Source code for brew.preprocessing.smote

"""
See: 
"""

from __future__ import division

import numpy as np
from sklearn.neighbors import NearestNeighbors

[docs]def smote(T, N=100, k=1): """ T: minority class data N: percentage of oversampling k: number of neighbors used """ # modification of original smote code so that it won't break if # minority class is too small in relation to the k, maybe this is not # sensible. if T.shape[0] <= k+1: idx = np.random.choice(T.shape[0], size=(k+1,)) T = T[idx,:] # randomly select a subset of the data, to be used for creating synthethic samples if N < 100: sz = int(T.shape[0] * (N/100)) idx = np.random.choice(T.shape[0], size=(sz,), replace=False) T = T[idx,:] N = 100 if N % 100 != 0: raise ValueError('N must be < 100 OR multiple of 100') N = int(N/100) n_minority_samples, n_features = T.shape n_synthetic_samples = N * n_minority_samples synthetic = np.zeros((n_synthetic_samples, n_features)) knn = NearestNeighbors(n_neighbors=k) knn.fit(T) count = 0 for i in range(n_minority_samples): # first neighbor returned is always the very own sample, so # get 1 more neighbor and discard the first neighbor returned neighbors_idx = knn.kneighbors(T[i,:], n_neighbors=k+1, return_distance=False)[0][1:] # randomly choose N neighbors of the sample (with replacement) nn_idx = np.random.choice(neighbors_idx, size=(N,)) chosen_neighbors = T[nn_idx,:] diff = chosen_neighbors - T[i,:] gap = np.random.uniform(low=0.0, high=1.0, size=N)[:,np.newaxis] synthetic[count:count+N,:] = T[i,:] + (gap * diff) count += N return synthetic