Source code for brew.generation.smote_bagging

from __future__ import division

import numpy as np

import sklearn
from sklearn.neighbors import KNeighborsClassifier

from brew.base import Ensemble
from brew.combination.rules import majority_vote_rule
from brew.combination.combiner import Combiner
from brew.generation import Bagging

from brew.preprocessing.smote import smote

from brew.metrics.evaluation import auc_score
from brew.metrics.diversity.paired import paired_metric_ensemble
from brew.metrics.diversity.non_paired import entropy_measure_e

from .base import PoolGenerator


[docs]class SmoteBagging(PoolGenerator):
def __init__(self, base_classifier=None, n_classifiers=100, combination_rule='majority_vote', k=2): #self.b = b self.k = k self.n_classifiers = n_classifiers self.base_classifier = base_classifier self.ensemble = None self.combiner = Combiner(rule=combination_rule) def smote_bootstrap_sample(self, X, y, b, k): classes = np.unique(y) count = np.bincount(y) # number of instances of each class majority_class = count.argmax() # majority clas majority_count = count.max() # majority class data = np.empty((0, X.shape[1])) target = np.empty((0,)) for i in classes: class_data = X[(y==i),:] if i == majority_class: # majority class # regular bootstrap (i.e. 100% sampling rate) idx = np.random.choice(majority_count, (majority_count,)) data = np.concatenate((data, class_data[idx,:])) target = np.concatenate((target, i * np.ones((majority_count,)))) #print('original class data = {}'.format(class_data.shape)) #print('sampled class data = {}'.format(class_data[idx,:].shape)) #print() else: # minority classes # bootstrap the class data with defined sampling rate sample_rate = (majority_count / class_data.shape[0]) * (b/100) idx = np.random.choice(class_data.shape[0], (int(sample_rate * class_data.shape[0]),)) sampled_class_data = class_data[idx,:] #print('original class data = {}'.format(class_data.shape)) #print('majority_count = {}'.format(majority_count)) #print('class data = {}'.format(class_data.shape)) #print('b = {}'.format(b)) #print('sample rate = {}'.format(sample_rate)) #print('sampled class data = {}'.format(sampled_class_data.shape)) # run smote on bootstrapped data to obtain synthetic samples # ceil to make sure N_smote is a multiple of 100, and the small value to avoid a zero N_smote = int( np.ceil((majority_count / sampled_class_data.shape[0]) * (1 - b/100 + 10e-8)) * 100 ) #print(N_smote) #print('----------') #print('smote parameters:') #print('T : {}'.format(sampled_class_data.shape)) #print('N : {}'.format(N_smote)) synthetic = smote(sampled_class_data, N=N_smote, k=self.k) #print('synthetic data = {})'.format(synthetic.shape)) #print(synthetic) # add synthetic samples to sampled class data n_missing = majority_count - sampled_class_data.shape[0] idx = np.random.choice(synthetic.shape[0], (n_missing,)) new_class_data = np.concatenate((sampled_class_data, synthetic[idx,:])) #print('new class data = {})'.format(new_class_data.shape)) #print() data = np.concatenate((data, new_class_data)) target = np.concatenate((target, i * np.ones((new_class_data.shape[0],)))) return data, target def fit(self, X, y): self.ensemble = Ensemble() # this parameter should change between [10, 100] with # increments of 10, for every classifier in the ensemble b = 10 for i in range(self.n_classifiers): #print() #print('classifier : {}'.format(i)) #print('------------------------') #print('b = {}'.format(b)) data, target = self.smote_bootstrap_sample(X, y, b=b, k=self.k) #print('data = {}'.format(data.shape)) #print() classifier = sklearn.base.clone(self.base_classifier) classifier.fit(data, target) self.ensemble.add(classifier) if b >= 100: b = 10 else: b += 10 return def predict(self, X): out = self.ensemble.output(X) return self.combiner.combine(out) if __name__ == '__main__': # class 0 X0 = np.random.random((100,2)) y0 = 0*np.ones((100,), dtype='int64') # class 1 X1 = np.random.random((60,2)) y1 = 1*np.ones((60,), dtype='int64') # class 2 X2 = np.random.random((35,2)) y2 = 2*np.ones((35,), dtype='int64') # class 3 X3 = np.random.random((5,2)) y3 = 3*np.ones((5,), dtype='int64') print('DATASET before:') print('class 0 : {}'.format(X0.shape)) print('class 1 : {}'.format(X1.shape)) print('class 2 : {}'.format(X2.shape)) print('class 3 : {}'.format(X3.shape)) print() X = np.concatenate((X0, X1, X2, X3)) y = np.concatenate((y0, y1, y2, y3)) knn = KNeighborsClassifier pool = SmoteBagging(base_classifier=knn, n_classifiers=5, k=3) pool.fit(X, y) print(np.sum(pool.predict(X) == y) / y.size)