Classifier Machine Learning Template

In [31]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import RFE
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
import time
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

#Model Options
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

#Parallelization
from sklearn.externals.joblib import Parallel, parallel_backend, register_parallel_backend
from multiprocessing import Pool

Parameter Setting

In [2]:
#Environment Setting
available_processor_count = -1

#Data Splits
pct_data_test = 25
pct_data_train = 75

#Data type (csv, json, hive)
data_type = 'csv'

#Data path
path = "churn.csv"

#Project Type (Classifier or Regression)
ptype = 'Classifier'

#Target Field
target = 'churn'

Data Import, Encoding & Dimensionality Reduction

Principal Component Analysis, or PCA for short, is a method for reducing the dimensionality of data. It can be thought of as a projection method where data with m-columns (features) is projected into a subspace with m or fewer columns, whilst retaining the essence of the original data.

In [1]:
df = pd.read_csv(path, delimiter=',', header='infer')
number_features = len(df.columns)-1
y =  df[[target]]
x =  df.drop([target], axis=1)

#encoding with get_dummies
#creates new fields with 1 and 0. E.g. instead of male, female in a single field. It will make it 2 fields & boolean.
x = pd.get_dummies( x )

#fill in NA values with zeros
x = x.fillna(0)

#standardize the scale
x = StandardScaler().fit_transform(x)

#convert dataframes to numpy arrays
x = np.array(x)
y = np.array(y)

from sklearn.feature_selection import SelectKBest, chi2, f_classif 
dataset = SelectKBest(f_classif, k=50).fit_transform(x, y.ravel())

'''
#Find the best number of components that still retain much of the variability from the original dataset
pca = PCA().fit(x)
n_pca = np.where(np.cumsum(pca.explained_variance_ratio_) >= 0.99)[0][0]
pca = PCA(n_components = n_pca)
dataset = pca.fit_transform(x)
'''
train_features, test_features, train_labels, test_labels = train_test_split(dataset, y, test_size = pct_data_test/100)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-ab8b515cc389> in <module>
----> 1 df = pd.read_csv(path, delimiter=',', header='infer')
      2 number_features = len(df.columns)-1
      3 y =  df[[target]]
      4 x =  df.drop([target], axis=1)
      5 

NameError: name 'pd' is not defined

Choose the best model type

In [39]:
forest_start_time = time.time()

model = RandomForestClassifier()
param_grid = {
    'bootstrap': [True, False],
    'max_depth': list(range(5, 10)),
    'min_samples_leaf': list(range(1, 30)),
    'min_samples_split': list(range(1, 30)),
    'n_estimators': list(range(1, 500))
}

bestforest = RandomizedSearchCV(estimator = model, param_distributions = param_grid, cv = 3, 
                          n_iter = 10, n_jobs = available_processor_count)

bestforest.fit(train_features, train_labels.ravel())
forest_score = bestforest.score(test_features, test_labels.ravel())
print(forest_score)
forest_end_time = time.time()
forest_duration = forest_start_time-forest_end_time
0.55152
In [65]:
logistic_start_time = time.time()

model = LogisticRegression()
param_grid = {
    'solver': ['liblinear', 'sag', 'saga'],
    'class_weight': [None, 'balanced'],
    'max_iter': [5, 10, 80, 90, 100, 110]
}

bestlogistic = RandomizedSearchCV(estimator = model, param_distributions = param_grid, cv = 3, 
                          n_iter = 10, n_jobs = available_processor_count)

bestlogistic.fit(train_features, train_labels.ravel())
logistic_score = bestlogistic.score(test_features, test_labels.ravel())
print(logistic_score)

logistic_end_time = time.time()
logistic_duration = logistic_start_time-logistic_end_time
/Users/keenek1/anaconda3/lib/python3.6/site-packages/joblib/externals/loky/process_executor.py:706: UserWarning: A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.
  "timeout or by a memory leak.", UserWarning
0.57816
In [66]:
neighbour_start_time = time.time()

model = KNeighborsClassifier()
param_grid = {
    'n_neighbors': [5, 10, 80, 90, 100, 110],
    'weights': ['uniform', 'distance'],
    'leaf_size': [5, 10, 80, 90, 100, 110]
}

bestneighbour = RandomizedSearchCV(estimator = model, param_distributions = param_grid, cv = 3, 
                          n_iter = 10, n_jobs = available_processor_count)

bestneighbour.fit(train_features, train_labels.ravel())
neighbours_score = bestneighbour.score(test_features, test_labels.ravel())
print(neighbours_score)
neighbour_end_time = time.time()
neighbour_duration = neighbour_start_time-neighbour_end_time
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-66-27aaf159ec27> in <module>
     11                           n_iter = 10, n_jobs = available_processor_count)
     12 
---> 13 bestneighbour.fit(train_features, train_labels.ravel())
     14 neighbours_score = bestneighbour.score(test_features, test_labels.ravel())
     15 print(neighbours_score)

~/anaconda3/lib/python3.6/site-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params)
    685                 return results
    686 
--> 687             self._run_search(evaluate_candidates)
    688 
    689         # For multi-metric evaluation, store the best_index_, best_params_ and

~/anaconda3/lib/python3.6/site-packages/sklearn/model_selection/_search.py in _run_search(self, evaluate_candidates)
   1466         evaluate_candidates(ParameterSampler(
   1467             self.param_distributions, self.n_iter,
-> 1468             random_state=self.random_state))

~/anaconda3/lib/python3.6/site-packages/sklearn/model_selection/_search.py in evaluate_candidates(candidate_params)
    664                                for parameters, (train, test)
    665                                in product(candidate_params,
--> 666                                           cv.split(X, y, groups)))
    667 
    668                 if len(out) < 1:

~/anaconda3/lib/python3.6/site-packages/joblib/parallel.py in __call__(self, iterable)
    932 
    933             with self._backend.retrieval_context():
--> 934                 self.retrieve()
    935             # Make sure that we get a last message telling us we are done
    936             elapsed_time = time.time() - self._start_time

~/anaconda3/lib/python3.6/site-packages/joblib/parallel.py in retrieve(self)
    831             try:
    832                 if getattr(self._backend, 'supports_timeout', False):
--> 833                     self._output.extend(job.get(timeout=self.timeout))
    834                 else:
    835                     self._output.extend(job.get())

~/anaconda3/lib/python3.6/site-packages/joblib/_parallel_backends.py in wrap_future_result(future, timeout)
    519         AsyncResults.get from multiprocessing."""
    520         try:
--> 521             return future.result(timeout=timeout)
    522         except LokyTimeoutError:
    523             raise TimeoutError()

~/anaconda3/lib/python3.6/concurrent/futures/_base.py in result(self, timeout)
    425                 return self.__get_result()
    426 
--> 427             self._condition.wait(timeout)
    428 
    429             if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]:

~/anaconda3/lib/python3.6/threading.py in wait(self, timeout)
    293         try:    # restore state no matter what (e.g., KeyboardInterrupt)
    294             if timeout is None:
--> 295                 waiter.acquire()
    296                 gotit = True
    297             else:

KeyboardInterrupt: 

Recommended model configuration

In [ ]:
if logistic_score > forest_score and logistic_score > neighbours_score:
    selected_model = 'Logistic'
    score = logistic_score
elif forest_score > logistic_score and forest_score > neighbours_score:
    selected_model = 'Forest'
    score = forest_score
elif neighbours_score > logistic_score and forest_score > forest_score:
    selected_model = 'Neighbours'
    score = neighbours_score
    
print('''----------------------------------------------''')
print('''BEST MODEL TYPE''')
print('''----------------------------------------------''')
print('''For highest accuracy, the model you should use is: ''' + str(selected_model))
print('''Score: ''' + str(score))
print('''''')
print('''Parameters:''')
if selected_model == 'Logistic':
    print(bestlogistic.best_params_)
elif selected_model == 'Forest':
    print(bestforest.best_params_)
elif selected_model == 'Neighbours':
    print(bestneighbour.best_params_)

change_to_neighbours = abs(round(neighbours_score - score, 2))*100
change_to_logistic = abs(round(logistic_score - score, 2))*100
change_to_forest = abs(round(forest_score - score, 2))*100

neighbour_minus_logistic_duration = round(((neighbour_duration-logistic_duration)/logistic_duration)*100, 2)
forest_minus_logistic_duration = round(((forest_duration-logistic_duration)/logistic_duration)*100, 2)
logistic_minus_forest_duration = round(((logistic_duration-forest_duration)/forest_duration)*100, 2)
neighbour_minus_forest_duration = round(((neighbour_duration-forest_duration)/forest_duration)*100, 2)
forest_minus_neighbour_duration = round(((forest_duration-neighbour_duration)/neighbour_duration)*100, 2)
logistic_minus_neighbour_duration = round(((logistic_duration-neighbour_duration)/neighbour_duration)*100, 2)

if selected_model == 'Logistic':
    print('''''')
    print('''----------------------------------------------''')
    print('''ALTERNATIVE MODEL TYPE''')
    print('''----------------------------------------------''')   
    print('''Nearest neighbours: ''')
    print('''Accuracy Reduction: ''' + str(change_to_neighbours) + '''%''')
    print('''Duration % change (negative = shorter runtime): ''' + str(neighbour_minus_logistic_duration) + '''%''')
    print('''''')
    print('''Parameters: ''')
    print(bestneighbour.best_params_)    
    print('''''')
    print('''Random Forest: ''')
    print('''Accuracy Reduction: ''' + str(change_to_forest) + '''%''') 
    print('''Duration % change (negative = shorter runtime): ''' + str(forest_minus_logistic_duration)+ '''%''')
    print('''''')
    print('''Parameters: ''')
    print(bestlogistic.best_params_)
elif selected_model == 'Forest':
    print('''''')
    print('''----------------------------------------------''')
    print('''ALTERNATIVE MODEL TYPE''')
    print('''----------------------------------------------''')   
    print('''Nearest neighbours: ''')
    print('''Accuracy Reduction: ''' + str(change_to_neighbours) + '''%''')
    print('''Duration % change (negative = shorter runtime): ''' + str(neighbour_minus_forest_duration) + '''%''')
    print('''''')
    print('''Parameters: ''')
    print(bestneighbour.best_params_)    
    print('''''')
    print('''Logistic regression: ''')
    print('''Accuracy Reduction: ''' + str(change_to_logistic) + '''%''') 
    print('''Duration % change (negative = shorter runtime): ''' + str(logistic_minus_forest_duration)+ '''%''')
    print('''''')
    print('''Parameters: ''')
    print(bestlogistic.best_params_)
elif selected_model == 'Neighbours':
    print('''''')
    print('''----------------------------------------------''')
    print('''ALTERNATIVE MODEL TYPE''')
    print('''----------------------------------------------''')
    print('''Random Forest: ''')
    print('''Accuracy Reduction: ''' + str(change_to_forest) + '''%''')
    print('''Duration % change (negative = shorter runtime): ''' + str(forest_minus_neighbour_duration) + '''%''')
    print('''''')
    print('''Parameters: ''')
    print(bestforest.best_params_)    
    print('''''')
    print('''Logistic regression: ''')
    print('''Accuracy Reduction: ''' + str(change_to_logistic) + '''%''') 
    print('''Duration % change (negative = shorter runtime): ''' + str(logistic_minus_neighbour_duration)+ '''%''')
    print('''''')
    print('''Parameters: ''')
    print(bestlogistic.best_params_)    
In [ ]:
#Add hive support
#add json support
#output code
In [35]:
 
Out[35]:
(100000, 20)
In [ ]: