knitr::opts_chunk$set(echo = TRUE)
#reticulate allows for python code in R environment
library(reticulate)
#set environment and options specific for reticulate
use_condaenv("r-reticulate", required=TRUE)
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from sklearn.utils.class_weight import compute_sample_weight
Import data from assignment 1, with only slight alterations as the data had been prepped for logistic regression (ie. leaving in some outliers, which Decision Trees are more robust to).
df = pd.read_csv('data/assignment2_data_preprocessed.csv')
df.drop(columns='Unnamed: 0', inplace = True)
Before we start running anything, we want to have a system for keeping track of our experiments.
# Initialize an empty list to store results
#df_experiment_log = pd.DataFrame(columns=['model','params','accuracy',
# 'f1', 'auc'])
experiment_log = []
# Function to log experiments
def log_experiment(model_name, params, accuracy, precision,
recall, f1=None, auc=None, notes=None):
new_exp_dict = {"model": model_name,
"params": params,
"acccuracy": accuracy,
"precision": precision,
"recall": recall,
"f1": f1,
"auc": auc,
"notes":notes}
experiment_log.append(new_exp_dict)
Before running any experiments, we’ll set a baseline with a Decision Tree model.
#split into X and y
y = df['y_yes']
X = df.drop(columns='y_yes')
Running the model, setting class_weight=‘balanced’ as a default (as classes are not balanced in the data).
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=920)
#set parameters
params = {'class_weight':'balanced', 'random_state':920}
# Create a Decision Tree Classifier
dtree = DecisionTreeClassifier(**params)
# Train the model
dtree = dtree.fit(X_train, y_train)
# Make predictions on the test set
y_pred = dtree.predict(X_test)
#set any relevant notes
notes = 'baseline model'
Testing for accuracy and logging.
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
## Accuracy: 0.8720916772774626
print(f"Precision: {precision}")
## Precision: 0.44766839378238343
print(f"Recall: {recall}")
## Recall: 0.4302788844621514
print(f"F1 Score: {f1}")
## F1 Score: 0.4388014220416455
print(f"AUC: {auc}")
## AUC: 0.6802343996639507
#log
log_experiment('Decision Tree', params, accuracy, precision,
recall, f1, auc, notes)
df_log = pd.DataFrame(experiment_log)
Accuracy is strong, but other metrics are problematic, demonstrating why accuracy is a flawed metric.
Maybe need to resample data as original data for class balance?
One way to resample data is to simply under-sample the majority class so that it matches the minority. Potentially problematic (losing important trends) but simple, and it only relies on real results from the data.
# Same train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=920)
# Apply undersampling
rus = RandomUnderSampler(random_state=920)
X_train_res, y_train_res = rus.fit_resample(X_train, y_train)
#check balance
y_train_res.value_counts()
## y_yes
## 0.0 4017
## 1.0 4017
## Name: count, dtype: int64
Re-run the same base model with no parameters (we’ve made class_balance moot):
#set parameters
params = {'random_state':920}
# Create a Decision Tree Classifier
dtree = DecisionTreeClassifier(**params)
# Train the model
dtree = dtree.fit(X_train_res, y_train_res)
# Make predictions on the test set
y_pred = dtree.predict(X_test)
#set any relevant notes
notes = 'baseline model with majority undersampled'
Results:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
## Accuracy: 0.7809931705058456
print(f"Precision: {precision}")
## Precision: 0.3198051948051948
print(f"Recall: {recall}")
## Recall: 0.7848605577689243
print(f"F1 Score: {f1}")
## F1 Score: 0.4544405997693195
print(f"AUC: {auc}")
## AUC: 0.782672584058005
#log
log_experiment('Decision Tree', params, accuracy, precision,
recall, f1, auc, notes)
df_log = pd.DataFrame(experiment_log)
Major bump in recall but precision falls off a cliff. SMOTE:
#Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
# Apply SMOTE to training data only
sm = SMOTE(random_state=920)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)
#check balance
y_train_res.value_counts()
## y_yes
## 0.0 30537
## 1.0 30537
## Name: count, dtype: int64
re-run the base model with the new sample
#set parameters
params = {'random_state':920}
# Create a Decision Tree Classifier
dtree = DecisionTreeClassifier(**params)
# Train the model
dtree = dtree.fit(X_train_res, y_train_res)
# Make predictions on the test set
y_pred = dtree.predict(X_test)
#set any relevant notes
notes = 'baseline model with SMOTE'
Results:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
## Accuracy: 0.8671142493344137
print(f"Precision: {precision}")
## Precision: 0.4353680430879713
print(f"Recall: {recall}")
## Recall: 0.48306772908366535
print(f"F1 Score: {f1}")
## F1 Score: 0.45797922568460814
print(f"AUC: {auc}")
## AUC: 0.7003419850395406
#log
log_experiment('Decision Tree', params, accuracy, precision,
recall, f1, auc, notes)
df_log = pd.DataFrame(experiment_log)
Back to pretty high accuracy but not great precision and recall. Still, SMOTE seems to have the best f1 score by a slight margin, so we’ll stick with that as our sampling methodology by default.
#Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
# Apply SMOTE to training data only
sm = SMOTE(random_state=920)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)
#make set of different max depths to try, working down
max_depths = [100, 50, 30, 20, 15, 10, 5, 3, 2]
#Cycle through and record results
for depth in max_depths:
#set new parameters
params = {'class_weight':'balanced',
'max_depth':depth,
'random_state':920}
# Create a Decision Tree Classifier
dtree = DecisionTreeClassifier(**params)
# Train the model
dtree = dtree.fit(X_train_res, y_train_res)
# Make predictions on the test set
y_pred = dtree.predict(X_test)
#set any relevant notes
notes = f'changed max depth to {depth}'
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)
#log
log_experiment('Decision Tree', params, accuracy, precision,
recall, f1, auc, notes)
df_log = pd.DataFrame(experiment_log)
Looks like my best f1 came at max_depth 10. There’s more we can do.
While there are other experiments we could run with single Decision Trees, it’s worth exploring ensemble methods.
Let’s first check with different sampling methodology. We can run with SMOTE, but there’s no guarantee that its optimized for Random Forests just because it worked for a single Decision Tree.
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=920)
#set parameters
params = {'class_weight':'balanced', 'random_state':920}
# Create a Decision Tree Classifier
rf = RandomForestClassifier(**params)
# Train the model
rf = rf.fit(X_train, y_train)
# Make predictions on the test set
y_pred = rf.predict(X_test)
#set any relevant notes
notes = 'baseline random forest model, balanced classes in model'
check results and log
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
## Accuracy: 0.8988308832040746
print(f"Precision: {precision}")
## Precision: 0.6431718061674009
print(f"Recall: {recall}")
## Recall: 0.2908366533864542
print(f"F1 Score: {f1}")
## F1 Score: 0.40054869684499317
print(f"AUC: {auc}")
## AUC: 0.6348092893651327
#log
log_experiment('Random Forest', params, accuracy, precision,
recall, f1, auc, notes)
df_log = pd.DataFrame(experiment_log)
SMOTE
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=920)
# Apply SMOTE to training data only
sm = SMOTE(random_state=920)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)
#set parameters (none in this case, for the baseline)
params = {'random_state':920}
# Create a Decision Tree Classifier
rf = RandomForestClassifier(**params)
# Train the model
rf = rf.fit(X_train_res, y_train_res)
# Make predictions on the test set
y_pred = rf.predict(X_test)
#set any relevant notes
notes = 'baseline random forest model, SMOTE'
test and log
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
## Accuracy: 0.9046185901145966
print(f"Precision: {precision}")
## Precision: 0.6480263157894737
print(f"Recall: {recall}")
## Recall: 0.39243027888446214
print(f"F1 Score: {f1}")
## F1 Score: 0.48883374689826303
print(f"AUC: {auc}")
## AUC: 0.682200732107588
#log
log_experiment('Random Forest', params, accuracy, precision,
recall, f1, auc, notes)
df_log = pd.DataFrame(experiment_log)
and finally, majority undersampling
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=920)
# Apply undersampling
rus = RandomUnderSampler(random_state=920)
X_train_res, y_train_res = rus.fit_resample(X_train, y_train)
#set parameters (none in this case, for the baseline)
params = {'random_state':920}
# Create a Decision Tree Classifier
rf = RandomForestClassifier(**params)
# Train the model
rf = rf.fit(X_train_res, y_train_res)
# Make predictions on the test set
y_pred = rf.predict(X_test)
#set any relevant notes
notes = 'baseline random forest model, undersampling'
test and log
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
## Accuracy: 0.8393332561639079
print(f"Precision: {precision}")
## Precision: 0.40977443609022557
print(f"Recall: {recall}")
## Recall: 0.8685258964143426
print(f"F1 Score: {f1}")
## F1 Score: 0.5568326947637292
print(f"AUC: {auc}")
## AUC: 0.8520101649720699
#log
log_experiment('Random Forest', params, accuracy, precision,
recall, f1, auc, notes)
df_log = pd.DataFrame(experiment_log)
Interesting, precision was very high with SMOTE but recall was depressed. With undersampling, recall was super high. but the f1 score was better with undersampling.
Let’s go with undersampling.
Can limit the number of features.
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=920)
# Apply undersampling
rus = RandomUnderSampler(random_state=920)
X_train_res, y_train_res = rus.fit_resample(X_train, y_train)
#make set of different tree counts to try
mfs = ['sqrt', 'log2', 0.3, 0.5, 5]
#Cycle through and record results
for mf in mfs:
#set new parameters
params = {'n_estimators':100,
'max_features':mf,
'random_state':920}
# Create a Decision Tree Classifier
rf = RandomForestClassifier(**params)
# Train the model
rf = rf.fit(X_train_res, y_train_res)
# Make predictions on the test set
y_pred = rf.predict(X_test)
#set any relevant notes
notes = f'undersampling, max_features set to {mf}'
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)
#log
log_experiment('Random Forest', params, accuracy, precision,
recall, f1, auc, notes)
df_log = pd.DataFrame(experiment_log)
Hitting ceiling of about 0.56, with max_features set to square root.
When the issue is precision, increasing trees won’t necessarily help. Instead, we want to tune the min_samples_leaf and min_samples_split.
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=920)
# Apply undersampling
rus = RandomUnderSampler(random_state=920)
X_train_res, y_train_res = rus.fit_resample(X_train, y_train)
#make set of different values to try
min_samples_split_vals = [1, 5, 10, 15, 25, 50, 100]
min_samples_leaf_vals = [2, 5, 10, 15, 25, 50, 100]
#Cycle through and record results
for split_val in min_samples_leaf_vals:
for leaf_val in min_samples_leaf_vals:
#set new parameters
params = {'n_estimators':100,
'max_features':'sqrt',
'min_samples_split':split_val,
'min_samples_leaf':leaf_val,
'random_state':920}
# Create a Decision Tree Classifier
rf = RandomForestClassifier(**params)
# Train the model
rf = rf.fit(X_train_res, y_train_res)
# Make predictions on the test set
y_pred = rf.predict(X_test)
#set any relevant notes
notes = f'undersampling. See params.'
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)
#log
log_experiment('Random Forest', params, accuracy, precision,
recall, f1, auc, notes)
df_log = pd.DataFrame(experiment_log)
No notable improvement over base model. Notably, performance essentially peaks at min_samples_split and min_samples_leaf at 5, with diminishing improvements as those values increase. Could save computational resourcing to set the values there.
Time to try boosting.
By default, it stands to reason that I’d want to boost my best decision tree. Let’s see what that is:
df_log[df_log['model']=='Decision Tree'].sort_values(by='f1', ascending=False).iloc[0]
## model Decision Tree
## params {'class_weight': 'balanced', 'max_depth': 10, ...
## acccuracy 0.899873
## precision 0.58072
## recall 0.498008
## f1 0.536193
## auc 0.725363
## notes changed max depth to 10
## Name: 8, dtype: object
df_log[df_log['model']=='Decision Tree'].sort_values(by='f1', ascending=False).iloc[0]['params']
## {'class_weight': 'balanced', 'max_depth': 10, 'random_state': 920}
So our strongest decision tree was with class balance and max depth 10.
However, Adaboost is meant for weak learners, and max depth of 10 is too high Let’s run another max depth experiment but cap it at 8 and increment more slowly.
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=920)
for depth in [1,2,3,4,5,6,7,8]:
#set parameters based on results above
params = {'class_weight': 'balanced', 'max_depth': depth, 'random_state': 920}
# Create a Decision Tree Classifier
base_tree = DecisionTreeClassifier(**params)
# AdaBoost with optimized base tree
ada = AdaBoostClassifier(
estimator=base_tree,
n_estimators=100,
random_state=920
)
# Fit model on original training data (no resampling!)
ada.fit(X_train, y_train)
# Evaluate on untouched test set
y_pred = ada.predict(X_test)
#set any relevant notes
notes = f'See params for dtree. Adaboost with n_estimators=100.'
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)
#log
log_experiment('Adaboost', params, accuracy, precision,
recall, f1, auc, notes)
df_log = pd.DataFrame(experiment_log)
AdaBoostClassifier(estimator=DecisionTreeClassifier(class_weight='balanced',
max_depth=8,
random_state=920),
n_estimators=100, random_state=920)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. AdaBoostClassifier(estimator=DecisionTreeClassifier(class_weight='balanced',
max_depth=8,
random_state=920),
n_estimators=100, random_state=920)DecisionTreeClassifier(class_weight='balanced', max_depth=8, random_state=920)
DecisionTreeClassifier(class_weight='balanced', max_depth=8, random_state=920)
Best is at max depth 3.
Now let’s try learning_rate and n_estimators in tandem
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=920)
for n in [20, 50, 100, 200, 300]:
for rate in [1, 0.75, 0.5, 0.25, 0.1, 0.05, 0.01]:
#set parameters based on results above
params = {'class_weight': 'balanced',
'max_depth': 3,
'random_state': 920}
# Create a Decision Tree Classifier
base_tree = DecisionTreeClassifier(**params)
# AdaBoost with optimized base tree
ada = AdaBoostClassifier(
estimator=base_tree,
n_estimators=n,
learning_rate=rate,
random_state=920
)
# Fit model on original training data (no resampling!)
ada.fit(X_train, y_train)
# Evaluate on untouched test set
y_pred = ada.predict(X_test)
#set any relevant notes
notes = f'learning rate: {rate}, n = {n}'
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)
#log
log_experiment('Adaboost', params, accuracy, precision,
recall, f1, auc, notes)
df_log = pd.DataFrame(experiment_log)
AdaBoostClassifier(estimator=DecisionTreeClassifier(class_weight='balanced',
max_depth=3,
random_state=920),
learning_rate=0.01, n_estimators=300, random_state=920)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. AdaBoostClassifier(estimator=DecisionTreeClassifier(class_weight='balanced',
max_depth=3,
random_state=920),
learning_rate=0.01, n_estimators=300, random_state=920)DecisionTreeClassifier(class_weight='balanced', max_depth=3, random_state=920)
DecisionTreeClassifier(class_weight='balanced', max_depth=3, random_state=920)
Learning rate was highest at 1, irrespective of n. This tells us that if we wanted to use adaboost, we’re better off tweaking the learning rate than worrying about the number of trees. We can use a relatively low number (like 20) and still achieve high performance.
df_log_top = df_log.groupby('model').apply(lambda x: x.nlargest(3, 'f1'))
## <string>:1: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
for col in df_log_top.columns:
if df_log_top[col].dtype == float:
df_log_top[col] = df_log_top[col].round(3)
#df_log_top['params'].to_clipboard()
df['duration'].median()
## 180.0