assignment2_experiments

knitr::opts_chunk$set(echo = TRUE)

#reticulate allows for python code in R environment
library(reticulate)

#set environment and options specific for reticulate
use_condaenv("r-reticulate", required=TRUE)

Packages

import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from sklearn.utils.class_weight import compute_sample_weight

Import Data

Import data from assignment 1, with only slight alterations as the data had been prepped for logistic regression (ie. leaving in some outliers, which Decision Trees are more robust to).

df = pd.read_csv('data/assignment2_data_preprocessed.csv')
df.drop(columns='Unnamed: 0', inplace = True)

Logging

Before we start running anything, we want to have a system for keeping track of our experiments.

# Initialize an empty list to store results
#df_experiment_log = pd.DataFrame(columns=['model','params','accuracy',
#                                           'f1', 'auc'])

experiment_log = []

# Function to log experiments
def log_experiment(model_name, params, accuracy, precision, 
                  recall, f1=None, auc=None, notes=None):
  new_exp_dict = {"model": model_name,
        "params": params,
        "acccuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "auc": auc,
        "notes":notes}
        
  experiment_log.append(new_exp_dict)

Decision Trees

Before running any experiments, we’ll set a baseline with a Decision Tree model.

#split into X and y
y = df['y_yes']
X = df.drop(columns='y_yes')

Running the model, setting class_weight=‘balanced’ as a default (as classes are not balanced in the data).

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=920)

#set parameters
params = {'class_weight':'balanced', 'random_state':920}

# Create a Decision Tree Classifier
dtree = DecisionTreeClassifier(**params)

# Train the model
dtree = dtree.fit(X_train, y_train)

# Make predictions on the test set
y_pred = dtree.predict(X_test)

#set any relevant notes
notes = 'baseline model'

Testing for accuracy and logging.

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

## Accuracy: 0.8720916772774626

print(f"Precision: {precision}")

## Precision: 0.44766839378238343

print(f"Recall: {recall}")

## Recall: 0.4302788844621514

print(f"F1 Score: {f1}")

## F1 Score: 0.4388014220416455

print(f"AUC: {auc}")

## AUC: 0.6802343996639507


#log
log_experiment('Decision Tree', params, accuracy, precision,
               recall, f1, auc, notes)

df_log = pd.DataFrame(experiment_log)

Accuracy is strong, but other metrics are problematic, demonstrating why accuracy is a flawed metric.

Maybe need to resample data as original data for class balance?

Experiment 1: Resampling Data

One way to resample data is to simply under-sample the majority class so that it matches the minority. Potentially problematic (losing important trends) but simple, and it only relies on real results from the data.

# Same train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=920)

# Apply undersampling
rus = RandomUnderSampler(random_state=920)
X_train_res, y_train_res = rus.fit_resample(X_train, y_train)

#check balance
y_train_res.value_counts()

## y_yes
## 0.0    4017
## 1.0    4017
## Name: count, dtype: int64

Re-run the same base model with no parameters (we’ve made class_balance moot):

#set parameters
params = {'random_state':920}

# Create a Decision Tree Classifier
dtree = DecisionTreeClassifier(**params)

# Train the model
dtree = dtree.fit(X_train_res, y_train_res)

# Make predictions on the test set
y_pred = dtree.predict(X_test)

#set any relevant notes
notes = 'baseline model with majority undersampled'

Results:

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

## Accuracy: 0.7809931705058456

print(f"Precision: {precision}")

## Precision: 0.3198051948051948

print(f"Recall: {recall}")

## Recall: 0.7848605577689243

print(f"F1 Score: {f1}")

## F1 Score: 0.4544405997693195

print(f"AUC: {auc}")

## AUC: 0.782672584058005


#log
log_experiment('Decision Tree', params, accuracy, precision,
               recall, f1, auc, notes)

df_log = pd.DataFrame(experiment_log)

Major bump in recall but precision falls off a cliff. SMOTE:

#Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Apply SMOTE to training data only
sm = SMOTE(random_state=920)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

#check balance
y_train_res.value_counts()

## y_yes
## 0.0    30537
## 1.0    30537
## Name: count, dtype: int64

re-run the base model with the new sample

#set parameters
params = {'random_state':920}

# Create a Decision Tree Classifier
dtree = DecisionTreeClassifier(**params)

# Train the model
dtree = dtree.fit(X_train_res, y_train_res)

# Make predictions on the test set
y_pred = dtree.predict(X_test)

#set any relevant notes
notes = 'baseline model with SMOTE'

Results:

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

## Accuracy: 0.8671142493344137

print(f"Precision: {precision}")

## Precision: 0.4353680430879713

print(f"Recall: {recall}")

## Recall: 0.48306772908366535

print(f"F1 Score: {f1}")

## F1 Score: 0.45797922568460814

print(f"AUC: {auc}")

## AUC: 0.7003419850395406


#log
log_experiment('Decision Tree', params, accuracy, precision,
               recall, f1, auc, notes)

df_log = pd.DataFrame(experiment_log)

Back to pretty high accuracy but not great precision and recall. Still, SMOTE seems to have the best f1 score by a slight margin, so we’ll stick with that as our sampling methodology by default.

Experiment 2: Changing Max Depth

#Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Apply SMOTE to training data only
sm = SMOTE(random_state=920)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

#make set of different max depths to try, working down
max_depths = [100, 50, 30, 20, 15, 10, 5, 3, 2]

#Cycle through and record results
for depth in max_depths:
  
  #set new parameters
  params = {'class_weight':'balanced',
            'max_depth':depth,
            'random_state':920}
            
  # Create a Decision Tree Classifier
  dtree = DecisionTreeClassifier(**params)
  
  # Train the model
  dtree = dtree.fit(X_train_res, y_train_res)

  # Make predictions on the test set
  y_pred = dtree.predict(X_test)

  #set any relevant notes
  notes = f'changed max depth to {depth}'

  # Evaluate the model
  accuracy = accuracy_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred)
  recall = recall_score(y_test, y_pred)
  f1 = f1_score(y_test, y_pred)
  auc = roc_auc_score(y_test, y_pred)

  #log
  log_experiment('Decision Tree', params, accuracy, precision,
                 recall, f1, auc, notes)
                 
  df_log = pd.DataFrame(experiment_log)

Looks like my best f1 came at max_depth 10. There’s more we can do.

While there are other experiments we could run with single Decision Trees, it’s worth exploring ensemble methods.

Random Forests

Experiment 3: Sampling

Let’s first check with different sampling methodology. We can run with SMOTE, but there’s no guarantee that its optimized for Random Forests just because it worked for a single Decision Tree.

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=920)

#set parameters
params = {'class_weight':'balanced', 'random_state':920}

# Create a Decision Tree Classifier
rf = RandomForestClassifier(**params)

# Train the model
rf = rf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf.predict(X_test)

#set any relevant notes
notes = 'baseline random forest model, balanced classes in model'

check results and log

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

## Accuracy: 0.8988308832040746

print(f"Precision: {precision}")

## Precision: 0.6431718061674009

print(f"Recall: {recall}")

## Recall: 0.2908366533864542

print(f"F1 Score: {f1}")

## F1 Score: 0.40054869684499317

print(f"AUC: {auc}")

## AUC: 0.6348092893651327


#log
log_experiment('Random Forest', params, accuracy, precision,
               recall, f1, auc, notes)
df_log = pd.DataFrame(experiment_log)

SMOTE

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=920)

# Apply SMOTE to training data only
sm = SMOTE(random_state=920)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

#set parameters (none in this case, for the baseline)
params = {'random_state':920}

# Create a Decision Tree Classifier
rf = RandomForestClassifier(**params)

# Train the model
rf = rf.fit(X_train_res, y_train_res)

# Make predictions on the test set
y_pred = rf.predict(X_test)

#set any relevant notes
notes = 'baseline random forest model, SMOTE'

test and log

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

## Accuracy: 0.9046185901145966

print(f"Precision: {precision}")

## Precision: 0.6480263157894737

print(f"Recall: {recall}")

## Recall: 0.39243027888446214

print(f"F1 Score: {f1}")

## F1 Score: 0.48883374689826303

print(f"AUC: {auc}")

## AUC: 0.682200732107588


#log
log_experiment('Random Forest', params, accuracy, precision,
               recall, f1, auc, notes)
df_log = pd.DataFrame(experiment_log)

and finally, majority undersampling

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=920)

# Apply undersampling
rus = RandomUnderSampler(random_state=920)
X_train_res, y_train_res = rus.fit_resample(X_train, y_train)

#set parameters (none in this case, for the baseline)
params = {'random_state':920}

# Create a Decision Tree Classifier
rf = RandomForestClassifier(**params)

# Train the model
rf = rf.fit(X_train_res, y_train_res)

# Make predictions on the test set
y_pred = rf.predict(X_test)

#set any relevant notes
notes = 'baseline random forest model, undersampling'

test and log

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

## Accuracy: 0.8393332561639079

print(f"Precision: {precision}")

## Precision: 0.40977443609022557

print(f"Recall: {recall}")

## Recall: 0.8685258964143426

print(f"F1 Score: {f1}")

## F1 Score: 0.5568326947637292

print(f"AUC: {auc}")

## AUC: 0.8520101649720699


#log
log_experiment('Random Forest', params, accuracy, precision,
               recall, f1, auc, notes)
df_log = pd.DataFrame(experiment_log)

Interesting, precision was very high with SMOTE but recall was depressed. With undersampling, recall was super high. but the f1 score was better with undersampling.

Let’s go with undersampling.

Experiment 4: max features

Can limit the number of features.

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=920)

# Apply undersampling
rus = RandomUnderSampler(random_state=920)
X_train_res, y_train_res = rus.fit_resample(X_train, y_train)

#make set of different tree counts to try
mfs = ['sqrt', 'log2', 0.3, 0.5, 5]

#Cycle through and record results
for mf in mfs:
  
  #set new parameters
  params = {'n_estimators':100,
            'max_features':mf,
            'random_state':920}
            
  # Create a Decision Tree Classifier
  rf = RandomForestClassifier(**params)
  
  # Train the model
  rf = rf.fit(X_train_res, y_train_res)

  # Make predictions on the test set
  y_pred = rf.predict(X_test)

  #set any relevant notes
  notes = f'undersampling, max_features set to {mf}'

  # Evaluate the model
  accuracy = accuracy_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred)
  recall = recall_score(y_test, y_pred)
  f1 = f1_score(y_test, y_pred)
  auc = roc_auc_score(y_test, y_pred)

  #log
  log_experiment('Random Forest', params, accuracy, precision,
                 recall, f1, auc, notes)
                 
  df_log = pd.DataFrame(experiment_log)

Hitting ceiling of about 0.56, with max_features set to square root.

Experiment 5: Tune min_samples_leaf and min_samples_split in tandem

When the issue is precision, increasing trees won’t necessarily help. Instead, we want to tune the min_samples_leaf and min_samples_split.

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=920)

# Apply undersampling
rus = RandomUnderSampler(random_state=920)
X_train_res, y_train_res = rus.fit_resample(X_train, y_train)

#make set of different values to try
min_samples_split_vals = [1, 5, 10, 15, 25, 50, 100]
min_samples_leaf_vals = [2, 5, 10, 15, 25, 50, 100]

#Cycle through and record results
for split_val in min_samples_leaf_vals:
  for leaf_val in min_samples_leaf_vals:
    #set new parameters
    params = {'n_estimators':100,
              'max_features':'sqrt',
              'min_samples_split':split_val,
              'min_samples_leaf':leaf_val,
              'random_state':920}
              
    # Create a Decision Tree Classifier
    rf = RandomForestClassifier(**params)
    
    # Train the model
    rf = rf.fit(X_train_res, y_train_res)
  
    # Make predictions on the test set
    y_pred = rf.predict(X_test)
  
    #set any relevant notes
    notes = f'undersampling. See params.'
  
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)
  
    #log
    log_experiment('Random Forest', params, accuracy, precision,
                   recall, f1, auc, notes)
                   
    df_log = pd.DataFrame(experiment_log)

No notable improvement over base model. Notably, performance essentially peaks at min_samples_split and min_samples_leaf at 5, with diminishing improvements as those values increase. Could save computational resourcing to set the values there.

Time to try boosting.

Adaboost

Experiment 6: Decision Tree Max Depth

By default, it stands to reason that I’d want to boost my best decision tree. Let’s see what that is:

df_log[df_log['model']=='Decision Tree'].sort_values(by='f1', ascending=False).iloc[0]

## model                                            Decision Tree
## params       {'class_weight': 'balanced', 'max_depth': 10, ...
## acccuracy                                             0.899873
## precision                                              0.58072
## recall                                                0.498008
## f1                                                    0.536193
## auc                                                   0.725363
## notes                                  changed max depth to 10
## Name: 8, dtype: object

df_log[df_log['model']=='Decision Tree'].sort_values(by='f1', ascending=False).iloc[0]['params']

## {'class_weight': 'balanced', 'max_depth': 10, 'random_state': 920}

So our strongest decision tree was with class balance and max depth 10.

However, Adaboost is meant for weak learners, and max depth of 10 is too high Let’s run another max depth experiment but cap it at 8 and increment more slowly.

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=920)

for depth in [1,2,3,4,5,6,7,8]:
  #set parameters based on results above
  params = {'class_weight': 'balanced', 'max_depth': depth, 'random_state': 920}
  
  # Create a Decision Tree Classifier
  base_tree = DecisionTreeClassifier(**params)
  
  # AdaBoost with optimized base tree
  ada = AdaBoostClassifier(
      estimator=base_tree,
      n_estimators=100,
      random_state=920
  )
  
  # Fit model on original training data (no resampling!)
  ada.fit(X_train, y_train)
  
  # Evaluate on untouched test set
  y_pred = ada.predict(X_test)
  
  #set any relevant notes
  notes = f'See params for dtree. Adaboost with n_estimators=100.'
  
  # Evaluate the model
  accuracy = accuracy_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred)
  recall = recall_score(y_test, y_pred)
  f1 = f1_score(y_test, y_pred)
  auc = roc_auc_score(y_test, y_pred)

  #log
  log_experiment('Adaboost', params, accuracy, precision,
                 recall, f1, auc, notes)
                 
  df_log = pd.DataFrame(experiment_log)

AdaBoostClassifier(estimator=DecisionTreeClassifier(class_weight='balanced',
                                                    max_depth=8,
                                                    random_state=920),
                   n_estimators=100, random_state=920)

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

Best is at max depth 3.

Experiment 7: n_estimators and learning rate

Now let’s try learning_rate and n_estimators in tandem

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=920)

for n in [20, 50, 100, 200, 300]:
  for rate in [1, 0.75, 0.5, 0.25, 0.1, 0.05, 0.01]:
    #set parameters based on results above
    params = {'class_weight': 'balanced', 
              'max_depth': 3,
              'random_state': 920}
    
    # Create a Decision Tree Classifier
    base_tree = DecisionTreeClassifier(**params)
    
    # AdaBoost with optimized base tree
    ada = AdaBoostClassifier(
        estimator=base_tree,
        n_estimators=n,
        learning_rate=rate,
        random_state=920
    )
    
    # Fit model on original training data (no resampling!)
    ada.fit(X_train, y_train)
    
    # Evaluate on untouched test set
    y_pred = ada.predict(X_test)
    
    #set any relevant notes
    notes = f'learning rate: {rate}, n = {n}'
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)
  
    #log
    log_experiment('Adaboost', params, accuracy, precision,
                   recall, f1, auc, notes)
                   
    df_log = pd.DataFrame(experiment_log)

AdaBoostClassifier(estimator=DecisionTreeClassifier(class_weight='balanced',
                                                    max_depth=3,
                                                    random_state=920),
                   learning_rate=0.01, n_estimators=300, random_state=920)

Learning rate was highest at 1, irrespective of n. This tells us that if we wanted to use adaboost, we’re better off tweaking the learning rate than worrying about the number of trees. We can use a relatively low number (like 20) and still achieve high performance.

df_log_top = df_log.groupby('model').apply(lambda x: x.nlargest(3, 'f1'))

## <string>:1: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.


for col in df_log_top.columns:
  if df_log_top[col].dtype == float:
    df_log_top[col] = df_log_top[col].round(3)

#df_log_top['params'].to_clipboard()

df['duration'].median()

## 180.0