Classification modelling

1. Exploratory data analysis

Exploratory data analysis has been undertaken and is referred to in the separate CRISP-DM report submission. The business goal of this modelling exercise is to target existing customers for a re-purchase campaign. Data cleaning, processing and model-building is outlined below.

#import data
repurchase_training <- read.csv("raw_data/repurchase_training.csv")

Cleaning the data

#need to change ID to be "character" in both datasets, so the IDs are not counted as numbers

eda_data <- repurchase_training

for (i in 3:6) {
  eda_data[,i] = as.character(eda_data[,i]);
}
  
str(eda_data)
## 'data.frame':    131337 obs. of  17 variables:
##  $ ID                       : int  1 2 3 5 6 7 8 9 10 11 ...
##  $ Target                   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ age_band                 : chr  "3. 35 to 44" "NULL" "NULL" "NULL" ...
##  $ gender                   : chr  "Male" "NULL" "Male" "NULL" ...
##  $ car_model                : chr  "model_1" "model_2" "model_3" "model_3" ...
##  $ car_segment              : chr  "LCV" "Small/Medium" "Large/SUV" "Large/SUV" ...
##  $ age_of_vehicle_years     : int  9 6 9 5 8 7 8 7 1 3 ...
##  $ sched_serv_warr          : int  2 10 10 8 9 4 2 4 2 1 ...
##  $ non_sched_serv_warr      : int  10 3 9 5 4 10 8 9 1 1 ...
##  $ sched_serv_paid          : int  3 10 10 8 10 5 2 6 1 2 ...
##  $ non_sched_serv_paid      : int  7 4 9 4 7 7 9 9 3 1 ...
##  $ total_paid_services      : int  5 9 10 5 9 6 9 8 1 2 ...
##  $ total_services           : int  6 10 10 6 8 8 4 6 2 1 ...
##  $ mth_since_last_serv      : int  9 6 7 4 5 8 7 9 1 1 ...
##  $ annualised_mileage       : int  8 10 10 10 4 5 6 5 1 1 ...
##  $ num_dealers_visited      : int  10 7 6 9 4 10 10 5 2 1 ...
##  $ num_serv_dealer_purchased: int  4 10 10 7 9 4 4 8 3 1 ...
#Find nulls in the data to work out which columns have a lot of empty records
total_nulls <- data.frame(type = names(eda_data[, -1]), 
                    nulls = colSums(eda_data[, -1] == "NULL"),
                    values = colSums(eda_data[, -1] != "NULL")
                    )

total_nulls <- melt(total_nulls, id.var="type")

# We can see from this that 'age_band' has very few values across the training set
# We can also see that gender only has value in about 50% of rows
# Neither is likely very usable for our model
ggplot(total_nulls, aes(x = type, y = value, fill = variable)) +
  geom_bar(stat = "identity") +
  coord_flip()

Let’s look at some characteristics of the cars to see if we can detect any patterns

#First write a function that will take a characteristic and return a data frame that gives us the count of entries with target = 1 and target = 0
get_counts <- function (attribute_df, count_df) {

  output = data.frame(matrix(ncol = 3, nrow = 0), stringsAsFactors = FALSE)
  attribute_name = colnames(attribute_df)[1]
  names(output) = c(attribute_name, "target", "non_target")
  
  for(i in 1:nrow(attribute_df)) {
    my_attribute = as.character(attribute_df[i, attribute_name])

    my_target_count = count_df %>%
        filter(Target == 1) %>%
        filter(.[,2] == my_attribute)
    my_no_target_count = count_df %>%
        filter(Target == 0) %>%
        filter(.[,2] == my_attribute)
    
    if(nrow(my_target_count) > 0) {
      target_count = as.character(my_target_count$count)
    } else {
      target_count = "0"
    }
      
    if(nrow(my_no_target_count) > 0) {
      no_target_count = as.character(my_no_target_count$count)
    } else {
      no_target_count = "0"
    }
    
    temp <- data.frame(my_attribute, target_count, no_target_count)
    names(temp) <- c(attribute_name, "target", "non_target")
    
    output = rbind(output, temp)
    output[,1] <- as.character(output[,1])
    output[,2] <- as.integer(output[,2])
    output[,3] <- as.integer(output[,3])
    
  }
  
  return(output)
}
#melt and plot car models
car_models <- eda_data %>%
    group_by(car_model) %>%
    summarize(count = n())

car_models_counts <- eda_data %>%
    group_by(Target, car_model) %>%
    summarize(count = n())

car_model_data <- get_counts(car_models, car_models_counts)

car_model_data <- melt(car_model_data, id.var="car_model")

ggplot(car_model_data, aes(x = car_model, y = value, fill = variable)) +
  geom_bar(stat = "identity") + theme(axis.text.x = element_text(angle = 90, hjust = 1))

#plot car segments variable
car_segments <- eda_data %>%
    group_by(car_segment) %>%
    summarize(count = n())

car_segments_counts <- eda_data %>%
    group_by(Target, car_segment) %>%
    summarize(count = n())

car_segment_data <- get_counts(car_segments, car_segments_counts)

car_segment_data <- melt(car_segment_data, id.var="car_segment")

ggplot(car_segment_data, aes(x = car_segment, y = value, fill = variable)) +
  geom_bar(stat = "identity")

#plot age of vehicle in years variable
age_of_vehicle_years <- eda_data %>%
   group_by(age_of_vehicle_years) %>%
   summarize(count = n())

age_of_vehicle_years_counts <- eda_data %>%
  group_by(Target, age_of_vehicle_years) %>%
  summarize(count = n())

age_of_vehicle_years_data <- get_counts(age_of_vehicle_years, age_of_vehicle_years_counts)

age_of_vehicle_years_data <- melt(age_of_vehicle_years_data, id.var="age_of_vehicle_years")

ggplot(age_of_vehicle_years_data, aes(x = age_of_vehicle_years, y = value, fill = variable)) +
  geom_bar(stat = "identity")

Look at some service history characteristics of the cars to see if we can detect any patterns

#NUmber of scheduled service used under warranty
sched_serv_warr_values <- eda_data %>%
  group_by(sched_serv_warr) %>%
    summarize(count = n())

sched_serv_warr_counts <- eda_data %>%
  group_by(Target, sched_serv_warr) %>%
  summarize(count = n())

sched_serv_warr_data <- get_counts(sched_serv_warr_values, sched_serv_warr_counts)

sched_serv_warr_data <- melt(sched_serv_warr_data, id.var="sched_serv_warr")

ggplot(sched_serv_warr_data, aes(x = sched_serv_warr, y = value, fill = variable)) +
  geom_bar(stat = "identity")

#Number of services had at the same dealer where the vehicle was purchased
num_serv_dealer_purchased_values <- eda_data %>%
  group_by(num_serv_dealer_purchased) %>%
    summarize(count = n())

num_serv_dealer_purchased_counts <- eda_data %>%
  group_by(Target, num_serv_dealer_purchased) %>%
  summarize(count = n())

num_serv_dealer_purchased_data <- get_counts(num_serv_dealer_purchased_values, num_serv_dealer_purchased_counts)

num_serv_dealer_purchased_data <- melt(num_serv_dealer_purchased_data, id.var="num_serv_dealer_purchased")

ggplot(num_serv_dealer_purchased_data, aes(x = num_serv_dealer_purchased, y = value, fill = variable)) +
  geom_bar(stat = "identity")

2. Building a linear classification model

Run a linear model with the training data.

lrm_data <- repurchase_training

#We run a linear regression model on our testset, excluding IDs
glm.model = glm(formula = Target ~ ., family = binomial, data = lrm_data[,-1])

summary(glm.model)
## 
## Call:
## glm(formula = Target ~ ., family = binomial, data = lrm_data[, 
##     -1])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3059  -0.1867  -0.0606  -0.0233   4.8892  
## 
## Coefficients: (2 not defined because of singularities)
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                -4.622319   0.557257  -8.295  < 2e-16 ***
## age_band2. 25 to 34         1.229115   0.582886   2.109 0.034973 *  
## age_band3. 35 to 44         1.404695   0.576427   2.437 0.014814 *  
## age_band4. 45 to 54         1.656169   0.570158   2.905 0.003675 ** 
## age_band5. 55 to 64         1.774010   0.571009   3.107 0.001891 ** 
## age_band6. 65 to 74         1.489551   0.599670   2.484 0.012993 *  
## age_band7. 75+              1.985368   0.636082   3.121 0.001801 ** 
## age_bandNULL                1.782812   0.550253   3.240 0.001195 ** 
## genderMale                  0.426431   0.069093   6.172 6.75e-10 ***
## genderNULL                 -0.109306   0.064938  -1.683 0.092329 .  
## car_modelmodel_10          -0.793655   0.165332  -4.800 1.58e-06 ***
## car_modelmodel_11          -1.368900   0.353951  -3.867 0.000110 ***
## car_modelmodel_12          -1.014117   0.430728  -2.354 0.018551 *  
## car_modelmodel_13           1.444475   0.204250   7.072 1.53e-12 ***
## car_modelmodel_14         -11.535142  87.036102  -0.133 0.894563    
## car_modelmodel_15           1.611429   0.388465   4.148 3.35e-05 ***
## car_modelmodel_16           0.840084   0.595070   1.412 0.158027    
## car_modelmodel_17          -1.291977   1.034821  -1.249 0.211847    
## car_modelmodel_18          -0.336041   0.516382  -0.651 0.515200    
## car_modelmodel_19         -11.630748 614.640130  -0.019 0.984903    
## car_modelmodel_2            0.647902   0.070808   9.150  < 2e-16 ***
## car_modelmodel_3            0.821045   0.073260  11.207  < 2e-16 ***
## car_modelmodel_4            0.656971   0.087515   7.507 6.05e-14 ***
## car_modelmodel_5            0.309173   0.073812   4.189 2.81e-05 ***
## car_modelmodel_6            0.447349   0.161645   2.767 0.005649 ** 
## car_modelmodel_7            0.843444   0.089540   9.420  < 2e-16 ***
## car_modelmodel_8            0.909639   0.099712   9.123  < 2e-16 ***
## car_modelmodel_9           -0.001115   0.234381  -0.005 0.996206    
## car_segmentLCV                    NA         NA      NA       NA    
## car_segmentOther            0.343629   1.448426   0.237 0.812468    
## car_segmentSmall/Medium           NA         NA      NA       NA    
## age_of_vehicle_years       -0.037538   0.010281  -3.651 0.000261 ***
## sched_serv_warr            -0.313279   0.021700 -14.437  < 2e-16 ***
## non_sched_serv_warr         0.024260   0.019515   1.243 0.213813    
## sched_serv_paid            -0.294926   0.019035 -15.494  < 2e-16 ***
## non_sched_serv_paid         0.321299   0.023184  13.859  < 2e-16 ***
## total_paid_services        -0.048576   0.023816  -2.040 0.041384 *  
## total_services             -0.956269   0.028876 -33.116  < 2e-16 ***
## mth_since_last_serv        -0.354847   0.012025 -29.508  < 2e-16 ***
## annualised_mileage          0.449782   0.011239  40.020  < 2e-16 ***
## num_dealers_visited         0.045751   0.010131   4.516 6.30e-06 ***
## num_serv_dealer_purchased   0.472887   0.014385  32.875  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 32432  on 131336  degrees of freedom
## Residual deviance: 20538  on 131297  degrees of freedom
## AIC: 20618
## 
## Number of Fisher Scoring iterations: 13

The summary shows the factor data is making the model treat each category in factor columns as predictors, so we need to clean the data to turn categories into integers

Transforming & cleaning data

lrm_data$age_band <- substr(lrm_data$age_band, 1, 1)
lrm_data$age_band <- gsub("N", "NULL", lrm_data$age_band)

lrm_data$gender <- gsub("Male", "1", lrm_data$gender)
lrm_data$gender <- gsub("Female", "2", lrm_data$gender)

lrm_data$car_model <- sub('model_', '', lrm_data$car_model)

lrm_data$car_segment <- sub("Other", "1", lrm_data$car_segment)
lrm_data$car_segment <- sub("Small/Medium", "2", lrm_data$car_segment)
lrm_data$car_segment <- sub("Large/SUV", "3", lrm_data$car_segment)
lrm_data$car_segment <- sub("LCV", "4", lrm_data$car_segment)

lrm_data$age_band <- as.integer(lrm_data$age_band)
## Warning: NAs introduced by coercion
lrm_data$gender <- as.integer(lrm_data$gender)
## Warning: NAs introduced by coercion
lrm_data$car_model <- as.integer(lrm_data$car_model)
lrm_data$car_segment <- as.integer(lrm_data$car_segment)

#We want to keep our Target as a Factor
lrm_data[,2] = as.factor(lrm_data[,2]);

We also have 2 columns with high proportions of NAs. They are unlikely to be useful predictors, so let’s remove them

lrm_data <- lrm_data %>%
    dplyr::select(ID, Target, car_model, car_segment, age_of_vehicle_years, sched_serv_warr, non_sched_serv_warr, sched_serv_paid, non_sched_serv_paid, total_paid_services, total_services, mth_since_last_serv, annualised_mileage, num_dealers_visited, num_serv_dealer_purchased)

Create training and test sets of our lrm_data

## 70% of the lrm_data, use floor to round down to nearest integer
lrm_trainset_size <- floor(0.70 * nrow(lrm_data))

set.seed(42)

lrm_trainset_indices <- sample(seq_len(nrow(lrm_data)), size = lrm_trainset_size)

lrm_trainset <- lrm_data[lrm_trainset_indices, ]
lrm_testset <- lrm_data[-lrm_trainset_indices, ]
#So try our linear regression model on our newly cleaned test set
glm.model = glm(formula = Target ~ ., family = binomial, data = lrm_testset[,-1])

summary(glm.model)
## 
## Call:
## glm(formula = Target ~ ., family = binomial, data = lrm_testset[, 
##     -1])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9480  -0.2123  -0.0702  -0.0278   4.7109  
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               -1.936221   0.174590 -11.090  < 2e-16 ***
## car_model                 -0.016126   0.012125  -1.330  0.18353    
## car_segment               -0.100213   0.047507  -2.109  0.03491 *  
## age_of_vehicle_years       0.021718   0.017283   1.257  0.20889    
## sched_serv_warr           -0.309101   0.038368  -8.056 7.86e-16 ***
## non_sched_serv_warr       -0.007022   0.034808  -0.202  0.84012    
## sched_serv_paid           -0.312433   0.034067  -9.171  < 2e-16 ***
## non_sched_serv_paid        0.282285   0.040693   6.937 4.01e-12 ***
## total_paid_services       -0.077255   0.041899  -1.844  0.06520 .  
## total_services            -0.840949   0.050158 -16.766  < 2e-16 ***
## mth_since_last_serv       -0.333633   0.021060 -15.842  < 2e-16 ***
## annualised_mileage         0.405797   0.019764  20.533  < 2e-16 ***
## num_dealers_visited        0.053807   0.017891   3.007  0.00263 ** 
## num_serv_dealer_purchased  0.464105   0.026131  17.761  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 9813.5  on 39401  degrees of freedom
## Residual deviance: 6593.4  on 39388  degrees of freedom
## AIC: 6621.4
## 
## Number of Fisher Scoring iterations: 9

Predictions and confusion matrix

# Now to predict probabilities on lrm_testset
glm_prob <- predict.glm(glm.model,lrm_testset[,-1],type="response")

# Create a vector to hold predictions
glm_predict <- rep(0,nrow(lrm_testset[,-1]))
glm_predict[glm_prob>.5] <- 1

#Create a confusion matrix
glm_confusion_matrix <- table(pred=glm_predict,true=lrm_testset$Target)

glm_confusion_matrix
##     true
## pred     0     1
##    0 38300   870
##    1    34   198

With an AIC of 6621.4, compared to the previous experimental model having an AIC of 21132, this model is a better fit than the earlier model built without clean and transformed data. However, the confusion matrix reveals a high number of “false negatives”, so the model didn’t correctly predict 870 of the 1068 positives.

# We'll want to look at evaluation measures regularly, so crteate a function to calculate and return them
get_evaluation_measures <- function(name = NA, tn, fp, fn, tp) {
  
  accuracy = (tp+tn)/(tp+tn+fp+fn)
  
  precision = tp/(tp+fp)
  
  recall = tp/(tp+fn)
  
  F1 = 2 * ((precision * recall)/(precision + recall))
  
  output = data.frame(name, accuracy, precision, recall, F1)
  
  return(output)
  
}
#Convert the confusion matrix to a data frame and output the evaluation measures
glm_confusion_matrix_df <- as.data.frame(glm_confusion_matrix)

lrm_evaluation_measures <- get_evaluation_measures("Linear Regression",
                              glm_confusion_matrix_df$Freq[1],
                              glm_confusion_matrix_df$Freq[2],
                              glm_confusion_matrix_df$Freq[3],
                              glm_confusion_matrix_df$Freq[4])

lrm_evaluation_measures
##                name accuracy precision    recall        F1
## 1 Linear Regression 0.977057 0.8534483 0.1853933 0.3046154
#Now to get AUC. We'll do it again further on in our analysis, so write a function
get_auc <- function(probabilities, targets) {
  
  probs = as.vector(probabilities)
  
  pred = prediction(probs,targets)
  
  perf_AUC = performance(pred, "auc")
  
  AUC = perf_AUC@y.values[[1]]
  
  return(AUC)
  
}
#Get the AUC and add it to our evaluation measures data frame
lrm_evaluation_measures$AUC <- get_auc(glm_prob, lrm_testset$Target)

#Create a data frame for all evaluation measures and use lrm_evaluation_measures as the first row
evaluation_measures <- lrm_evaluation_measures

evaluation_measures
##                name accuracy precision    recall        F1       AUC
## 1 Linear Regression 0.977057 0.8534483 0.1853933 0.3046154 0.8913693
#Now we want to try LASSO to perform grid search to find optimal value of lambda IE regularize the model

#convert training data to matrix format
lrm_x <- model.matrix(Target~.,lrm_trainset[,-1])
lrm_y <- lrm_trainset$Target

#family= binomial => logistic regression, alpha=1 => lasso
lrm_cv.out <- cv.glmnet(lrm_x, lrm_y, alpha=1, family="binomial", type.measure="auc")

plot(lrm_cv.out)

#Using lambda of 1se rather than the minimum lambda, see what predictors are discarded

#min value of lambda
lrm_lambda_min <- lrm_cv.out$lambda.min
#best value of lambda
lrm_lambda_1se <- lrm_cv.out$lambda.1se

#regression coefficients
coef(lrm_cv.out,s=lrm_lambda_1se)
## 15 x 1 sparse Matrix of class "dgCMatrix"
##                                      1
## (Intercept)               -2.078821528
## (Intercept)                .          
## car_model                  .          
## car_segment                .          
## age_of_vehicle_years       .          
## sched_serv_warr           -0.215893443
## non_sched_serv_warr        .          
## sched_serv_paid           -0.212896245
## non_sched_serv_paid        0.125448961
## total_paid_services        .          
## total_services            -0.301259090
## mth_since_last_serv       -0.178056998
## annualised_mileage         0.171637603
## num_dealers_visited        0.006766528
## num_serv_dealer_purchased  0.166261320
#Convert test data to a model matrix
lrm_x_test <- model.matrix(Target~.,lrm_testset[,-1])

#Get prediction probabilities
lasso_prob <- predict(lrm_cv.out, newx = lrm_x_test, s=lrm_lambda_1se, type="response")

#translate probabilities to predictions
lasso_predict <- rep(0,nrow(lrm_testset))

lasso_predict[lasso_prob>.5] <- 1

2.a. Show the confusion matrix and calculate precision, recall, F1 and AUC

#confusion matrix
lasso_confusion_matrix <- table(pred=lasso_predict,true=lrm_testset$Target)

lasso_confusion_matrix
##     true
## pred     0     1
##    0 38334  1066
##    1     0     2
#Convert the confusion matrix to a data frame
lasso_confusion_matrix_df <- as.data.frame(lasso_confusion_matrix)
  
lasso_evaluation_measures <- get_evaluation_measures("Lasso",
                        lasso_confusion_matrix_df$Freq[1],
                        lasso_confusion_matrix_df$Freq[2],
                        lasso_confusion_matrix_df$Freq[3],
                        lasso_confusion_matrix_df$Freq[4])

#Get the AUC and add it to our evaluation measures data frame
lasso_evaluation_measures$AUC <- get_auc(lasso_prob, lrm_testset$Target)

lasso_evaluation_measures
##    name  accuracy precision      recall          F1       AUC
## 1 Lasso 0.9729455         1 0.001872659 0.003738318 0.8962849
#Visually compare the two models' evaluation metrics by placing in a data frame

evaluation_measures <- rbind(evaluation_measures, lasso_evaluation_measures)

evaluation_measures
##                name  accuracy precision      recall          F1       AUC
## 1 Linear Regression 0.9770570 0.8534483 0.185393258 0.304615385 0.8913693
## 2             Lasso 0.9729455 1.0000000 0.001872659 0.003738318 0.8962849

2.b. Which metric will you use to decide your final model – the F1 metric

The F1 metric is the weighted average of Precision and Recall. It isn’t as intuitive as the other measures, but since the linear classification model has such strong accuracy and precision but generally poorer recall, a blended metric should be used to choose the final model.

3. Tree based classification

Cleaning the data

# Remove columns with high NAs - excluding age and gender - and ID
tree_data <- repurchase_training %>%
  select(Target, car_model, car_segment, age_of_vehicle_years, sched_serv_warr, non_sched_serv_warr, sched_serv_paid, non_sched_serv_paid, total_paid_services, total_services, mth_since_last_serv, annualised_mileage,num_dealers_visited,num_serv_dealer_purchased)

# Make sure our predictor is a factor
tree_data$Target <- as.factor(tree_data$Target)

#Get index of predicted variable
targetColNum <- grep("Target",names(tree_data))

Use same train:test split as linear classification model

# Create training and test sets

## 70% of the sample size, use floor to round down to nearest integer
tree_trainset_size <- floor(0.70 * nrow(tree_data))

#set random seed 
set.seed(42)

tree_trainset_indices <- sample(seq_len(nrow(tree_data)), size = tree_trainset_size)
#assign observations to training and testing sets

tree_trainset <- tree_data[tree_trainset_indices, ]
tree_testset <- tree_data[-tree_trainset_indices, ]

Grow an unpruned tree with rpart

#This is a classification problem so set method="class" and exclude ID
rpart_fit <- rpart(Target~.,data = tree_trainset, method='class', model=TRUE)

#predict on test data
rpart_prob <- predict(rpart_fit,tree_testset[,-targetColNum],type="class")

#Create a confusion matrix
rpart_confusion_matrix <- table(pred=rpart_prob,true=tree_testset[,targetColNum])

rpart_confusion_matrix
##     true
## pred     0     1
##    0 38234   528
##    1   100   540
#Convert the confusion matrix to a data frame
rpart_confusion_matrix_df <- as.data.frame(rpart_confusion_matrix)
  
rpart_evaluation_measures <- get_evaluation_measures("RPart unpruned",
                        rpart_confusion_matrix_df$Freq[1],
                        rpart_confusion_matrix_df$Freq[2],
                        rpart_confusion_matrix_df$Freq[3],
                        rpart_confusion_matrix_df$Freq[4])

#Can't get AUC
rpart_evaluation_measures$AUC <- NA

evaluation_measures <- rbind(evaluation_measures, rpart_evaluation_measures)

evaluation_measures
##                name  accuracy precision      recall          F1       AUC
## 1 Linear Regression 0.9770570 0.8534483 0.185393258 0.304615385 0.8913693
## 2             Lasso 0.9729455 1.0000000 0.001872659 0.003738318 0.8962849
## 3    RPart unpruned 0.9840617 0.8437500 0.505617978 0.632318501        NA

Pruning the tree

#what's the tree size which results in the min cross validated error
rpart_opt <- which.min(rpart_fit$cptable[,"xerror"])

#value of the complexity parameter (alpha) for that gives a tree of that size
rpart_cp <- rpart_fit$cptable[rpart_opt, "CP"]

# "prune" the tree using that value of the complexity parameter
rpart_pruned.model <- prune(rpart_fit,rpart_cp)

#plot pruned tree
prp(rpart_pruned.model)

#predictions from pruned model
rpart_pruned_predict <- predict(rpart_pruned.model,tree_testset[,-targetColNum],type="class")

#confusion matrix (PRUNED model)
rpart_pruned_confusion_matrix <- table(pred=rpart_pruned_predict,true=tree_testset[,targetColNum])

rpart_pruned_confusion_matrix
##     true
## pred     0     1
##    0 38234   528
##    1   100   540
#Convert the confusion matrix to a data frame
rpart_pruned_confusion_matrix_df <- as.data.frame(rpart_pruned_confusion_matrix)
  
rpart_pruned_evaluation_measures <- get_evaluation_measures("RPart pruned",
                        rpart_pruned_confusion_matrix_df$Freq[1],
                        rpart_pruned_confusion_matrix_df$Freq[2],
                        rpart_pruned_confusion_matrix_df$Freq[3],
                        rpart_pruned_confusion_matrix_df$Freq[4])

#Can't get AUC
rpart_pruned_evaluation_measures$AUC <- NA

evaluation_measures <- rbind(evaluation_measures, rpart_pruned_evaluation_measures)

evaluation_measures
##                name  accuracy precision      recall          F1       AUC
## 1 Linear Regression 0.9770570 0.8534483 0.185393258 0.304615385 0.8913693
## 2             Lasso 0.9729455 1.0000000 0.001872659 0.003738318 0.8962849
## 3    RPart unpruned 0.9840617 0.8437500 0.505617978 0.632318501        NA
## 4      RPart pruned 0.9840617 0.8437500 0.505617978 0.632318501        NA

We can see from the evaluation measures that, while precision and accuracy figures are similar across the three models, recall is much better in the tree-based model and subsequently the F1 score (which we are using as our evaluation metric of choice) is much better.

Try a random forest model

#Build random forest model
rf.model <- randomForest(Target ~.,
                         data = tree_trainset,
                         importance=TRUE,
                         xtest=tree_testset[,-targetColNum],
                         keep.forest = TRUE,
                         ntree=1000)

#predictions for test set
rf_predictions <- data.frame(tree_testset,rf.model$test$predicted)

#confusion matrix
rf_confusion_matrix <- table(pred=rf.model$test$predicted,true=tree_testset[,targetColNum])

rf_confusion_matrix
##     true
## pred     0     1
##    0 38282   247
##    1    52   821
rf_confusion_matrix_df <- as.data.frame(rf_confusion_matrix)

rf_evaluation_measures <- get_evaluation_measures("Random Forest",
                                                  rf_confusion_matrix_df$Freq[1],
                                                  rf_confusion_matrix_df$Freq[2],
                                                  rf_confusion_matrix_df$Freq[3],
                                                  rf_confusion_matrix_df$Freq[4])

#Get the AUC and add it to our evaluation measures data frame
rf_evaluation_measures$AUC <- get_auc(rf.model$test$votes[,2], tree_testset[,targetColNum])

evaluation_measures <- rbind(evaluation_measures, rf_evaluation_measures)

evaluation_measures
##                name  accuracy precision      recall          F1       AUC
## 1 Linear Regression 0.9770570 0.8534483 0.185393258 0.304615385 0.8913693
## 2             Lasso 0.9729455 1.0000000 0.001872659 0.003738318 0.8962849
## 3    RPart unpruned 0.9840617 0.8437500 0.505617978 0.632318501        NA
## 4      RPart pruned 0.9840617 0.8437500 0.505617978 0.632318501        NA
## 5     Random Forest 0.9924116 0.9404353 0.768726592 0.845955693 0.9930775

3.a. How does a tree-based model perform relative to the linear model?

We selected F1 as the blended metric to help choose the final model, and from the table above, it’s clear the Random Forest tree-based model has the best score.

The linear classification model regularized by Lasso was extremely accurate and precise, but due to its poor recall has a low F1 score and would result in very few customers being targetted for communication, even though they may be the right customers.

The Area Under The Curve score further highlights the strength of the Random Forest tree based model.

Now, let’s look at variable importance.

#quantitative measure of variable importance
importance(rf.model)
##                                   0           1 MeanDecreaseAccuracy
## car_model                  22.30483  -0.1142544             22.25298
## car_segment                24.42780  -8.2473182             23.36288
## age_of_vehicle_years       98.60089  13.4830131             98.86378
## sched_serv_warr            68.58838  73.1513210             70.61953
## non_sched_serv_warr        50.02090  45.6837522             50.90880
## sched_serv_paid            39.19942  47.5919508             40.24251
## non_sched_serv_paid        36.19129  26.9254542             36.47427
## total_paid_services        36.05429  21.8819590             36.30462
## total_services             59.52957  72.0991582             60.78891
## mth_since_last_serv       157.00513 120.5204063            161.82230
## annualised_mileage        175.21678   9.2144956            175.45747
## num_dealers_visited        58.44822  70.8990345             60.06840
## num_serv_dealer_purchased  93.33044  12.1798271             93.77070
##                           MeanDecreaseGini
## car_model                        228.63798
## car_segment                       49.28325
## age_of_vehicle_years             452.67217
## sched_serv_warr                  312.27809
## non_sched_serv_warr              273.82791
## sched_serv_paid                  262.10793
## non_sched_serv_paid              189.91392
## total_paid_services              253.55395
## total_services                   417.98529
## mth_since_last_serv              768.59623
## annualised_mileage               617.71384
## num_dealers_visited              328.08056
## num_serv_dealer_purchased        580.26730
# Look at variable importance using the built in varImpPlot from randomForest
varImpPlot(rf.model)

# Let's use the vip package instead

vip(rf.model, bar = TRUE, horizontal = TRUE, size = 1.5)

3. b. Variable importance measures, linear vs tree

Linear regression Linear with Lasso Rpart P&U tree Random Forest
Sched_serv_warr Total services Sched_serv_warr Annualised_mileage
Sched_serv_paid Sched_serv_warr Annualised_mileage Mth_since_last_serv
Total_services Sched_serv_paid Mth_since_last_ser Age_of_vehicle_years
Mth_since_last_serv Mth_since_last_serv Total services Num_serv_dealer_purchased
Annualised_mileage Annualised_mileage Total paid service Sched_serv_warr

3.d. Partial dependency plots for top 5 most important features

# Create a vector of our top 5 predictors
top_five_predictors = c("annualised_mileage", "mth_since_last_serv", "age_of_vehicle_years", "num_serv_dealer_purchased", "sched_serv_warr")
#Construct the 5 plots

# Annualised Mileage
p1 <- partial(rf.model, pred.var = top_five_predictors[1], plot = FALSE, rug = TRUE, plot.engine = "ggplot2")
# Months since last service
p2 <- partial(rf.model, pred.var = top_five_predictors[2], plot = FALSE, rug = TRUE, plot.engine = "ggplot2")
#Age of vehicle in years
p3 <- partial(rf.model, pred.var = top_five_predictors[3], plot = FALSE, rug = TRUE, plot.engine = "ggplot2")
#Number of services had at the same dealer where the vehicle was purchased
p4 <- partial(rf.model, pred.var = top_five_predictors[4], plot = FALSE, rug = TRUE, plot.engine = "ggplot2")
#Number of scheduled services used under warranty
p5 <- partial(rf.model, pred.var = top_five_predictors[5], plot = FALSE, rug = TRUE, plot.engine = "ggplot2")

#Let's also do a sixth for total services
p6 <- partial(rf.model, pred.var = "total_services", plot = FALSE, rug = TRUE, plot.engine = "ggplot2")
#Plot 1
autoplot(p1, ylab = "yhat") + theme_light() + ggtitle(paste("Partial dependency plot for", top_five_predictors[1]))

#Plot2
autoplot(p2, ylab = "yhat") + theme_light() + ggtitle(paste("Partial dependency plot for", top_five_predictors[2]))

#Plot 3
autoplot(p3, ylab = "yhat") + theme_light() + ggtitle(paste("Partial dependency plot for", top_five_predictors[3]))

#Plot 4
autoplot(p4, ylab = "yhat") + theme_light() + ggtitle(paste("Partial dependency plot for", top_five_predictors[4]))

#Plot 5
autoplot(p5, ylab = "yhat") + theme_light() + ggtitle(paste("Partial dependency plot for", top_five_predictors[5]))

#Plot 6
autoplot(p6, ylab = "yhat") + theme_light() + ggtitle("Partial dependency plot for total services")

4. Now use the Random Forest model to predict for the validation set

#Load the vaidation set
#NOTE:this validation set needed an ID added (hence the CORRECTED in the file name)
repurchase_validation <- read.csv("raw_data/repurchase_validationCORRECTED.csv")

#Train set is the full repurchase training set after cleaning for the tree models
trainset <- tree_data

#Car model 19 doesn't exist in the validation data set, so we filter it from the training set
trainset <- trainset %>%
    filter(car_model != "model_19")

#Re-factor car_model to have 18 factors as per validation set
trainset$car_model <- factor(trainset$car_model)

#Test set
testset <- repurchase_validation

#Run the model (don't include ID, age or gender columns in the testset)
validation.model <- randomForest(Target~.,
                                 data = trainset, 
                                 importance = TRUE, 
                                 xtest = testset[,4:ncol(testset)],
                                 ntree = 1000)

#predictions for test set
testset$target_class <- validation.model$test$predicted

testset$target_probability <- validation.model$test$votes[,2]

final_output <- testset %>%
    select(ID, target_probability, target_class)

write.csv(final_output, file="cleaned_data/repurchase_validation_13184650.csv", row.names=FALSE)

#predictions for trainset
final_predictions <- data.frame(trainset,validation.model$predicted)

#confusion matrix
final_confusion_matrix <- table(validation.model$predicted,trainset[,targetColNum])

final_confusion_matrix
##    
##          0      1
##   0 127644    734
##   1    170   2787