DAM Assignment 1 Part B

Classification modelling

Linear classification

The goal is to target existing customers for a re-purchase campaign, by sending a communication to customers who are highly likely to purchase a new vehicle (all customers have purchased at least one vehicle)

#import data
repurchase_training <- read.csv("raw_data/repurchase_training.csv")

Experimenting to build a linear classification model

Run a linear model with the training data.

Using glm, we optimised to build out linear classification model, at first using Target as the predictor and taking ID out of the test set. The factor data needed to be cleaned to turn categories into integers.

lrm_data <- repurchase_training

#We run a linear regression model on our testset, excluding IDs
glm.model = glm(formula = Target ~ ., family = binomial, data = lrm_data[,-1])

summary(glm.model)

## 
## Call:
## glm(formula = Target ~ ., family = binomial, data = lrm_data[, 
##     -1])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3059  -0.1867  -0.0606  -0.0233   4.8892  
## 
## Coefficients: (2 not defined because of singularities)
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                -4.622319   0.557257  -8.295  < 2e-16 ***
## age_band2. 25 to 34         1.229115   0.582886   2.109 0.034973 *  
## age_band3. 35 to 44         1.404695   0.576427   2.437 0.014814 *  
## age_band4. 45 to 54         1.656169   0.570158   2.905 0.003675 ** 
## age_band5. 55 to 64         1.774010   0.571009   3.107 0.001891 ** 
## age_band6. 65 to 74         1.489551   0.599670   2.484 0.012993 *  
## age_band7. 75+              1.985368   0.636082   3.121 0.001801 ** 
## age_bandNULL                1.782812   0.550253   3.240 0.001195 ** 
## genderMale                  0.426431   0.069093   6.172 6.75e-10 ***
## genderNULL                 -0.109306   0.064938  -1.683 0.092329 .  
## car_modelmodel_10          -0.793655   0.165332  -4.800 1.58e-06 ***
## car_modelmodel_11          -1.368900   0.353951  -3.867 0.000110 ***
## car_modelmodel_12          -1.014117   0.430728  -2.354 0.018551 *  
## car_modelmodel_13           1.444475   0.204250   7.072 1.53e-12 ***
## car_modelmodel_14         -11.535142  87.036102  -0.133 0.894563    
## car_modelmodel_15           1.611429   0.388465   4.148 3.35e-05 ***
## car_modelmodel_16           0.840084   0.595070   1.412 0.158027    
## car_modelmodel_17          -1.291977   1.034821  -1.249 0.211847    
## car_modelmodel_18          -0.336041   0.516382  -0.651 0.515200    
## car_modelmodel_19         -11.630748 614.640130  -0.019 0.984903    
## car_modelmodel_2            0.647902   0.070808   9.150  < 2e-16 ***
## car_modelmodel_3            0.821045   0.073260  11.207  < 2e-16 ***
## car_modelmodel_4            0.656971   0.087515   7.507 6.05e-14 ***
## car_modelmodel_5            0.309173   0.073812   4.189 2.81e-05 ***
## car_modelmodel_6            0.447349   0.161645   2.767 0.005649 ** 
## car_modelmodel_7            0.843444   0.089540   9.420  < 2e-16 ***
## car_modelmodel_8            0.909639   0.099712   9.123  < 2e-16 ***
## car_modelmodel_9           -0.001115   0.234381  -0.005 0.996206    
## car_segmentLCV                    NA         NA      NA       NA    
## car_segmentOther            0.343629   1.448426   0.237 0.812468    
## car_segmentSmall/Medium           NA         NA      NA       NA    
## age_of_vehicle_years       -0.037538   0.010281  -3.651 0.000261 ***
## sched_serv_warr            -0.313279   0.021700 -14.437  < 2e-16 ***
## non_sched_serv_warr         0.024260   0.019515   1.243 0.213813    
## sched_serv_paid            -0.294926   0.019035 -15.494  < 2e-16 ***
## non_sched_serv_paid         0.321299   0.023184  13.859  < 2e-16 ***
## total_paid_services        -0.048576   0.023816  -2.040 0.041384 *  
## total_services             -0.956269   0.028876 -33.116  < 2e-16 ***
## mth_since_last_serv        -0.354847   0.012025 -29.508  < 2e-16 ***
## annualised_mileage          0.449782   0.011239  40.020  < 2e-16 ***
## num_dealers_visited         0.045751   0.010131   4.516 6.30e-06 ***
## num_serv_dealer_purchased   0.472887   0.014385  32.875  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 32432  on 131336  degrees of freedom
## Residual deviance: 20538  on 131297  degrees of freedom
## AIC: 20618
## 
## Number of Fisher Scoring iterations: 13

#The summary shows the factor data is making the model treat each category in factor columns as predictors, so we need to clean the data to turn categories into integers
lrm_data$age_band <- substr(lrm_data$age_band, 1, 1)
lrm_data$age_band <- gsub("N", "NULL", lrm_data$age_band)

lrm_data$gender <- gsub("Male", "1", lrm_data$gender)
lrm_data$gender <- gsub("Female", "2", lrm_data$gender)

lrm_data$car_model <- sub('model_', '', lrm_data$car_model)

lrm_data$car_segment <- sub("Other", "1", lrm_data$car_segment)
lrm_data$car_segment <- sub("Small/Medium", "2", lrm_data$car_segment)
lrm_data$car_segment <- sub("Large/SUV", "3", lrm_data$car_segment)
lrm_data$car_segment <- sub("LCV", "4", lrm_data$car_segment)

lrm_data$age_band <- as.integer(lrm_data$age_band)

## Warning: NAs introduced by coercion

lrm_data$gender <- as.integer(lrm_data$gender)

## Warning: NAs introduced by coercion

lrm_data$car_model <- as.integer(lrm_data$car_model)
lrm_data$car_segment <- as.integer(lrm_data$car_segment)

#We want to keep our Target as a Factor
lrm_data[,2] = as.factor(lrm_data[,2]);

#We also have 2 columns with high proportions of NAs. They are unlikely to be useful predictors, so let's remove them

lrm_data <- lrm_data %>%
    dplyr::select(ID, Target, car_model, car_segment, age_of_vehicle_years, sched_serv_warr, non_sched_serv_warr, sched_serv_paid, non_sched_serv_paid, total_paid_services, total_services, mth_since_last_serv, annualised_mileage, num_dealers_visited, num_serv_dealer_purchased)

# create training and test sets of our lrm_data

## 70% of the lrm_data, use floor to round down to nearest integer
lrm_trainset_size <- floor(0.70 * nrow(lrm_data))
set.seed(42)

lrm_trainset_indices <- sample(seq_len(nrow(lrm_data)), size = lrm_trainset_size)

lrm_trainset <- lrm_data[lrm_trainset_indices, ]
lrm_testset <- lrm_data[-lrm_trainset_indices, ]

#So try our linear regression model on our newly cleaned test set
glm.model = glm(formula = Target ~ ., family = binomial, data = lrm_testset[,-1])

#ALEXTOCOMMENTThere's a couple of obvious ones we could remove (e.g. age_of_vehicle_years, non_sched_serv_warr), but some others are questionable (e.g. total_paid_services, car_segment, car_model)
summary(glm.model)

## 
## Call:
## glm(formula = Target ~ ., family = binomial, data = lrm_testset[, 
##     -1])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9480  -0.2123  -0.0702  -0.0278   4.7109  
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               -1.936221   0.174590 -11.090  < 2e-16 ***
## car_model                 -0.016126   0.012125  -1.330  0.18353    
## car_segment               -0.100213   0.047507  -2.109  0.03491 *  
## age_of_vehicle_years       0.021718   0.017283   1.257  0.20889    
## sched_serv_warr           -0.309101   0.038368  -8.056 7.86e-16 ***
## non_sched_serv_warr       -0.007022   0.034808  -0.202  0.84012    
## sched_serv_paid           -0.312433   0.034067  -9.171  < 2e-16 ***
## non_sched_serv_paid        0.282285   0.040693   6.937 4.01e-12 ***
## total_paid_services       -0.077255   0.041899  -1.844  0.06520 .  
## total_services            -0.840949   0.050158 -16.766  < 2e-16 ***
## mth_since_last_serv       -0.333633   0.021060 -15.842  < 2e-16 ***
## annualised_mileage         0.405797   0.019764  20.533  < 2e-16 ***
## num_dealers_visited        0.053807   0.017891   3.007  0.00263 ** 
## num_serv_dealer_purchased  0.464105   0.026131  17.761  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 9813.5  on 39401  degrees of freedom
## Residual deviance: 6593.4  on 39388  degrees of freedom
## AIC: 6621.4
## 
## Number of Fisher Scoring iterations: 9

# Now to predict probabilities on lrm_testset
glm_prob <- predict.glm(glm.model,lrm_testset[,-1],type="response")

# Create a vector to hold predictions
glm_predict <- rep(0,nrow(lrm_testset[,-1]))
glm_predict[glm_prob>.5] <- 1

#Create a confusion matrix
glm_confusion_matrix <- table(pred=glm_predict,true=lrm_testset$Target)
glm_confusion_matrix

##     true
## pred     0     1
##    0 38300   870
##    1    34   198

# We'll want to look at evaluation measures regularly, so crteate a function to calculate and return them
get_evaluation_measures <- function(name = NA, tn, fp, fn, tp) {
  
  accuracy = (tp+tn)/(tp+tn+fp+fn)
  
  precision = tp/(tp+fp)
  
  recall = tp/(tp+fn)
  
  F1 = 2 * ((precision * recall)/(precision + recall))
  
  output = data.frame(name, accuracy, precision, recall, F1)
  
  return(output)
  
}

#Convert the confusion matrix to a data frame and output the evaluation measures
glm_confusion_matrix_df <- as.data.frame(glm_confusion_matrix)

lrm_evaluation_measures <- get_evaluation_measures("Linear Regression",
                              glm_confusion_matrix_df$Freq[1],
                              glm_confusion_matrix_df$Freq[2],
                              glm_confusion_matrix_df$Freq[3],
                              glm_confusion_matrix_df$Freq[4])

lrm_evaluation_measures

##                name accuracy precision    recall        F1
## 1 Linear Regression 0.977057 0.8534483 0.1853933 0.3046154

#Now to get AUC. We'll do it a few times, so write a function

get_auc <- function(probabilities, targets) {
  
  probs = as.vector(probabilities)
  
  pred = prediction(probs,targets)
  
  perf_AUC = performance(pred, "auc")
  
  AUC = perf_AUC@y.values[[1]]
  
  return(AUC)
  
}

#Get the AUC and add it to our evaluation measures data frame
lrm_evaluation_measures$AUC <- get_auc(glm_prob, lrm_testset$Target)

#Create a data frame for all evaluation measures and use lrm_evaluation_measures as the first row
evaluation_measures <- lrm_evaluation_measures

evaluation_measures

##                name accuracy precision    recall        F1       AUC
## 1 Linear Regression 0.977057 0.8534483 0.1853933 0.3046154 0.8913693

#Now we want to try LASSO to perform grid search to find optimal value of lambda IE regularize the model

#convert training data to matrix format
lrm_x <- model.matrix(Target~.,lrm_trainset[,-1])
lrm_y <- lrm_trainset$Target

#family= binomial => logistic regression, alpha=1 => lasso
lrm_cv.out <- cv.glmnet(lrm_x, lrm_y, alpha=1, family="binomial", type.measure="auc")

plot(lrm_cv.out)

#Using lambda of 1se rather than the inimum lambda, see what predictors are discarded
#min value of lambda
lrm_lambda_min <- lrm_cv.out$lambda.min
#best value of lambda
lrm_lambda_1se <- lrm_cv.out$lambda.1se

#regression coefficients
coef(lrm_cv.out,s=lrm_lambda_1se)

## 15 x 1 sparse Matrix of class "dgCMatrix"
##                                      1
## (Intercept)               -2.078821528
## (Intercept)                .          
## car_model                  .          
## car_segment                .          
## age_of_vehicle_years       .          
## sched_serv_warr           -0.215893443
## non_sched_serv_warr        .          
## sched_serv_paid           -0.212896245
## non_sched_serv_paid        0.125448961
## total_paid_services        .          
## total_services            -0.301259090
## mth_since_last_serv       -0.178056998
## annualised_mileage         0.171637603
## num_dealers_visited        0.006766528
## num_serv_dealer_purchased  0.166261320

#Convert test data to a model matrix
lrm_x_test <- model.matrix(Target~.,lrm_testset[,-1])

#Get prediction probabilities
lasso_prob <- predict(lrm_cv.out, newx = lrm_x_test, s=lrm_lambda_1se, type="response")

#translate probabilities to predictions
lasso_predict <- rep(0,nrow(lrm_testset))
lasso_predict[lasso_prob>.5] <- 1

#confusion matrix
lasso_confusion_matrix <- table(pred=lasso_predict,true=lrm_testset$Target)
lasso_confusion_matrix

##     true
## pred     0     1
##    0 38334  1066
##    1     0     2

#Convert the confusion matrix to a data frame
lasso_confusion_matrix_df <- as.data.frame(lasso_confusion_matrix)
  
lasso_evaluation_measures <- get_evaluation_measures("Lasso",
                        lasso_confusion_matrix_df$Freq[1],
                        lasso_confusion_matrix_df$Freq[2],
                        lasso_confusion_matrix_df$Freq[3],
                        lasso_confusion_matrix_df$Freq[4])

#Get the AUC and add it to our evaluation measures data frame
lasso_evaluation_measures$AUC <- get_auc(lasso_prob, lrm_testset$Target)

lasso_evaluation_measures

##    name  accuracy precision      recall          F1       AUC
## 1 Lasso 0.9729455         1 0.001872659 0.003738318 0.8962849

#Visually compare the two models' evaluation metrics by placing in a data frame
evaluation_measures <- rbind(evaluation_measures, lasso_evaluation_measures)

evaluation_measures

##                name  accuracy precision      recall          F1       AUC
## 1 Linear Regression 0.9770570 0.8534483 0.185393258 0.304615385 0.8913693
## 2             Lasso 0.9729455 1.0000000 0.001872659 0.003738318 0.8962849

Tree based classification

# Remove columns with high NAs - excluding age and gender - and ID
tree_data <- repurchase_training %>%
  select(Target, car_model, car_segment, age_of_vehicle_years, sched_serv_warr, non_sched_serv_warr, sched_serv_paid, non_sched_serv_paid, total_paid_services, total_services, mth_since_last_serv, annualised_mileage,num_dealers_visited,num_serv_dealer_purchased)

# Make sure our predictor is a factor
tree_data$Target <- as.factor(tree_data$Target)

#Get index of predicted variable
targetColNum <- grep("Target",names(tree_data))

# Create training and test sets

## 70% of the sample size, use floor to round down to nearest integer
tree_trainset_size <- floor(0.70 * nrow(tree_data))

#set random seed 
set.seed(42)

tree_trainset_indices <- sample(seq_len(nrow(tree_data)), size = tree_trainset_size)
#assign observations to training and testing sets

tree_trainset <- tree_data[tree_trainset_indices, ]
tree_testset <- tree_data[-tree_trainset_indices, ]

Grow an unpruned tree with rpart

#default params. This is a classification problem so set method="class" and exclude ID
rpart_fit <- rpart(Target~.,data = tree_trainset, method='class', model=TRUE)

#predict on test data
rpart_prob <- predict(rpart_fit,tree_testset[,-targetColNum],type="class")

#Create a confusion matrix
rpart_confusion_matrix <- table(pred=rpart_prob,true=tree_testset[,targetColNum])

rpart_confusion_matrix

##     true
## pred     0     1
##    0 38234   528
##    1   100   540

#Convert the confusion matrix to a data frame
rpart_confusion_matrix_df <- as.data.frame(rpart_confusion_matrix)
  
rpart_evaluation_measures <- get_evaluation_measures("RPart unpruned",
                        rpart_confusion_matrix_df$Freq[1],
                        rpart_confusion_matrix_df$Freq[2],
                        rpart_confusion_matrix_df$Freq[3],
                        rpart_confusion_matrix_df$Freq[4])

#Get the AUC and add it to our evaluation measures data frame
#rpart_evaluation_measures$AUC <- get_auc(rpart_prob, tree_testset[,targetColNum])

#Can't get AUC
rpart_evaluation_measures$AUC <- NA

evaluation_measures <- rbind(evaluation_measures, rpart_evaluation_measures)

evaluation_measures

##                name  accuracy precision      recall          F1       AUC
## 1 Linear Regression 0.9770570 0.8534483 0.185393258 0.304615385 0.8913693
## 2             Lasso 0.9729455 1.0000000 0.001872659 0.003738318 0.8962849
## 3    RPart unpruned 0.9840617 0.8437500 0.505617978 0.632318501        NA

Pruning the tree

#what's the tree size which results in the min cross validated error
rpart_opt <- which.min(rpart_fit$cptable[,"xerror"])

#value of the complexity parameter (alpha) for that gives a tree of that size
rpart_cp <- rpart_fit$cptable[rpart_opt, "CP"]

# "prune" the tree using that value of the complexity parameter
rpart_pruned.model <- prune(rpart_fit,rpart_cp)

#plot pruned tree
prp(rpart_pruned.model)

#predictions from pruned model
rpart_pruned_predict <- predict(rpart_pruned.model,tree_testset[,-targetColNum],type="class")

#confusion matrix (PRUNED model)
rpart_pruned_confusion_matrix <- table(pred=rpart_pruned_predict,true=tree_testset[,targetColNum])

rpart_pruned_confusion_matrix

##     true
## pred     0     1
##    0 38234   528
##    1   100   540

#Convert the confusion matrix to a data frame
rpart_pruned_confusion_matrix_df <- as.data.frame(rpart_pruned_confusion_matrix)
  
rpart_pruned_evaluation_measures <- get_evaluation_measures("RPart pruned",
                        rpart_pruned_confusion_matrix_df$Freq[1],
                        rpart_pruned_confusion_matrix_df$Freq[2],
                        rpart_pruned_confusion_matrix_df$Freq[3],
                        rpart_pruned_confusion_matrix_df$Freq[4])

#Get the AUC and add it to our evaluation measures data frame
#rpart_pruned_evaluation_measures$AUC <- get_auc(rpart_pruned_predict, tree_testset[,targetColNum])

#Can't get AUC
rpart_pruned_evaluation_measures$AUC <- NA

evaluation_measures <- rbind(evaluation_measures, rpart_pruned_evaluation_measures)

evaluation_measures

##                name  accuracy precision      recall          F1       AUC
## 1 Linear Regression 0.9770570 0.8534483 0.185393258 0.304615385 0.8913693
## 2             Lasso 0.9729455 1.0000000 0.001872659 0.003738318 0.8962849
## 3    RPart unpruned 0.9840617 0.8437500 0.505617978 0.632318501        NA
## 4      RPart pruned 0.9840617 0.8437500 0.505617978 0.632318501        NA

#Build random forest model
rf.model <- randomForest(Target ~.,
                         data = tree_trainset,
                         importance=TRUE,
                         xtest=tree_testset[,-targetColNum],
                         ntree=1000)

#predictions for test set
rf_predictions <- data.frame(tree_testset,rf.model$test$predicted)

#confusion matrix
rf_confusion_matrix <- table(rf.model$test$predicted,tree_testset[,targetColNum])

rf_confusion_matrix_df <- as.data.frame(rf_confusion_matrix)

rf_evaluation_measures <- get_evaluation_measures("Random Forest",
                                                  rf_confusion_matrix_df$Freq[1],
                                                  rf_confusion_matrix_df$Freq[2],
                                                  rf_confusion_matrix_df$Freq[3],
                                                  rf_confusion_matrix_df$Freq[4])

#Get the AUC and add it to our evaluation measures data frame
rf_evaluation_measures$AUC <- get_auc(rf.model$test$votes[,2], tree_testset[,targetColNum])

evaluation_measures <- rbind(evaluation_measures, rf_evaluation_measures)

#Compare model evaluation metrics to determine best model to use

evaluation_measures

##                name  accuracy precision      recall          F1       AUC
## 1 Linear Regression 0.9770570 0.8534483 0.185393258 0.304615385 0.8913693
## 2             Lasso 0.9729455 1.0000000 0.001872659 0.003738318 0.8962849
## 3    RPart unpruned 0.9840617 0.8437500 0.505617978 0.632318501        NA
## 4      RPart pruned 0.9840617 0.8437500 0.505617978 0.632318501        NA
## 5     Random Forest 0.9924116 0.9404353 0.768726592 0.845955693 0.9930775

Talk about variable importance measures from linear vs tree

Which features work best as predictors, what will you keep and why ALEX TO FLESH THIS OUT

#quantitative measure of variable importance
importance(rf.model)

##                                   0           1 MeanDecreaseAccuracy
## car_model                  22.30483  -0.1142544             22.25298
## car_segment                24.42780  -8.2473182             23.36288
## age_of_vehicle_years       98.60089  13.4830131             98.86378
## sched_serv_warr            68.58838  73.1513210             70.61953
## non_sched_serv_warr        50.02090  45.6837522             50.90880
## sched_serv_paid            39.19942  47.5919508             40.24251
## non_sched_serv_paid        36.19129  26.9254542             36.47427
## total_paid_services        36.05429  21.8819590             36.30462
## total_services             59.52957  72.0991582             60.78891
## mth_since_last_serv       157.00513 120.5204063            161.82230
## annualised_mileage        175.21678   9.2144956            175.45747
## num_dealers_visited        58.44822  70.8990345             60.06840
## num_serv_dealer_purchased  93.33044  12.1798271             93.77070
##                           MeanDecreaseGini
## car_model                        228.63798
## car_segment                       49.28325
## age_of_vehicle_years             452.67217
## sched_serv_warr                  312.27809
## non_sched_serv_warr              273.82791
## sched_serv_paid                  262.10793
## non_sched_serv_paid              189.91392
## total_paid_services              253.55395
## total_services                   417.98529
## mth_since_last_serv              768.59623
## annualised_mileage               617.71384
## num_dealers_visited              328.08056
## num_serv_dealer_purchased        580.26730

imp = varImp(rf.model)

plot(imp)

###ALEX TO DETERMINE TOP 5 MOST IMPORTANT FEATURES - do top 5 partial dependency plots discuss variable importance for both models (maybe in report, maybe here - decide) ###Selecting Random Forest as main model ####Now use the Random Forest model to predict for the validation set

#Load the vaidation set
#NOTE:this validation set needed an ID added (hence the CORRECTED in the file name)
repurchase_validation <- read.csv("raw_data/repurchase_validationCORRECTED.csv")

#Train set is the full repurchase training set after cleaning for the tree models
trainset <- tree_data

#Car model 19 doesn't exist in the validation data set, so we filter it from the training set
trainset <- trainset %>%
    filter(car_model != "model_19")

#Re-factor car_model to have 18 factors as per validation set
trainset$car_model <- factor(trainset$car_model)

#Test set
testset <- repurchase_validation

#Run the model (don't include ID, age or gender columns in the testset)
validation.model <- randomForest(Target~.,
                                 data = trainset, 
                                 importance = TRUE, 
                                 xtest = testset[,4:ncol(testset)],
                                 ntree = 1000)

#predictions for test set
testset$target_class <- validation.model$test$predicted

#NEED TO DETERMINE WHICH VOTE VALUE TO USE
testset$target_probability <- validation.model$test$votes[,2]

final_output <- testset %>%
    select(ID, target_probability, target_class)

write.csv(final_output, file="cleaned_data/repurchase_validation_13184650.csv", row.names=FALSE)

View(final_output)

Targeted_IDs <- filter(final_output,target_class==1)

View(Targeted_IDs)