The goal is to target existing customers for a re-purchase campaign, by sending a communication to customers who are highly likely to purchase a new vehicle (all customers have purchased at least one vehicle)
#import data
repurchase_training <- read.csv("raw_data/repurchase_training.csv")
Run a linear model with the training data.
Using glm, we optimised to build out linear classification model, at first using Target as the predictor and taking ID out of the test set. The factor data needed to be cleaned to turn categories into integers.
lrm_data <- repurchase_training
#We run a linear regression model on our testset, excluding IDs
glm.model = glm(formula = Target ~ ., family = binomial, data = lrm_data[,-1])
summary(glm.model)
##
## Call:
## glm(formula = Target ~ ., family = binomial, data = lrm_data[,
## -1])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3059 -0.1867 -0.0606 -0.0233 4.8892
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.622319 0.557257 -8.295 < 2e-16 ***
## age_band2. 25 to 34 1.229115 0.582886 2.109 0.034973 *
## age_band3. 35 to 44 1.404695 0.576427 2.437 0.014814 *
## age_band4. 45 to 54 1.656169 0.570158 2.905 0.003675 **
## age_band5. 55 to 64 1.774010 0.571009 3.107 0.001891 **
## age_band6. 65 to 74 1.489551 0.599670 2.484 0.012993 *
## age_band7. 75+ 1.985368 0.636082 3.121 0.001801 **
## age_bandNULL 1.782812 0.550253 3.240 0.001195 **
## genderMale 0.426431 0.069093 6.172 6.75e-10 ***
## genderNULL -0.109306 0.064938 -1.683 0.092329 .
## car_modelmodel_10 -0.793655 0.165332 -4.800 1.58e-06 ***
## car_modelmodel_11 -1.368900 0.353951 -3.867 0.000110 ***
## car_modelmodel_12 -1.014117 0.430728 -2.354 0.018551 *
## car_modelmodel_13 1.444475 0.204250 7.072 1.53e-12 ***
## car_modelmodel_14 -11.535142 87.036102 -0.133 0.894563
## car_modelmodel_15 1.611429 0.388465 4.148 3.35e-05 ***
## car_modelmodel_16 0.840084 0.595070 1.412 0.158027
## car_modelmodel_17 -1.291977 1.034821 -1.249 0.211847
## car_modelmodel_18 -0.336041 0.516382 -0.651 0.515200
## car_modelmodel_19 -11.630748 614.640130 -0.019 0.984903
## car_modelmodel_2 0.647902 0.070808 9.150 < 2e-16 ***
## car_modelmodel_3 0.821045 0.073260 11.207 < 2e-16 ***
## car_modelmodel_4 0.656971 0.087515 7.507 6.05e-14 ***
## car_modelmodel_5 0.309173 0.073812 4.189 2.81e-05 ***
## car_modelmodel_6 0.447349 0.161645 2.767 0.005649 **
## car_modelmodel_7 0.843444 0.089540 9.420 < 2e-16 ***
## car_modelmodel_8 0.909639 0.099712 9.123 < 2e-16 ***
## car_modelmodel_9 -0.001115 0.234381 -0.005 0.996206
## car_segmentLCV NA NA NA NA
## car_segmentOther 0.343629 1.448426 0.237 0.812468
## car_segmentSmall/Medium NA NA NA NA
## age_of_vehicle_years -0.037538 0.010281 -3.651 0.000261 ***
## sched_serv_warr -0.313279 0.021700 -14.437 < 2e-16 ***
## non_sched_serv_warr 0.024260 0.019515 1.243 0.213813
## sched_serv_paid -0.294926 0.019035 -15.494 < 2e-16 ***
## non_sched_serv_paid 0.321299 0.023184 13.859 < 2e-16 ***
## total_paid_services -0.048576 0.023816 -2.040 0.041384 *
## total_services -0.956269 0.028876 -33.116 < 2e-16 ***
## mth_since_last_serv -0.354847 0.012025 -29.508 < 2e-16 ***
## annualised_mileage 0.449782 0.011239 40.020 < 2e-16 ***
## num_dealers_visited 0.045751 0.010131 4.516 6.30e-06 ***
## num_serv_dealer_purchased 0.472887 0.014385 32.875 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 32432 on 131336 degrees of freedom
## Residual deviance: 20538 on 131297 degrees of freedom
## AIC: 20618
##
## Number of Fisher Scoring iterations: 13
#The summary shows the factor data is making the model treat each category in factor columns as predictors, so we need to clean the data to turn categories into integers
lrm_data$age_band <- substr(lrm_data$age_band, 1, 1)
lrm_data$age_band <- gsub("N", "NULL", lrm_data$age_band)
lrm_data$gender <- gsub("Male", "1", lrm_data$gender)
lrm_data$gender <- gsub("Female", "2", lrm_data$gender)
lrm_data$car_model <- sub('model_', '', lrm_data$car_model)
lrm_data$car_segment <- sub("Other", "1", lrm_data$car_segment)
lrm_data$car_segment <- sub("Small/Medium", "2", lrm_data$car_segment)
lrm_data$car_segment <- sub("Large/SUV", "3", lrm_data$car_segment)
lrm_data$car_segment <- sub("LCV", "4", lrm_data$car_segment)
lrm_data$age_band <- as.integer(lrm_data$age_band)
## Warning: NAs introduced by coercion
lrm_data$gender <- as.integer(lrm_data$gender)
## Warning: NAs introduced by coercion
lrm_data$car_model <- as.integer(lrm_data$car_model)
lrm_data$car_segment <- as.integer(lrm_data$car_segment)
#We want to keep our Target as a Factor
lrm_data[,2] = as.factor(lrm_data[,2]);
#We also have 2 columns with high proportions of NAs. They are unlikely to be useful predictors, so let's remove them
lrm_data <- lrm_data %>%
dplyr::select(ID, Target, car_model, car_segment, age_of_vehicle_years, sched_serv_warr, non_sched_serv_warr, sched_serv_paid, non_sched_serv_paid, total_paid_services, total_services, mth_since_last_serv, annualised_mileage, num_dealers_visited, num_serv_dealer_purchased)
# create training and test sets of our lrm_data
## 70% of the lrm_data, use floor to round down to nearest integer
lrm_trainset_size <- floor(0.70 * nrow(lrm_data))
set.seed(42)
lrm_trainset_indices <- sample(seq_len(nrow(lrm_data)), size = lrm_trainset_size)
lrm_trainset <- lrm_data[lrm_trainset_indices, ]
lrm_testset <- lrm_data[-lrm_trainset_indices, ]
#So try our linear regression model on our newly cleaned test set
glm.model = glm(formula = Target ~ ., family = binomial, data = lrm_testset[,-1])
#ALEXTOCOMMENTThere's a couple of obvious ones we could remove (e.g. age_of_vehicle_years, non_sched_serv_warr), but some others are questionable (e.g. total_paid_services, car_segment, car_model)
summary(glm.model)
##
## Call:
## glm(formula = Target ~ ., family = binomial, data = lrm_testset[,
## -1])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9480 -0.2123 -0.0702 -0.0278 4.7109
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.936221 0.174590 -11.090 < 2e-16 ***
## car_model -0.016126 0.012125 -1.330 0.18353
## car_segment -0.100213 0.047507 -2.109 0.03491 *
## age_of_vehicle_years 0.021718 0.017283 1.257 0.20889
## sched_serv_warr -0.309101 0.038368 -8.056 7.86e-16 ***
## non_sched_serv_warr -0.007022 0.034808 -0.202 0.84012
## sched_serv_paid -0.312433 0.034067 -9.171 < 2e-16 ***
## non_sched_serv_paid 0.282285 0.040693 6.937 4.01e-12 ***
## total_paid_services -0.077255 0.041899 -1.844 0.06520 .
## total_services -0.840949 0.050158 -16.766 < 2e-16 ***
## mth_since_last_serv -0.333633 0.021060 -15.842 < 2e-16 ***
## annualised_mileage 0.405797 0.019764 20.533 < 2e-16 ***
## num_dealers_visited 0.053807 0.017891 3.007 0.00263 **
## num_serv_dealer_purchased 0.464105 0.026131 17.761 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 9813.5 on 39401 degrees of freedom
## Residual deviance: 6593.4 on 39388 degrees of freedom
## AIC: 6621.4
##
## Number of Fisher Scoring iterations: 9
# Now to predict probabilities on lrm_testset
glm_prob <- predict.glm(glm.model,lrm_testset[,-1],type="response")
# Create a vector to hold predictions
glm_predict <- rep(0,nrow(lrm_testset[,-1]))
glm_predict[glm_prob>.5] <- 1
#Create a confusion matrix
glm_confusion_matrix <- table(pred=glm_predict,true=lrm_testset$Target)
glm_confusion_matrix
## true
## pred 0 1
## 0 38300 870
## 1 34 198
# We'll want to look at evaluation measures regularly, so crteate a function to calculate and return them
get_evaluation_measures <- function(name = NA, tn, fp, fn, tp) {
accuracy = (tp+tn)/(tp+tn+fp+fn)
precision = tp/(tp+fp)
recall = tp/(tp+fn)
F1 = 2 * ((precision * recall)/(precision + recall))
output = data.frame(name, accuracy, precision, recall, F1)
return(output)
}
#Convert the confusion matrix to a data frame and output the evaluation measures
glm_confusion_matrix_df <- as.data.frame(glm_confusion_matrix)
lrm_evaluation_measures <- get_evaluation_measures("Linear Regression",
glm_confusion_matrix_df$Freq[1],
glm_confusion_matrix_df$Freq[2],
glm_confusion_matrix_df$Freq[3],
glm_confusion_matrix_df$Freq[4])
lrm_evaluation_measures
## name accuracy precision recall F1
## 1 Linear Regression 0.977057 0.8534483 0.1853933 0.3046154
#Now to get AUC. We'll do it a few times, so write a function
get_auc <- function(probabilities, targets) {
probs = as.vector(probabilities)
pred = prediction(probs,targets)
perf_AUC = performance(pred, "auc")
AUC = perf_AUC@y.values[[1]]
return(AUC)
}
#Get the AUC and add it to our evaluation measures data frame
lrm_evaluation_measures$AUC <- get_auc(glm_prob, lrm_testset$Target)
#Create a data frame for all evaluation measures and use lrm_evaluation_measures as the first row
evaluation_measures <- lrm_evaluation_measures
evaluation_measures
## name accuracy precision recall F1 AUC
## 1 Linear Regression 0.977057 0.8534483 0.1853933 0.3046154 0.8913693
#Now we want to try LASSO to perform grid search to find optimal value of lambda IE regularize the model
#convert training data to matrix format
lrm_x <- model.matrix(Target~.,lrm_trainset[,-1])
lrm_y <- lrm_trainset$Target
#family= binomial => logistic regression, alpha=1 => lasso
lrm_cv.out <- cv.glmnet(lrm_x, lrm_y, alpha=1, family="binomial", type.measure="auc")
plot(lrm_cv.out)
#Using lambda of 1se rather than the inimum lambda, see what predictors are discarded
#min value of lambda
lrm_lambda_min <- lrm_cv.out$lambda.min
#best value of lambda
lrm_lambda_1se <- lrm_cv.out$lambda.1se
#regression coefficients
coef(lrm_cv.out,s=lrm_lambda_1se)
## 15 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -2.078821528
## (Intercept) .
## car_model .
## car_segment .
## age_of_vehicle_years .
## sched_serv_warr -0.215893443
## non_sched_serv_warr .
## sched_serv_paid -0.212896245
## non_sched_serv_paid 0.125448961
## total_paid_services .
## total_services -0.301259090
## mth_since_last_serv -0.178056998
## annualised_mileage 0.171637603
## num_dealers_visited 0.006766528
## num_serv_dealer_purchased 0.166261320
#Convert test data to a model matrix
lrm_x_test <- model.matrix(Target~.,lrm_testset[,-1])
#Get prediction probabilities
lasso_prob <- predict(lrm_cv.out, newx = lrm_x_test, s=lrm_lambda_1se, type="response")
#translate probabilities to predictions
lasso_predict <- rep(0,nrow(lrm_testset))
lasso_predict[lasso_prob>.5] <- 1
#confusion matrix
lasso_confusion_matrix <- table(pred=lasso_predict,true=lrm_testset$Target)
lasso_confusion_matrix
## true
## pred 0 1
## 0 38334 1066
## 1 0 2
#Convert the confusion matrix to a data frame
lasso_confusion_matrix_df <- as.data.frame(lasso_confusion_matrix)
lasso_evaluation_measures <- get_evaluation_measures("Lasso",
lasso_confusion_matrix_df$Freq[1],
lasso_confusion_matrix_df$Freq[2],
lasso_confusion_matrix_df$Freq[3],
lasso_confusion_matrix_df$Freq[4])
#Get the AUC and add it to our evaluation measures data frame
lasso_evaluation_measures$AUC <- get_auc(lasso_prob, lrm_testset$Target)
lasso_evaluation_measures
## name accuracy precision recall F1 AUC
## 1 Lasso 0.9729455 1 0.001872659 0.003738318 0.8962849
#Visually compare the two models' evaluation metrics by placing in a data frame
evaluation_measures <- rbind(evaluation_measures, lasso_evaluation_measures)
evaluation_measures
## name accuracy precision recall F1 AUC
## 1 Linear Regression 0.9770570 0.8534483 0.185393258 0.304615385 0.8913693
## 2 Lasso 0.9729455 1.0000000 0.001872659 0.003738318 0.8962849
# Remove columns with high NAs - excluding age and gender - and ID
tree_data <- repurchase_training %>%
select(Target, car_model, car_segment, age_of_vehicle_years, sched_serv_warr, non_sched_serv_warr, sched_serv_paid, non_sched_serv_paid, total_paid_services, total_services, mth_since_last_serv, annualised_mileage,num_dealers_visited,num_serv_dealer_purchased)
# Make sure our predictor is a factor
tree_data$Target <- as.factor(tree_data$Target)
#Get index of predicted variable
targetColNum <- grep("Target",names(tree_data))
# Create training and test sets
## 70% of the sample size, use floor to round down to nearest integer
tree_trainset_size <- floor(0.70 * nrow(tree_data))
#set random seed
set.seed(42)
tree_trainset_indices <- sample(seq_len(nrow(tree_data)), size = tree_trainset_size)
#assign observations to training and testing sets
tree_trainset <- tree_data[tree_trainset_indices, ]
tree_testset <- tree_data[-tree_trainset_indices, ]
#default params. This is a classification problem so set method="class" and exclude ID
rpart_fit <- rpart(Target~.,data = tree_trainset, method='class', model=TRUE)
#predict on test data
rpart_prob <- predict(rpart_fit,tree_testset[,-targetColNum],type="class")
#Create a confusion matrix
rpart_confusion_matrix <- table(pred=rpart_prob,true=tree_testset[,targetColNum])
rpart_confusion_matrix
## true
## pred 0 1
## 0 38234 528
## 1 100 540
#Convert the confusion matrix to a data frame
rpart_confusion_matrix_df <- as.data.frame(rpart_confusion_matrix)
rpart_evaluation_measures <- get_evaluation_measures("RPart unpruned",
rpart_confusion_matrix_df$Freq[1],
rpart_confusion_matrix_df$Freq[2],
rpart_confusion_matrix_df$Freq[3],
rpart_confusion_matrix_df$Freq[4])
#Get the AUC and add it to our evaluation measures data frame
#rpart_evaluation_measures$AUC <- get_auc(rpart_prob, tree_testset[,targetColNum])
#Can't get AUC
rpart_evaluation_measures$AUC <- NA
evaluation_measures <- rbind(evaluation_measures, rpart_evaluation_measures)
evaluation_measures
## name accuracy precision recall F1 AUC
## 1 Linear Regression 0.9770570 0.8534483 0.185393258 0.304615385 0.8913693
## 2 Lasso 0.9729455 1.0000000 0.001872659 0.003738318 0.8962849
## 3 RPart unpruned 0.9840617 0.8437500 0.505617978 0.632318501 NA
#what's the tree size which results in the min cross validated error
rpart_opt <- which.min(rpart_fit$cptable[,"xerror"])
#value of the complexity parameter (alpha) for that gives a tree of that size
rpart_cp <- rpart_fit$cptable[rpart_opt, "CP"]
# "prune" the tree using that value of the complexity parameter
rpart_pruned.model <- prune(rpart_fit,rpart_cp)
#plot pruned tree
prp(rpart_pruned.model)
#predictions from pruned model
rpart_pruned_predict <- predict(rpart_pruned.model,tree_testset[,-targetColNum],type="class")
#confusion matrix (PRUNED model)
rpart_pruned_confusion_matrix <- table(pred=rpart_pruned_predict,true=tree_testset[,targetColNum])
rpart_pruned_confusion_matrix
## true
## pred 0 1
## 0 38234 528
## 1 100 540
#Convert the confusion matrix to a data frame
rpart_pruned_confusion_matrix_df <- as.data.frame(rpart_pruned_confusion_matrix)
rpart_pruned_evaluation_measures <- get_evaluation_measures("RPart pruned",
rpart_pruned_confusion_matrix_df$Freq[1],
rpart_pruned_confusion_matrix_df$Freq[2],
rpart_pruned_confusion_matrix_df$Freq[3],
rpart_pruned_confusion_matrix_df$Freq[4])
#Get the AUC and add it to our evaluation measures data frame
#rpart_pruned_evaluation_measures$AUC <- get_auc(rpart_pruned_predict, tree_testset[,targetColNum])
#Can't get AUC
rpart_pruned_evaluation_measures$AUC <- NA
evaluation_measures <- rbind(evaluation_measures, rpart_pruned_evaluation_measures)
evaluation_measures
## name accuracy precision recall F1 AUC
## 1 Linear Regression 0.9770570 0.8534483 0.185393258 0.304615385 0.8913693
## 2 Lasso 0.9729455 1.0000000 0.001872659 0.003738318 0.8962849
## 3 RPart unpruned 0.9840617 0.8437500 0.505617978 0.632318501 NA
## 4 RPart pruned 0.9840617 0.8437500 0.505617978 0.632318501 NA
#Build random forest model
rf.model <- randomForest(Target ~.,
data = tree_trainset,
importance=TRUE,
xtest=tree_testset[,-targetColNum],
ntree=1000)
#predictions for test set
rf_predictions <- data.frame(tree_testset,rf.model$test$predicted)
#confusion matrix
rf_confusion_matrix <- table(rf.model$test$predicted,tree_testset[,targetColNum])
rf_confusion_matrix_df <- as.data.frame(rf_confusion_matrix)
rf_evaluation_measures <- get_evaluation_measures("Random Forest",
rf_confusion_matrix_df$Freq[1],
rf_confusion_matrix_df$Freq[2],
rf_confusion_matrix_df$Freq[3],
rf_confusion_matrix_df$Freq[4])
#Get the AUC and add it to our evaluation measures data frame
rf_evaluation_measures$AUC <- get_auc(rf.model$test$votes[,2], tree_testset[,targetColNum])
evaluation_measures <- rbind(evaluation_measures, rf_evaluation_measures)
#Compare model evaluation metrics to determine best model to use
evaluation_measures
## name accuracy precision recall F1 AUC
## 1 Linear Regression 0.9770570 0.8534483 0.185393258 0.304615385 0.8913693
## 2 Lasso 0.9729455 1.0000000 0.001872659 0.003738318 0.8962849
## 3 RPart unpruned 0.9840617 0.8437500 0.505617978 0.632318501 NA
## 4 RPart pruned 0.9840617 0.8437500 0.505617978 0.632318501 NA
## 5 Random Forest 0.9924116 0.9404353 0.768726592 0.845955693 0.9930775
Which features work best as predictors, what will you keep and why ALEX TO FLESH THIS OUT
#quantitative measure of variable importance
importance(rf.model)
## 0 1 MeanDecreaseAccuracy
## car_model 22.30483 -0.1142544 22.25298
## car_segment 24.42780 -8.2473182 23.36288
## age_of_vehicle_years 98.60089 13.4830131 98.86378
## sched_serv_warr 68.58838 73.1513210 70.61953
## non_sched_serv_warr 50.02090 45.6837522 50.90880
## sched_serv_paid 39.19942 47.5919508 40.24251
## non_sched_serv_paid 36.19129 26.9254542 36.47427
## total_paid_services 36.05429 21.8819590 36.30462
## total_services 59.52957 72.0991582 60.78891
## mth_since_last_serv 157.00513 120.5204063 161.82230
## annualised_mileage 175.21678 9.2144956 175.45747
## num_dealers_visited 58.44822 70.8990345 60.06840
## num_serv_dealer_purchased 93.33044 12.1798271 93.77070
## MeanDecreaseGini
## car_model 228.63798
## car_segment 49.28325
## age_of_vehicle_years 452.67217
## sched_serv_warr 312.27809
## non_sched_serv_warr 273.82791
## sched_serv_paid 262.10793
## non_sched_serv_paid 189.91392
## total_paid_services 253.55395
## total_services 417.98529
## mth_since_last_serv 768.59623
## annualised_mileage 617.71384
## num_dealers_visited 328.08056
## num_serv_dealer_purchased 580.26730
imp = varImp(rf.model)
plot(imp)
###ALEX TO DETERMINE TOP 5 MOST IMPORTANT FEATURES - do top 5 partial dependency plots discuss variable importance for both models (maybe in report, maybe here - decide) ###Selecting Random Forest as main model ####Now use the Random Forest model to predict for the validation set
#Load the vaidation set
#NOTE:this validation set needed an ID added (hence the CORRECTED in the file name)
repurchase_validation <- read.csv("raw_data/repurchase_validationCORRECTED.csv")
#Train set is the full repurchase training set after cleaning for the tree models
trainset <- tree_data
#Car model 19 doesn't exist in the validation data set, so we filter it from the training set
trainset <- trainset %>%
filter(car_model != "model_19")
#Re-factor car_model to have 18 factors as per validation set
trainset$car_model <- factor(trainset$car_model)
#Test set
testset <- repurchase_validation
#Run the model (don't include ID, age or gender columns in the testset)
validation.model <- randomForest(Target~.,
data = trainset,
importance = TRUE,
xtest = testset[,4:ncol(testset)],
ntree = 1000)
#predictions for test set
testset$target_class <- validation.model$test$predicted
#NEED TO DETERMINE WHICH VOTE VALUE TO USE
testset$target_probability <- validation.model$test$votes[,2]
final_output <- testset %>%
select(ID, target_probability, target_class)
write.csv(final_output, file="cleaned_data/repurchase_validation_13184650.csv", row.names=FALSE)
View(final_output)
Targeted_IDs <- filter(final_output,target_class==1)
View(Targeted_IDs)