Exploratory data analysis has been undertaken and is referred to in the separate CRISP-DM report submission. The business goal of this modelling exercise is to target existing customers for a re-purchase campaign. Data cleaning, processing and model-building is outlined below.
#import data
repurchase_training <- read.csv("raw_data/repurchase_training.csv")
#need to change ID to be "character" in both datasets, so the IDs are not counted as numbers
eda_data <- repurchase_training
for (i in 3:6) {
eda_data[,i] = as.character(eda_data[,i]);
}
str(eda_data)
## 'data.frame': 131337 obs. of 17 variables:
## $ ID : int 1 2 3 5 6 7 8 9 10 11 ...
## $ Target : int 0 0 0 0 0 0 0 0 0 0 ...
## $ age_band : chr "3. 35 to 44" "NULL" "NULL" "NULL" ...
## $ gender : chr "Male" "NULL" "Male" "NULL" ...
## $ car_model : chr "model_1" "model_2" "model_3" "model_3" ...
## $ car_segment : chr "LCV" "Small/Medium" "Large/SUV" "Large/SUV" ...
## $ age_of_vehicle_years : int 9 6 9 5 8 7 8 7 1 3 ...
## $ sched_serv_warr : int 2 10 10 8 9 4 2 4 2 1 ...
## $ non_sched_serv_warr : int 10 3 9 5 4 10 8 9 1 1 ...
## $ sched_serv_paid : int 3 10 10 8 10 5 2 6 1 2 ...
## $ non_sched_serv_paid : int 7 4 9 4 7 7 9 9 3 1 ...
## $ total_paid_services : int 5 9 10 5 9 6 9 8 1 2 ...
## $ total_services : int 6 10 10 6 8 8 4 6 2 1 ...
## $ mth_since_last_serv : int 9 6 7 4 5 8 7 9 1 1 ...
## $ annualised_mileage : int 8 10 10 10 4 5 6 5 1 1 ...
## $ num_dealers_visited : int 10 7 6 9 4 10 10 5 2 1 ...
## $ num_serv_dealer_purchased: int 4 10 10 7 9 4 4 8 3 1 ...
#Find nulls in the data to work out which columns have a lot of empty records
total_nulls <- data.frame(type = names(eda_data[, -1]),
nulls = colSums(eda_data[, -1] == "NULL"),
values = colSums(eda_data[, -1] != "NULL")
)
total_nulls <- melt(total_nulls, id.var="type")
# We can see from this that 'age_band' has very few values across the training set
# We can also see that gender only has value in about 50% of rows
# Neither is likely very usable for our model
ggplot(total_nulls, aes(x = type, y = value, fill = variable)) +
geom_bar(stat = "identity") +
coord_flip()
#First write a function that will take a characteristic and return a data frame that gives us the count of entries with target = 1 and target = 0
get_counts <- function (attribute_df, count_df) {
output = data.frame(matrix(ncol = 3, nrow = 0), stringsAsFactors = FALSE)
attribute_name = colnames(attribute_df)[1]
names(output) = c(attribute_name, "target", "non_target")
for(i in 1:nrow(attribute_df)) {
my_attribute = as.character(attribute_df[i, attribute_name])
my_target_count = count_df %>%
filter(Target == 1) %>%
filter(.[,2] == my_attribute)
my_no_target_count = count_df %>%
filter(Target == 0) %>%
filter(.[,2] == my_attribute)
if(nrow(my_target_count) > 0) {
target_count = as.character(my_target_count$count)
} else {
target_count = "0"
}
if(nrow(my_no_target_count) > 0) {
no_target_count = as.character(my_no_target_count$count)
} else {
no_target_count = "0"
}
temp <- data.frame(my_attribute, target_count, no_target_count)
names(temp) <- c(attribute_name, "target", "non_target")
output = rbind(output, temp)
output[,1] <- as.character(output[,1])
output[,2] <- as.integer(output[,2])
output[,3] <- as.integer(output[,3])
}
return(output)
}
#melt and plot car models
car_models <- eda_data %>%
group_by(car_model) %>%
summarize(count = n())
car_models_counts <- eda_data %>%
group_by(Target, car_model) %>%
summarize(count = n())
car_model_data <- get_counts(car_models, car_models_counts)
car_model_data <- melt(car_model_data, id.var="car_model")
ggplot(car_model_data, aes(x = car_model, y = value, fill = variable)) +
geom_bar(stat = "identity") + theme(axis.text.x = element_text(angle = 90, hjust = 1))
#plot car segments variable
car_segments <- eda_data %>%
group_by(car_segment) %>%
summarize(count = n())
car_segments_counts <- eda_data %>%
group_by(Target, car_segment) %>%
summarize(count = n())
car_segment_data <- get_counts(car_segments, car_segments_counts)
car_segment_data <- melt(car_segment_data, id.var="car_segment")
ggplot(car_segment_data, aes(x = car_segment, y = value, fill = variable)) +
geom_bar(stat = "identity")
#plot age of vehicle in years variable
age_of_vehicle_years <- eda_data %>%
group_by(age_of_vehicle_years) %>%
summarize(count = n())
age_of_vehicle_years_counts <- eda_data %>%
group_by(Target, age_of_vehicle_years) %>%
summarize(count = n())
age_of_vehicle_years_data <- get_counts(age_of_vehicle_years, age_of_vehicle_years_counts)
age_of_vehicle_years_data <- melt(age_of_vehicle_years_data, id.var="age_of_vehicle_years")
ggplot(age_of_vehicle_years_data, aes(x = age_of_vehicle_years, y = value, fill = variable)) +
geom_bar(stat = "identity")
#NUmber of scheduled service used under warranty
sched_serv_warr_values <- eda_data %>%
group_by(sched_serv_warr) %>%
summarize(count = n())
sched_serv_warr_counts <- eda_data %>%
group_by(Target, sched_serv_warr) %>%
summarize(count = n())
sched_serv_warr_data <- get_counts(sched_serv_warr_values, sched_serv_warr_counts)
sched_serv_warr_data <- melt(sched_serv_warr_data, id.var="sched_serv_warr")
ggplot(sched_serv_warr_data, aes(x = sched_serv_warr, y = value, fill = variable)) +
geom_bar(stat = "identity")
#Number of services had at the same dealer where the vehicle was purchased
num_serv_dealer_purchased_values <- eda_data %>%
group_by(num_serv_dealer_purchased) %>%
summarize(count = n())
num_serv_dealer_purchased_counts <- eda_data %>%
group_by(Target, num_serv_dealer_purchased) %>%
summarize(count = n())
num_serv_dealer_purchased_data <- get_counts(num_serv_dealer_purchased_values, num_serv_dealer_purchased_counts)
num_serv_dealer_purchased_data <- melt(num_serv_dealer_purchased_data, id.var="num_serv_dealer_purchased")
ggplot(num_serv_dealer_purchased_data, aes(x = num_serv_dealer_purchased, y = value, fill = variable)) +
geom_bar(stat = "identity")
Run a linear model with the training data.
lrm_data <- repurchase_training
#We run a linear regression model on our testset, excluding IDs
glm.model = glm(formula = Target ~ ., family = binomial, data = lrm_data[,-1])
summary(glm.model)
##
## Call:
## glm(formula = Target ~ ., family = binomial, data = lrm_data[,
## -1])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3059 -0.1867 -0.0606 -0.0233 4.8892
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.622319 0.557257 -8.295 < 2e-16 ***
## age_band2. 25 to 34 1.229115 0.582886 2.109 0.034973 *
## age_band3. 35 to 44 1.404695 0.576427 2.437 0.014814 *
## age_band4. 45 to 54 1.656169 0.570158 2.905 0.003675 **
## age_band5. 55 to 64 1.774010 0.571009 3.107 0.001891 **
## age_band6. 65 to 74 1.489551 0.599670 2.484 0.012993 *
## age_band7. 75+ 1.985368 0.636082 3.121 0.001801 **
## age_bandNULL 1.782812 0.550253 3.240 0.001195 **
## genderMale 0.426431 0.069093 6.172 6.75e-10 ***
## genderNULL -0.109306 0.064938 -1.683 0.092329 .
## car_modelmodel_10 -0.793655 0.165332 -4.800 1.58e-06 ***
## car_modelmodel_11 -1.368900 0.353951 -3.867 0.000110 ***
## car_modelmodel_12 -1.014117 0.430728 -2.354 0.018551 *
## car_modelmodel_13 1.444475 0.204250 7.072 1.53e-12 ***
## car_modelmodel_14 -11.535142 87.036102 -0.133 0.894563
## car_modelmodel_15 1.611429 0.388465 4.148 3.35e-05 ***
## car_modelmodel_16 0.840084 0.595070 1.412 0.158027
## car_modelmodel_17 -1.291977 1.034821 -1.249 0.211847
## car_modelmodel_18 -0.336041 0.516382 -0.651 0.515200
## car_modelmodel_19 -11.630748 614.640130 -0.019 0.984903
## car_modelmodel_2 0.647902 0.070808 9.150 < 2e-16 ***
## car_modelmodel_3 0.821045 0.073260 11.207 < 2e-16 ***
## car_modelmodel_4 0.656971 0.087515 7.507 6.05e-14 ***
## car_modelmodel_5 0.309173 0.073812 4.189 2.81e-05 ***
## car_modelmodel_6 0.447349 0.161645 2.767 0.005649 **
## car_modelmodel_7 0.843444 0.089540 9.420 < 2e-16 ***
## car_modelmodel_8 0.909639 0.099712 9.123 < 2e-16 ***
## car_modelmodel_9 -0.001115 0.234381 -0.005 0.996206
## car_segmentLCV NA NA NA NA
## car_segmentOther 0.343629 1.448426 0.237 0.812468
## car_segmentSmall/Medium NA NA NA NA
## age_of_vehicle_years -0.037538 0.010281 -3.651 0.000261 ***
## sched_serv_warr -0.313279 0.021700 -14.437 < 2e-16 ***
## non_sched_serv_warr 0.024260 0.019515 1.243 0.213813
## sched_serv_paid -0.294926 0.019035 -15.494 < 2e-16 ***
## non_sched_serv_paid 0.321299 0.023184 13.859 < 2e-16 ***
## total_paid_services -0.048576 0.023816 -2.040 0.041384 *
## total_services -0.956269 0.028876 -33.116 < 2e-16 ***
## mth_since_last_serv -0.354847 0.012025 -29.508 < 2e-16 ***
## annualised_mileage 0.449782 0.011239 40.020 < 2e-16 ***
## num_dealers_visited 0.045751 0.010131 4.516 6.30e-06 ***
## num_serv_dealer_purchased 0.472887 0.014385 32.875 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 32432 on 131336 degrees of freedom
## Residual deviance: 20538 on 131297 degrees of freedom
## AIC: 20618
##
## Number of Fisher Scoring iterations: 13
The summary shows the factor data is making the model treat each category in factor columns as predictors, so we need to clean the data to turn categories into integers
lrm_data$age_band <- substr(lrm_data$age_band, 1, 1)
lrm_data$age_band <- gsub("N", "NULL", lrm_data$age_band)
lrm_data$gender <- gsub("Male", "1", lrm_data$gender)
lrm_data$gender <- gsub("Female", "2", lrm_data$gender)
lrm_data$car_model <- sub('model_', '', lrm_data$car_model)
lrm_data$car_segment <- sub("Other", "1", lrm_data$car_segment)
lrm_data$car_segment <- sub("Small/Medium", "2", lrm_data$car_segment)
lrm_data$car_segment <- sub("Large/SUV", "3", lrm_data$car_segment)
lrm_data$car_segment <- sub("LCV", "4", lrm_data$car_segment)
lrm_data$age_band <- as.integer(lrm_data$age_band)
## Warning: NAs introduced by coercion
lrm_data$gender <- as.integer(lrm_data$gender)
## Warning: NAs introduced by coercion
lrm_data$car_model <- as.integer(lrm_data$car_model)
lrm_data$car_segment <- as.integer(lrm_data$car_segment)
#We want to keep our Target as a Factor
lrm_data[,2] = as.factor(lrm_data[,2]);
We also have 2 columns with high proportions of NAs. They are unlikely to be useful predictors, so let’s remove them
lrm_data <- lrm_data %>%
dplyr::select(ID, Target, car_model, car_segment, age_of_vehicle_years, sched_serv_warr, non_sched_serv_warr, sched_serv_paid, non_sched_serv_paid, total_paid_services, total_services, mth_since_last_serv, annualised_mileage, num_dealers_visited, num_serv_dealer_purchased)
## 70% of the lrm_data, use floor to round down to nearest integer
lrm_trainset_size <- floor(0.70 * nrow(lrm_data))
set.seed(42)
lrm_trainset_indices <- sample(seq_len(nrow(lrm_data)), size = lrm_trainset_size)
lrm_trainset <- lrm_data[lrm_trainset_indices, ]
lrm_testset <- lrm_data[-lrm_trainset_indices, ]
#So try our linear regression model on our newly cleaned test set
glm.model = glm(formula = Target ~ ., family = binomial, data = lrm_testset[,-1])
summary(glm.model)
##
## Call:
## glm(formula = Target ~ ., family = binomial, data = lrm_testset[,
## -1])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9480 -0.2123 -0.0702 -0.0278 4.7109
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.936221 0.174590 -11.090 < 2e-16 ***
## car_model -0.016126 0.012125 -1.330 0.18353
## car_segment -0.100213 0.047507 -2.109 0.03491 *
## age_of_vehicle_years 0.021718 0.017283 1.257 0.20889
## sched_serv_warr -0.309101 0.038368 -8.056 7.86e-16 ***
## non_sched_serv_warr -0.007022 0.034808 -0.202 0.84012
## sched_serv_paid -0.312433 0.034067 -9.171 < 2e-16 ***
## non_sched_serv_paid 0.282285 0.040693 6.937 4.01e-12 ***
## total_paid_services -0.077255 0.041899 -1.844 0.06520 .
## total_services -0.840949 0.050158 -16.766 < 2e-16 ***
## mth_since_last_serv -0.333633 0.021060 -15.842 < 2e-16 ***
## annualised_mileage 0.405797 0.019764 20.533 < 2e-16 ***
## num_dealers_visited 0.053807 0.017891 3.007 0.00263 **
## num_serv_dealer_purchased 0.464105 0.026131 17.761 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 9813.5 on 39401 degrees of freedom
## Residual deviance: 6593.4 on 39388 degrees of freedom
## AIC: 6621.4
##
## Number of Fisher Scoring iterations: 9
# Now to predict probabilities on lrm_testset
glm_prob <- predict.glm(glm.model,lrm_testset[,-1],type="response")
# Create a vector to hold predictions
glm_predict <- rep(0,nrow(lrm_testset[,-1]))
glm_predict[glm_prob>.5] <- 1
#Create a confusion matrix
glm_confusion_matrix <- table(pred=glm_predict,true=lrm_testset$Target)
glm_confusion_matrix
## true
## pred 0 1
## 0 38300 870
## 1 34 198
With an AIC of 6621.4, compared to the previous experimental model having an AIC of 21132, this model is a better fit than the earlier model built without clean and transformed data. However, the confusion matrix reveals a high number of “false negatives”, so the model didn’t correctly predict 870 of the 1068 positives.
# We'll want to look at evaluation measures regularly, so crteate a function to calculate and return them
get_evaluation_measures <- function(name = NA, tn, fp, fn, tp) {
accuracy = (tp+tn)/(tp+tn+fp+fn)
precision = tp/(tp+fp)
recall = tp/(tp+fn)
F1 = 2 * ((precision * recall)/(precision + recall))
output = data.frame(name, accuracy, precision, recall, F1)
return(output)
}
#Convert the confusion matrix to a data frame and output the evaluation measures
glm_confusion_matrix_df <- as.data.frame(glm_confusion_matrix)
lrm_evaluation_measures <- get_evaluation_measures("Linear Regression",
glm_confusion_matrix_df$Freq[1],
glm_confusion_matrix_df$Freq[2],
glm_confusion_matrix_df$Freq[3],
glm_confusion_matrix_df$Freq[4])
lrm_evaluation_measures
## name accuracy precision recall F1
## 1 Linear Regression 0.977057 0.8534483 0.1853933 0.3046154
#Now to get AUC. We'll do it again further on in our analysis, so write a function
get_auc <- function(probabilities, targets) {
probs = as.vector(probabilities)
pred = prediction(probs,targets)
perf_AUC = performance(pred, "auc")
AUC = perf_AUC@y.values[[1]]
return(AUC)
}
#Get the AUC and add it to our evaluation measures data frame
lrm_evaluation_measures$AUC <- get_auc(glm_prob, lrm_testset$Target)
#Create a data frame for all evaluation measures and use lrm_evaluation_measures as the first row
evaluation_measures <- lrm_evaluation_measures
evaluation_measures
## name accuracy precision recall F1 AUC
## 1 Linear Regression 0.977057 0.8534483 0.1853933 0.3046154 0.8913693
#Now we want to try LASSO to perform grid search to find optimal value of lambda IE regularize the model
#convert training data to matrix format
lrm_x <- model.matrix(Target~.,lrm_trainset[,-1])
lrm_y <- lrm_trainset$Target
#family= binomial => logistic regression, alpha=1 => lasso
lrm_cv.out <- cv.glmnet(lrm_x, lrm_y, alpha=1, family="binomial", type.measure="auc")
plot(lrm_cv.out)
#Using lambda of 1se rather than the minimum lambda, see what predictors are discarded
#min value of lambda
lrm_lambda_min <- lrm_cv.out$lambda.min
#best value of lambda
lrm_lambda_1se <- lrm_cv.out$lambda.1se
#regression coefficients
coef(lrm_cv.out,s=lrm_lambda_1se)
## 15 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -2.078821528
## (Intercept) .
## car_model .
## car_segment .
## age_of_vehicle_years .
## sched_serv_warr -0.215893443
## non_sched_serv_warr .
## sched_serv_paid -0.212896245
## non_sched_serv_paid 0.125448961
## total_paid_services .
## total_services -0.301259090
## mth_since_last_serv -0.178056998
## annualised_mileage 0.171637603
## num_dealers_visited 0.006766528
## num_serv_dealer_purchased 0.166261320
#Convert test data to a model matrix
lrm_x_test <- model.matrix(Target~.,lrm_testset[,-1])
#Get prediction probabilities
lasso_prob <- predict(lrm_cv.out, newx = lrm_x_test, s=lrm_lambda_1se, type="response")
#translate probabilities to predictions
lasso_predict <- rep(0,nrow(lrm_testset))
lasso_predict[lasso_prob>.5] <- 1
#confusion matrix
lasso_confusion_matrix <- table(pred=lasso_predict,true=lrm_testset$Target)
lasso_confusion_matrix
## true
## pred 0 1
## 0 38334 1066
## 1 0 2
#Convert the confusion matrix to a data frame
lasso_confusion_matrix_df <- as.data.frame(lasso_confusion_matrix)
lasso_evaluation_measures <- get_evaluation_measures("Lasso",
lasso_confusion_matrix_df$Freq[1],
lasso_confusion_matrix_df$Freq[2],
lasso_confusion_matrix_df$Freq[3],
lasso_confusion_matrix_df$Freq[4])
#Get the AUC and add it to our evaluation measures data frame
lasso_evaluation_measures$AUC <- get_auc(lasso_prob, lrm_testset$Target)
lasso_evaluation_measures
## name accuracy precision recall F1 AUC
## 1 Lasso 0.9729455 1 0.001872659 0.003738318 0.8962849
#Visually compare the two models' evaluation metrics by placing in a data frame
evaluation_measures <- rbind(evaluation_measures, lasso_evaluation_measures)
evaluation_measures
## name accuracy precision recall F1 AUC
## 1 Linear Regression 0.9770570 0.8534483 0.185393258 0.304615385 0.8913693
## 2 Lasso 0.9729455 1.0000000 0.001872659 0.003738318 0.8962849
The F1 metric is the weighted average of Precision and Recall. It isn’t as intuitive as the other measures, but since the linear classification model has such strong accuracy and precision but generally poorer recall, a blended metric should be used to choose the final model.
# Remove columns with high NAs - excluding age and gender - and ID
tree_data <- repurchase_training %>%
select(Target, car_model, car_segment, age_of_vehicle_years, sched_serv_warr, non_sched_serv_warr, sched_serv_paid, non_sched_serv_paid, total_paid_services, total_services, mth_since_last_serv, annualised_mileage,num_dealers_visited,num_serv_dealer_purchased)
# Make sure our predictor is a factor
tree_data$Target <- as.factor(tree_data$Target)
#Get index of predicted variable
targetColNum <- grep("Target",names(tree_data))
# Create training and test sets
## 70% of the sample size, use floor to round down to nearest integer
tree_trainset_size <- floor(0.70 * nrow(tree_data))
#set random seed
set.seed(42)
tree_trainset_indices <- sample(seq_len(nrow(tree_data)), size = tree_trainset_size)
#assign observations to training and testing sets
tree_trainset <- tree_data[tree_trainset_indices, ]
tree_testset <- tree_data[-tree_trainset_indices, ]
#This is a classification problem so set method="class" and exclude ID
rpart_fit <- rpart(Target~.,data = tree_trainset, method='class', model=TRUE)
#predict on test data
rpart_prob <- predict(rpart_fit,tree_testset[,-targetColNum],type="class")
#Create a confusion matrix
rpart_confusion_matrix <- table(pred=rpart_prob,true=tree_testset[,targetColNum])
rpart_confusion_matrix
## true
## pred 0 1
## 0 38234 528
## 1 100 540
#Convert the confusion matrix to a data frame
rpart_confusion_matrix_df <- as.data.frame(rpart_confusion_matrix)
rpart_evaluation_measures <- get_evaluation_measures("RPart unpruned",
rpart_confusion_matrix_df$Freq[1],
rpart_confusion_matrix_df$Freq[2],
rpart_confusion_matrix_df$Freq[3],
rpart_confusion_matrix_df$Freq[4])
#Can't get AUC
rpart_evaluation_measures$AUC <- NA
evaluation_measures <- rbind(evaluation_measures, rpart_evaluation_measures)
evaluation_measures
## name accuracy precision recall F1 AUC
## 1 Linear Regression 0.9770570 0.8534483 0.185393258 0.304615385 0.8913693
## 2 Lasso 0.9729455 1.0000000 0.001872659 0.003738318 0.8962849
## 3 RPart unpruned 0.9840617 0.8437500 0.505617978 0.632318501 NA
#what's the tree size which results in the min cross validated error
rpart_opt <- which.min(rpart_fit$cptable[,"xerror"])
#value of the complexity parameter (alpha) for that gives a tree of that size
rpart_cp <- rpart_fit$cptable[rpart_opt, "CP"]
# "prune" the tree using that value of the complexity parameter
rpart_pruned.model <- prune(rpart_fit,rpart_cp)
#plot pruned tree
prp(rpart_pruned.model)
#predictions from pruned model
rpart_pruned_predict <- predict(rpart_pruned.model,tree_testset[,-targetColNum],type="class")
#confusion matrix (PRUNED model)
rpart_pruned_confusion_matrix <- table(pred=rpart_pruned_predict,true=tree_testset[,targetColNum])
rpart_pruned_confusion_matrix
## true
## pred 0 1
## 0 38234 528
## 1 100 540
#Convert the confusion matrix to a data frame
rpart_pruned_confusion_matrix_df <- as.data.frame(rpart_pruned_confusion_matrix)
rpart_pruned_evaluation_measures <- get_evaluation_measures("RPart pruned",
rpart_pruned_confusion_matrix_df$Freq[1],
rpart_pruned_confusion_matrix_df$Freq[2],
rpart_pruned_confusion_matrix_df$Freq[3],
rpart_pruned_confusion_matrix_df$Freq[4])
#Can't get AUC
rpart_pruned_evaluation_measures$AUC <- NA
evaluation_measures <- rbind(evaluation_measures, rpart_pruned_evaluation_measures)
evaluation_measures
## name accuracy precision recall F1 AUC
## 1 Linear Regression 0.9770570 0.8534483 0.185393258 0.304615385 0.8913693
## 2 Lasso 0.9729455 1.0000000 0.001872659 0.003738318 0.8962849
## 3 RPart unpruned 0.9840617 0.8437500 0.505617978 0.632318501 NA
## 4 RPart pruned 0.9840617 0.8437500 0.505617978 0.632318501 NA
We can see from the evaluation measures that, while precision and accuracy figures are similar across the three models, recall is much better in the tree-based model and subsequently the F1 score (which we are using as our evaluation metric of choice) is much better.
#Build random forest model
rf.model <- randomForest(Target ~.,
data = tree_trainset,
importance=TRUE,
xtest=tree_testset[,-targetColNum],
keep.forest = TRUE,
ntree=1000)
#predictions for test set
rf_predictions <- data.frame(tree_testset,rf.model$test$predicted)
#confusion matrix
rf_confusion_matrix <- table(pred=rf.model$test$predicted,true=tree_testset[,targetColNum])
rf_confusion_matrix
## true
## pred 0 1
## 0 38282 247
## 1 52 821
rf_confusion_matrix_df <- as.data.frame(rf_confusion_matrix)
rf_evaluation_measures <- get_evaluation_measures("Random Forest",
rf_confusion_matrix_df$Freq[1],
rf_confusion_matrix_df$Freq[2],
rf_confusion_matrix_df$Freq[3],
rf_confusion_matrix_df$Freq[4])
#Get the AUC and add it to our evaluation measures data frame
rf_evaluation_measures$AUC <- get_auc(rf.model$test$votes[,2], tree_testset[,targetColNum])
evaluation_measures <- rbind(evaluation_measures, rf_evaluation_measures)
evaluation_measures
## name accuracy precision recall F1 AUC
## 1 Linear Regression 0.9770570 0.8534483 0.185393258 0.304615385 0.8913693
## 2 Lasso 0.9729455 1.0000000 0.001872659 0.003738318 0.8962849
## 3 RPart unpruned 0.9840617 0.8437500 0.505617978 0.632318501 NA
## 4 RPart pruned 0.9840617 0.8437500 0.505617978 0.632318501 NA
## 5 Random Forest 0.9924116 0.9404353 0.768726592 0.845955693 0.9930775
We selected F1 as the blended metric to help choose the final model, and from the table above, it’s clear the Random Forest tree-based model has the best score.
The linear classification model regularized by Lasso was extremely accurate and precise, but due to its poor recall has a low F1 score and would result in very few customers being targetted for communication, even though they may be the right customers.
The Area Under The Curve score further highlights the strength of the Random Forest tree based model.
Now, let’s look at variable importance.
#quantitative measure of variable importance
importance(rf.model)
## 0 1 MeanDecreaseAccuracy
## car_model 22.30483 -0.1142544 22.25298
## car_segment 24.42780 -8.2473182 23.36288
## age_of_vehicle_years 98.60089 13.4830131 98.86378
## sched_serv_warr 68.58838 73.1513210 70.61953
## non_sched_serv_warr 50.02090 45.6837522 50.90880
## sched_serv_paid 39.19942 47.5919508 40.24251
## non_sched_serv_paid 36.19129 26.9254542 36.47427
## total_paid_services 36.05429 21.8819590 36.30462
## total_services 59.52957 72.0991582 60.78891
## mth_since_last_serv 157.00513 120.5204063 161.82230
## annualised_mileage 175.21678 9.2144956 175.45747
## num_dealers_visited 58.44822 70.8990345 60.06840
## num_serv_dealer_purchased 93.33044 12.1798271 93.77070
## MeanDecreaseGini
## car_model 228.63798
## car_segment 49.28325
## age_of_vehicle_years 452.67217
## sched_serv_warr 312.27809
## non_sched_serv_warr 273.82791
## sched_serv_paid 262.10793
## non_sched_serv_paid 189.91392
## total_paid_services 253.55395
## total_services 417.98529
## mth_since_last_serv 768.59623
## annualised_mileage 617.71384
## num_dealers_visited 328.08056
## num_serv_dealer_purchased 580.26730
# Look at variable importance using the built in varImpPlot from randomForest
varImpPlot(rf.model)
# Let's use the vip package instead
vip(rf.model, bar = TRUE, horizontal = TRUE, size = 1.5)
| Linear regression | Linear with Lasso | Rpart P&U tree | Random Forest |
|---|---|---|---|
| Sched_serv_warr | Total services | Sched_serv_warr | Annualised_mileage |
| Sched_serv_paid | Sched_serv_warr | Annualised_mileage | Mth_since_last_serv |
| Total_services | Sched_serv_paid | Mth_since_last_ser | Age_of_vehicle_years |
| Mth_since_last_serv | Mth_since_last_serv | Total services | Num_serv_dealer_purchased |
| Annualised_mileage | Annualised_mileage | Total paid service | Sched_serv_warr |
# Create a vector of our top 5 predictors
top_five_predictors = c("annualised_mileage", "mth_since_last_serv", "age_of_vehicle_years", "num_serv_dealer_purchased", "sched_serv_warr")
#Construct the 5 plots
# Annualised Mileage
p1 <- partial(rf.model, pred.var = top_five_predictors[1], plot = FALSE, rug = TRUE, plot.engine = "ggplot2")
# Months since last service
p2 <- partial(rf.model, pred.var = top_five_predictors[2], plot = FALSE, rug = TRUE, plot.engine = "ggplot2")
#Age of vehicle in years
p3 <- partial(rf.model, pred.var = top_five_predictors[3], plot = FALSE, rug = TRUE, plot.engine = "ggplot2")
#Number of services had at the same dealer where the vehicle was purchased
p4 <- partial(rf.model, pred.var = top_five_predictors[4], plot = FALSE, rug = TRUE, plot.engine = "ggplot2")
#Number of scheduled services used under warranty
p5 <- partial(rf.model, pred.var = top_five_predictors[5], plot = FALSE, rug = TRUE, plot.engine = "ggplot2")
#Let's also do a sixth for total services
p6 <- partial(rf.model, pred.var = "total_services", plot = FALSE, rug = TRUE, plot.engine = "ggplot2")
#Plot 1
autoplot(p1, ylab = "yhat") + theme_light() + ggtitle(paste("Partial dependency plot for", top_five_predictors[1]))
#Plot2
autoplot(p2, ylab = "yhat") + theme_light() + ggtitle(paste("Partial dependency plot for", top_five_predictors[2]))
#Plot 3
autoplot(p3, ylab = "yhat") + theme_light() + ggtitle(paste("Partial dependency plot for", top_five_predictors[3]))
#Plot 4
autoplot(p4, ylab = "yhat") + theme_light() + ggtitle(paste("Partial dependency plot for", top_five_predictors[4]))
#Plot 5
autoplot(p5, ylab = "yhat") + theme_light() + ggtitle(paste("Partial dependency plot for", top_five_predictors[5]))
#Plot 6
autoplot(p6, ylab = "yhat") + theme_light() + ggtitle("Partial dependency plot for total services")
#Load the vaidation set
#NOTE:this validation set needed an ID added (hence the CORRECTED in the file name)
repurchase_validation <- read.csv("raw_data/repurchase_validationCORRECTED.csv")
#Train set is the full repurchase training set after cleaning for the tree models
trainset <- tree_data
#Car model 19 doesn't exist in the validation data set, so we filter it from the training set
trainset <- trainset %>%
filter(car_model != "model_19")
#Re-factor car_model to have 18 factors as per validation set
trainset$car_model <- factor(trainset$car_model)
#Test set
testset <- repurchase_validation
#Run the model (don't include ID, age or gender columns in the testset)
validation.model <- randomForest(Target~.,
data = trainset,
importance = TRUE,
xtest = testset[,4:ncol(testset)],
ntree = 1000)
#predictions for test set
testset$target_class <- validation.model$test$predicted
testset$target_probability <- validation.model$test$votes[,2]
final_output <- testset %>%
select(ID, target_probability, target_class)
write.csv(final_output, file="cleaned_data/repurchase_validation_13184650.csv", row.names=FALSE)
#predictions for trainset
final_predictions <- data.frame(trainset,validation.model$predicted)
#confusion matrix
final_confusion_matrix <- table(validation.model$predicted,trainset[,targetColNum])
final_confusion_matrix
##
## 0 1
## 0 127644 734
## 1 170 2787