The goal is to target existing customers for a re-purchase campaign, by sending a communication to customers who are highly likely to purchase a new vehicle (all customers have purchased at least one vehicle).
#import data
repurchase_training <- read.csv("raw_data/repurchase_training.csv")
#NOTE:this validation set needs an ID added
repurchase_validation <- read.csv("raw_data/repurchase_validationCORRECTED.csv")
#need to change ID to be "character" in both datasets, so the IDs are not counted as numbers
eda_data <- repurchase_training
for (i in 7:ncol(repurchase_training)) {
repurchase_training[,i] = factor(as.integer(repurchase_training[,i]));
}
for (i in 3:6) {
eda_data[,i] = as.character(eda_data[,i]);
}
str(eda_data)
## 'data.frame': 131337 obs. of 17 variables:
## $ ID : int 1 2 3 5 6 7 8 9 10 11 ...
## $ Target : int 0 0 0 0 0 0 0 0 0 0 ...
## $ age_band : chr "3. 35 to 44" "NULL" "NULL" "NULL" ...
## $ gender : chr "Male" "NULL" "Male" "NULL" ...
## $ car_model : chr "model_1" "model_2" "model_3" "model_3" ...
## $ car_segment : chr "LCV" "Small/Medium" "Large/SUV" "Large/SUV" ...
## $ age_of_vehicle_years : int 9 6 9 5 8 7 8 7 1 3 ...
## $ sched_serv_warr : int 2 10 10 8 9 4 2 4 2 1 ...
## $ non_sched_serv_warr : int 10 3 9 5 4 10 8 9 1 1 ...
## $ sched_serv_paid : int 3 10 10 8 10 5 2 6 1 2 ...
## $ non_sched_serv_paid : int 7 4 9 4 7 7 9 9 3 1 ...
## $ total_paid_services : int 5 9 10 5 9 6 9 8 1 2 ...
## $ total_services : int 6 10 10 6 8 8 4 6 2 1 ...
## $ mth_since_last_serv : int 9 6 7 4 5 8 7 9 1 1 ...
## $ annualised_mileage : int 8 10 10 10 4 5 6 5 1 1 ...
## $ num_dealers_visited : int 10 7 6 9 4 10 10 5 2 1 ...
## $ num_serv_dealer_purchased: int 4 10 10 7 9 4 4 8 3 1 ...
#Find nulls in the data to work out which columns have a lot of empty records
total_nulls <- data.frame(type = names(eda_data[, -1]),
nulls = colSums(eda_data[, -1] == "NULL"),
values = colSums(eda_data[, -1] != "NULL")
)
total_nulls <- melt(total_nulls, id.var="type")
# We can see from this that 'age_band' has very few values across the training set
# We can also see that gender only has value in about 50% of rows
# Neither is likely very usable for our model
ggplot(total_nulls, aes(x = type, y = value, fill = variable)) +
geom_bar(stat = "identity") +
coord_flip()
#match to the data dictionary on the assignment sheet (note something about transforming variables)
#already done in data
#Let's look at some characteristics of the cars to see if we can detect any patterns
#First write a function that will take a characteristic and return a data frame that gives us the count of entries with target = 1 and target = 0
get_counts <- function (attribute_df, count_df) {
output = data.frame(matrix(ncol = 3, nrow = 0), stringsAsFactors = FALSE)
attribute_name = colnames(attribute_df)[1]
names(output) = c(attribute_name, "target", "non_target")
for(i in 1:nrow(attribute_df)) {
my_attribute = as.character(attribute_df[i, attribute_name])
my_target_count = count_df %>%
filter(Target == 1) %>%
filter(.[,2] == my_attribute)
my_no_target_count = count_df %>%
filter(Target == 0) %>%
filter(.[,2] == my_attribute)
if(nrow(my_target_count) > 0) {
target_count = as.character(my_target_count$count)
} else {
target_count = "0"
}
if(nrow(my_no_target_count) > 0) {
no_target_count = as.character(my_no_target_count$count)
} else {
no_target_count = "0"
}
#Can't divide by zero!
# ratio <- ifelse(no_target_count > 0, as.integer(target_count)/as.integer(no_target_count), 0)
temp <- data.frame(my_attribute, target_count, no_target_count)
names(temp) <- c(attribute_name, "target", "non_target")
output = rbind(output, temp)
output[,1] <- as.character(output[,1])
output[,2] <- as.integer(output[,2])
output[,3] <- as.integer(output[,3])
}
return(output)
}
#get data for car models
car_models <- eda_data %>%
group_by(car_model) %>%
summarize(count = n())
car_models_counts <- eda_data %>%
group_by(Target, car_model) %>%
summarize(count = n())
car_model_data <- get_counts(car_models, car_models_counts)
car_model_data <- melt(car_model_data, id.var="car_model")
ggplot(car_model_data, aes(x = car_model, y = value, fill = variable)) +
geom_bar(stat = "identity") + theme(axis.text.x = element_text(angle = 90, hjust = 1))
car_segments <- eda_data %>%
group_by(car_segment) %>%
summarize(count = n())
car_segments_counts <- eda_data %>%
group_by(Target, car_segment) %>%
summarize(count = n())
car_segment_data <- get_counts(car_segments, car_segments_counts)
car_segment_data <- melt(car_segment_data, id.var="car_segment")
ggplot(car_segment_data, aes(x = car_segment, y = value, fill = variable)) +
geom_bar(stat = "identity")
age_of_vehicle_years <- eda_data %>%
group_by(age_of_vehicle_years) %>%
summarize(count = n())
age_of_vehicle_years_counts <- eda_data %>%
group_by(Target, age_of_vehicle_years) %>%
summarize(count = n())
age_of_vehicle_years_data <- get_counts(age_of_vehicle_years, age_of_vehicle_years_counts)
age_of_vehicle_years_data <- melt(age_of_vehicle_years_data, id.var="age_of_vehicle_years")
ggplot(age_of_vehicle_years_data, aes(x = age_of_vehicle_years, y = value, fill = variable)) +
geom_bar(stat = "identity")
comments and chrts
#look at some service history characteristics of the cars to see if we can detect any patterns
sched_serv_warr_values <- eda_data %>%
group_by(sched_serv_warr) %>%
summarize(count = n())
sched_serv_warr_counts <- eda_data %>%
group_by(Target, sched_serv_warr) %>%
summarize(count = n())
sched_serv_warr_data <- get_counts(sched_serv_warr_values, sched_serv_warr_counts)
sched_serv_warr_data <- melt(sched_serv_warr_data, id.var="sched_serv_warr")
ggplot(sched_serv_warr_data, aes(x = sched_serv_warr, y = value, fill = variable)) +
geom_bar(stat = "identity")
Run a linear model with the training data with an experimental classification target, trying to find the right target/predictor variable.
test_data <- repurchase_training
str(test_data)
## 'data.frame': 131337 obs. of 17 variables:
## $ ID : int 1 2 3 5 6 7 8 9 10 11 ...
## $ Target : int 0 0 0 0 0 0 0 0 0 0 ...
## $ age_band : Factor w/ 8 levels "1. <25","2. 25 to 34",..: 3 8 8 8 8 8 1 8 8 8 ...
## $ gender : Factor w/ 3 levels "Female","Male",..: 2 3 2 3 1 2 2 2 3 3 ...
## $ car_model : Factor w/ 19 levels "model_1","model_10",..: 1 12 13 13 12 15 13 16 14 14 ...
## $ car_segment : Factor w/ 4 levels "Large/SUV","LCV",..: 2 4 1 1 4 1 1 4 4 4 ...
## $ age_of_vehicle_years : Factor w/ 10 levels "1","2","3","4",..: 9 6 9 5 8 7 8 7 1 3 ...
## $ sched_serv_warr : Factor w/ 10 levels "1","2","3","4",..: 2 10 10 8 9 4 2 4 2 1 ...
## $ non_sched_serv_warr : Factor w/ 10 levels "1","2","3","4",..: 10 3 9 5 4 10 8 9 1 1 ...
## $ sched_serv_paid : Factor w/ 10 levels "1","2","3","4",..: 3 10 10 8 10 5 2 6 1 2 ...
## $ non_sched_serv_paid : Factor w/ 10 levels "1","2","3","4",..: 7 4 9 4 7 7 9 9 3 1 ...
## $ total_paid_services : Factor w/ 10 levels "1","2","3","4",..: 5 9 10 5 9 6 9 8 1 2 ...
## $ total_services : Factor w/ 10 levels "1","2","3","4",..: 6 10 10 6 8 8 4 6 2 1 ...
## $ mth_since_last_serv : Factor w/ 10 levels "1","2","3","4",..: 9 6 7 4 5 8 7 9 1 1 ...
## $ annualised_mileage : Factor w/ 10 levels "1","2","3","4",..: 8 10 10 10 4 5 6 5 1 1 ...
## $ num_dealers_visited : Factor w/ 10 levels "1","2","3","4",..: 10 7 6 9 4 10 10 5 2 1 ...
## $ num_serv_dealer_purchased: Factor w/ 10 levels "1","2","3","4",..: 4 10 10 7 9 4 4 8 3 1 ...
#We run a linear regression model on our testset
glm0 = glm(formula = Target ~ ., family = binomial, data = test_data[,-1])
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(glm0)
##
## Call:
## glm(formula = Target ~ ., family = binomial, data = test_data[,
## -1])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.4003 -0.0890 -0.0046 -0.0001 5.8607
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.726e+00 7.512e-01 -10.285 < 2e-16 ***
## age_band2. 25 to 34 1.255e+00 7.766e-01 1.616 0.106007
## age_band3. 35 to 44 1.100e+00 7.667e-01 1.435 0.151200
## age_band4. 45 to 54 1.402e+00 7.581e-01 1.850 0.064299 .
## age_band5. 55 to 64 1.583e+00 7.593e-01 2.084 0.037132 *
## age_band6. 65 to 74 1.053e+00 7.990e-01 1.318 0.187396
## age_band7. 75+ 1.064e+00 8.666e-01 1.228 0.219413
## age_bandNULL 1.329e+00 7.317e-01 1.817 0.069284 .
## genderMale 4.309e-01 8.983e-02 4.797 1.61e-06 ***
## genderNULL -7.974e-01 8.746e-02 -9.117 < 2e-16 ***
## car_modelmodel_10 -7.750e-01 1.929e-01 -4.018 5.87e-05 ***
## car_modelmodel_11 -3.269e+00 3.840e-01 -8.513 < 2e-16 ***
## car_modelmodel_12 -1.327e+00 4.914e-01 -2.701 0.006922 **
## car_modelmodel_13 1.700e+00 2.690e-01 6.320 2.62e-10 ***
## car_modelmodel_14 -1.955e+01 2.622e+03 -0.007 0.994050
## car_modelmodel_15 2.198e+00 6.185e-01 3.554 0.000379 ***
## car_modelmodel_16 2.016e+00 8.372e-01 2.408 0.016026 *
## car_modelmodel_17 -2.162e+00 1.044e+00 -2.071 0.038383 *
## car_modelmodel_18 -2.265e+00 5.384e-01 -4.206 2.60e-05 ***
## car_modelmodel_19 -1.674e+01 2.061e+04 -0.001 0.999352
## car_modelmodel_2 6.811e-01 9.059e-02 7.518 5.56e-14 ***
## car_modelmodel_3 8.245e-01 9.358e-02 8.811 < 2e-16 ***
## car_modelmodel_4 6.181e-01 1.102e-01 5.609 2.04e-08 ***
## car_modelmodel_5 3.059e-01 9.383e-02 3.260 0.001115 **
## car_modelmodel_6 3.392e-01 2.013e-01 1.685 0.092072 .
## car_modelmodel_7 9.602e-01 1.133e-01 8.475 < 2e-16 ***
## car_modelmodel_8 9.486e-01 1.280e-01 7.413 1.23e-13 ***
## car_modelmodel_9 2.963e-01 2.780e-01 1.066 0.286367
## car_segmentLCV NA NA NA NA
## car_segmentOther 2.146e+00 1.478e+00 1.452 0.146404
## car_segmentSmall/Medium NA NA NA NA
## age_of_vehicle_years2 4.702e-01 9.139e-02 5.145 2.68e-07 ***
## age_of_vehicle_years3 9.684e-01 9.318e-02 10.392 < 2e-16 ***
## age_of_vehicle_years4 2.657e+00 1.063e-01 24.997 < 2e-16 ***
## age_of_vehicle_years5 2.524e+00 1.135e-01 22.242 < 2e-16 ***
## age_of_vehicle_years6 2.167e+00 1.310e-01 16.551 < 2e-16 ***
## age_of_vehicle_years7 9.812e-01 1.706e-01 5.752 8.82e-09 ***
## age_of_vehicle_years8 1.491e-01 2.310e-01 0.646 0.518498
## age_of_vehicle_years9 -1.078e+00 4.178e-01 -2.581 0.009851 **
## age_of_vehicle_years10 -2.997e+00 1.014e+00 -2.957 0.003109 **
## sched_serv_warr2 3.290e-01 7.192e-02 4.574 4.77e-06 ***
## sched_serv_warr3 -2.419e-01 8.739e-02 -2.768 0.005634 **
## sched_serv_warr4 -1.027e+00 1.114e-01 -9.221 < 2e-16 ***
## sched_serv_warr5 -2.038e+00 1.565e-01 -13.019 < 2e-16 ***
## sched_serv_warr6 -3.367e+00 2.511e-01 -13.409 < 2e-16 ***
## sched_serv_warr7 -6.594e+00 1.028e+00 -6.415 1.41e-10 ***
## sched_serv_warr8 -5.819e+00 7.528e-01 -7.730 1.07e-14 ***
## sched_serv_warr9 -1.796e+01 1.869e+02 -0.096 0.923476
## sched_serv_warr10 -5.570e+00 9.064e-01 -6.145 7.99e-10 ***
## non_sched_serv_warr2 6.563e-01 9.905e-02 6.626 3.46e-11 ***
## non_sched_serv_warr3 7.255e-01 1.056e-01 6.871 6.36e-12 ***
## non_sched_serv_warr4 9.172e-01 1.464e-01 6.265 3.73e-10 ***
## non_sched_serv_warr5 6.880e-01 1.476e-01 4.662 3.13e-06 ***
## non_sched_serv_warr6 -3.175e-01 1.747e-01 -1.817 0.069168 .
## non_sched_serv_warr7 -5.941e-01 1.873e-01 -3.172 0.001513 **
## non_sched_serv_warr8 -1.203e+00 2.284e-01 -5.266 1.39e-07 ***
## non_sched_serv_warr9 -1.771e+00 3.167e-01 -5.593 2.23e-08 ***
## non_sched_serv_warr10 -1.505e+00 5.372e-01 -2.802 0.005082 **
## sched_serv_paid2 -2.753e-02 6.552e-02 -0.420 0.674326
## sched_serv_paid3 -3.627e-01 7.938e-02 -4.569 4.90e-06 ***
## sched_serv_paid4 -1.108e+00 1.096e-01 -10.105 < 2e-16 ***
## sched_serv_paid5 -1.247e+00 1.346e-01 -9.267 < 2e-16 ***
## sched_serv_paid6 -3.059e+00 3.174e-01 -9.637 < 2e-16 ***
## sched_serv_paid7 -2.576e+00 3.519e-01 -7.322 2.44e-13 ***
## sched_serv_paid8 -2.256e+00 4.601e-01 -4.903 9.45e-07 ***
## sched_serv_paid9 -1.374e+00 5.280e-01 -2.602 0.009275 **
## sched_serv_paid10 -8.445e-01 5.162e-01 -1.636 0.101881
## non_sched_serv_paid2 -1.193e-01 8.611e-02 -1.385 0.165904
## non_sched_serv_paid3 -5.668e-03 9.107e-02 -0.062 0.950380
## non_sched_serv_paid4 -3.243e-01 1.601e-01 -2.025 0.042870 *
## non_sched_serv_paid5 1.499e-01 1.805e-01 0.830 0.406382
## non_sched_serv_paid6 4.923e-01 2.463e-01 1.999 0.045643 *
## non_sched_serv_paid7 1.246e+00 3.273e-01 3.808 0.000140 ***
## non_sched_serv_paid8 1.260e+00 3.820e-01 3.300 0.000968 ***
## non_sched_serv_paid9 1.989e+00 4.355e-01 4.566 4.96e-06 ***
## non_sched_serv_paid10 2.071e+00 5.445e-01 3.803 0.000143 ***
## total_paid_services2 2.481e-01 7.683e-02 3.229 0.001243 **
## total_paid_services3 9.475e-01 1.395e-01 6.794 1.09e-11 ***
## total_paid_services4 4.854e-01 2.214e-01 2.193 0.028343 *
## total_paid_services5 -1.514e-01 3.131e-01 -0.484 0.628709
## total_paid_services6 -1.354e-01 3.656e-01 -0.370 0.711137
## total_paid_services7 -1.609e-01 4.055e-01 -0.397 0.691546
## total_paid_services8 -5.824e-01 4.561e-01 -1.277 0.201607
## total_paid_services9 -2.998e-01 5.318e-01 -0.564 0.573006
## total_paid_services10 1.327e-02 5.681e-01 0.023 0.981371
## total_services2 6.514e-01 8.411e-02 7.745 9.57e-15 ***
## total_services3 1.034e-01 1.230e-01 0.841 0.400303
## total_services4 -9.161e-01 1.562e-01 -5.866 4.47e-09 ***
## total_services5 -1.757e+00 2.090e-01 -8.408 < 2e-16 ***
## total_services6 -3.917e+00 3.092e-01 -12.669 < 2e-16 ***
## total_services7 -5.702e+00 4.362e-01 -13.071 < 2e-16 ***
## total_services8 -6.150e+00 5.603e-01 -10.977 < 2e-16 ***
## total_services9 -8.335e+00 7.372e-01 -11.307 < 2e-16 ***
## total_services10 -1.032e+01 9.119e-01 -11.321 < 2e-16 ***
## mth_since_last_serv2 5.506e-01 7.699e-02 7.152 8.56e-13 ***
## mth_since_last_serv3 6.648e-01 1.002e-01 6.638 3.18e-11 ***
## mth_since_last_serv4 5.501e-01 1.023e-01 5.379 7.50e-08 ***
## mth_since_last_serv5 -2.527e-01 1.135e-01 -2.228 0.025908 *
## mth_since_last_serv6 -1.508e+00 1.462e-01 -10.316 < 2e-16 ***
## mth_since_last_serv7 -3.037e+00 2.177e-01 -13.951 < 2e-16 ***
## mth_since_last_serv8 -7.104e+00 1.011e+00 -7.025 2.13e-12 ***
## mth_since_last_serv9 -5.894e+00 3.314e-01 -17.785 < 2e-16 ***
## mth_since_last_serv10 -1.852e+01 1.998e+02 -0.093 0.926144
## annualised_mileage2 4.445e-01 8.854e-02 5.020 5.17e-07 ***
## annualised_mileage3 3.364e+00 1.158e-01 29.048 < 2e-16 ***
## annualised_mileage4 3.118e+00 1.172e-01 26.605 < 2e-16 ***
## annualised_mileage5 3.593e+00 1.192e-01 30.135 < 2e-16 ***
## annualised_mileage6 4.370e+00 1.281e-01 34.115 < 2e-16 ***
## annualised_mileage7 3.884e+00 1.405e-01 27.635 < 2e-16 ***
## annualised_mileage8 3.044e+00 1.702e-01 17.887 < 2e-16 ***
## annualised_mileage9 2.926e+00 1.937e-01 15.105 < 2e-16 ***
## annualised_mileage10 -4.396e-02 4.643e-01 -0.095 0.924561
## num_dealers_visited2 4.617e-01 8.377e-02 5.511 3.57e-08 ***
## num_dealers_visited3 -1.343e-01 1.173e-01 -1.145 0.252149
## num_dealers_visited4 -6.009e-01 1.528e-01 -3.932 8.41e-05 ***
## num_dealers_visited5 -4.021e-01 1.484e-01 -2.710 0.006735 **
## num_dealers_visited6 -5.136e-01 1.470e-01 -3.494 0.000476 ***
## num_dealers_visited7 -2.800e-01 1.451e-01 -1.930 0.053552 .
## num_dealers_visited8 -4.597e-01 1.501e-01 -3.062 0.002196 **
## num_dealers_visited9 1.126e+00 1.691e-01 6.660 2.74e-11 ***
## num_dealers_visited10 1.773e+00 1.743e-01 10.177 < 2e-16 ***
## num_serv_dealer_purchased2 5.115e-01 9.664e-02 5.293 1.20e-07 ***
## num_serv_dealer_purchased3 5.939e-01 9.648e-02 6.155 7.49e-10 ***
## num_serv_dealer_purchased4 -7.218e-01 1.416e-01 -5.098 3.42e-07 ***
## num_serv_dealer_purchased5 2.242e-02 1.389e-01 0.161 0.871776
## num_serv_dealer_purchased6 1.141e+00 1.468e-01 7.770 7.83e-15 ***
## num_serv_dealer_purchased7 2.658e+00 1.673e-01 15.884 < 2e-16 ***
## num_serv_dealer_purchased8 4.921e+00 2.304e-01 21.356 < 2e-16 ***
## num_serv_dealer_purchased9 5.759e+00 2.875e-01 20.030 < 2e-16 ***
## num_serv_dealer_purchased10 8.333e+00 4.051e-01 20.570 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 32432 on 131336 degrees of freedom
## Residual deviance: 12674 on 131209 degrees of freedom
## AIC: 12930
##
## Number of Fisher Scoring iterations: 20
#It shows the factor data is causing strange things to come out, so we need to clean the data to turn categories into integers
test_data$age_band <- substr(test_data$age_band, 1, 1)
test_data$age_band <- gsub("N", "NULL", test_data$age_band)
test_data$gender <- gsub("Male", "1", test_data$gender)
test_data$gender <- gsub("Female", "2", test_data$gender)
test_data$car_model <- sub('model_', '', test_data$car_model)
test_data$car_segment <- sub("Other", "1", test_data$car_segment)
test_data$car_segment <- sub("Small/Medium", "2", test_data$car_segment)
test_data$car_segment <- sub("Large/SUV", "3", test_data$car_segment)
test_data$car_segment <- sub("LCV", "4", test_data$car_segment)
test_data$age_band <- as.integer(test_data$age_band)
## Warning: NAs introduced by coercion
test_data$gender <- as.integer(test_data$gender)
## Warning: NAs introduced by coercion
test_data$car_model <- as.integer(test_data$car_model)
test_data$car_segment <- as.integer(test_data$car_segment)
#We want to keep our Target as a Factor
test_data[,2] = as.factor(test_data[,2]);
#Remove the columns with high numbers of NAs
test_data <- test_data %>%
dplyr::select(ID, Target, car_model, car_segment, age_of_vehicle_years, sched_serv_warr, non_sched_serv_warr, sched_serv_paid, non_sched_serv_paid, total_paid_services, total_services, mth_since_last_serv, annualised_mileage, num_dealers_visited, num_serv_dealer_purchased)
str(test_data)
## 'data.frame': 131337 obs. of 15 variables:
## $ ID : int 1 2 3 5 6 7 8 9 10 11 ...
## $ Target : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ car_model : int 1 2 3 3 2 5 3 6 4 4 ...
## $ car_segment : int 4 2 3 3 2 3 3 2 2 2 ...
## $ age_of_vehicle_years : Factor w/ 10 levels "1","2","3","4",..: 9 6 9 5 8 7 8 7 1 3 ...
## $ sched_serv_warr : Factor w/ 10 levels "1","2","3","4",..: 2 10 10 8 9 4 2 4 2 1 ...
## $ non_sched_serv_warr : Factor w/ 10 levels "1","2","3","4",..: 10 3 9 5 4 10 8 9 1 1 ...
## $ sched_serv_paid : Factor w/ 10 levels "1","2","3","4",..: 3 10 10 8 10 5 2 6 1 2 ...
## $ non_sched_serv_paid : Factor w/ 10 levels "1","2","3","4",..: 7 4 9 4 7 7 9 9 3 1 ...
## $ total_paid_services : Factor w/ 10 levels "1","2","3","4",..: 5 9 10 5 9 6 9 8 1 2 ...
## $ total_services : Factor w/ 10 levels "1","2","3","4",..: 6 10 10 6 8 8 4 6 2 1 ...
## $ mth_since_last_serv : Factor w/ 10 levels "1","2","3","4",..: 9 6 7 4 5 8 7 9 1 1 ...
## $ annualised_mileage : Factor w/ 10 levels "1","2","3","4",..: 8 10 10 10 4 5 6 5 1 1 ...
## $ num_dealers_visited : Factor w/ 10 levels "1","2","3","4",..: 10 7 6 9 4 10 10 5 2 1 ...
## $ num_serv_dealer_purchased: Factor w/ 10 levels "1","2","3","4",..: 4 10 10 7 9 4 4 8 3 1 ...
# create training and test sets
## 70% of the test_data, use floor to round down to nearest integer
trainset_size <- floor(0.70 * nrow(test_data))
set.seed(42)
trainset_indices <- sample(seq_len(nrow(test_data)), size = trainset_size)
trainset <- test_data[trainset_indices, ]
testset <- test_data[-trainset_indices, ]
#rowcounts to check
nrow(trainset)
## [1] 91935
nrow(testset)
## [1] 39402
nrow(test_data)
## [1] 131337
#So try our linear regression model on our test set
glm_model = glm(formula = Target ~ ., family = binomial, data = testset[,-1])
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
#There's a couple of obvious ones we could remove (e.g. age_of_vehicle_years, non_sched_serv_warr), but some others are questionable (e.g. total_paid_services, car_segment, car_model)
summary(glm_model)
##
## Call:
## glm(formula = Target ~ ., family = binomial, data = testset[,
## -1])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.1140 -0.1122 -0.0007 0.0000 4.6885
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.321785 0.301334 -17.661 < 2e-16 ***
## car_model -0.063753 0.015277 -4.173 3.00e-05 ***
## car_segment -0.008342 0.060282 -0.138 0.889943
## age_of_vehicle_years2 0.247220 0.160288 1.542 0.122990
## age_of_vehicle_years3 0.927293 0.159538 5.812 6.16e-09 ***
## age_of_vehicle_years4 2.396002 0.181748 13.183 < 2e-16 ***
## age_of_vehicle_years5 2.665773 0.193280 13.792 < 2e-16 ***
## age_of_vehicle_years6 2.193909 0.229713 9.551 < 2e-16 ***
## age_of_vehicle_years7 1.568182 0.289025 5.426 5.77e-08 ***
## age_of_vehicle_years8 0.312768 0.471324 0.664 0.506950
## age_of_vehicle_years9 -0.056806 0.639165 -0.089 0.929181
## age_of_vehicle_years10 -14.521471 530.135033 -0.027 0.978147
## sched_serv_warr2 0.526864 0.126651 4.160 3.18e-05 ***
## sched_serv_warr3 -0.229156 0.157099 -1.459 0.144656
## sched_serv_warr4 -1.037305 0.198200 -5.234 1.66e-07 ***
## sched_serv_warr5 -1.801043 0.267927 -6.722 1.79e-11 ***
## sched_serv_warr6 -3.293091 0.442662 -7.439 1.01e-13 ***
## sched_serv_warr7 -19.183103 590.054961 -0.033 0.974065
## sched_serv_warr8 -4.729599 0.835025 -5.664 1.48e-08 ***
## sched_serv_warr9 -19.307675 546.038284 -0.035 0.971793
## sched_serv_warr10 -19.522822 475.318915 -0.041 0.967238
## non_sched_serv_warr2 0.467764 0.169838 2.754 0.005884 **
## non_sched_serv_warr3 0.582451 0.180449 3.228 0.001248 **
## non_sched_serv_warr4 0.759299 0.257715 2.946 0.003216 **
## non_sched_serv_warr5 0.512283 0.257373 1.990 0.046543 *
## non_sched_serv_warr6 -0.271581 0.303509 -0.895 0.370893
## non_sched_serv_warr7 -0.697191 0.326683 -2.134 0.032830 *
## non_sched_serv_warr8 -1.031068 0.392659 -2.626 0.008643 **
## non_sched_serv_warr9 -1.918605 0.548952 -3.495 0.000474 ***
## non_sched_serv_warr10 -3.020074 0.934314 -3.232 0.001228 **
## sched_serv_paid2 -0.164578 0.115568 -1.424 0.154424
## sched_serv_paid3 -0.330871 0.137471 -2.407 0.016091 *
## sched_serv_paid4 -1.181155 0.189317 -6.239 4.40e-10 ***
## sched_serv_paid5 -1.403904 0.237000 -5.924 3.15e-09 ***
## sched_serv_paid6 -3.213430 0.566104 -5.676 1.38e-08 ***
## sched_serv_paid7 -2.664652 0.644345 -4.135 3.54e-05 ***
## sched_serv_paid8 -2.274661 0.802060 -2.836 0.004568 **
## sched_serv_paid9 -15.661813 496.513323 -0.032 0.974836
## sched_serv_paid10 -2.318917 1.117235 -2.076 0.037932 *
## non_sched_serv_paid2 -0.206021 0.149160 -1.381 0.167214
## non_sched_serv_paid3 -0.101047 0.157342 -0.642 0.520736
## non_sched_serv_paid4 -0.786024 0.282492 -2.782 0.005395 **
## non_sched_serv_paid5 -0.428696 0.315385 -1.359 0.174058
## non_sched_serv_paid6 -0.089068 0.429123 -0.208 0.835574
## non_sched_serv_paid7 0.297113 0.583934 0.509 0.610883
## non_sched_serv_paid8 0.245353 0.697787 0.352 0.725126
## non_sched_serv_paid9 1.094894 0.802217 1.365 0.172305
## non_sched_serv_paid10 1.315141 0.980104 1.342 0.179648
## total_paid_services2 0.021461 0.134726 0.159 0.873436
## total_paid_services3 0.850394 0.240581 3.535 0.000408 ***
## total_paid_services4 0.526992 0.387381 1.360 0.173705
## total_paid_services5 0.038755 0.550725 0.070 0.943899
## total_paid_services6 0.444404 0.664168 0.669 0.503423
## total_paid_services7 0.054248 0.748411 0.072 0.942217
## total_paid_services8 -0.717669 0.844431 -0.850 0.395389
## total_paid_services9 -0.250286 0.955115 -0.262 0.793285
## total_paid_services10 -0.015846 1.014287 -0.016 0.987536
## total_services2 0.581132 0.146595 3.964 7.36e-05 ***
## total_services3 0.279494 0.213710 1.308 0.190934
## total_services4 -0.670212 0.275198 -2.435 0.014876 *
## total_services5 -1.615524 0.371538 -4.348 1.37e-05 ***
## total_services6 -3.020715 0.504344 -5.989 2.11e-09 ***
## total_services7 -4.218894 0.674028 -6.259 3.87e-10 ***
## total_services8 -4.812766 0.905330 -5.316 1.06e-07 ***
## total_services9 -3.971067 1.088167 -3.649 0.000263 ***
## total_services10 -20.370131 480.018291 -0.042 0.966151
## mth_since_last_serv2 0.506743 0.136416 3.715 0.000203 ***
## mth_since_last_serv3 0.344824 0.175412 1.966 0.049323 *
## mth_since_last_serv4 0.260285 0.180091 1.445 0.148375
## mth_since_last_serv5 -0.458696 0.198843 -2.307 0.021065 *
## mth_since_last_serv6 -1.786010 0.251780 -7.094 1.31e-12 ***
## mth_since_last_serv7 -2.935764 0.331817 -8.848 < 2e-16 ***
## mth_since_last_serv8 -5.894079 1.023980 -5.756 8.61e-09 ***
## mth_since_last_serv9 -4.544963 0.456044 -9.966 < 2e-16 ***
## mth_since_last_serv10 -19.337687 618.884505 -0.031 0.975073
## annualised_mileage2 0.366010 0.158755 2.305 0.021139 *
## annualised_mileage3 3.649939 0.205651 17.748 < 2e-16 ***
## annualised_mileage4 3.408541 0.209283 16.287 < 2e-16 ***
## annualised_mileage5 3.718963 0.213346 17.432 < 2e-16 ***
## annualised_mileage6 4.303324 0.233809 18.405 < 2e-16 ***
## annualised_mileage7 4.039185 0.248380 16.262 < 2e-16 ***
## annualised_mileage8 3.272838 0.300098 10.906 < 2e-16 ***
## annualised_mileage9 2.741438 0.368559 7.438 1.02e-13 ***
## annualised_mileage10 -0.036174 0.869773 -0.042 0.966826
## num_dealers_visited2 0.409238 0.146438 2.795 0.005196 **
## num_dealers_visited3 -0.372042 0.209482 -1.776 0.075731 .
## num_dealers_visited4 -1.084158 0.267515 -4.053 5.06e-05 ***
## num_dealers_visited5 -0.780010 0.256176 -3.045 0.002328 **
## num_dealers_visited6 -1.008538 0.254995 -3.955 7.65e-05 ***
## num_dealers_visited7 -0.782516 0.253863 -3.082 0.002053 **
## num_dealers_visited8 -0.822125 0.258554 -3.180 0.001474 **
## num_dealers_visited9 0.587129 0.306407 1.916 0.055343 .
## num_dealers_visited10 1.178422 0.306919 3.840 0.000123 ***
## num_serv_dealer_purchased2 0.401614 0.167702 2.395 0.016629 *
## num_serv_dealer_purchased3 0.456330 0.167702 2.721 0.006507 **
## num_serv_dealer_purchased4 -0.452801 0.242552 -1.867 0.061926 .
## num_serv_dealer_purchased5 0.043803 0.243425 0.180 0.857197
## num_serv_dealer_purchased6 1.185001 0.257882 4.595 4.32e-06 ***
## num_serv_dealer_purchased7 2.928572 0.295746 9.902 < 2e-16 ***
## num_serv_dealer_purchased8 4.678391 0.391916 11.937 < 2e-16 ***
## num_serv_dealer_purchased9 5.537041 0.495821 11.167 < 2e-16 ***
## num_serv_dealer_purchased10 7.497139 0.588519 12.739 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 9813.5 on 39401 degrees of freedom
## Residual deviance: 4179.7 on 39300 degrees of freedom
## AIC: 4383.7
##
## Number of Fisher Scoring iterations: 21
#predict probabilities on testset
glm_prob <- predict.glm(glm_model,testset[,-1],type="response")
# First create vector to hold predictions
glm_predict <- rep(0,nrow(testset[,-1]))
glm_predict[glm_prob>.5] <- 1
table(pred=glm_predict,true=testset$Target)
## true
## pred 0 1
## 0 38227 534
## 1 107 534
#accuracy
mean(glm_predict==testset$Target)
## [1] 0.9837318
#convert training data to matrix format
x <- model.matrix(Target~.,trainset[,-1])
y <- trainset$Target
#perform grid search to find optimal value of lambda
#family= binomial => logistic regression, alpha=1 => lasso
# check docs to explore other type.measure options
cv.out <- cv.glmnet(x, y, alpha=1, family="binomial", type.measure="mse")
plot(cv.out)
#min value of lambda
lambda_min <- cv.out$lambda.min
#best value of lambda
lambda_1se <- cv.out$lambda.1se
#regression coefficients
coef(cv.out,s=lambda_1se)
## 103 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -5.345592622
## (Intercept) .
## car_model -0.058948339
## car_segment .
## age_of_vehicle_years2 0.113174280
## age_of_vehicle_years3 0.465972703
## age_of_vehicle_years4 1.982647880
## age_of_vehicle_years5 1.900922269
## age_of_vehicle_years6 1.864416135
## age_of_vehicle_years7 0.796137241
## age_of_vehicle_years8 0.147472267
## age_of_vehicle_years9 -0.496992950
## age_of_vehicle_years10 -1.094971035
## sched_serv_warr2 0.198711635
## sched_serv_warr3 -0.209692514
## sched_serv_warr4 -0.861496879
## sched_serv_warr5 -1.947124345
## sched_serv_warr6 -3.187103741
## sched_serv_warr7 -4.731387368
## sched_serv_warr8 -4.986479158
## sched_serv_warr9 -4.980822455
## sched_serv_warr10 -4.096561938
## non_sched_serv_warr2 0.611885011
## non_sched_serv_warr3 0.742175407
## non_sched_serv_warr4 1.072448240
## non_sched_serv_warr5 0.879118154
## non_sched_serv_warr6 -0.058358943
## non_sched_serv_warr7 -0.148736889
## non_sched_serv_warr8 -0.726104878
## non_sched_serv_warr9 -1.109117349
## non_sched_serv_warr10 -0.717253659
## sched_serv_paid2 0.054981206
## sched_serv_paid3 -0.297471935
## sched_serv_paid4 -0.917507163
## sched_serv_paid5 -0.993139430
## sched_serv_paid6 -2.609431818
## sched_serv_paid7 -2.159949461
## sched_serv_paid8 -1.878150528
## sched_serv_paid9 -0.714028780
## sched_serv_paid10 .
## non_sched_serv_paid2 -0.093752966
## non_sched_serv_paid3 -0.004517833
## non_sched_serv_paid4 -0.245894861
## non_sched_serv_paid5 .
## non_sched_serv_paid6 0.247488348
## non_sched_serv_paid7 0.817697734
## non_sched_serv_paid8 0.804677871
## non_sched_serv_paid9 1.174678161
## non_sched_serv_paid10 1.370054653
## total_paid_services2 0.208031703
## total_paid_services3 0.685555246
## total_paid_services4 0.355041221
## total_paid_services5 .
## total_paid_services6 .
## total_paid_services7 0.191942591
## total_paid_services8 .
## total_paid_services9 .
## total_paid_services10 0.140322243
## total_services2 0.677822549
## total_services3 0.078310155
## total_services4 -0.824942151
## total_services5 -1.601261636
## total_services6 -3.756243813
## total_services7 -5.362748620
## total_services8 -5.380509495
## total_services9 -7.761052222
## total_services10 -8.002797965
## mth_since_last_serv2 0.584380574
## mth_since_last_serv3 0.782769036
## mth_since_last_serv4 0.639813072
## mth_since_last_serv5 -0.192871553
## mth_since_last_serv6 -1.325753463
## mth_since_last_serv7 -2.815547367
## mth_since_last_serv8 -5.324625410
## mth_since_last_serv9 -4.747938156
## mth_since_last_serv10 -4.967446669
## annualised_mileage2 0.237736757
## annualised_mileage3 2.824415505
## annualised_mileage4 2.532436233
## annualised_mileage5 3.039364647
## annualised_mileage6 3.847225225
## annualised_mileage7 3.238486585
## annualised_mileage8 2.525621285
## annualised_mileage9 2.470704690
## annualised_mileage10 .
## num_dealers_visited2 0.444414874
## num_dealers_visited3 -0.195623104
## num_dealers_visited4 -0.591550042
## num_dealers_visited5 -0.513379859
## num_dealers_visited6 -0.536408497
## num_dealers_visited7 -0.385058479
## num_dealers_visited8 -0.577726321
## num_dealers_visited9 0.950781961
## num_dealers_visited10 1.475032801
## num_serv_dealer_purchased2 0.365083714
## num_serv_dealer_purchased3 0.396784076
## num_serv_dealer_purchased4 -0.834991547
## num_serv_dealer_purchased5 -0.032191729
## num_serv_dealer_purchased6 0.951202576
## num_serv_dealer_purchased7 2.250061592
## num_serv_dealer_purchased8 4.309230917
## num_serv_dealer_purchased9 4.822079890
## num_serv_dealer_purchased10 6.634483004
#get test data
x_test <- model.matrix(Target~.,testset[,-1])
#predict class, type=”class”
lasso_prob <- predict(cv.out,newx = x_test,s=lambda_1se,type="response")
#translate probabilities to predictions
lasso_predict <- rep(0,nrow(testset))
lasso_predict[lasso_prob>.5] <- 1
#confusion matrix
table(pred=lasso_predict,true=testset$Target)
## true
## pred 0 1
## 0 38237 581
## 1 97 487
#accuracy
mean(lasso_predict==testset$Target)
## [1] 0.9827928