Classification modelling

The goal is to target existing customers for a re-purchase campaign, by sending a communication to customers who are highly likely to purchase a new vehicle (all customers have purchased at least one vehicle).

#import data

repurchase_training <- read.csv("raw_data/repurchase_training.csv")

#NOTE:this validation set needs an ID added
repurchase_validation  <- read.csv("raw_data/repurchase_validationCORRECTED.csv")

Cleaning the data

#need to change ID to be "character" in both datasets, so the IDs are not counted as numbers

eda_data <- repurchase_training

for (i in 7:ncol(repurchase_training)) {
  repurchase_training[,i] = factor(as.integer(repurchase_training[,i]));
}

for (i in 3:6) {
  eda_data[,i] = as.character(eda_data[,i]);
}
  
str(eda_data)
## 'data.frame':    131337 obs. of  17 variables:
##  $ ID                       : int  1 2 3 5 6 7 8 9 10 11 ...
##  $ Target                   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ age_band                 : chr  "3. 35 to 44" "NULL" "NULL" "NULL" ...
##  $ gender                   : chr  "Male" "NULL" "Male" "NULL" ...
##  $ car_model                : chr  "model_1" "model_2" "model_3" "model_3" ...
##  $ car_segment              : chr  "LCV" "Small/Medium" "Large/SUV" "Large/SUV" ...
##  $ age_of_vehicle_years     : int  9 6 9 5 8 7 8 7 1 3 ...
##  $ sched_serv_warr          : int  2 10 10 8 9 4 2 4 2 1 ...
##  $ non_sched_serv_warr      : int  10 3 9 5 4 10 8 9 1 1 ...
##  $ sched_serv_paid          : int  3 10 10 8 10 5 2 6 1 2 ...
##  $ non_sched_serv_paid      : int  7 4 9 4 7 7 9 9 3 1 ...
##  $ total_paid_services      : int  5 9 10 5 9 6 9 8 1 2 ...
##  $ total_services           : int  6 10 10 6 8 8 4 6 2 1 ...
##  $ mth_since_last_serv      : int  9 6 7 4 5 8 7 9 1 1 ...
##  $ annualised_mileage       : int  8 10 10 10 4 5 6 5 1 1 ...
##  $ num_dealers_visited      : int  10 7 6 9 4 10 10 5 2 1 ...
##  $ num_serv_dealer_purchased: int  4 10 10 7 9 4 4 8 3 1 ...
#Find nulls in the data to work out which columns have a lot of empty records
total_nulls <- data.frame(type = names(eda_data[, -1]), 
                    nulls = colSums(eda_data[, -1] == "NULL"),
                    values = colSums(eda_data[, -1] != "NULL")
                    )

total_nulls <- melt(total_nulls, id.var="type")

# We can see from this that 'age_band' has very few values across the training set
# We can also see that gender only has value in about 50% of rows
# Neither is likely very usable for our model
ggplot(total_nulls, aes(x = type, y = value, fill = variable)) +
  geom_bar(stat = "identity") +
  coord_flip()

#match to the data dictionary on the assignment sheet (note something about transforming variables)

#already done in data
#Let's look at some characteristics of the cars to see if we can detect any patterns

#First write a function that will take a characteristic and return a data frame that gives us the count of entries with target = 1 and target = 0

get_counts <- function (attribute_df, count_df) {

  output = data.frame(matrix(ncol = 3, nrow = 0), stringsAsFactors = FALSE)
  attribute_name = colnames(attribute_df)[1]
  names(output) = c(attribute_name, "target", "non_target")
  
  for(i in 1:nrow(attribute_df)) {
    my_attribute = as.character(attribute_df[i, attribute_name])

    my_target_count = count_df %>%
        filter(Target == 1) %>%
        filter(.[,2] == my_attribute)
    my_no_target_count = count_df %>%
        filter(Target == 0) %>%
        filter(.[,2] == my_attribute)
    
    if(nrow(my_target_count) > 0) {
      target_count = as.character(my_target_count$count)
    } else {
      target_count = "0"
    }
      
    if(nrow(my_no_target_count) > 0) {
      no_target_count = as.character(my_no_target_count$count)
    } else {
      no_target_count = "0"
    }
    
    #Can't divide by zero!
   # ratio <- ifelse(no_target_count > 0, as.integer(target_count)/as.integer(no_target_count), 0)
    
    temp <- data.frame(my_attribute, target_count, no_target_count)
    names(temp) <- c(attribute_name, "target", "non_target")
    
    output = rbind(output, temp)
    output[,1] <- as.character(output[,1])
    output[,2] <- as.integer(output[,2])
    output[,3] <- as.integer(output[,3])
    
  }
  
  return(output)
}
#get data for car models

car_models <- eda_data %>%
    group_by(car_model) %>%
    summarize(count = n())

car_models_counts <- eda_data %>%
    group_by(Target, car_model) %>%
    summarize(count = n())

car_model_data <- get_counts(car_models, car_models_counts)

car_model_data <- melt(car_model_data, id.var="car_model")

ggplot(car_model_data, aes(x = car_model, y = value, fill = variable)) +
  geom_bar(stat = "identity") + theme(axis.text.x = element_text(angle = 90, hjust = 1))

car_segments <- eda_data %>%
    group_by(car_segment) %>%
    summarize(count = n())

car_segments_counts <- eda_data %>%
    group_by(Target, car_segment) %>%
    summarize(count = n())

car_segment_data <- get_counts(car_segments, car_segments_counts)

car_segment_data <- melt(car_segment_data, id.var="car_segment")

ggplot(car_segment_data, aes(x = car_segment, y = value, fill = variable)) +
  geom_bar(stat = "identity")

age_of_vehicle_years <- eda_data %>%
   group_by(age_of_vehicle_years) %>%
   summarize(count = n())

age_of_vehicle_years_counts <- eda_data %>%
  group_by(Target, age_of_vehicle_years) %>%
  summarize(count = n())

age_of_vehicle_years_data <- get_counts(age_of_vehicle_years, age_of_vehicle_years_counts)

age_of_vehicle_years_data <- melt(age_of_vehicle_years_data, id.var="age_of_vehicle_years")

ggplot(age_of_vehicle_years_data, aes(x = age_of_vehicle_years, y = value, fill = variable)) +
  geom_bar(stat = "identity")

comments and chrts

#look at some service history characteristics of the cars to see if we can detect any patterns
sched_serv_warr_values <- eda_data %>%
  group_by(sched_serv_warr) %>%
    summarize(count = n())

sched_serv_warr_counts <- eda_data %>%
  group_by(Target, sched_serv_warr) %>%
  summarize(count = n())

sched_serv_warr_data <- get_counts(sched_serv_warr_values, sched_serv_warr_counts)

sched_serv_warr_data <- melt(sched_serv_warr_data, id.var="sched_serv_warr")

ggplot(sched_serv_warr_data, aes(x = sched_serv_warr, y = value, fill = variable)) +
  geom_bar(stat = "identity")

Experiment to build a linear classification model

Run a linear model with the training data with an experimental classification target, trying to find the right target/predictor variable.

test_data <- repurchase_training

str(test_data)
## 'data.frame':    131337 obs. of  17 variables:
##  $ ID                       : int  1 2 3 5 6 7 8 9 10 11 ...
##  $ Target                   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ age_band                 : Factor w/ 8 levels "1. <25","2. 25 to 34",..: 3 8 8 8 8 8 1 8 8 8 ...
##  $ gender                   : Factor w/ 3 levels "Female","Male",..: 2 3 2 3 1 2 2 2 3 3 ...
##  $ car_model                : Factor w/ 19 levels "model_1","model_10",..: 1 12 13 13 12 15 13 16 14 14 ...
##  $ car_segment              : Factor w/ 4 levels "Large/SUV","LCV",..: 2 4 1 1 4 1 1 4 4 4 ...
##  $ age_of_vehicle_years     : Factor w/ 10 levels "1","2","3","4",..: 9 6 9 5 8 7 8 7 1 3 ...
##  $ sched_serv_warr          : Factor w/ 10 levels "1","2","3","4",..: 2 10 10 8 9 4 2 4 2 1 ...
##  $ non_sched_serv_warr      : Factor w/ 10 levels "1","2","3","4",..: 10 3 9 5 4 10 8 9 1 1 ...
##  $ sched_serv_paid          : Factor w/ 10 levels "1","2","3","4",..: 3 10 10 8 10 5 2 6 1 2 ...
##  $ non_sched_serv_paid      : Factor w/ 10 levels "1","2","3","4",..: 7 4 9 4 7 7 9 9 3 1 ...
##  $ total_paid_services      : Factor w/ 10 levels "1","2","3","4",..: 5 9 10 5 9 6 9 8 1 2 ...
##  $ total_services           : Factor w/ 10 levels "1","2","3","4",..: 6 10 10 6 8 8 4 6 2 1 ...
##  $ mth_since_last_serv      : Factor w/ 10 levels "1","2","3","4",..: 9 6 7 4 5 8 7 9 1 1 ...
##  $ annualised_mileage       : Factor w/ 10 levels "1","2","3","4",..: 8 10 10 10 4 5 6 5 1 1 ...
##  $ num_dealers_visited      : Factor w/ 10 levels "1","2","3","4",..: 10 7 6 9 4 10 10 5 2 1 ...
##  $ num_serv_dealer_purchased: Factor w/ 10 levels "1","2","3","4",..: 4 10 10 7 9 4 4 8 3 1 ...
#We run a linear regression model on our testset
glm0 = glm(formula = Target ~ ., family = binomial, data = test_data[,-1])
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(glm0)
## 
## Call:
## glm(formula = Target ~ ., family = binomial, data = test_data[, 
##     -1])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.4003  -0.0890  -0.0046  -0.0001   5.8607  
## 
## Coefficients: (2 not defined because of singularities)
##                               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 -7.726e+00  7.512e-01 -10.285  < 2e-16 ***
## age_band2. 25 to 34          1.255e+00  7.766e-01   1.616 0.106007    
## age_band3. 35 to 44          1.100e+00  7.667e-01   1.435 0.151200    
## age_band4. 45 to 54          1.402e+00  7.581e-01   1.850 0.064299 .  
## age_band5. 55 to 64          1.583e+00  7.593e-01   2.084 0.037132 *  
## age_band6. 65 to 74          1.053e+00  7.990e-01   1.318 0.187396    
## age_band7. 75+               1.064e+00  8.666e-01   1.228 0.219413    
## age_bandNULL                 1.329e+00  7.317e-01   1.817 0.069284 .  
## genderMale                   4.309e-01  8.983e-02   4.797 1.61e-06 ***
## genderNULL                  -7.974e-01  8.746e-02  -9.117  < 2e-16 ***
## car_modelmodel_10           -7.750e-01  1.929e-01  -4.018 5.87e-05 ***
## car_modelmodel_11           -3.269e+00  3.840e-01  -8.513  < 2e-16 ***
## car_modelmodel_12           -1.327e+00  4.914e-01  -2.701 0.006922 ** 
## car_modelmodel_13            1.700e+00  2.690e-01   6.320 2.62e-10 ***
## car_modelmodel_14           -1.955e+01  2.622e+03  -0.007 0.994050    
## car_modelmodel_15            2.198e+00  6.185e-01   3.554 0.000379 ***
## car_modelmodel_16            2.016e+00  8.372e-01   2.408 0.016026 *  
## car_modelmodel_17           -2.162e+00  1.044e+00  -2.071 0.038383 *  
## car_modelmodel_18           -2.265e+00  5.384e-01  -4.206 2.60e-05 ***
## car_modelmodel_19           -1.674e+01  2.061e+04  -0.001 0.999352    
## car_modelmodel_2             6.811e-01  9.059e-02   7.518 5.56e-14 ***
## car_modelmodel_3             8.245e-01  9.358e-02   8.811  < 2e-16 ***
## car_modelmodel_4             6.181e-01  1.102e-01   5.609 2.04e-08 ***
## car_modelmodel_5             3.059e-01  9.383e-02   3.260 0.001115 ** 
## car_modelmodel_6             3.392e-01  2.013e-01   1.685 0.092072 .  
## car_modelmodel_7             9.602e-01  1.133e-01   8.475  < 2e-16 ***
## car_modelmodel_8             9.486e-01  1.280e-01   7.413 1.23e-13 ***
## car_modelmodel_9             2.963e-01  2.780e-01   1.066 0.286367    
## car_segmentLCV                      NA         NA      NA       NA    
## car_segmentOther             2.146e+00  1.478e+00   1.452 0.146404    
## car_segmentSmall/Medium             NA         NA      NA       NA    
## age_of_vehicle_years2        4.702e-01  9.139e-02   5.145 2.68e-07 ***
## age_of_vehicle_years3        9.684e-01  9.318e-02  10.392  < 2e-16 ***
## age_of_vehicle_years4        2.657e+00  1.063e-01  24.997  < 2e-16 ***
## age_of_vehicle_years5        2.524e+00  1.135e-01  22.242  < 2e-16 ***
## age_of_vehicle_years6        2.167e+00  1.310e-01  16.551  < 2e-16 ***
## age_of_vehicle_years7        9.812e-01  1.706e-01   5.752 8.82e-09 ***
## age_of_vehicle_years8        1.491e-01  2.310e-01   0.646 0.518498    
## age_of_vehicle_years9       -1.078e+00  4.178e-01  -2.581 0.009851 ** 
## age_of_vehicle_years10      -2.997e+00  1.014e+00  -2.957 0.003109 ** 
## sched_serv_warr2             3.290e-01  7.192e-02   4.574 4.77e-06 ***
## sched_serv_warr3            -2.419e-01  8.739e-02  -2.768 0.005634 ** 
## sched_serv_warr4            -1.027e+00  1.114e-01  -9.221  < 2e-16 ***
## sched_serv_warr5            -2.038e+00  1.565e-01 -13.019  < 2e-16 ***
## sched_serv_warr6            -3.367e+00  2.511e-01 -13.409  < 2e-16 ***
## sched_serv_warr7            -6.594e+00  1.028e+00  -6.415 1.41e-10 ***
## sched_serv_warr8            -5.819e+00  7.528e-01  -7.730 1.07e-14 ***
## sched_serv_warr9            -1.796e+01  1.869e+02  -0.096 0.923476    
## sched_serv_warr10           -5.570e+00  9.064e-01  -6.145 7.99e-10 ***
## non_sched_serv_warr2         6.563e-01  9.905e-02   6.626 3.46e-11 ***
## non_sched_serv_warr3         7.255e-01  1.056e-01   6.871 6.36e-12 ***
## non_sched_serv_warr4         9.172e-01  1.464e-01   6.265 3.73e-10 ***
## non_sched_serv_warr5         6.880e-01  1.476e-01   4.662 3.13e-06 ***
## non_sched_serv_warr6        -3.175e-01  1.747e-01  -1.817 0.069168 .  
## non_sched_serv_warr7        -5.941e-01  1.873e-01  -3.172 0.001513 ** 
## non_sched_serv_warr8        -1.203e+00  2.284e-01  -5.266 1.39e-07 ***
## non_sched_serv_warr9        -1.771e+00  3.167e-01  -5.593 2.23e-08 ***
## non_sched_serv_warr10       -1.505e+00  5.372e-01  -2.802 0.005082 ** 
## sched_serv_paid2            -2.753e-02  6.552e-02  -0.420 0.674326    
## sched_serv_paid3            -3.627e-01  7.938e-02  -4.569 4.90e-06 ***
## sched_serv_paid4            -1.108e+00  1.096e-01 -10.105  < 2e-16 ***
## sched_serv_paid5            -1.247e+00  1.346e-01  -9.267  < 2e-16 ***
## sched_serv_paid6            -3.059e+00  3.174e-01  -9.637  < 2e-16 ***
## sched_serv_paid7            -2.576e+00  3.519e-01  -7.322 2.44e-13 ***
## sched_serv_paid8            -2.256e+00  4.601e-01  -4.903 9.45e-07 ***
## sched_serv_paid9            -1.374e+00  5.280e-01  -2.602 0.009275 ** 
## sched_serv_paid10           -8.445e-01  5.162e-01  -1.636 0.101881    
## non_sched_serv_paid2        -1.193e-01  8.611e-02  -1.385 0.165904    
## non_sched_serv_paid3        -5.668e-03  9.107e-02  -0.062 0.950380    
## non_sched_serv_paid4        -3.243e-01  1.601e-01  -2.025 0.042870 *  
## non_sched_serv_paid5         1.499e-01  1.805e-01   0.830 0.406382    
## non_sched_serv_paid6         4.923e-01  2.463e-01   1.999 0.045643 *  
## non_sched_serv_paid7         1.246e+00  3.273e-01   3.808 0.000140 ***
## non_sched_serv_paid8         1.260e+00  3.820e-01   3.300 0.000968 ***
## non_sched_serv_paid9         1.989e+00  4.355e-01   4.566 4.96e-06 ***
## non_sched_serv_paid10        2.071e+00  5.445e-01   3.803 0.000143 ***
## total_paid_services2         2.481e-01  7.683e-02   3.229 0.001243 ** 
## total_paid_services3         9.475e-01  1.395e-01   6.794 1.09e-11 ***
## total_paid_services4         4.854e-01  2.214e-01   2.193 0.028343 *  
## total_paid_services5        -1.514e-01  3.131e-01  -0.484 0.628709    
## total_paid_services6        -1.354e-01  3.656e-01  -0.370 0.711137    
## total_paid_services7        -1.609e-01  4.055e-01  -0.397 0.691546    
## total_paid_services8        -5.824e-01  4.561e-01  -1.277 0.201607    
## total_paid_services9        -2.998e-01  5.318e-01  -0.564 0.573006    
## total_paid_services10        1.327e-02  5.681e-01   0.023 0.981371    
## total_services2              6.514e-01  8.411e-02   7.745 9.57e-15 ***
## total_services3              1.034e-01  1.230e-01   0.841 0.400303    
## total_services4             -9.161e-01  1.562e-01  -5.866 4.47e-09 ***
## total_services5             -1.757e+00  2.090e-01  -8.408  < 2e-16 ***
## total_services6             -3.917e+00  3.092e-01 -12.669  < 2e-16 ***
## total_services7             -5.702e+00  4.362e-01 -13.071  < 2e-16 ***
## total_services8             -6.150e+00  5.603e-01 -10.977  < 2e-16 ***
## total_services9             -8.335e+00  7.372e-01 -11.307  < 2e-16 ***
## total_services10            -1.032e+01  9.119e-01 -11.321  < 2e-16 ***
## mth_since_last_serv2         5.506e-01  7.699e-02   7.152 8.56e-13 ***
## mth_since_last_serv3         6.648e-01  1.002e-01   6.638 3.18e-11 ***
## mth_since_last_serv4         5.501e-01  1.023e-01   5.379 7.50e-08 ***
## mth_since_last_serv5        -2.527e-01  1.135e-01  -2.228 0.025908 *  
## mth_since_last_serv6        -1.508e+00  1.462e-01 -10.316  < 2e-16 ***
## mth_since_last_serv7        -3.037e+00  2.177e-01 -13.951  < 2e-16 ***
## mth_since_last_serv8        -7.104e+00  1.011e+00  -7.025 2.13e-12 ***
## mth_since_last_serv9        -5.894e+00  3.314e-01 -17.785  < 2e-16 ***
## mth_since_last_serv10       -1.852e+01  1.998e+02  -0.093 0.926144    
## annualised_mileage2          4.445e-01  8.854e-02   5.020 5.17e-07 ***
## annualised_mileage3          3.364e+00  1.158e-01  29.048  < 2e-16 ***
## annualised_mileage4          3.118e+00  1.172e-01  26.605  < 2e-16 ***
## annualised_mileage5          3.593e+00  1.192e-01  30.135  < 2e-16 ***
## annualised_mileage6          4.370e+00  1.281e-01  34.115  < 2e-16 ***
## annualised_mileage7          3.884e+00  1.405e-01  27.635  < 2e-16 ***
## annualised_mileage8          3.044e+00  1.702e-01  17.887  < 2e-16 ***
## annualised_mileage9          2.926e+00  1.937e-01  15.105  < 2e-16 ***
## annualised_mileage10        -4.396e-02  4.643e-01  -0.095 0.924561    
## num_dealers_visited2         4.617e-01  8.377e-02   5.511 3.57e-08 ***
## num_dealers_visited3        -1.343e-01  1.173e-01  -1.145 0.252149    
## num_dealers_visited4        -6.009e-01  1.528e-01  -3.932 8.41e-05 ***
## num_dealers_visited5        -4.021e-01  1.484e-01  -2.710 0.006735 ** 
## num_dealers_visited6        -5.136e-01  1.470e-01  -3.494 0.000476 ***
## num_dealers_visited7        -2.800e-01  1.451e-01  -1.930 0.053552 .  
## num_dealers_visited8        -4.597e-01  1.501e-01  -3.062 0.002196 ** 
## num_dealers_visited9         1.126e+00  1.691e-01   6.660 2.74e-11 ***
## num_dealers_visited10        1.773e+00  1.743e-01  10.177  < 2e-16 ***
## num_serv_dealer_purchased2   5.115e-01  9.664e-02   5.293 1.20e-07 ***
## num_serv_dealer_purchased3   5.939e-01  9.648e-02   6.155 7.49e-10 ***
## num_serv_dealer_purchased4  -7.218e-01  1.416e-01  -5.098 3.42e-07 ***
## num_serv_dealer_purchased5   2.242e-02  1.389e-01   0.161 0.871776    
## num_serv_dealer_purchased6   1.141e+00  1.468e-01   7.770 7.83e-15 ***
## num_serv_dealer_purchased7   2.658e+00  1.673e-01  15.884  < 2e-16 ***
## num_serv_dealer_purchased8   4.921e+00  2.304e-01  21.356  < 2e-16 ***
## num_serv_dealer_purchased9   5.759e+00  2.875e-01  20.030  < 2e-16 ***
## num_serv_dealer_purchased10  8.333e+00  4.051e-01  20.570  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 32432  on 131336  degrees of freedom
## Residual deviance: 12674  on 131209  degrees of freedom
## AIC: 12930
## 
## Number of Fisher Scoring iterations: 20
#It shows the factor data is causing strange things to come out, so we need to clean the data to turn categories into integers
test_data$age_band <- substr(test_data$age_band, 1, 1)
test_data$age_band <- gsub("N", "NULL", test_data$age_band)

test_data$gender <- gsub("Male", "1", test_data$gender)
test_data$gender <- gsub("Female", "2", test_data$gender)

test_data$car_model <- sub('model_', '', test_data$car_model)

test_data$car_segment <- sub("Other", "1", test_data$car_segment)
test_data$car_segment <- sub("Small/Medium", "2", test_data$car_segment)
test_data$car_segment <- sub("Large/SUV", "3", test_data$car_segment)
test_data$car_segment <- sub("LCV", "4", test_data$car_segment)

test_data$age_band <- as.integer(test_data$age_band)
## Warning: NAs introduced by coercion
test_data$gender <- as.integer(test_data$gender)
## Warning: NAs introduced by coercion
test_data$car_model <- as.integer(test_data$car_model)
test_data$car_segment <- as.integer(test_data$car_segment)

#We want to keep our Target as a Factor
test_data[,2] = as.factor(test_data[,2]);

#Remove the columns with high numbers of NAs
test_data <- test_data %>%
    dplyr::select(ID, Target, car_model, car_segment, age_of_vehicle_years, sched_serv_warr, non_sched_serv_warr, sched_serv_paid, non_sched_serv_paid, total_paid_services, total_services, mth_since_last_serv, annualised_mileage, num_dealers_visited, num_serv_dealer_purchased)

str(test_data)
## 'data.frame':    131337 obs. of  15 variables:
##  $ ID                       : int  1 2 3 5 6 7 8 9 10 11 ...
##  $ Target                   : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ car_model                : int  1 2 3 3 2 5 3 6 4 4 ...
##  $ car_segment              : int  4 2 3 3 2 3 3 2 2 2 ...
##  $ age_of_vehicle_years     : Factor w/ 10 levels "1","2","3","4",..: 9 6 9 5 8 7 8 7 1 3 ...
##  $ sched_serv_warr          : Factor w/ 10 levels "1","2","3","4",..: 2 10 10 8 9 4 2 4 2 1 ...
##  $ non_sched_serv_warr      : Factor w/ 10 levels "1","2","3","4",..: 10 3 9 5 4 10 8 9 1 1 ...
##  $ sched_serv_paid          : Factor w/ 10 levels "1","2","3","4",..: 3 10 10 8 10 5 2 6 1 2 ...
##  $ non_sched_serv_paid      : Factor w/ 10 levels "1","2","3","4",..: 7 4 9 4 7 7 9 9 3 1 ...
##  $ total_paid_services      : Factor w/ 10 levels "1","2","3","4",..: 5 9 10 5 9 6 9 8 1 2 ...
##  $ total_services           : Factor w/ 10 levels "1","2","3","4",..: 6 10 10 6 8 8 4 6 2 1 ...
##  $ mth_since_last_serv      : Factor w/ 10 levels "1","2","3","4",..: 9 6 7 4 5 8 7 9 1 1 ...
##  $ annualised_mileage       : Factor w/ 10 levels "1","2","3","4",..: 8 10 10 10 4 5 6 5 1 1 ...
##  $ num_dealers_visited      : Factor w/ 10 levels "1","2","3","4",..: 10 7 6 9 4 10 10 5 2 1 ...
##  $ num_serv_dealer_purchased: Factor w/ 10 levels "1","2","3","4",..: 4 10 10 7 9 4 4 8 3 1 ...
# create training and test sets
## 70% of the test_data, use floor to round down to nearest integer
trainset_size <- floor(0.70 * nrow(test_data))
set.seed(42)

trainset_indices <- sample(seq_len(nrow(test_data)), size = trainset_size)

trainset <- test_data[trainset_indices, ]
testset <- test_data[-trainset_indices, ]

#rowcounts to check
nrow(trainset)
## [1] 91935
nrow(testset)
## [1] 39402
nrow(test_data)
## [1] 131337
#So try our linear regression model on our test set
glm_model = glm(formula = Target ~ ., family = binomial, data = testset[,-1])
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
#There's a couple of obvious ones we could remove (e.g. age_of_vehicle_years, non_sched_serv_warr), but some others are questionable (e.g. total_paid_services, car_segment, car_model)
summary(glm_model)
## 
## Call:
## glm(formula = Target ~ ., family = binomial, data = testset[, 
##     -1])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.1140  -0.1122  -0.0007   0.0000   4.6885  
## 
## Coefficients:
##                               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                  -5.321785   0.301334 -17.661  < 2e-16 ***
## car_model                    -0.063753   0.015277  -4.173 3.00e-05 ***
## car_segment                  -0.008342   0.060282  -0.138 0.889943    
## age_of_vehicle_years2         0.247220   0.160288   1.542 0.122990    
## age_of_vehicle_years3         0.927293   0.159538   5.812 6.16e-09 ***
## age_of_vehicle_years4         2.396002   0.181748  13.183  < 2e-16 ***
## age_of_vehicle_years5         2.665773   0.193280  13.792  < 2e-16 ***
## age_of_vehicle_years6         2.193909   0.229713   9.551  < 2e-16 ***
## age_of_vehicle_years7         1.568182   0.289025   5.426 5.77e-08 ***
## age_of_vehicle_years8         0.312768   0.471324   0.664 0.506950    
## age_of_vehicle_years9        -0.056806   0.639165  -0.089 0.929181    
## age_of_vehicle_years10      -14.521471 530.135033  -0.027 0.978147    
## sched_serv_warr2              0.526864   0.126651   4.160 3.18e-05 ***
## sched_serv_warr3             -0.229156   0.157099  -1.459 0.144656    
## sched_serv_warr4             -1.037305   0.198200  -5.234 1.66e-07 ***
## sched_serv_warr5             -1.801043   0.267927  -6.722 1.79e-11 ***
## sched_serv_warr6             -3.293091   0.442662  -7.439 1.01e-13 ***
## sched_serv_warr7            -19.183103 590.054961  -0.033 0.974065    
## sched_serv_warr8             -4.729599   0.835025  -5.664 1.48e-08 ***
## sched_serv_warr9            -19.307675 546.038284  -0.035 0.971793    
## sched_serv_warr10           -19.522822 475.318915  -0.041 0.967238    
## non_sched_serv_warr2          0.467764   0.169838   2.754 0.005884 ** 
## non_sched_serv_warr3          0.582451   0.180449   3.228 0.001248 ** 
## non_sched_serv_warr4          0.759299   0.257715   2.946 0.003216 ** 
## non_sched_serv_warr5          0.512283   0.257373   1.990 0.046543 *  
## non_sched_serv_warr6         -0.271581   0.303509  -0.895 0.370893    
## non_sched_serv_warr7         -0.697191   0.326683  -2.134 0.032830 *  
## non_sched_serv_warr8         -1.031068   0.392659  -2.626 0.008643 ** 
## non_sched_serv_warr9         -1.918605   0.548952  -3.495 0.000474 ***
## non_sched_serv_warr10        -3.020074   0.934314  -3.232 0.001228 ** 
## sched_serv_paid2             -0.164578   0.115568  -1.424 0.154424    
## sched_serv_paid3             -0.330871   0.137471  -2.407 0.016091 *  
## sched_serv_paid4             -1.181155   0.189317  -6.239 4.40e-10 ***
## sched_serv_paid5             -1.403904   0.237000  -5.924 3.15e-09 ***
## sched_serv_paid6             -3.213430   0.566104  -5.676 1.38e-08 ***
## sched_serv_paid7             -2.664652   0.644345  -4.135 3.54e-05 ***
## sched_serv_paid8             -2.274661   0.802060  -2.836 0.004568 ** 
## sched_serv_paid9            -15.661813 496.513323  -0.032 0.974836    
## sched_serv_paid10            -2.318917   1.117235  -2.076 0.037932 *  
## non_sched_serv_paid2         -0.206021   0.149160  -1.381 0.167214    
## non_sched_serv_paid3         -0.101047   0.157342  -0.642 0.520736    
## non_sched_serv_paid4         -0.786024   0.282492  -2.782 0.005395 ** 
## non_sched_serv_paid5         -0.428696   0.315385  -1.359 0.174058    
## non_sched_serv_paid6         -0.089068   0.429123  -0.208 0.835574    
## non_sched_serv_paid7          0.297113   0.583934   0.509 0.610883    
## non_sched_serv_paid8          0.245353   0.697787   0.352 0.725126    
## non_sched_serv_paid9          1.094894   0.802217   1.365 0.172305    
## non_sched_serv_paid10         1.315141   0.980104   1.342 0.179648    
## total_paid_services2          0.021461   0.134726   0.159 0.873436    
## total_paid_services3          0.850394   0.240581   3.535 0.000408 ***
## total_paid_services4          0.526992   0.387381   1.360 0.173705    
## total_paid_services5          0.038755   0.550725   0.070 0.943899    
## total_paid_services6          0.444404   0.664168   0.669 0.503423    
## total_paid_services7          0.054248   0.748411   0.072 0.942217    
## total_paid_services8         -0.717669   0.844431  -0.850 0.395389    
## total_paid_services9         -0.250286   0.955115  -0.262 0.793285    
## total_paid_services10        -0.015846   1.014287  -0.016 0.987536    
## total_services2               0.581132   0.146595   3.964 7.36e-05 ***
## total_services3               0.279494   0.213710   1.308 0.190934    
## total_services4              -0.670212   0.275198  -2.435 0.014876 *  
## total_services5              -1.615524   0.371538  -4.348 1.37e-05 ***
## total_services6              -3.020715   0.504344  -5.989 2.11e-09 ***
## total_services7              -4.218894   0.674028  -6.259 3.87e-10 ***
## total_services8              -4.812766   0.905330  -5.316 1.06e-07 ***
## total_services9              -3.971067   1.088167  -3.649 0.000263 ***
## total_services10            -20.370131 480.018291  -0.042 0.966151    
## mth_since_last_serv2          0.506743   0.136416   3.715 0.000203 ***
## mth_since_last_serv3          0.344824   0.175412   1.966 0.049323 *  
## mth_since_last_serv4          0.260285   0.180091   1.445 0.148375    
## mth_since_last_serv5         -0.458696   0.198843  -2.307 0.021065 *  
## mth_since_last_serv6         -1.786010   0.251780  -7.094 1.31e-12 ***
## mth_since_last_serv7         -2.935764   0.331817  -8.848  < 2e-16 ***
## mth_since_last_serv8         -5.894079   1.023980  -5.756 8.61e-09 ***
## mth_since_last_serv9         -4.544963   0.456044  -9.966  < 2e-16 ***
## mth_since_last_serv10       -19.337687 618.884505  -0.031 0.975073    
## annualised_mileage2           0.366010   0.158755   2.305 0.021139 *  
## annualised_mileage3           3.649939   0.205651  17.748  < 2e-16 ***
## annualised_mileage4           3.408541   0.209283  16.287  < 2e-16 ***
## annualised_mileage5           3.718963   0.213346  17.432  < 2e-16 ***
## annualised_mileage6           4.303324   0.233809  18.405  < 2e-16 ***
## annualised_mileage7           4.039185   0.248380  16.262  < 2e-16 ***
## annualised_mileage8           3.272838   0.300098  10.906  < 2e-16 ***
## annualised_mileage9           2.741438   0.368559   7.438 1.02e-13 ***
## annualised_mileage10         -0.036174   0.869773  -0.042 0.966826    
## num_dealers_visited2          0.409238   0.146438   2.795 0.005196 ** 
## num_dealers_visited3         -0.372042   0.209482  -1.776 0.075731 .  
## num_dealers_visited4         -1.084158   0.267515  -4.053 5.06e-05 ***
## num_dealers_visited5         -0.780010   0.256176  -3.045 0.002328 ** 
## num_dealers_visited6         -1.008538   0.254995  -3.955 7.65e-05 ***
## num_dealers_visited7         -0.782516   0.253863  -3.082 0.002053 ** 
## num_dealers_visited8         -0.822125   0.258554  -3.180 0.001474 ** 
## num_dealers_visited9          0.587129   0.306407   1.916 0.055343 .  
## num_dealers_visited10         1.178422   0.306919   3.840 0.000123 ***
## num_serv_dealer_purchased2    0.401614   0.167702   2.395 0.016629 *  
## num_serv_dealer_purchased3    0.456330   0.167702   2.721 0.006507 ** 
## num_serv_dealer_purchased4   -0.452801   0.242552  -1.867 0.061926 .  
## num_serv_dealer_purchased5    0.043803   0.243425   0.180 0.857197    
## num_serv_dealer_purchased6    1.185001   0.257882   4.595 4.32e-06 ***
## num_serv_dealer_purchased7    2.928572   0.295746   9.902  < 2e-16 ***
## num_serv_dealer_purchased8    4.678391   0.391916  11.937  < 2e-16 ***
## num_serv_dealer_purchased9    5.537041   0.495821  11.167  < 2e-16 ***
## num_serv_dealer_purchased10   7.497139   0.588519  12.739  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 9813.5  on 39401  degrees of freedom
## Residual deviance: 4179.7  on 39300  degrees of freedom
## AIC: 4383.7
## 
## Number of Fisher Scoring iterations: 21
#predict probabilities on testset
glm_prob <- predict.glm(glm_model,testset[,-1],type="response")
# First create vector to hold predictions
glm_predict <- rep(0,nrow(testset[,-1]))
glm_predict[glm_prob>.5] <- 1

table(pred=glm_predict,true=testset$Target)
##     true
## pred     0     1
##    0 38227   534
##    1   107   534
#accuracy
mean(glm_predict==testset$Target)
## [1] 0.9837318
#convert training data to matrix format
x <- model.matrix(Target~.,trainset[,-1])
y <- trainset$Target

#perform grid search to find optimal value of lambda
#family= binomial => logistic regression, alpha=1 => lasso
# check docs to explore other type.measure options
cv.out <- cv.glmnet(x, y, alpha=1, family="binomial", type.measure="mse")

plot(cv.out)

#min value of lambda
lambda_min <- cv.out$lambda.min
#best value of lambda
lambda_1se <- cv.out$lambda.1se
#regression coefficients
coef(cv.out,s=lambda_1se)
## 103 x 1 sparse Matrix of class "dgCMatrix"
##                                        1
## (Intercept)                 -5.345592622
## (Intercept)                  .          
## car_model                   -0.058948339
## car_segment                  .          
## age_of_vehicle_years2        0.113174280
## age_of_vehicle_years3        0.465972703
## age_of_vehicle_years4        1.982647880
## age_of_vehicle_years5        1.900922269
## age_of_vehicle_years6        1.864416135
## age_of_vehicle_years7        0.796137241
## age_of_vehicle_years8        0.147472267
## age_of_vehicle_years9       -0.496992950
## age_of_vehicle_years10      -1.094971035
## sched_serv_warr2             0.198711635
## sched_serv_warr3            -0.209692514
## sched_serv_warr4            -0.861496879
## sched_serv_warr5            -1.947124345
## sched_serv_warr6            -3.187103741
## sched_serv_warr7            -4.731387368
## sched_serv_warr8            -4.986479158
## sched_serv_warr9            -4.980822455
## sched_serv_warr10           -4.096561938
## non_sched_serv_warr2         0.611885011
## non_sched_serv_warr3         0.742175407
## non_sched_serv_warr4         1.072448240
## non_sched_serv_warr5         0.879118154
## non_sched_serv_warr6        -0.058358943
## non_sched_serv_warr7        -0.148736889
## non_sched_serv_warr8        -0.726104878
## non_sched_serv_warr9        -1.109117349
## non_sched_serv_warr10       -0.717253659
## sched_serv_paid2             0.054981206
## sched_serv_paid3            -0.297471935
## sched_serv_paid4            -0.917507163
## sched_serv_paid5            -0.993139430
## sched_serv_paid6            -2.609431818
## sched_serv_paid7            -2.159949461
## sched_serv_paid8            -1.878150528
## sched_serv_paid9            -0.714028780
## sched_serv_paid10            .          
## non_sched_serv_paid2        -0.093752966
## non_sched_serv_paid3        -0.004517833
## non_sched_serv_paid4        -0.245894861
## non_sched_serv_paid5         .          
## non_sched_serv_paid6         0.247488348
## non_sched_serv_paid7         0.817697734
## non_sched_serv_paid8         0.804677871
## non_sched_serv_paid9         1.174678161
## non_sched_serv_paid10        1.370054653
## total_paid_services2         0.208031703
## total_paid_services3         0.685555246
## total_paid_services4         0.355041221
## total_paid_services5         .          
## total_paid_services6         .          
## total_paid_services7         0.191942591
## total_paid_services8         .          
## total_paid_services9         .          
## total_paid_services10        0.140322243
## total_services2              0.677822549
## total_services3              0.078310155
## total_services4             -0.824942151
## total_services5             -1.601261636
## total_services6             -3.756243813
## total_services7             -5.362748620
## total_services8             -5.380509495
## total_services9             -7.761052222
## total_services10            -8.002797965
## mth_since_last_serv2         0.584380574
## mth_since_last_serv3         0.782769036
## mth_since_last_serv4         0.639813072
## mth_since_last_serv5        -0.192871553
## mth_since_last_serv6        -1.325753463
## mth_since_last_serv7        -2.815547367
## mth_since_last_serv8        -5.324625410
## mth_since_last_serv9        -4.747938156
## mth_since_last_serv10       -4.967446669
## annualised_mileage2          0.237736757
## annualised_mileage3          2.824415505
## annualised_mileage4          2.532436233
## annualised_mileage5          3.039364647
## annualised_mileage6          3.847225225
## annualised_mileage7          3.238486585
## annualised_mileage8          2.525621285
## annualised_mileage9          2.470704690
## annualised_mileage10         .          
## num_dealers_visited2         0.444414874
## num_dealers_visited3        -0.195623104
## num_dealers_visited4        -0.591550042
## num_dealers_visited5        -0.513379859
## num_dealers_visited6        -0.536408497
## num_dealers_visited7        -0.385058479
## num_dealers_visited8        -0.577726321
## num_dealers_visited9         0.950781961
## num_dealers_visited10        1.475032801
## num_serv_dealer_purchased2   0.365083714
## num_serv_dealer_purchased3   0.396784076
## num_serv_dealer_purchased4  -0.834991547
## num_serv_dealer_purchased5  -0.032191729
## num_serv_dealer_purchased6   0.951202576
## num_serv_dealer_purchased7   2.250061592
## num_serv_dealer_purchased8   4.309230917
## num_serv_dealer_purchased9   4.822079890
## num_serv_dealer_purchased10  6.634483004
#get test data
x_test <- model.matrix(Target~.,testset[,-1])
#predict class, type=”class”
lasso_prob <- predict(cv.out,newx = x_test,s=lambda_1se,type="response")
#translate probabilities to predictions
lasso_predict <- rep(0,nrow(testset))
lasso_predict[lasso_prob>.5] <- 1
#confusion matrix
table(pred=lasso_predict,true=testset$Target)
##     true
## pred     0     1
##    0 38237   581
##    1    97   487
#accuracy
mean(lasso_predict==testset$Target)
## [1] 0.9827928