Objectives

  1. Develop and deploy a slassification model an a product purchase data set.
  2. End to end analysis using R
  3. Learn the caret package for ML
  4. Learn to present the case using Rmarkdown

Read in the dataset

############################################################################################################
# Check that required libraries are present, otherwise install them
############################################################################################################

list_of_packages <- c("tidyverse", "dplyr","ggplot2", "caret", "lattice")
new_packages <- list_of_packages[!(list_of_packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

############################################################################################################
# Load required libraries
############################################################################################################

library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.0.0     ✔ purrr   0.2.5
## ✔ tibble  1.4.2     ✔ dplyr   0.7.6
## ✔ tidyr   0.8.1     ✔ stringr 1.3.1
## ✔ readr   1.1.1     ✔ forcats 0.3.0
## Warning: package 'dplyr' was built under R version 3.5.1
## ── Conflicts ──────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(dplyr)
library(ggplot2)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(lattice) #Caret dependent
library(rpart)
library(modelr)
library(purrr)

# Ask the user to nominate a location for the forecast files to be produced
############################################################################################################

setwd("~/Documents/36106/Assignment_1/36106AT1")

############################################################################################################

## Import training set. 

train_set <- read_csv("repurchase_training.csv")
## Parsed with column specification:
## cols(
##   ID = col_integer(),
##   Target = col_integer(),
##   age_band = col_character(),
##   gender = col_character(),
##   car_model = col_character(),
##   car_segment = col_character(),
##   age_of_vehicle_years = col_integer(),
##   sched_serv_warr = col_integer(),
##   non_sched_serv_warr = col_integer(),
##   sched_serv_paid = col_integer(),
##   non_sched_serv_paid = col_integer(),
##   total_paid_services = col_integer(),
##   total_services = col_integer(),
##   mth_since_last_serv = col_integer(),
##   annualised_mileage = col_integer(),
##   num_dealers_visited = col_integer(),
##   num_serv_dealer_purchased = col_integer()
## )
'131,337 observations over 17 variables'
## [1] "131,337 observations over 17 variables"
str(train_set)
## Classes 'tbl_df', 'tbl' and 'data.frame':    131337 obs. of  17 variables:
##  $ ID                       : int  1 2 3 5 6 7 8 9 10 11 ...
##  $ Target                   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ age_band                 : chr  "3. 35 to 44" "NULL" "NULL" "NULL" ...
##  $ gender                   : chr  "Male" "NULL" "Male" "NULL" ...
##  $ car_model                : chr  "model_1" "model_2" "model_3" "model_3" ...
##  $ car_segment              : chr  "LCV" "Small/Medium" "Large/SUV" "Large/SUV" ...
##  $ age_of_vehicle_years     : int  9 6 9 5 8 7 8 7 1 3 ...
##  $ sched_serv_warr          : int  2 10 10 8 9 4 2 4 2 1 ...
##  $ non_sched_serv_warr      : int  10 3 9 5 4 10 8 9 1 1 ...
##  $ sched_serv_paid          : int  3 10 10 8 10 5 2 6 1 2 ...
##  $ non_sched_serv_paid      : int  7 4 9 4 7 7 9 9 3 1 ...
##  $ total_paid_services      : int  5 9 10 5 9 6 9 8 1 2 ...
##  $ total_services           : int  6 10 10 6 8 8 4 6 2 1 ...
##  $ mth_since_last_serv      : int  9 6 7 4 5 8 7 9 1 1 ...
##  $ annualised_mileage       : int  8 10 10 10 4 5 6 5 1 1 ...
##  $ num_dealers_visited      : int  10 7 6 9 4 10 10 5 2 1 ...
##  $ num_serv_dealer_purchased: int  4 10 10 7 9 4 4 8 3 1 ...
##  - attr(*, "spec")=List of 2
##   ..$ cols   :List of 17
##   .. ..$ ID                       : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ Target                   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ age_band                 : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ gender                   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ car_model                : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ car_segment              : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ age_of_vehicle_years     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ sched_serv_warr          : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ non_sched_serv_warr      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ sched_serv_paid          : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ non_sched_serv_paid      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ total_paid_services      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ total_services           : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ mth_since_last_serv      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ annualised_mileage       : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ num_dealers_visited      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ num_serv_dealer_purchased: list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   ..$ default: list()
##   .. ..- attr(*, "class")= chr  "collector_guess" "collector"
##   ..- attr(*, "class")= chr "col_spec"
train_set$age_band <- as.factor(train_set$age_band)
levels(train_set$age_band)
## [1] "1. <25"      "2. 25 to 34" "3. 35 to 44" "4. 45 to 54" "5. 55 to 64"
## [6] "6. 65 to 74" "7. 75+"      "NULL"
train_set$gender <- as.factor(train_set$gender)
levels(train_set$gender)
## [1] "Female" "Male"   "NULL"
#order the factors correctly.
train_set$car_model <- as.factor(train_set$car_model)
train_set$car_model <- factor(train_set$car_model,
                              levels = c("model_1", "model_2", "model_3", "model_4", "model_5", "model_6", "model_7", "model_8", "model_9", "model_10", "model_11", "model_12", "model_13", "model_14", "model_15", "model_16", "model_17", "model_18"))
levels(train_set$car_model)
##  [1] "model_1"  "model_2"  "model_3"  "model_4"  "model_5"  "model_6" 
##  [7] "model_7"  "model_8"  "model_9"  "model_10" "model_11" "model_12"
## [13] "model_13" "model_14" "model_15" "model_16" "model_17" "model_18"
train_set$car_segment <- as.factor(train_set$car_segment)
train_set$car_segment <- factor(train_set$car_segment,
                                levels = c("Small/Medium", "Large/SUV", "LCV", "Other"))
levels(train_set$car_segment)
## [1] "Small/Medium" "Large/SUV"    "LCV"          "Other"

Missing Values Analysis

From the summary we can see that the age_band has 112,375 NULL’s, Gender has 69,308 NULL’s and car_model has 2 NA’s.

Quick investigation of missing values can be done using the complete.cases(), and more thorough graphical summary can be done using Amelia.

### Missing Values Analysis

summary(train_set)
##        ID             Target               age_band         gender     
##  Min.   :     1   Min.   :0.00000   NULL       :112375   Female:25957  
##  1st Qu.: 38563   1st Qu.:0.00000   4. 45 to 54:  4058   Male  :36072  
##  Median : 77132   Median :0.00000   3. 35 to 44:  3833   NULL  :69308  
##  Mean   : 77097   Mean   :0.02681   2. 25 to 34:  3548                 
##  3rd Qu.:115668   3rd Qu.:0.00000   5. 55 to 64:  3397                 
##  Max.   :154139   Max.   :1.00000   6. 65 to 74:  2140                 
##                                     (Other)    :  1986                 
##    car_model           car_segment    age_of_vehicle_years
##  model_2:34491   Small/Medium:54553   Min.   : 1.000      
##  model_5:24674   Large/SUV   :52120   1st Qu.: 3.000      
##  model_3:17074   LCV         :24606   Median : 5.000      
##  model_1:15331   Other       :   58   Mean   : 5.493      
##  model_4:15155                        3rd Qu.: 8.000      
##  (Other):24610                        Max.   :10.000      
##  NA's   :    2                                            
##  sched_serv_warr  non_sched_serv_warr sched_serv_paid  non_sched_serv_paid
##  Min.   : 1.000   Min.   : 1.000      Min.   : 1.000   Min.   : 1.000     
##  1st Qu.: 3.000   1st Qu.: 3.000      1st Qu.: 3.000   1st Qu.: 3.000     
##  Median : 5.000   Median : 5.000      Median : 5.000   Median : 5.000     
##  Mean   : 5.452   Mean   : 5.473      Mean   : 5.452   Mean   : 5.497     
##  3rd Qu.: 8.000   3rd Qu.: 8.000      3rd Qu.: 8.000   3rd Qu.: 8.000     
##  Max.   :10.000   Max.   :10.000      Max.   :10.000   Max.   :10.000     
##                                                                           
##  total_paid_services total_services   mth_since_last_serv
##  Min.   : 1.000      Min.   : 1.000   Min.   : 1.00      
##  1st Qu.: 3.000      1st Qu.: 3.000   1st Qu.: 3.00      
##  Median : 5.000      Median : 5.000   Median : 5.00      
##  Mean   : 5.482      Mean   : 5.455   Mean   : 5.47      
##  3rd Qu.: 8.000      3rd Qu.: 8.000   3rd Qu.: 8.00      
##  Max.   :10.000      Max.   :10.000   Max.   :10.00      
##                                                          
##  annualised_mileage num_dealers_visited num_serv_dealer_purchased
##  Min.   : 1.000     Min.   : 1.000      Min.   : 1.000           
##  1st Qu.: 3.000     1st Qu.: 3.000      1st Qu.: 3.000           
##  Median : 5.000     Median : 5.000      Median : 5.000           
##  Mean   : 5.503     Mean   : 5.485      Mean   : 5.481           
##  3rd Qu.: 8.000     3rd Qu.: 8.000      3rd Qu.: 8.000           
##  Max.   :10.000     Max.   :10.000      Max.   :10.000           
## 
#Missing cases NA:
map_int(train_set,~sum(is.na(.x)))
##                        ID                    Target 
##                         0                         0 
##                  age_band                    gender 
##                         0                         0 
##                 car_model               car_segment 
##                         2                         0 
##      age_of_vehicle_years           sched_serv_warr 
##                         0                         0 
##       non_sched_serv_warr           sched_serv_paid 
##                         0                         0 
##       non_sched_serv_paid       total_paid_services 
##                         0                         0 
##            total_services       mth_since_last_serv 
##                         0                         0 
##        annualised_mileage       num_dealers_visited 
##                         0                         0 
## num_serv_dealer_purchased 
##                         0
#Missing cases NULL:
train_set %>%
  filter(gender == "NULL")
## # A tibble: 69,308 x 17
##       ID Target age_band gender car_model car_segment age_of_vehicle_…
##    <int>  <int> <fct>    <fct>  <fct>     <fct>                  <int>
##  1     2      0 NULL     NULL   model_2   Small/Medi…                6
##  2     5      0 NULL     NULL   model_3   Large/SUV                  5
##  3    10      0 NULL     NULL   model_4   Small/Medi…                1
##  4    11      0 NULL     NULL   model_4   Small/Medi…                3
##  5    13      0 NULL     NULL   model_5   Large/SUV                  3
##  6    17      0 NULL     NULL   model_8   Large/SUV                  5
##  7    19      0 NULL     NULL   model_8   Large/SUV                  4
##  8    22      0 4. 45 t… NULL   model_3   Large/SUV                  5
##  9    26      0 NULL     NULL   model_5   Large/SUV                  3
## 10    27      0 NULL     NULL   model_5   Large/SUV                  2
## # ... with 69,298 more rows, and 10 more variables: sched_serv_warr <int>,
## #   non_sched_serv_warr <int>, sched_serv_paid <int>,
## #   non_sched_serv_paid <int>, total_paid_services <int>,
## #   total_services <int>, mth_since_last_serv <int>,
## #   annualised_mileage <int>, num_dealers_visited <int>,
## #   num_serv_dealer_purchased <int>
train_set %>%
  filter(age_band == "NULL")
## # A tibble: 112,375 x 17
##       ID Target age_band gender car_model car_segment age_of_vehicle_…
##    <int>  <int> <fct>    <fct>  <fct>     <fct>                  <int>
##  1     2      0 NULL     NULL   model_2   Small/Medi…                6
##  2     3      0 NULL     Male   model_3   Large/SUV                  9
##  3     5      0 NULL     NULL   model_3   Large/SUV                  5
##  4     6      0 NULL     Female model_2   Small/Medi…                8
##  5     7      0 NULL     Male   model_5   Large/SUV                  7
##  6     9      0 NULL     Male   model_6   Small/Medi…                7
##  7    10      0 NULL     NULL   model_4   Small/Medi…                1
##  8    11      0 NULL     NULL   model_4   Small/Medi…                3
##  9    13      0 NULL     NULL   model_5   Large/SUV                  3
## 10    14      0 NULL     Male   model_7   LCV                        9
## # ... with 112,365 more rows, and 10 more variables:
## #   sched_serv_warr <int>, non_sched_serv_warr <int>,
## #   sched_serv_paid <int>, non_sched_serv_paid <int>,
## #   total_paid_services <int>, total_services <int>,
## #   mth_since_last_serv <int>, annualised_mileage <int>,
## #   num_dealers_visited <int>, num_serv_dealer_purchased <int>

Train/Test Split

Here, I’m splitting up the train dataset into a training and a testing dataset. Right now, I don’t intend to upload any results back into Kaggle, so I’m just going to evaluate my models on my testing split.

“you can fit a substantially wider variety of Gaussian models with a GLM than with regression.”

“Classification: to predict a categorical class label, such as weather: rainy, sunnny, cloudy or snowy

Regressive evaluative measure.

MAE - Mean Absolute Error MSE - Mean Square Error RMSE - Route Mean Square Error

Avoid overfitting. (performs well on training data but porly on unseen data)

Training testing 80/20

k-fold cross validation 1. split data set into k subsets of equal size 2. reserve one for test 3. Average performance of all above.

Using the crossv_kfold method from modelr, it automatically splits the data into k partitions and uses each partition for a testing training split.

nrow(train_set)
## [1] 131337
ncol(train_set)
## [1] 17
summary(train_set)
##        ID             Target               age_band         gender     
##  Min.   :     1   Min.   :0.00000   NULL       :112375   Female:25957  
##  1st Qu.: 38563   1st Qu.:0.00000   4. 45 to 54:  4058   Male  :36072  
##  Median : 77132   Median :0.00000   3. 35 to 44:  3833   NULL  :69308  
##  Mean   : 77097   Mean   :0.02681   2. 25 to 34:  3548                 
##  3rd Qu.:115668   3rd Qu.:0.00000   5. 55 to 64:  3397                 
##  Max.   :154139   Max.   :1.00000   6. 65 to 74:  2140                 
##                                     (Other)    :  1986                 
##    car_model           car_segment    age_of_vehicle_years
##  model_2:34491   Small/Medium:54553   Min.   : 1.000      
##  model_5:24674   Large/SUV   :52120   1st Qu.: 3.000      
##  model_3:17074   LCV         :24606   Median : 5.000      
##  model_1:15331   Other       :   58   Mean   : 5.493      
##  model_4:15155                        3rd Qu.: 8.000      
##  (Other):24610                        Max.   :10.000      
##  NA's   :    2                                            
##  sched_serv_warr  non_sched_serv_warr sched_serv_paid  non_sched_serv_paid
##  Min.   : 1.000   Min.   : 1.000      Min.   : 1.000   Min.   : 1.000     
##  1st Qu.: 3.000   1st Qu.: 3.000      1st Qu.: 3.000   1st Qu.: 3.000     
##  Median : 5.000   Median : 5.000      Median : 5.000   Median : 5.000     
##  Mean   : 5.452   Mean   : 5.473      Mean   : 5.452   Mean   : 5.497     
##  3rd Qu.: 8.000   3rd Qu.: 8.000      3rd Qu.: 8.000   3rd Qu.: 8.000     
##  Max.   :10.000   Max.   :10.000      Max.   :10.000   Max.   :10.000     
##                                                                           
##  total_paid_services total_services   mth_since_last_serv
##  Min.   : 1.000      Min.   : 1.000   Min.   : 1.00      
##  1st Qu.: 3.000      1st Qu.: 3.000   1st Qu.: 3.00      
##  Median : 5.000      Median : 5.000   Median : 5.00      
##  Mean   : 5.482      Mean   : 5.455   Mean   : 5.47      
##  3rd Qu.: 8.000      3rd Qu.: 8.000   3rd Qu.: 8.00      
##  Max.   :10.000      Max.   :10.000   Max.   :10.00      
##                                                          
##  annualised_mileage num_dealers_visited num_serv_dealer_purchased
##  Min.   : 1.000     Min.   : 1.000      Min.   : 1.000           
##  1st Qu.: 3.000     1st Qu.: 3.000      1st Qu.: 3.000           
##  Median : 5.000     Median : 5.000      Median : 5.000           
##  Mean   : 5.503     Mean   : 5.485      Mean   : 5.481           
##  3rd Qu.: 8.000     3rd Qu.: 8.000      3rd Qu.: 8.000           
##  Max.   :10.000     Max.   :10.000      Max.   :10.000           
## 
#getindex of predicted variable
typeColNum <- grep("Target", names(train_set))

#k-fold 5
set.seed(42)

folds <- crossv_kfold(train_set, k = 6)

folds
## # A tibble: 6 x 3
##   train          test           .id  
##   <list>         <list>         <chr>
## 1 <S3: resample> <S3: resample> 1    
## 2 <S3: resample> <S3: resample> 2    
## 3 <S3: resample> <S3: resample> 3    
## 4 <S3: resample> <S3: resample> 4    
## 5 <S3: resample> <S3: resample> 5    
## 6 <S3: resample> <S3: resample> 6
folds$test[[1]]
## <resample [21,890 x 17]> 5, 16, 19, 28, 43, 50, 69, 83, 98, 99, ...

EDA

The first step in the analysis is to explore the data numerically and graphically. I always split up my EDA investigation as follows:

This gives me a structured approach towards larger datasets. My professor at Northwestern taught me to always complete a thorough intimate numeric & graphical EDA on the data, no matter how large the data 1. Anscombe (1973) clearly shows the importance of graphical analyses.

Say we’re interested in predicting Repurchase (Target) with all other variables. With the whole data set, we’d do this via:

glm(Target ~ ., data = train_set)

Instead, we want to run this model on each set of training data (data referenced in each train cell). We can do this as follows:

folds <- folds %>% mutate(model = map(train, ~ glm(Target ~ ., data = .)))
folds
## # A tibble: 6 x 4
##   train          test           .id   model    
##   <list>         <list>         <chr> <list>   
## 1 <S3: resample> <S3: resample> 1     <S3: glm>
## 2 <S3: resample> <S3: resample> 2     <S3: glm>
## 3 <S3: resample> <S3: resample> 3     <S3: glm>
## 4 <S3: resample> <S3: resample> 4     <S3: glm>
## 5 <S3: resample> <S3: resample> 5     <S3: glm>
## 6 <S3: resample> <S3: resample> 6     <S3: glm>
folds$model[[1]] %>% summary()
## 
## Call:
## glm(formula = Target ~ ., data = .)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -0.23870  -0.06237  -0.02090   0.02215   0.99997  
## 
## Coefficients: (2 not defined because of singularities)
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               -2.564e-02  5.725e-03  -4.478 7.53e-06 ***
## ID                         9.380e-07  1.016e-08  92.347  < 2e-16 ***
## age_band2. 25 to 34        5.307e-03  5.914e-03   0.897 0.369570    
## age_band3. 35 to 44        1.037e-02  5.875e-03   1.765 0.077628 .  
## age_band4. 45 to 54        1.246e-02  5.848e-03   2.130 0.033181 *  
## age_band5. 55 to 64        1.474e-02  5.949e-03   2.477 0.013245 *  
## age_band6. 65 to 74        1.459e-02  6.346e-03   2.299 0.021483 *  
## age_band7. 75+             2.130e-02  7.374e-03   2.888 0.003874 ** 
## age_bandNULL               1.535e-02  5.310e-03   2.891 0.003841 ** 
## genderMale                 7.032e-03  1.383e-03   5.083 3.72e-07 ***
## genderNULL                -3.925e-03  1.341e-03  -2.928 0.003416 ** 
## car_modelmodel_2           1.745e-02  1.652e-03  10.561  < 2e-16 ***
## car_modelmodel_3           2.664e-02  1.849e-03  14.404  < 2e-16 ***
## car_modelmodel_4           1.783e-02  1.969e-03   9.053  < 2e-16 ***
## car_modelmodel_5           1.007e-02  1.710e-03   5.892 3.82e-09 ***
## car_modelmodel_6           1.379e-02  3.291e-03   4.191 2.79e-05 ***
## car_modelmodel_7           2.243e-02  2.266e-03   9.901  < 2e-16 ***
## car_modelmodel_8           2.536e-02  2.447e-03  10.366  < 2e-16 ***
## car_modelmodel_9           7.445e-03  5.361e-03   1.389 0.164931    
## car_modelmodel_10         -2.478e-02  3.218e-03  -7.699 1.38e-14 ***
## car_modelmodel_11         -3.289e-02  6.775e-03  -4.855 1.21e-06 ***
## car_modelmodel_12         -2.244e-02  6.676e-03  -3.361 0.000778 ***
## car_modelmodel_13          3.788e-02  6.248e-03   6.063 1.34e-09 ***
## car_modelmodel_14         -4.314e-02  1.857e-02  -2.323 0.020166 *  
## car_modelmodel_15          3.749e-02  9.181e-03   4.084 4.43e-05 ***
## car_modelmodel_16          1.999e-02  1.522e-02   1.313 0.189181    
## car_modelmodel_17         -1.571e-02  1.695e-02  -0.927 0.353991    
## car_modelmodel_18         -2.663e-02  1.607e-02  -1.657 0.097450 .  
## car_segmentLarge/SUV              NA         NA      NA       NA    
## car_segmentLCV                    NA         NA      NA       NA    
## car_segmentOther          -5.437e-03  2.815e-02  -0.193 0.846827    
## age_of_vehicle_years       6.663e-04  2.535e-04   2.629 0.008571 ** 
## sched_serv_warr           -3.031e-03  4.398e-04  -6.892 5.55e-12 ***
## non_sched_serv_warr        1.119e-03  4.062e-04   2.754 0.005891 ** 
## sched_serv_paid           -9.931e-03  4.048e-04 -24.535  < 2e-16 ***
## non_sched_serv_paid        3.404e-04  4.442e-04   0.766 0.443429    
## total_paid_services        8.076e-03  5.266e-04  15.336  < 2e-16 ***
## total_services            -1.713e-02  5.758e-04 -29.745  < 2e-16 ***
## mth_since_last_serv       -7.069e-03  2.152e-04 -32.851  < 2e-16 ***
## annualised_mileage         9.623e-03  2.780e-04  34.619  < 2e-16 ***
## num_dealers_visited        1.809e-03  1.952e-04   9.270  < 2e-16 ***
## num_serv_dealer_purchased  6.383e-03  2.200e-04  29.022  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.02225838)
## 
##     Null deviance: 2856.3  on 109444  degrees of freedom
## Residual deviance: 2435.2  on 109405  degrees of freedom
##   (2 observations deleted due to missingness)
## AIC: -105809
## 
## Number of Fisher Scoring iterations: 2
"By looking at the model summary, the AIC value of -105,809 is not bad for a cross sectional data of 131,337 observations. The individual regression coefficients, it is seen that as many as non_sched_serv_paid variable is not statistically significant.
Further we can plot the model diagnostic checking for other problems such as normality of error term, heteroscedasticity etc.

Predicting the test data
The next step is to use each model for predicting the outcome variable in the corresponding test data. There are many ways to achieve this. One general approach might be:

folds %>% mutate(predicted = map2(model, test,  ))
map2(model, test, ...) iterates through each model and set of test data in parallel. By referencing these in the function for predicting the test data, this would add a predicted column with the predicted results.

For many common models, an elegant alternative is to use augment from broom. For regression, augment will take a fitted model and a new data frame, and return a data frame of the predicted results, which is what we want! Following above, we can use augment as follows:"
## [1] "By looking at the model summary, the AIC value of -105,809 is not bad for a cross sectional data of 131,337 observations. The individual regression coefficients, it is seen that as many as non_sched_serv_paid variable is not statistically significant.\nFurther we can plot the model diagnostic checking for other problems such as normality of error term, heteroscedasticity etc.\n\nPredicting the test data\nThe next step is to use each model for predicting the outcome variable in the corresponding test data. There are many ways to achieve this. One general approach might be:\n\nfolds %>% mutate(predicted = map2(model, test,  ))\nmap2(model, test, ...) iterates through each model and set of test data in parallel. By referencing these in the function for predicting the test data, this would add a predicted column with the predicted results.\n\nFor many common models, an elegant alternative is to use augment from broom. For regression, augment will take a fitted model and a new data frame, and return a data frame of the predicted results, which is what we want! Following above, we can use augment as follows:"
plot(folds$model[[1]])

library(broom)
## 
## Attaching package: 'broom'
## The following object is masked from 'package:modelr':
## 
##     bootstrap
folds %>% mutate(predicted = map2(model, test, ~ augment(.x, newdata = .y)))
## # A tibble: 6 x 5
##   train          test           .id   model     predicted             
##   <list>         <list>         <chr> <list>    <list>                
## 1 <S3: resample> <S3: resample> 1     <S3: glm> <tibble [21,890 × 19]>
## 2 <S3: resample> <S3: resample> 2     <S3: glm> <tibble [21,890 × 19]>
## 3 <S3: resample> <S3: resample> 3     <S3: glm> <tibble [21,890 × 19]>
## 4 <S3: resample> <S3: resample> 4     <S3: glm> <tibble [21,889 × 19]>
## 5 <S3: resample> <S3: resample> 5     <S3: glm> <tibble [21,889 × 19]>
## 6 <S3: resample> <S3: resample> 6     <S3: glm> <tibble [21,889 × 19]>
"
To extract the relevant information from these predicted results, we’ll unnest the data frames thanks to the tidyr package:"
## [1] "\nTo extract the relevant information from these predicted results, we’ll unnest the data frames thanks to the tidyr package:"
folds %>%
  mutate(predicted = map2(model, test, ~ augment(.x, newdata = .y))) %>% 
  unnest(predicted)
## # A tibble: 131,337 x 20
##    .id      ID Target age_band gender car_model car_segment
##    <chr> <int>  <int> <fct>    <fct>  <fct>     <fct>      
##  1 1         6      0 NULL     Female model_2   Small/Medi…
##  2 1        19      0 NULL     NULL   model_8   Large/SUV  
##  3 1        23      0 NULL     Female model_4   Small/Medi…
##  4 1        35      0 NULL     Male   model_2   Small/Medi…
##  5 1        50      0 NULL     NULL   model_1   LCV        
##  6 1        59      0 NULL     NULL   model_7   LCV        
##  7 1        82      0 NULL     Female model_5   Large/SUV  
##  8 1       100      0 NULL     NULL   model_2   Small/Medi…
##  9 1       117      0 NULL     NULL   model_3   Large/SUV  
## 10 1       118      0 NULL     Female model_4   Small/Medi…
## # ... with 131,327 more rows, and 13 more variables:
## #   age_of_vehicle_years <int>, sched_serv_warr <int>,
## #   non_sched_serv_warr <int>, sched_serv_paid <int>,
## #   non_sched_serv_paid <int>, total_paid_services <int>,
## #   total_services <int>, mth_since_last_serv <int>,
## #   annualised_mileage <int>, num_dealers_visited <int>,
## #   num_serv_dealer_purchased <int>, .fitted <dbl>, .se.fit <dbl>
"This was to show you the intermediate steps. In practice we can skip the mutate step:"
## [1] "This was to show you the intermediate steps. In practice we can skip the mutate step:"
predicted <- folds %>% unnest(map2(model, test, ~ augment(.x, newdata = .y)))
predicted
## # A tibble: 131,337 x 20
##    .id      ID Target age_band gender car_model car_segment
##    <chr> <int>  <int> <fct>    <fct>  <fct>     <fct>      
##  1 1         6      0 NULL     Female model_2   Small/Medi…
##  2 1        19      0 NULL     NULL   model_8   Large/SUV  
##  3 1        23      0 NULL     Female model_4   Small/Medi…
##  4 1        35      0 NULL     Male   model_2   Small/Medi…
##  5 1        50      0 NULL     NULL   model_1   LCV        
##  6 1        59      0 NULL     NULL   model_7   LCV        
##  7 1        82      0 NULL     Female model_5   Large/SUV  
##  8 1       100      0 NULL     NULL   model_2   Small/Medi…
##  9 1       117      0 NULL     NULL   model_3   Large/SUV  
## 10 1       118      0 NULL     Female model_4   Small/Medi…
## # ... with 131,327 more rows, and 13 more variables:
## #   age_of_vehicle_years <int>, sched_serv_warr <int>,
## #   non_sched_serv_warr <int>, sched_serv_paid <int>,
## #   non_sched_serv_paid <int>, total_paid_services <int>,
## #   total_services <int>, mth_since_last_serv <int>,
## #   annualised_mileage <int>, num_dealers_visited <int>,
## #   num_serv_dealer_purchased <int>, .fitted <dbl>, .se.fit <dbl>
"We now have a tibble of the test data for each fold (.id = fold number) and the corresponding .fitted, or predicted values for the outcome variable (Target) in each case.

Validating the model
We can compute and examine the residuals:"
## [1] "We now have a tibble of the test data for each fold (.id = fold number) and the corresponding .fitted, or predicted values for the outcome variable (Target) in each case.\n\nValidating the model\nWe can compute and examine the residuals:"
# Compute the residuals
predicted <- predicted %>% 
  mutate(residual = .fitted - Target)

"It looks like our models could be overestimating Target around 20-30 and underestimating higher Target. But there are clearly fewer data points, making prediction difficult.

We can also use these values to calculate the overall proportion of variance accounted for by each model:"
## [1] "It looks like our models could be overestimating Target around 20-30 and underestimating higher Target. But there are clearly fewer data points, making prediction difficult.\n\nWe can also use these values to calculate the overall proportion of variance accounted for by each model:"
rs <- predicted %>%
  group_by(.id) %>% 
  summarise(
    sst = sum((Target - mean(Target)) ^ 2), # Sum of Squares Total
    sse = sum(residual ^ 2),          # Sum of Squares Residual/Error
    r.squared = 1 - sse / sst         # Proportion of variance accounted for
    )
rs
## # A tibble: 6 x 4
##   .id     sst   sse r.squared
##   <chr> <dbl> <dbl>     <dbl>
## 1 1      570.  492.     0.138
## 2 2      586.   NA     NA    
## 3 3      567.   NA     NA    
## 4 4      554.  474.     0.144
## 5 5      587.  498.     0.151
## 6 6      562.  480.     0.146
# Overall
mean(rs$r.squared)
## [1] NA
#we're getting an error NA for set 3. lets see whats going on.

which(is.na(predicted), arr.ind=TRUE)
##        row col
## [1,] 36170   6
## [2,] 48181   6
## [3,] 36170  19
## [4,] 48181  19
## [5,] 36170  20
## [6,] 48181  20
## [7,] 36170  21
## [8,] 48181  21
newdataset1<-predicted[c(48260, 58109, 48260, 58109, 48260, 58109, 48260, 58109), ]

newdataset1
## # A tibble: 8 x 21
##   .id       ID Target age_band gender car_model car_segment
##   <chr>  <int>  <int> <fct>    <fct>  <fct>     <fct>      
## 1 3      31538      0 NULL     Male   model_2   Small/Medi…
## 2 3     100435      0 NULL     Female model_3   Large/SUV  
## 3 3      31538      0 NULL     Male   model_2   Small/Medi…
## 4 3     100435      0 NULL     Female model_3   Large/SUV  
## 5 3      31538      0 NULL     Male   model_2   Small/Medi…
## 6 3     100435      0 NULL     Female model_3   Large/SUV  
## 7 3      31538      0 NULL     Male   model_2   Small/Medi…
## 8 3     100435      0 NULL     Female model_3   Large/SUV  
## # ... with 14 more variables: age_of_vehicle_years <int>,
## #   sched_serv_warr <int>, non_sched_serv_warr <int>,
## #   sched_serv_paid <int>, non_sched_serv_paid <int>,
## #   total_paid_services <int>, total_services <int>,
## #   mth_since_last_serv <int>, annualised_mileage <int>,
## #   num_dealers_visited <int>, num_serv_dealer_purchased <int>,
## #   .fitted <dbl>, .se.fit <dbl>, residual <dbl>
#these NA values are derived from this table, as we can see these people have bought a car however are purely customers that are male and don't aid the predicted results. lets remove these NAs.

predicted <- predicted[-c(48260, 58109, 48260, 58109, 48260, 58109, 48260, 58109), ]

rs <- predicted %>%
  group_by(.id) %>% 
  summarise(
    sst = sum((Target - mean(Target)) ^ 2), # Sum of Squares Total
    sse = sum(residual ^ 2),          # Sum of Squares Residual/Error
    r.squared = 1 - sse / sst         # Proportion of variance accounted for
  )
rs
## # A tibble: 6 x 4
##   .id     sst   sse r.squared
##   <chr> <dbl> <dbl>     <dbl>
## 1 1      570.  492.     0.138
## 2 2      586.   NA     NA    
## 3 3      567.   NA     NA    
## 4 4      554.  474.     0.144
## 5 5      587.  498.     0.151
## 6 6      562.  480.     0.146
# Overall
mean(rs$r.squared)
## [1] NA
"
So, across the folds, the regression models have accounted for an average of 14.47% of the variance of Target in new, test data.

Plotting these results:"
## [1] "\nSo, across the folds, the regression models have accounted for an average of 14.47% of the variance of Target in new, test data.\n\nPlotting these results:"
rs %>% 
ggplot(aes(r.squared, fill  = .id)) +
geom_histogram() +
geom_vline(aes(xintercept = mean(r.squared)))  # Overall mean
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing non-finite values (stat_bin).
## Warning: Removed 6 rows containing missing values (geom_vline).

"The model performed poorly"
## [1] "The model performed poorly"

model <- glm(Target ~ .,data = train_set, family = “binomial”) summary(model)

predictions <- predict(model, newdata = test,type=‘response’)

predictions <- ifelse(predictions > 0.5,1,0) misClasificError <- mean(predictions != test$left) print(paste(‘Accuracy’,1-misClasificError))

train_control <- trainControl(method=“cv”, number=10, savePredictions = TRUE) output <- train(Target ~ age_band + gender + car_model + car_segment + age_of_vehicle_years + sched_serv_warr + non_sched_serv_warr + non_sched_serv_warr + total_paid_services + total_services + mth_since_last_serv + annualised_mileage + num_dealers_visited + num_serv_dealer_purchased, data=train_set, trControl=train_control, method=“lm”)

folds <- crossv_kfold(train_set, k = 5)

trControl <- trainControl(method=“cv”, number=10, savePredictions = TRUE)

fit <- train(Target ~ ., method = “lm”, trControl = trControl, data = train_set)

flds <- createFolds(train_set, k = 6, list = TRUE, returnTrain = FALSE) names(flds)[1] <- “train 1” names(flds)[2] <- “train 2” names(flds)[3] <- “train 3” names(flds)[4] <- “train 4” names(flds)[5] <- “train 5” names(flds)[6] <- “test 1” flds

n.folds <- 10 folds <- cut(sample(seq_len(nrow(train_set))), breaks=n.folds, labels=FALSE)

all.confusion.tables <- list() for (i in seq_len(n.folds)) { # Create training set train <- filter(train_set, folds != i) # Take all other samples than i

# Create test set test <- filter(train_set, folds == i)

# Fit the glm model on the training set glm.train <- glm(Target ~ age_band + gender + car_model + car_segment + age_of_vehicle_years + sched_serv_warr + non_sched_serv_warr + non_sched_serv_warr + total_paid_services + total_services + mth_since_last_serv + annualised_mileage + num_dealers_visited + num_serv_dealer_purchased, family = binomial, data = train_set)

# Use the fitted model on the test set logit.prob <- predict(glm.train, newdata = test)

# Classifiy based on the predictions pred.class <- ifelse(logit.prob < 0, 0, 1)

# Construct the confusion table all.confustion.tables[[i]] <- table(pred = pred.class, true = test$low) }

all.confusion.tables[[2]]

linear model training

train1.glm <- glm(data = train_set[ flds[[1]], ], formula = Target ~ ., family = binomial)

summary(train.glm)