caret package for ML############################################################################################################
# Check that required libraries are present, otherwise install them
############################################################################################################
list_of_packages <- c("tidyverse", "dplyr","ggplot2", "caret", "lattice")
new_packages <- list_of_packages[!(list_of_packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)
############################################################################################################
# Load required libraries
############################################################################################################
library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.0.0 ✔ purrr 0.2.5
## ✔ tibble 1.4.2 ✔ dplyr 0.7.6
## ✔ tidyr 0.8.1 ✔ stringr 1.3.1
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## Warning: package 'dplyr' was built under R version 3.5.1
## ── Conflicts ──────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(dplyr)
library(ggplot2)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(lattice) #Caret dependent
library(rpart)
library(modelr)
library(purrr)
# Ask the user to nominate a location for the forecast files to be produced
############################################################################################################
setwd("~/Documents/36106/Assignment_1/36106AT1")
############################################################################################################
## Import training set.
train_set <- read_csv("repurchase_training.csv")
## Parsed with column specification:
## cols(
## ID = col_integer(),
## Target = col_integer(),
## age_band = col_character(),
## gender = col_character(),
## car_model = col_character(),
## car_segment = col_character(),
## age_of_vehicle_years = col_integer(),
## sched_serv_warr = col_integer(),
## non_sched_serv_warr = col_integer(),
## sched_serv_paid = col_integer(),
## non_sched_serv_paid = col_integer(),
## total_paid_services = col_integer(),
## total_services = col_integer(),
## mth_since_last_serv = col_integer(),
## annualised_mileage = col_integer(),
## num_dealers_visited = col_integer(),
## num_serv_dealer_purchased = col_integer()
## )
'131,337 observations over 17 variables'
## [1] "131,337 observations over 17 variables"
str(train_set)
## Classes 'tbl_df', 'tbl' and 'data.frame': 131337 obs. of 17 variables:
## $ ID : int 1 2 3 5 6 7 8 9 10 11 ...
## $ Target : int 0 0 0 0 0 0 0 0 0 0 ...
## $ age_band : chr "3. 35 to 44" "NULL" "NULL" "NULL" ...
## $ gender : chr "Male" "NULL" "Male" "NULL" ...
## $ car_model : chr "model_1" "model_2" "model_3" "model_3" ...
## $ car_segment : chr "LCV" "Small/Medium" "Large/SUV" "Large/SUV" ...
## $ age_of_vehicle_years : int 9 6 9 5 8 7 8 7 1 3 ...
## $ sched_serv_warr : int 2 10 10 8 9 4 2 4 2 1 ...
## $ non_sched_serv_warr : int 10 3 9 5 4 10 8 9 1 1 ...
## $ sched_serv_paid : int 3 10 10 8 10 5 2 6 1 2 ...
## $ non_sched_serv_paid : int 7 4 9 4 7 7 9 9 3 1 ...
## $ total_paid_services : int 5 9 10 5 9 6 9 8 1 2 ...
## $ total_services : int 6 10 10 6 8 8 4 6 2 1 ...
## $ mth_since_last_serv : int 9 6 7 4 5 8 7 9 1 1 ...
## $ annualised_mileage : int 8 10 10 10 4 5 6 5 1 1 ...
## $ num_dealers_visited : int 10 7 6 9 4 10 10 5 2 1 ...
## $ num_serv_dealer_purchased: int 4 10 10 7 9 4 4 8 3 1 ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 17
## .. ..$ ID : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ Target : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ age_band : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ gender : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ car_model : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ car_segment : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ age_of_vehicle_years : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ sched_serv_warr : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ non_sched_serv_warr : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ sched_serv_paid : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ non_sched_serv_paid : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ total_paid_services : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ total_services : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ mth_since_last_serv : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ annualised_mileage : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ num_dealers_visited : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ num_serv_dealer_purchased: list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
train_set$age_band <- as.factor(train_set$age_band)
levels(train_set$age_band)
## [1] "1. <25" "2. 25 to 34" "3. 35 to 44" "4. 45 to 54" "5. 55 to 64"
## [6] "6. 65 to 74" "7. 75+" "NULL"
train_set$gender <- as.factor(train_set$gender)
levels(train_set$gender)
## [1] "Female" "Male" "NULL"
#order the factors correctly.
train_set$car_model <- as.factor(train_set$car_model)
train_set$car_model <- factor(train_set$car_model,
levels = c("model_1", "model_2", "model_3", "model_4", "model_5", "model_6", "model_7", "model_8", "model_9", "model_10", "model_11", "model_12", "model_13", "model_14", "model_15", "model_16", "model_17", "model_18"))
levels(train_set$car_model)
## [1] "model_1" "model_2" "model_3" "model_4" "model_5" "model_6"
## [7] "model_7" "model_8" "model_9" "model_10" "model_11" "model_12"
## [13] "model_13" "model_14" "model_15" "model_16" "model_17" "model_18"
train_set$car_segment <- as.factor(train_set$car_segment)
train_set$car_segment <- factor(train_set$car_segment,
levels = c("Small/Medium", "Large/SUV", "LCV", "Other"))
levels(train_set$car_segment)
## [1] "Small/Medium" "Large/SUV" "LCV" "Other"
From the summary we can see that the age_band has 112,375 NULL’s, Gender has 69,308 NULL’s and car_model has 2 NA’s.
Quick investigation of missing values can be done using the complete.cases(), and more thorough graphical summary can be done using Amelia.
### Missing Values Analysis
summary(train_set)
## ID Target age_band gender
## Min. : 1 Min. :0.00000 NULL :112375 Female:25957
## 1st Qu.: 38563 1st Qu.:0.00000 4. 45 to 54: 4058 Male :36072
## Median : 77132 Median :0.00000 3. 35 to 44: 3833 NULL :69308
## Mean : 77097 Mean :0.02681 2. 25 to 34: 3548
## 3rd Qu.:115668 3rd Qu.:0.00000 5. 55 to 64: 3397
## Max. :154139 Max. :1.00000 6. 65 to 74: 2140
## (Other) : 1986
## car_model car_segment age_of_vehicle_years
## model_2:34491 Small/Medium:54553 Min. : 1.000
## model_5:24674 Large/SUV :52120 1st Qu.: 3.000
## model_3:17074 LCV :24606 Median : 5.000
## model_1:15331 Other : 58 Mean : 5.493
## model_4:15155 3rd Qu.: 8.000
## (Other):24610 Max. :10.000
## NA's : 2
## sched_serv_warr non_sched_serv_warr sched_serv_paid non_sched_serv_paid
## Min. : 1.000 Min. : 1.000 Min. : 1.000 Min. : 1.000
## 1st Qu.: 3.000 1st Qu.: 3.000 1st Qu.: 3.000 1st Qu.: 3.000
## Median : 5.000 Median : 5.000 Median : 5.000 Median : 5.000
## Mean : 5.452 Mean : 5.473 Mean : 5.452 Mean : 5.497
## 3rd Qu.: 8.000 3rd Qu.: 8.000 3rd Qu.: 8.000 3rd Qu.: 8.000
## Max. :10.000 Max. :10.000 Max. :10.000 Max. :10.000
##
## total_paid_services total_services mth_since_last_serv
## Min. : 1.000 Min. : 1.000 Min. : 1.00
## 1st Qu.: 3.000 1st Qu.: 3.000 1st Qu.: 3.00
## Median : 5.000 Median : 5.000 Median : 5.00
## Mean : 5.482 Mean : 5.455 Mean : 5.47
## 3rd Qu.: 8.000 3rd Qu.: 8.000 3rd Qu.: 8.00
## Max. :10.000 Max. :10.000 Max. :10.00
##
## annualised_mileage num_dealers_visited num_serv_dealer_purchased
## Min. : 1.000 Min. : 1.000 Min. : 1.000
## 1st Qu.: 3.000 1st Qu.: 3.000 1st Qu.: 3.000
## Median : 5.000 Median : 5.000 Median : 5.000
## Mean : 5.503 Mean : 5.485 Mean : 5.481
## 3rd Qu.: 8.000 3rd Qu.: 8.000 3rd Qu.: 8.000
## Max. :10.000 Max. :10.000 Max. :10.000
##
#Missing cases NA:
map_int(train_set,~sum(is.na(.x)))
## ID Target
## 0 0
## age_band gender
## 0 0
## car_model car_segment
## 2 0
## age_of_vehicle_years sched_serv_warr
## 0 0
## non_sched_serv_warr sched_serv_paid
## 0 0
## non_sched_serv_paid total_paid_services
## 0 0
## total_services mth_since_last_serv
## 0 0
## annualised_mileage num_dealers_visited
## 0 0
## num_serv_dealer_purchased
## 0
#Missing cases NULL:
train_set %>%
filter(gender == "NULL")
## # A tibble: 69,308 x 17
## ID Target age_band gender car_model car_segment age_of_vehicle_…
## <int> <int> <fct> <fct> <fct> <fct> <int>
## 1 2 0 NULL NULL model_2 Small/Medi… 6
## 2 5 0 NULL NULL model_3 Large/SUV 5
## 3 10 0 NULL NULL model_4 Small/Medi… 1
## 4 11 0 NULL NULL model_4 Small/Medi… 3
## 5 13 0 NULL NULL model_5 Large/SUV 3
## 6 17 0 NULL NULL model_8 Large/SUV 5
## 7 19 0 NULL NULL model_8 Large/SUV 4
## 8 22 0 4. 45 t… NULL model_3 Large/SUV 5
## 9 26 0 NULL NULL model_5 Large/SUV 3
## 10 27 0 NULL NULL model_5 Large/SUV 2
## # ... with 69,298 more rows, and 10 more variables: sched_serv_warr <int>,
## # non_sched_serv_warr <int>, sched_serv_paid <int>,
## # non_sched_serv_paid <int>, total_paid_services <int>,
## # total_services <int>, mth_since_last_serv <int>,
## # annualised_mileage <int>, num_dealers_visited <int>,
## # num_serv_dealer_purchased <int>
train_set %>%
filter(age_band == "NULL")
## # A tibble: 112,375 x 17
## ID Target age_band gender car_model car_segment age_of_vehicle_…
## <int> <int> <fct> <fct> <fct> <fct> <int>
## 1 2 0 NULL NULL model_2 Small/Medi… 6
## 2 3 0 NULL Male model_3 Large/SUV 9
## 3 5 0 NULL NULL model_3 Large/SUV 5
## 4 6 0 NULL Female model_2 Small/Medi… 8
## 5 7 0 NULL Male model_5 Large/SUV 7
## 6 9 0 NULL Male model_6 Small/Medi… 7
## 7 10 0 NULL NULL model_4 Small/Medi… 1
## 8 11 0 NULL NULL model_4 Small/Medi… 3
## 9 13 0 NULL NULL model_5 Large/SUV 3
## 10 14 0 NULL Male model_7 LCV 9
## # ... with 112,365 more rows, and 10 more variables:
## # sched_serv_warr <int>, non_sched_serv_warr <int>,
## # sched_serv_paid <int>, non_sched_serv_paid <int>,
## # total_paid_services <int>, total_services <int>,
## # mth_since_last_serv <int>, annualised_mileage <int>,
## # num_dealers_visited <int>, num_serv_dealer_purchased <int>
Here, I’m splitting up the train dataset into a training and a testing dataset. Right now, I don’t intend to upload any results back into Kaggle, so I’m just going to evaluate my models on my testing split.
“you can fit a substantially wider variety of Gaussian models with a GLM than with regression.”
“Classification: to predict a categorical class label, such as weather: rainy, sunnny, cloudy or snowy
Regressive evaluative measure.
MAE - Mean Absolute Error MSE - Mean Square Error RMSE - Route Mean Square Error
Avoid overfitting. (performs well on training data but porly on unseen data)
Training testing 80/20
k-fold cross validation 1. split data set into k subsets of equal size 2. reserve one for test 3. Average performance of all above.
Using the crossv_kfold method from modelr, it automatically splits the data into k partitions and uses each partition for a testing training split.
nrow(train_set)
## [1] 131337
ncol(train_set)
## [1] 17
summary(train_set)
## ID Target age_band gender
## Min. : 1 Min. :0.00000 NULL :112375 Female:25957
## 1st Qu.: 38563 1st Qu.:0.00000 4. 45 to 54: 4058 Male :36072
## Median : 77132 Median :0.00000 3. 35 to 44: 3833 NULL :69308
## Mean : 77097 Mean :0.02681 2. 25 to 34: 3548
## 3rd Qu.:115668 3rd Qu.:0.00000 5. 55 to 64: 3397
## Max. :154139 Max. :1.00000 6. 65 to 74: 2140
## (Other) : 1986
## car_model car_segment age_of_vehicle_years
## model_2:34491 Small/Medium:54553 Min. : 1.000
## model_5:24674 Large/SUV :52120 1st Qu.: 3.000
## model_3:17074 LCV :24606 Median : 5.000
## model_1:15331 Other : 58 Mean : 5.493
## model_4:15155 3rd Qu.: 8.000
## (Other):24610 Max. :10.000
## NA's : 2
## sched_serv_warr non_sched_serv_warr sched_serv_paid non_sched_serv_paid
## Min. : 1.000 Min. : 1.000 Min. : 1.000 Min. : 1.000
## 1st Qu.: 3.000 1st Qu.: 3.000 1st Qu.: 3.000 1st Qu.: 3.000
## Median : 5.000 Median : 5.000 Median : 5.000 Median : 5.000
## Mean : 5.452 Mean : 5.473 Mean : 5.452 Mean : 5.497
## 3rd Qu.: 8.000 3rd Qu.: 8.000 3rd Qu.: 8.000 3rd Qu.: 8.000
## Max. :10.000 Max. :10.000 Max. :10.000 Max. :10.000
##
## total_paid_services total_services mth_since_last_serv
## Min. : 1.000 Min. : 1.000 Min. : 1.00
## 1st Qu.: 3.000 1st Qu.: 3.000 1st Qu.: 3.00
## Median : 5.000 Median : 5.000 Median : 5.00
## Mean : 5.482 Mean : 5.455 Mean : 5.47
## 3rd Qu.: 8.000 3rd Qu.: 8.000 3rd Qu.: 8.00
## Max. :10.000 Max. :10.000 Max. :10.00
##
## annualised_mileage num_dealers_visited num_serv_dealer_purchased
## Min. : 1.000 Min. : 1.000 Min. : 1.000
## 1st Qu.: 3.000 1st Qu.: 3.000 1st Qu.: 3.000
## Median : 5.000 Median : 5.000 Median : 5.000
## Mean : 5.503 Mean : 5.485 Mean : 5.481
## 3rd Qu.: 8.000 3rd Qu.: 8.000 3rd Qu.: 8.000
## Max. :10.000 Max. :10.000 Max. :10.000
##
#getindex of predicted variable
typeColNum <- grep("Target", names(train_set))
#k-fold 5
set.seed(42)
folds <- crossv_kfold(train_set, k = 6)
folds
## # A tibble: 6 x 3
## train test .id
## <list> <list> <chr>
## 1 <S3: resample> <S3: resample> 1
## 2 <S3: resample> <S3: resample> 2
## 3 <S3: resample> <S3: resample> 3
## 4 <S3: resample> <S3: resample> 4
## 5 <S3: resample> <S3: resample> 5
## 6 <S3: resample> <S3: resample> 6
folds$test[[1]]
## <resample [21,890 x 17]> 5, 16, 19, 28, 43, 50, 69, 83, 98, 99, ...
The first step in the analysis is to explore the data numerically and graphically. I always split up my EDA investigation as follows:
This gives me a structured approach towards larger datasets. My professor at Northwestern taught me to always complete a thorough intimate numeric & graphical EDA on the data, no matter how large the data 1. Anscombe (1973) clearly shows the importance of graphical analyses.
Say we’re interested in predicting Repurchase (Target) with all other variables. With the whole data set, we’d do this via:
glm(Target ~ ., data = train_set)
Instead, we want to run this model on each set of training data (data referenced in each train cell). We can do this as follows:
folds <- folds %>% mutate(model = map(train, ~ glm(Target ~ ., data = .)))
folds
## # A tibble: 6 x 4
## train test .id model
## <list> <list> <chr> <list>
## 1 <S3: resample> <S3: resample> 1 <S3: glm>
## 2 <S3: resample> <S3: resample> 2 <S3: glm>
## 3 <S3: resample> <S3: resample> 3 <S3: glm>
## 4 <S3: resample> <S3: resample> 4 <S3: glm>
## 5 <S3: resample> <S3: resample> 5 <S3: glm>
## 6 <S3: resample> <S3: resample> 6 <S3: glm>
folds$model[[1]] %>% summary()
##
## Call:
## glm(formula = Target ~ ., data = .)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.23870 -0.06237 -0.02090 0.02215 0.99997
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.564e-02 5.725e-03 -4.478 7.53e-06 ***
## ID 9.380e-07 1.016e-08 92.347 < 2e-16 ***
## age_band2. 25 to 34 5.307e-03 5.914e-03 0.897 0.369570
## age_band3. 35 to 44 1.037e-02 5.875e-03 1.765 0.077628 .
## age_band4. 45 to 54 1.246e-02 5.848e-03 2.130 0.033181 *
## age_band5. 55 to 64 1.474e-02 5.949e-03 2.477 0.013245 *
## age_band6. 65 to 74 1.459e-02 6.346e-03 2.299 0.021483 *
## age_band7. 75+ 2.130e-02 7.374e-03 2.888 0.003874 **
## age_bandNULL 1.535e-02 5.310e-03 2.891 0.003841 **
## genderMale 7.032e-03 1.383e-03 5.083 3.72e-07 ***
## genderNULL -3.925e-03 1.341e-03 -2.928 0.003416 **
## car_modelmodel_2 1.745e-02 1.652e-03 10.561 < 2e-16 ***
## car_modelmodel_3 2.664e-02 1.849e-03 14.404 < 2e-16 ***
## car_modelmodel_4 1.783e-02 1.969e-03 9.053 < 2e-16 ***
## car_modelmodel_5 1.007e-02 1.710e-03 5.892 3.82e-09 ***
## car_modelmodel_6 1.379e-02 3.291e-03 4.191 2.79e-05 ***
## car_modelmodel_7 2.243e-02 2.266e-03 9.901 < 2e-16 ***
## car_modelmodel_8 2.536e-02 2.447e-03 10.366 < 2e-16 ***
## car_modelmodel_9 7.445e-03 5.361e-03 1.389 0.164931
## car_modelmodel_10 -2.478e-02 3.218e-03 -7.699 1.38e-14 ***
## car_modelmodel_11 -3.289e-02 6.775e-03 -4.855 1.21e-06 ***
## car_modelmodel_12 -2.244e-02 6.676e-03 -3.361 0.000778 ***
## car_modelmodel_13 3.788e-02 6.248e-03 6.063 1.34e-09 ***
## car_modelmodel_14 -4.314e-02 1.857e-02 -2.323 0.020166 *
## car_modelmodel_15 3.749e-02 9.181e-03 4.084 4.43e-05 ***
## car_modelmodel_16 1.999e-02 1.522e-02 1.313 0.189181
## car_modelmodel_17 -1.571e-02 1.695e-02 -0.927 0.353991
## car_modelmodel_18 -2.663e-02 1.607e-02 -1.657 0.097450 .
## car_segmentLarge/SUV NA NA NA NA
## car_segmentLCV NA NA NA NA
## car_segmentOther -5.437e-03 2.815e-02 -0.193 0.846827
## age_of_vehicle_years 6.663e-04 2.535e-04 2.629 0.008571 **
## sched_serv_warr -3.031e-03 4.398e-04 -6.892 5.55e-12 ***
## non_sched_serv_warr 1.119e-03 4.062e-04 2.754 0.005891 **
## sched_serv_paid -9.931e-03 4.048e-04 -24.535 < 2e-16 ***
## non_sched_serv_paid 3.404e-04 4.442e-04 0.766 0.443429
## total_paid_services 8.076e-03 5.266e-04 15.336 < 2e-16 ***
## total_services -1.713e-02 5.758e-04 -29.745 < 2e-16 ***
## mth_since_last_serv -7.069e-03 2.152e-04 -32.851 < 2e-16 ***
## annualised_mileage 9.623e-03 2.780e-04 34.619 < 2e-16 ***
## num_dealers_visited 1.809e-03 1.952e-04 9.270 < 2e-16 ***
## num_serv_dealer_purchased 6.383e-03 2.200e-04 29.022 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.02225838)
##
## Null deviance: 2856.3 on 109444 degrees of freedom
## Residual deviance: 2435.2 on 109405 degrees of freedom
## (2 observations deleted due to missingness)
## AIC: -105809
##
## Number of Fisher Scoring iterations: 2
"By looking at the model summary, the AIC value of -105,809 is not bad for a cross sectional data of 131,337 observations. The individual regression coefficients, it is seen that as many as non_sched_serv_paid variable is not statistically significant.
Further we can plot the model diagnostic checking for other problems such as normality of error term, heteroscedasticity etc.
Predicting the test data
The next step is to use each model for predicting the outcome variable in the corresponding test data. There are many ways to achieve this. One general approach might be:
folds %>% mutate(predicted = map2(model, test, ))
map2(model, test, ...) iterates through each model and set of test data in parallel. By referencing these in the function for predicting the test data, this would add a predicted column with the predicted results.
For many common models, an elegant alternative is to use augment from broom. For regression, augment will take a fitted model and a new data frame, and return a data frame of the predicted results, which is what we want! Following above, we can use augment as follows:"
## [1] "By looking at the model summary, the AIC value of -105,809 is not bad for a cross sectional data of 131,337 observations. The individual regression coefficients, it is seen that as many as non_sched_serv_paid variable is not statistically significant.\nFurther we can plot the model diagnostic checking for other problems such as normality of error term, heteroscedasticity etc.\n\nPredicting the test data\nThe next step is to use each model for predicting the outcome variable in the corresponding test data. There are many ways to achieve this. One general approach might be:\n\nfolds %>% mutate(predicted = map2(model, test, ))\nmap2(model, test, ...) iterates through each model and set of test data in parallel. By referencing these in the function for predicting the test data, this would add a predicted column with the predicted results.\n\nFor many common models, an elegant alternative is to use augment from broom. For regression, augment will take a fitted model and a new data frame, and return a data frame of the predicted results, which is what we want! Following above, we can use augment as follows:"
plot(folds$model[[1]])
library(broom)
##
## Attaching package: 'broom'
## The following object is masked from 'package:modelr':
##
## bootstrap
folds %>% mutate(predicted = map2(model, test, ~ augment(.x, newdata = .y)))
## # A tibble: 6 x 5
## train test .id model predicted
## <list> <list> <chr> <list> <list>
## 1 <S3: resample> <S3: resample> 1 <S3: glm> <tibble [21,890 × 19]>
## 2 <S3: resample> <S3: resample> 2 <S3: glm> <tibble [21,890 × 19]>
## 3 <S3: resample> <S3: resample> 3 <S3: glm> <tibble [21,890 × 19]>
## 4 <S3: resample> <S3: resample> 4 <S3: glm> <tibble [21,889 × 19]>
## 5 <S3: resample> <S3: resample> 5 <S3: glm> <tibble [21,889 × 19]>
## 6 <S3: resample> <S3: resample> 6 <S3: glm> <tibble [21,889 × 19]>
"
To extract the relevant information from these predicted results, we’ll unnest the data frames thanks to the tidyr package:"
## [1] "\nTo extract the relevant information from these predicted results, we’ll unnest the data frames thanks to the tidyr package:"
folds %>%
mutate(predicted = map2(model, test, ~ augment(.x, newdata = .y))) %>%
unnest(predicted)
## # A tibble: 131,337 x 20
## .id ID Target age_band gender car_model car_segment
## <chr> <int> <int> <fct> <fct> <fct> <fct>
## 1 1 6 0 NULL Female model_2 Small/Medi…
## 2 1 19 0 NULL NULL model_8 Large/SUV
## 3 1 23 0 NULL Female model_4 Small/Medi…
## 4 1 35 0 NULL Male model_2 Small/Medi…
## 5 1 50 0 NULL NULL model_1 LCV
## 6 1 59 0 NULL NULL model_7 LCV
## 7 1 82 0 NULL Female model_5 Large/SUV
## 8 1 100 0 NULL NULL model_2 Small/Medi…
## 9 1 117 0 NULL NULL model_3 Large/SUV
## 10 1 118 0 NULL Female model_4 Small/Medi…
## # ... with 131,327 more rows, and 13 more variables:
## # age_of_vehicle_years <int>, sched_serv_warr <int>,
## # non_sched_serv_warr <int>, sched_serv_paid <int>,
## # non_sched_serv_paid <int>, total_paid_services <int>,
## # total_services <int>, mth_since_last_serv <int>,
## # annualised_mileage <int>, num_dealers_visited <int>,
## # num_serv_dealer_purchased <int>, .fitted <dbl>, .se.fit <dbl>
"This was to show you the intermediate steps. In practice we can skip the mutate step:"
## [1] "This was to show you the intermediate steps. In practice we can skip the mutate step:"
predicted <- folds %>% unnest(map2(model, test, ~ augment(.x, newdata = .y)))
predicted
## # A tibble: 131,337 x 20
## .id ID Target age_band gender car_model car_segment
## <chr> <int> <int> <fct> <fct> <fct> <fct>
## 1 1 6 0 NULL Female model_2 Small/Medi…
## 2 1 19 0 NULL NULL model_8 Large/SUV
## 3 1 23 0 NULL Female model_4 Small/Medi…
## 4 1 35 0 NULL Male model_2 Small/Medi…
## 5 1 50 0 NULL NULL model_1 LCV
## 6 1 59 0 NULL NULL model_7 LCV
## 7 1 82 0 NULL Female model_5 Large/SUV
## 8 1 100 0 NULL NULL model_2 Small/Medi…
## 9 1 117 0 NULL NULL model_3 Large/SUV
## 10 1 118 0 NULL Female model_4 Small/Medi…
## # ... with 131,327 more rows, and 13 more variables:
## # age_of_vehicle_years <int>, sched_serv_warr <int>,
## # non_sched_serv_warr <int>, sched_serv_paid <int>,
## # non_sched_serv_paid <int>, total_paid_services <int>,
## # total_services <int>, mth_since_last_serv <int>,
## # annualised_mileage <int>, num_dealers_visited <int>,
## # num_serv_dealer_purchased <int>, .fitted <dbl>, .se.fit <dbl>
"We now have a tibble of the test data for each fold (.id = fold number) and the corresponding .fitted, or predicted values for the outcome variable (Target) in each case.
Validating the model
We can compute and examine the residuals:"
## [1] "We now have a tibble of the test data for each fold (.id = fold number) and the corresponding .fitted, or predicted values for the outcome variable (Target) in each case.\n\nValidating the model\nWe can compute and examine the residuals:"
# Compute the residuals
predicted <- predicted %>%
mutate(residual = .fitted - Target)
"It looks like our models could be overestimating Target around 20-30 and underestimating higher Target. But there are clearly fewer data points, making prediction difficult.
We can also use these values to calculate the overall proportion of variance accounted for by each model:"
## [1] "It looks like our models could be overestimating Target around 20-30 and underestimating higher Target. But there are clearly fewer data points, making prediction difficult.\n\nWe can also use these values to calculate the overall proportion of variance accounted for by each model:"
rs <- predicted %>%
group_by(.id) %>%
summarise(
sst = sum((Target - mean(Target)) ^ 2), # Sum of Squares Total
sse = sum(residual ^ 2), # Sum of Squares Residual/Error
r.squared = 1 - sse / sst # Proportion of variance accounted for
)
rs
## # A tibble: 6 x 4
## .id sst sse r.squared
## <chr> <dbl> <dbl> <dbl>
## 1 1 570. 492. 0.138
## 2 2 586. NA NA
## 3 3 567. NA NA
## 4 4 554. 474. 0.144
## 5 5 587. 498. 0.151
## 6 6 562. 480. 0.146
# Overall
mean(rs$r.squared)
## [1] NA
#we're getting an error NA for set 3. lets see whats going on.
which(is.na(predicted), arr.ind=TRUE)
## row col
## [1,] 36170 6
## [2,] 48181 6
## [3,] 36170 19
## [4,] 48181 19
## [5,] 36170 20
## [6,] 48181 20
## [7,] 36170 21
## [8,] 48181 21
newdataset1<-predicted[c(48260, 58109, 48260, 58109, 48260, 58109, 48260, 58109), ]
newdataset1
## # A tibble: 8 x 21
## .id ID Target age_band gender car_model car_segment
## <chr> <int> <int> <fct> <fct> <fct> <fct>
## 1 3 31538 0 NULL Male model_2 Small/Medi…
## 2 3 100435 0 NULL Female model_3 Large/SUV
## 3 3 31538 0 NULL Male model_2 Small/Medi…
## 4 3 100435 0 NULL Female model_3 Large/SUV
## 5 3 31538 0 NULL Male model_2 Small/Medi…
## 6 3 100435 0 NULL Female model_3 Large/SUV
## 7 3 31538 0 NULL Male model_2 Small/Medi…
## 8 3 100435 0 NULL Female model_3 Large/SUV
## # ... with 14 more variables: age_of_vehicle_years <int>,
## # sched_serv_warr <int>, non_sched_serv_warr <int>,
## # sched_serv_paid <int>, non_sched_serv_paid <int>,
## # total_paid_services <int>, total_services <int>,
## # mth_since_last_serv <int>, annualised_mileage <int>,
## # num_dealers_visited <int>, num_serv_dealer_purchased <int>,
## # .fitted <dbl>, .se.fit <dbl>, residual <dbl>
#these NA values are derived from this table, as we can see these people have bought a car however are purely customers that are male and don't aid the predicted results. lets remove these NAs.
predicted <- predicted[-c(48260, 58109, 48260, 58109, 48260, 58109, 48260, 58109), ]
rs <- predicted %>%
group_by(.id) %>%
summarise(
sst = sum((Target - mean(Target)) ^ 2), # Sum of Squares Total
sse = sum(residual ^ 2), # Sum of Squares Residual/Error
r.squared = 1 - sse / sst # Proportion of variance accounted for
)
rs
## # A tibble: 6 x 4
## .id sst sse r.squared
## <chr> <dbl> <dbl> <dbl>
## 1 1 570. 492. 0.138
## 2 2 586. NA NA
## 3 3 567. NA NA
## 4 4 554. 474. 0.144
## 5 5 587. 498. 0.151
## 6 6 562. 480. 0.146
# Overall
mean(rs$r.squared)
## [1] NA
"
So, across the folds, the regression models have accounted for an average of 14.47% of the variance of Target in new, test data.
Plotting these results:"
## [1] "\nSo, across the folds, the regression models have accounted for an average of 14.47% of the variance of Target in new, test data.\n\nPlotting these results:"
rs %>%
ggplot(aes(r.squared, fill = .id)) +
geom_histogram() +
geom_vline(aes(xintercept = mean(r.squared))) # Overall mean
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing non-finite values (stat_bin).
## Warning: Removed 6 rows containing missing values (geom_vline).
"The model performed poorly"
## [1] "The model performed poorly"
model <- glm(Target ~ .,data = train_set, family = “binomial”) summary(model)
predictions <- predict(model, newdata = test,type=‘response’)
predictions <- ifelse(predictions > 0.5,1,0) misClasificError <- mean(predictions != test$left) print(paste(‘Accuracy’,1-misClasificError))
train_control <- trainControl(method=“cv”, number=10, savePredictions = TRUE) output <- train(Target ~ age_band + gender + car_model + car_segment + age_of_vehicle_years + sched_serv_warr + non_sched_serv_warr + non_sched_serv_warr + total_paid_services + total_services + mth_since_last_serv + annualised_mileage + num_dealers_visited + num_serv_dealer_purchased, data=train_set, trControl=train_control, method=“lm”)
folds <- crossv_kfold(train_set, k = 5)
trControl <- trainControl(method=“cv”, number=10, savePredictions = TRUE)
fit <- train(Target ~ ., method = “lm”, trControl = trControl, data = train_set)
flds <- createFolds(train_set, k = 6, list = TRUE, returnTrain = FALSE) names(flds)[1] <- “train 1” names(flds)[2] <- “train 2” names(flds)[3] <- “train 3” names(flds)[4] <- “train 4” names(flds)[5] <- “train 5” names(flds)[6] <- “test 1” flds
n.folds <- 10 folds <- cut(sample(seq_len(nrow(train_set))), breaks=n.folds, labels=FALSE)
all.confusion.tables <- list() for (i in seq_len(n.folds)) { # Create training set train <- filter(train_set, folds != i) # Take all other samples than i
# Create test set test <- filter(train_set, folds == i)
# Fit the glm model on the training set glm.train <- glm(Target ~ age_band + gender + car_model + car_segment + age_of_vehicle_years + sched_serv_warr + non_sched_serv_warr + non_sched_serv_warr + total_paid_services + total_services + mth_since_last_serv + annualised_mileage + num_dealers_visited + num_serv_dealer_purchased, family = binomial, data = train_set)
# Use the fitted model on the test set logit.prob <- predict(glm.train, newdata = test)
# Classifiy based on the predictions pred.class <- ifelse(logit.prob < 0, 0, 1)
# Construct the confusion table all.confustion.tables[[i]] <- table(pred = pred.class, true = test$low) }
all.confusion.tables[[2]]
train1.glm <- glm(data = train_set[ flds[[1]], ], formula = Target ~ ., family = binomial)
summary(train.glm)