Model Validation Methods

by Anand Jage

CROSS VALIDATION METHODS 1. Hold Out 2. K Fold 3. Repeated K Fold 4. Leave One Out Cross Validation

Install & Load Packages

library(lattice)
library(ggplot2)
## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang
library(caret)
library(car)
## Loading required package: carData
library(carData)

1. Hold Out Method

Import the Dataset & Split into Training and Test Data using createDataPartition() from caret package. The first arguement is any column vector from dataset that is part of model, usually dependent variable is selected. Second arguemnt to function is the ratio of split of training data to test data p=0.8. Third arguement is indext data type, list = FALSE

motor <- read.csv("Motor Insurance claim amount.csv")
shapiro.test(motor$claimamt)
## 
##  Shapiro-Wilk normality test
## 
## data:  motor$claimamt
## W = 0.96652, p-value = 2.124e-14
head(motor)
##   vehage   CC Length Weight claimamt
## 1      4 1495   4250   1023  72000.0
## 2      2 1061   3495    875  72000.0
## 3      2 1405   3675    980  50400.0
## 4      7 1298   4090    930  39960.0
## 5      2 1495   4250   1023 106800.0
## 6      1 1086   3565    854  69592.8
index <- createDataPartition(motor$claimamt, p = 0.8, list = FALSE)
head(index)
##      Resample1
## [1,]         1
## [2,]         2
## [3,]         3
## [4,]         4
## [5,]         5
## [6,]         6
dim(index)
## [1] 800   1
traindata <- motor[index, ]
testdata <- motor[-index,]
dim(traindata)
## [1] 800   5
dim(testdata)
## [1] 200   5

Generate the full model and check for multicollinearity.

model <- lm(claimamt~., data = traindata); model
## 
## Call:
## lm(formula = claimamt ~ ., data = traindata)
## 
## Coefficients:
## (Intercept)       vehage           CC       Length       Weight  
##   -59418.28     -6678.36        15.70        37.73       -20.56
vif(model)
##   vehage       CC   Length   Weight 
## 1.042477 6.205176 3.491471 6.921608

Remove variable with highest vif value and regenerate model. Check vif for multicollinearity. Retain model if vif of each feature is below 5. Else repeat previous step.

model <- lm(claimamt~vehage+CC+Length, data=traindata)
vif(model)
##   vehage       CC   Length 
## 1.042383 2.923070 2.974068

Calculate residuals and predicted values of model for training data

traindata$residuals <- residuals(model)
traindata$predicted <- predict(model,traindata)
head(traindata)
##   vehage   CC Length Weight claimamt  residuals predicted
## 1      4 1495   4250   1023  72000.0  -1845.942  73845.94
## 2      2 1061   3495    875  72000.0  13282.269  58717.73
## 3      2 1405   3675    980  50400.0 -16878.232  67278.23
## 4      7 1298   4090    930  39960.0  -7016.086  46976.09
## 5      2 1495   4250   1023 106800.0  19581.393  87218.61
## 6      1 1086   3565    854  69592.8   1658.128  67934.67

Calculate RMSE for training data

rmse_train <- sqrt(mean(traindata$residuals ** 2)) ; rmse_train
## [1] 11543.02

Calculate residuals and predicted for test data

testdata$predicted <- predict(model, testdata)
testdata$residuals <- testdata$claimamt - testdata$predicted
head(testdata)
##    vehage   CC Length Weight claimamt predicted   residuals
## 14      3 1405   3675    980 48000.00  60591.90 -12591.8994
## 22      2 1298   4095    875 93600.00  80575.40  13024.6027
## 24      0 1405   3675    980 80048.16  80650.90   -602.7365
## 46      3  796   3495    740 39120.00  50086.19 -10966.1869
## 48      2 1298   3695    980 88800.10  67163.41  21636.6897
## 51      1  796   3495    740 52320.00  63458.85 -11138.8516

Calculate RMSE for test data

rmse_test <- sqrt(mean(testdata$residuals ** 2)) ; rmse_test
## [1] 11070.49

Check RMSEs shouldnt exceed 15% of each other

rmse_train/rmse_test
## [1] 1.042684