The validation set approach consists of randomly splitting the data into two sets: one set is used to train the model and the remaining other set is used to test the model. I split the data set so that 80% is used for training a linear regression model and 20% is used to evaluate the model performance.
library("readxl")
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.7 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
my_data <- read_excel("MBA6636_SM21_Professor_Proposes_Data.xlsx")
my_data$Colour <- as.factor(my_data$Colour)
my_data$Clarity <- as.factor(my_data$Clarity)
my_data$Cut <- as.factor(my_data$Cut)
my_data$Certification <- as.factor(my_data$Certification)
my_data$Polish <- as.factor(my_data$Polish)
my_data$Symmetry <- as.factor(my_data$Symmetry)
#str(my_data)
set.seed(123)
training.samples <- my_data$Price %>%
createDataPartition(p = 0.8, list = FALSE)
train.data <- my_data[training.samples, ]
test.data <- my_data[-training.samples, ]
# Build the model
model <- lm(Price ~ Carat + Colour + Clarity + Cut + Certification + Symmetry + Wholesaler, data = train.data)
# Make predictions and compute the R2, RMSE and MAE
#predictions <- model %>% predict(test.data)
predictions <- predict(model, test.data)
data.frame( R2 = R2(predictions, test.data$Price),
RMSE = RMSE(predictions, test.data$Price),
MAE = MAE(predictions, test.data$Price))
## R2 RMSE MAE
## 1 0.9767481 180.9923 131.0195
When comparing two models, the one that produces the lowest test sample RMSE is the preferred model. The RMSE and the MAE are measured in the same scale as the outcome variable. Dividing the RMSE by the average value of the outcome variable will give you the prediction error rate, which should be as small as possible:
RMSE(predictions, test.data$Price)/mean(test.data$Price)
## [1] 0.1040502
Below the LOOCV method is implemented:
# Define training control
train.control <- trainControl(method = "LOOCV")
# Train the model
model <- train(Price ~ Carat + Colour + Clarity + Cut + Certification + Symmetry + Wholesaler, data = my_data, method = "lm", trControl = train.control)
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
# Summarize the results
print(model)
## Linear Regression
##
## 440 samples
## 7 predictor
##
## No pre-processing
## Resampling: Leave-One-Out Cross-Validation
## Summary of sample sizes: 439, 439, 439, 439, 439, 439, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 177.6076 0.9771334 122.951
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
I used 10 fold cross validation to estimate the prediction error.
# Define training control
set.seed(123)
train.control <- trainControl(method = "cv", number = 10)
# Train the model
model <- train(Price ~ Carat + Colour + Clarity + Cut + Certification + Symmetry + Wholesaler, data = my_data, method = "lm", trControl = train.control)
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
# Summarize the results
print(model)
## Linear Regression
##
## 440 samples
## 7 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 396, 395, 396, 396, 395, 396, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 173.7868 0.9787337 121.7843
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
In addition to the K-fold method, I also use 10-fold cross validation with 3 repeats:
# Define training control
set.seed(123)
train.control <- trainControl(method = "repeatedcv",
number = 10, repeats = 3)
# Train the model
model <- train(Price ~ Carat + Colour + Clarity + Cut + Certification + Symmetry + Wholesaler, data = my_data, method = "lm", trControl = train.control)
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
# Summarize the results
print(model)
## Linear Regression
##
## 440 samples
## 7 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 396, 395, 396, 396, 395, 396, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 176.8466 0.9779106 123.3918
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
I used a bootstrap with 1000 resamples to test a linear regression model.
library(boot)
##
## Attaching package: 'boot'
## The following object is masked from 'package:lattice':
##
## melanoma
# Creating Function to obtain R-Squared from the data
r_squared <- function(formula, data, indices) {
val <- data[indices,] # selecting sample with boot
fit <- lm(formula, data=val)
return(summary(fit)$r.square)
}
# Performing 1500 replications with boot
output <- boot(data = my_data, statistic = r_squared, R = 1000, formula = Price ~ .)
# Plotting the output
output
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = my_data, statistic = r_squared, R = 1000, formula = Price ~
## .)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 0.9809477 0.001554798 0.001809535
plot(output)
# Obtaining a confidence interval of 95%
boot.ci(output, type="bca")
## Warning in norm.inter(t, adj.alpha): extreme order statistics used as endpoints
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 1000 bootstrap replicates
##
## CALL :
## boot.ci(boot.out = output, type = "bca")
##
## Intervals :
## Level BCa
## 95% ( 0.9762, 0.9831 )
## Calculations and Intervals on Original Scale
## Warning : BCa Intervals used Extreme Quantiles
## Some BCa intervals may be unstable
The dataset used for the classification problem is “bank-additional.csv”, because “bank-additional-full.csv” took a significant amount of time to load.
bank_data <- read.table("bank-additional.csv", head = TRUE, sep = ";", stringsAsFactors=T)
#Table headers
# names(bank_data)
# summary(bank_data)
#Number of observations
# nrow(bank_data)
# Define training control
set.seed(123)
train.control <- trainControl(method = "cv", number = 10)
# Train the model
model <- train(y ~ age + job + cons.price.idx + euribor3m, data = bank_data, method = "glm", trControl = train.control, family = binomial())
# Summarize the results
print(model)
## Generalized Linear Model
##
## 4119 samples
## 4 predictor
## 2 classes: 'no', 'yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 3707, 3707, 3706, 3707, 3707, 3708, ...
## Resampling results:
##
## Accuracy Kappa
## 0.8883216 0.06980823
# Define training control
set.seed(123)
train.control <- trainControl(method = "repeatedcv",
number = 10, repeats = 3)
# Train the model
model <- train(y ~ age + job + cons.price.idx + euribor3m, data = bank_data, method = "glm", trControl = train.control, family = binomial())
# Summarize the results
print(model)
## Generalized Linear Model
##
## 4119 samples
## 4 predictor
## 2 classes: 'no', 'yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 3707, 3707, 3706, 3707, 3707, 3708, ...
## Resampling results:
##
## Accuracy Kappa
## 0.8887262 0.07331853
This code segment has been commented out since it took a long time to load.
# Define training control
#train.control <- trainControl(method = "LOOCV")
# Train the model
#model <- train(y ~ age + job + cons.price.idx + euribor3m, data = bank_data, method = "glm", trControl = train.control, family = binomial())
# Summarize the results
#print(model)
I used a bootstrap with 1000 resamples to test our logistic regression model.
library(boot)
logit_test <- function(d,indices) {
d <- d[indices,]
fit <- glm(y ~ age + job + cons.price.idx + euribor3m, data = d, family = "binomial")
return(coef(fit))
}
boot_fit <- boot(
data = bank_data,
statistic = logit_test,
R = 1000
)
# Plotting the output
boot_fit
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = bank_data, statistic = logit_test, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* -56.95137250 -0.6989970077 9.552414565
## t2* 0.01472573 -0.0002795477 0.005787553
## t3* -0.70238098 0.0076640313 0.166607107
## t4* -0.85113278 -0.0656509018 0.459758742
## t5* -0.32144902 -0.0274050047 0.357531124
## t6* -0.47320121 -0.0080787007 0.219043093
## t7* -0.10370361 0.0094122629 0.270814823
## t8* -0.58267534 -0.0329459634 0.344063688
## t9* -0.52424621 -0.0150635893 0.206724879
## t10* 0.09756921 -0.0045230041 0.311343329
## t11* -0.04869290 0.0050951874 0.162996045
## t12* 0.14113332 -0.0239873398 0.278727089
## t13* -0.27614399 -0.3380599261 1.878867534
## t14* 0.60340337 0.0075885981 0.102865078
## t15* -0.62891243 -0.0026290134 0.037955588
plot(boot_fit)
# Obtaining a confidence interval of 95%
#boot.ci(boot_fit, type="bca")