library(tidyverse)
## -- Attaching packages -------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.3 v dplyr 1.0.2
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts ----------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.0.3
library(caret)
## Warning: package 'caret' was built under R version 4.0.4
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library('fastDummies')
## Warning: package 'fastDummies' was built under R version 4.0.4
library('boot')
##
## Attaching package: 'boot'
## The following object is masked from 'package:lattice':
##
## melanoma
In this method data is put into k equal groups. model is trained on k-1 groups, remaining group is used for testing. this is repeated k times, so every group acts as a test data atleast once. Each occurence is recorded and averaged
The validation set approach? Advantages : lower variablity, all the data is used, over-estimate the test error Disadvantages: computational advantage for validation
LOOCV? advantages : k-fold is less computionally demanding, k fold has a better bias-variance tradeoff Disadvantage : k-fold has variation in the out of sample error compared to LOOCV
set.seed(123)
data("Default")
head(Default)
## default student balance income
## 1 No No 729.5265 44361.625
## 2 No Yes 817.1804 12106.135
## 3 No No 1073.5492 31767.139
## 4 No No 529.2506 35704.494
## 5 No No 785.6559 38463.496
## 6 No Yes 919.5885 7491.559
glm_default <- glm(default~income+balance, data= Default, family = "binomial")
summary(glm_default)
##
## Call:
## glm(formula = default ~ income + balance, family = "binomial",
## data = Default)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4725 -0.1444 -0.0574 -0.0211 3.7245
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
index <- createDataPartition(y=Default$default,p= 0.6999, list= F)
train <- Default[index,]
test<- Default[-index,]
glm_default_train <- glm(default ~ income+balance, data= train, family = "binomial" )
summary(glm_default_train)
##
## Call:
## glm(formula = default ~ income + balance, family = "binomial",
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5089 -0.1466 -0.0595 -0.0219 3.7097
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.165e+01 5.235e-01 -22.25 < 2e-16 ***
## income 2.683e-05 5.988e-06 4.48 7.45e-06 ***
## balance 5.605e-03 2.710e-04 20.68 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2050.5 on 6999 degrees of freedom
## Residual deviance: 1120.8 on 6997 degrees of freedom
## AIC: 1126.8
##
## Number of Fisher Scoring iterations: 8
pred_test <- factor(ifelse(predict(glm_default_train,newdata = test, type= 'response')>0.5,"Yes", "No"))
pred_test[0:9]
## 6 12 19 23 24 26 29 33 34
## No No No No No No No No No
## Levels: No Yes
mean(test$default != pred_test)
## [1] 0.02633333
first split
set.seed(12)
index <- createDataPartition(y=Default$default,p= 0.75, list= F)
train <- Default[index,]
test<- Default[-index,]
glm_default_train <- glm(default ~ income+balance, data= train, family = "binomial" )
pred_test <- factor(ifelse(predict(glm_default_train,newdata = test, type= 'response')>0.5,"Yes", "No"))
mean(test$default != pred_test)
## [1] 0.02721088
second split
set.seed(12)
index <- createDataPartition(y=Default$default,p= 0.80, list= F)
train <- Default[index,]
test<- Default[-index,]
glm_default_train <- glm(default ~ income+balance, data= train, family = "binomial" )
pred_test <- factor(ifelse(predict(glm_default_train,newdata = test, type= 'response')>0.5,"Yes", "No"))
mean(test$default != pred_test)
## [1] 0.02951476
third split
set.seed(12)
index <- createDataPartition(y=Default$default,p= 0.85, list= F)
train <- Default[index,]
test<- Default[-index,]
glm_default_train <- glm(default ~ income+balance, data= train, family = "binomial" )
pred_test <- factor(ifelse(predict(glm_default_train,newdata = test, type= 'response')>0.5,"Yes", "No"))
mean(test$default != pred_test)
## [1] 0.02535023
set.seed(12)
index <- createDataPartition(y = Default$default, p = 0.7, list = F)
train <- Default[index, ]
test <- Default[-index, ]
log_def <- glm(default ~ ., data = train, family = "binomial")
summary(log_def)
##
## Call:
## glm(formula = default ~ ., family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4433 -0.1473 -0.0583 -0.0218 3.7086
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.059e+01 5.749e-01 -18.420 <2e-16 ***
## studentYes -7.118e-01 2.788e-01 -2.553 0.0107 *
## balance 5.640e-03 2.726e-04 20.691 <2e-16 ***
## income 7.322e-07 9.730e-06 0.075 0.9400
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2050.6 on 7000 degrees of freedom
## Residual deviance: 1120.9 on 6997 degrees of freedom
## AIC: 1128.9
##
## Number of Fisher Scoring iterations: 8
test_pred <- factor(ifelse(predict(log_def, newdata = test, type = "response") > 0.5, "Yes", "No"))
mean(test$default != test_pred)
## [1] 0.02834278
dummy variable
New_default <- dummy_cols(Default,select_columns = 'student')
drop <- c('student')
New_default<- subset(New_default, select = -c(student))
head(New_default)
## default balance income student_No student_Yes
## 1 No 729.5265 44361.625 1 0
## 2 No 817.1804 12106.135 0 1
## 3 No 1073.5492 31767.139 1 0
## 4 No 529.2506 35704.494 1 0
## 5 No 785.6559 38463.496 1 0
## 6 No 919.5885 7491.559 0 1
set.seed(12)
index <- createDataPartition(y = New_default$default, p = 0.7, list = F)
train <- New_default[index, ]
test <- New_default[-index, ]
log_def <- glm(default ~ ., data = train, family = "binomial")
summary(log_def)
##
## Call:
## glm(formula = default ~ ., family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4433 -0.1473 -0.0583 -0.0218 3.7086
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.130e+01 5.156e-01 -21.920 <2e-16 ***
## balance 5.640e-03 2.726e-04 20.691 <2e-16 ***
## income 7.322e-07 9.730e-06 0.075 0.9400
## student_No 7.118e-01 2.788e-01 2.553 0.0107 *
## student_Yes NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2050.6 on 7000 degrees of freedom
## Residual deviance: 1120.9 on 6997 degrees of freedom
## AIC: 1128.9
##
## Number of Fisher Scoring iterations: 8
test_pred <- factor(ifelse(predict(log_def, newdata = test, type = "response") > 0.5, "Yes", "No"))
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
mean(test$default != test_pred)
## [1] 0.02834278
so including a dummy variable for student does not reduce the test error rate.
log_def <- glm(default ~ income + balance, data = Default, family = "binomial")
summary(log_def)$coefficients[, 2]
## (Intercept) income balance
## 4.347564e-01 4.985167e-06 2.273731e-04
boot.fn <- function(data, index = 1:nrow(data)) {
coef(glm(default ~ income + balance, data = data, subset = index, family = "binomial"))[-1]
}
boot.fn(Default)
## income balance
## 2.080898e-05 5.647103e-03
set.seed(123)
boot(data = Default, statistic = boot.fn, R = 100)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Default, statistic = boot.fn, R = 100)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 2.080898e-05 1.483646e-07 5.124683e-06
## t2* 5.647103e-03 2.836407e-06 2.150082e-04
The estimated standard errors obtained by the two mehods are similiar
attach(Boston)
mu_hat <-mean(medv)
mu_hat
## [1] 22.53281
se_hat <- sd(medv)/sqrt(dim(Boston)[1])
se_hat
## [1] 0.4088611
set.seed(123)
boot.fn <- function(data,index){
mu <- mean(data[index])
return(mu)
}
boot(medv,boot.fn,1000)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = medv, statistic = boot.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 22.53281 -0.01607372 0.4045557
t.test(medv)
##
## One Sample t-test
##
## data: medv
## t = 55.111, df = 505, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 21.72953 23.33608
## sample estimates:
## mean of x
## 22.53281
CI_mu_hat <- c(22.53 - 2 * 0.4045557, 22.53 + 2 * 0.4045557)
CI_mu_hat
## [1] 21.72089 23.33911
med_hat <- median(medv)
med_hat
## [1] 21.2
boot.fn <- function(vector, index) {
median(vector[index])
}
set.seed(77)
(boot_results <- boot(data = Boston$medv, statistic = boot.fn, R = 1000))
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Boston$medv, statistic = boot.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 21.2 -0.0119 0.371108
quantile(Boston$medv, 0.1)
## 10%
## 12.75
boot.fn <- function(vector, index) {
quantile(vector[index], 0.1)
}
set.seed(77)
(boot_results <- boot(data = Boston$medv, statistic = boot.fn, R = 1000))
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Boston$medv, statistic = boot.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 12.75 0.01305 0.4850335