library(ISLR)
## Warning: package 'ISLR' was built under R version 4.0.5
set.seed(1)
attach(Default)
glm.fit = glm(default ~ income + balance, data = Default, family = "binomial")
summary(glm.fit)
##
## Call:
## glm(formula = default ~ income + balance, family = "binomial",
## data = Default)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4725 -0.1444 -0.0574 -0.0211 3.7245
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
index = sample(1:nrow(Default), 0.8 * nrow(Default))
train = Default[index,]
test = Default[-index,]
glm.fit.train = glm(default ~ income + balance, data = train, family = "binomial")
summary(glm.fit.train)
##
## Call:
## glm(formula = default ~ income + balance, family = "binomial",
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4758 -0.1413 -0.0563 -0.0210 3.4620
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.168e+01 4.893e-01 -23.879 < 2e-16 ***
## income 2.547e-05 5.631e-06 4.523 6.1e-06 ***
## balance 5.613e-03 2.531e-04 22.176 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2313.6 on 7999 degrees of freedom
## Residual deviance: 1239.2 on 7997 degrees of freedom
## AIC: 1245.2
##
## Number of Fisher Scoring iterations: 8
pred = predict(glm.fit.train, newdata =test,type = "response")
glm.pred = rep("No",length(pred))
glm.pred[pred > 0.5] = "Yes"
mean(glm.pred != test$default)
## [1] 0.026
The validation set error is 2.6%
index = sample(1:nrow(Default), 0.7 * nrow(Default))
train = Default[index,]
test = Default[-index,]
glm.fit.train = glm(default ~ income + balance, data = train, family = "binomial")
summary(glm.fit.train)
##
## Call:
## glm(formula = default ~ income + balance, family = "binomial",
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4932 -0.1390 -0.0544 -0.0197 3.7591
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.176e+01 5.299e-01 -22.193 < 2e-16 ***
## income 2.140e-05 6.001e-06 3.567 0.000362 ***
## balance 5.749e-03 2.756e-04 20.859 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2057.2 on 6999 degrees of freedom
## Residual deviance: 1084.8 on 6997 degrees of freedom
## AIC: 1090.8
##
## Number of Fisher Scoring iterations: 8
pred = predict(glm.fit.train, newdata = test,type = "response")
glm.pred = rep("No",length(pred))
glm.pred[pred > 0.5] = "Yes"
mean(glm.pred != test$default)
## [1] 0.02533333
The validation set error is now 2.53% for the 70-30 split.
index = sample(1:nrow(Default), 0.75 * nrow(Default))
train = Default[index,]
test = Default[-index,]
glm.fit.train = glm(default ~ income + balance, data = train, family = "binomial")
summary(glm.fit.train)
##
## Call:
## glm(formula = default ~ income + balance, family = "binomial",
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5332 -0.1393 -0.0541 -0.0193 3.7533
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.184e+01 5.178e-01 -22.86 < 2e-16 ***
## income 2.155e-05 5.794e-06 3.72 0.000199 ***
## balance 5.821e-03 2.708e-04 21.49 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2198.9 on 7499 degrees of freedom
## Residual deviance: 1159.9 on 7497 degrees of freedom
## AIC: 1165.9
##
## Number of Fisher Scoring iterations: 8
pred = predict(glm.fit.train, newdata = test,type = "response")
glm.pred = rep("No",length(pred))
glm.pred[pred > 0.5] = "Yes"
mean(glm.pred != test$default)
## [1] 0.0276
The validation set error is now 2.76% for the 75-25 split.
index = sample(1:nrow(Default), 0.9 * nrow(Default))
train = Default[index,]
test = Default[-index,]
glm.fit.train = glm(default ~ income + balance, data = train, family = "binomial")
summary(glm.fit.train)
##
## Call:
## glm(formula = default ~ income + balance, family = "binomial",
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4686 -0.1406 -0.0553 -0.0204 3.7360
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.159e+01 4.644e-01 -24.963 < 2e-16 ***
## income 1.951e-05 5.312e-06 3.673 0.000239 ***
## balance 5.691e-03 2.421e-04 23.509 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2630.6 on 8999 degrees of freedom
## Residual deviance: 1404.9 on 8997 degrees of freedom
## AIC: 1410.9
##
## Number of Fisher Scoring iterations: 8
pred = predict(glm.fit.train, newdata = test,type = "response")
glm.pred = rep("No",length(pred))
glm.pred[pred > 0.5] = "Yes"
mean(glm.pred != test$default)
## [1] 0.026
The validation set error is now 2.6% for the 90-10 split.
From the above iterations, we can see that everytime a different split of the data is considered, the validation error keeps changing though it remains in the same range.
index = sample(1:nrow(Default), 0.8 * nrow(Default))
train = Default[index,]
test = Default[-index,]
glm.fit.train = glm(default ~ income + balance + student, data = train, family = "binomial")
summary(glm.fit.train)
##
## Call:
## glm(formula = default ~ income + balance + student, family = "binomial",
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1723 -0.1400 -0.0543 -0.0198 3.5656
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.076e+01 5.556e-01 -19.376 < 2e-16 ***
## income -1.959e-06 9.393e-06 -0.209 0.83477
## balance 5.782e-03 2.651e-04 21.808 < 2e-16 ***
## studentYes -7.684e-01 2.676e-01 -2.872 0.00408 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2286.5 on 7999 degrees of freedom
## Residual deviance: 1240.1 on 7996 degrees of freedom
## AIC: 1248.1
##
## Number of Fisher Scoring iterations: 8
pred = predict(glm.fit.train, newdata = test,type = "response")
glm.pred = rep("No",length(pred))
glm.pred[pred > 0.5] = "Yes"
mean(glm.pred != test$default)
## [1] 0.0245
After adding a dummy variable student, the validation set error is 2.45%. There is no reduction seen in the error because of addition of the dummy variable.
set.seed(1)
glm.fit = glm(default ~ income + balance, data = Default, family = "binomial")
summary(glm.fit)
##
## Call:
## glm(formula = default ~ income + balance, family = "binomial",
## data = Default)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4725 -0.1444 -0.0574 -0.0211 3.7245
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
boot.fn=function(data,index){
coefficients(glm(default~income+balance, data=data, subset=index, family="binomial"))
}
boot.fn(Default,1:nrow(Default))
## (Intercept) income balance
## -1.154047e+01 2.080898e-05 5.647103e-03
#boot(Default,boot.fn,1000)
library(MASS)
mu=mean(Boston$medv)
mu
## [1] 22.53281
sd(Boston$medv)/sqrt(length(Boston$medv))
## [1] 0.4088611
library(boot)
boot.fn<-function(data,index){
mean(data[index])
}
boot(Boston$medv,boot.fn,1000,parallel ="multicore")
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Boston$medv, statistic = boot.fn, R = 1000, parallel = "multicore")
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 22.53281 0.007650791 0.4106622
t.test(Boston$medv)
##
## One Sample t-test
##
## data: Boston$medv
## t = 55.111, df = 505, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 21.72953 23.33608
## sample estimates:
## mean of x
## 22.53281
mu=22.53
se=0.4016
mu-2*se
## [1] 21.7268
mu+2*se
## [1] 23.3332
mumed=median(Boston$medv)
mumed
## [1] 21.2
set.seed(1)
boot.fn<-function(data,index){
median(data[index])
}
boot(Boston$medv,boot.fn,1000,parallel ="multicore")
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Boston$medv, statistic = boot.fn, R = 1000, parallel = "multicore")
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 21.2 0.02295 0.3778075
mu_0.1 = quantile(Boston$medv,p=0.1)
mu_0.1
## 10%
## 12.75
set.seed(1)
boot.fn<-function(data,index){
quantile(data[index],p=0.1)
}
boot(Boston$medv,boot.fn,1000,parallel ="multicore")
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Boston$medv, statistic = boot.fn, R = 1000, parallel = "multicore")
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 12.75 0.0339 0.4767526