Libraries
library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.4.2
library(boot)
## Warning: package 'boot' was built under R version 4.4.3
set.seed(1)
train = sample(392,196)
lm.fit = lm(mpg~ horsepower, data = Auto, subset = train)
#make a prediction
mpg.prediction = predict(lm.fit, Auto)[-train]
# compare predictions to observed
mean((Auto$mpg-predict(lm.fit,Auto))[-train]^2)
## [1] 23.26601
lm.fit2 = lm(mpg~poly(horsepower,2), data = Auto, subset = train)
attach(Auto)
mean((mpg-predict(lm.fit,Auto))[-train]^2)
## [1] 23.26601
mean((mpg-predict(lm.fit2,Auto))[-train]^2)
## [1] 18.71646
#fit a simple linear model
glm.fit = glm(mpg~horsepower,data = Auto)
#calculating the cross validation error
cv.err = cv.glm(Auto,glm.fit)
cv.err$delta
## [1] 24.23151 24.23114
# K fold cross validation
glm.fit = glm(mpg~horsepower,data = Auto)
#calculating the cross validation error
cv.err = cv.glm(Auto,glm.fit, K = 10)
cv.err$delta
## [1] 24.19879 24.18553
We continue to consider the use of a logistic regression model to predict the probability of default using income and balance on the Default data set. In particular, we will now compute estimates for the standard errors of the income and balance logistic regression coefficients in two different ways: (1) using the bootstrap, and (2) using the standard formula for computing the standard errors in the glm() function. Do not forget to set a random seed before beginning your analysis.
library(boot)
library(ISLR2)
attach(Default)
View(Default)
glm.fit3 = glm(default~income + balance, data = Default, family = "binomial")
summary(glm.fit3)$coefficients[,2]
## (Intercept) income balance
## 4.347564e-01 4.985167e-06 2.273731e-04
Write a function, boot.fn(), that takes as input the Default data set as well as an index of the observations, and that outputs the coefficient estimates for income and balance in the multiple logistic regression model.
Use the boot() function together with your boot.fn() function to estimate the standard errors of the logistic regression coefficients for income and balance.
Comment on the estimated standard errors obtained using the glm() function and using your bootstrap function. The standard errors are almost identitical
set.seed(17)
boot.fn = function(data,index){
glm.fits = glm(default~ income + balance, data = Default, family = "binomial",subset = index)
return(coef(glm.fits))
}
boot.fn(Default,1:100)
## (Intercept) income balance
## -2.656607e+01 -8.165470e-19 -2.914073e-19
set.seed(123)
results = boot(Default,boot.fn,R = 1000)
apply(results$t,2,sd)
## [1] 4.204817e-01 4.729534e-06 2.217214e-04
data(Boston)
head(Boston)
## crim zn indus chas nox rm age dis rad tax ptratio lstat medv
## 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 4.98 24.0
## 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 9.14 21.6
## 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 4.03 34.7
## 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 2.94 33.4
## 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 5.33 36.2
## 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 5.21 28.7
mu = mean(Boston$medv)
sd(Boston$medv)/sqrt(nrow(Boston))
## [1] 0.4088611
The error is roughly 40%
boot.fn2 = function(data,index){
return(mean(data[index]))
}
bstrap = boot(Boston$medv,boot.fn2,1000)
print(bstrap)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Boston$medv, statistic = boot.fn2, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 22.53281 0.02171561 0.4163195
conf.int = c(bstrap$t0 - (2 * 0.3994119), bstrap$t0 + (2 * 0.3994119))
conf.int
## [1] 21.73398 23.33163
t.test(Boston$medv)
##
## One Sample t-test
##
## data: Boston$medv
## t = 55.111, df = 505, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 21.72953 23.33608
## sample estimates:
## mean of x
## 22.53281
The standard error is very close to the error in section above
median.u = median(Boston$medv)
boot.median = function(data,index){
return(median(data[index]))
}
bstrap.median = boot(Boston$medv,boot.median,10000)
print(bstrap.median)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Boston$medv, statistic = boot.median, R = 10000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 21.2 -0.012735 0.3801743
The error is getting smaller
quantile(Boston$medv,probs = c(.1))
## 10%
## 12.75
bstrap.quant = function(data,index){
return(quantile(data[index],probs = c(.1)))
}
bstrap.quant.ex = boot(Boston$medv,bstrap.quant,10000)
print(bstrap.quant.ex)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Boston$medv, statistic = bstrap.quant, R = 10000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 12.75 0.00645 0.5032919
The error is the highest here