How to Predict

split data into training and test

fit a bunch of models

make a prediction

confusion Matrix (classifactaion) & RMSE (regression)

Libraries

library(ISLR2)

## Warning: package 'ISLR2' was built under R version 4.4.2

library(boot)

## Warning: package 'boot' was built under R version 4.4.3

set.seed(1)
train = sample(392,196)

lm.fit = lm(mpg~ horsepower, data = Auto, subset = train)

#make a prediction
mpg.prediction = predict(lm.fit, Auto)[-train]

# compare predictions to observed
mean((Auto$mpg-predict(lm.fit,Auto))[-train]^2)

## [1] 23.26601

lm.fit2 = lm(mpg~poly(horsepower,2), data = Auto, subset = train)
attach(Auto)
mean((mpg-predict(lm.fit,Auto))[-train]^2)

## [1] 23.26601

mean((mpg-predict(lm.fit2,Auto))[-train]^2)

## [1] 18.71646

Leave one out cross validation

#fit a simple linear model
glm.fit = glm(mpg~horsepower,data = Auto)
#calculating the cross validation error 
cv.err = cv.glm(Auto,glm.fit)
cv.err$delta

## [1] 24.23151 24.23114

# K fold cross validation
glm.fit = glm(mpg~horsepower,data = Auto)
#calculating the cross validation error 
cv.err = cv.glm(Auto,glm.fit, K = 10)
cv.err$delta

## [1] 24.19879 24.18553

Problem 6

We continue to consider the use of a logistic regression model to predict the probability of default using income and balance on the Default data set. In particular, we will now compute estimates for the standard errors of the income and balance logistic regression coefficients in two different ways: (1) using the bootstrap, and (2) using the standard formula for computing the standard errors in the glm() function. Do not forget to set a random seed before beginning your analysis.

library(boot)
library(ISLR2)
attach(Default)
View(Default)

Using the summary() and glm() functions, determine the esti- mated standard errors for the coefficients associated with income and balance in a multiple logistic regression model that uses both predictors.

glm.fit3 = glm(default~income + balance, data = Default, family = "binomial")
summary(glm.fit3)$coefficients[,2]

##  (Intercept)       income      balance 
## 4.347564e-01 4.985167e-06 2.273731e-04

Write a function, boot.fn(), that takes as input the Default data set as well as an index of the observations, and that outputs the coefficient estimates for income and balance in the multiple logistic regression model.
Use the boot() function together with your boot.fn() function to estimate the standard errors of the logistic regression coefficients for income and balance.
Comment on the estimated standard errors obtained using the glm() function and using your bootstrap function. The standard errors are almost identitical

set.seed(17)
boot.fn = function(data,index){
  glm.fits = glm(default~ income + balance, data = Default, family = "binomial",subset = index)



  
  return(coef(glm.fits))
}
boot.fn(Default,1:100)

##   (Intercept)        income       balance 
## -2.656607e+01 -8.165470e-19 -2.914073e-19

set.seed(123)
results = boot(Default,boot.fn,R = 1000)

apply(results$t,2,sd)

## [1] 4.204817e-01 4.729534e-06 2.217214e-04

data(Boston)
head(Boston)

##      crim zn indus chas   nox    rm  age    dis rad tax ptratio lstat medv
## 1 0.00632 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3  4.98 24.0
## 2 0.02731  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8  9.14 21.6
## 3 0.02729  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8  4.03 34.7
## 4 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7  2.94 33.4
## 5 0.06905  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7  5.33 36.2
## 6 0.02985  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7  5.21 28.7

mu = mean(Boston$medv)


sd(Boston$medv)/sqrt(nrow(Boston))

## [1] 0.4088611

The error is roughly 40%

boot.fn2 = function(data,index){
  return(mean(data[index]))
}

bstrap = boot(Boston$medv,boot.fn2,1000)
print(bstrap)

## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Boston$medv, statistic = boot.fn2, R = 1000)
## 
## 
## Bootstrap Statistics :
##     original     bias    std. error
## t1* 22.53281 0.02171561   0.4163195

conf.int = c(bstrap$t0 - (2 * 0.3994119), bstrap$t0 + (2  * 0.3994119))
conf.int

## [1] 21.73398 23.33163

t.test(Boston$medv)

## 
##  One Sample t-test
## 
## data:  Boston$medv
## t = 55.111, df = 505, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  21.72953 23.33608
## sample estimates:
## mean of x 
##  22.53281

The standard error is very close to the error in section above

median.u = median(Boston$medv)


boot.median = function(data,index){
  return(median(data[index]))
}

bstrap.median = boot(Boston$medv,boot.median,10000)
print(bstrap.median)

## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Boston$medv, statistic = boot.median, R = 10000)
## 
## 
## Bootstrap Statistics :
##     original    bias    std. error
## t1*     21.2 -0.012735   0.3801743

The error is getting smaller

quantile(Boston$medv,probs = c(.1))

##   10% 
## 12.75

bstrap.quant = function(data,index){
  return(quantile(data[index],probs = c(.1)))
}

bstrap.quant.ex = boot(Boston$medv,bstrap.quant,10000)
print(bstrap.quant.ex)

## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Boston$medv, statistic = bstrap.quant, R = 10000)
## 
## 
## Bootstrap Statistics :
##     original  bias    std. error
## t1*    12.75 0.00645   0.5032919

The error is the highest here

Sampling Technique

Matthew Mills

2025-03-24