ch5

# Chapter 5 page 197: 3,5,6,9

#3

# The k-fold cross validation method is implemented by taking the whole data set of n observations and randomly splitting the taken observation into k non-overlapping sets of data of length of (approximately) n/k. These groups of data observations act as a validation set, and the remainder of the data set (length) which then acts as a training set for the model or the method of study. The test error is then estimated by averaging the k resulting MSE estimates for the model.
# The validation set approach or the model has two primary and main drawbacks compared to k-fold cross-validation method. Firstly, the validation model estimate of the test error rate of the model under study can be highly variable (depending on which obsesrvations from the data set are included in the training set and which observations are taken account in the validation set). Second of all, the subset consisting of the data sets of the observations are used to fit the model. Since statistical methods and models tend to perform worse when based on fewer observations, this suggests that the validation set error rate may tend to overestimate the test error for the model fit on the entire data set since all the points of the data set are not considered.
# The studied LOOCV cross-validation approach is only a special case of k-fold cross-validation model in which the only specialization is that, k=n. This approach has two demerits when compared to k-fold cross-validation model. First of all, it requires fitting the potentially computationally expensive model for n times when compared to the k-fold cross-validation model which required the model to be fitted for only k times. Second, the LOOCV cross-validation model approach may give an approximate and unbiased estimates of the test error for the model, since each training set contains n-1 observations from the data set; however, this approach has higher variance when compared to the k-fold cross-validation model, as we are averaging the outputs of n fitted models trained on an almost identical set of observations, along with the advantage that these outputs are highly correlated, and the mean of highly correlated quantities has higher variance than less correlated ones.

#5

library(ISLR)

## Warning: package 'ISLR' was built under R version 4.1.3

attach(Default)
set.seed(1)
fit.glm=glm(default~income+balance,data=Default,family="binomial")
summary(fit.glm)

## 
## Call:
## glm(formula = default ~ income + balance, family = "binomial", 
##     data = Default)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4725  -0.1444  -0.0574  -0.0211   3.7245  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.154e+01  4.348e-01 -26.545  < 2e-16 ***
## income       2.081e-05  4.985e-06   4.174 2.99e-05 ***
## balance      5.647e-03  2.274e-04  24.836  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1579.0  on 9997  degrees of freedom
## AIC: 1585
## 
## Number of Fisher Scoring iterations: 8

train=sample(dim(Default)[1],dim(Default)[1]/2)
fit.glm=glm(default~income+balance,data=Default,family="binomial",subset=train)
summary(fit.glm)

## 
## Call:
## glm(formula = default ~ income + balance, family = "binomial", 
##     data = Default, subset = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5830  -0.1428  -0.0573  -0.0213   3.3395  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.194e+01  6.178e-01 -19.333  < 2e-16 ***
## income       3.262e-05  7.024e-06   4.644 3.41e-06 ***
## balance      5.689e-03  3.158e-04  18.014  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1523.8  on 4999  degrees of freedom
## Residual deviance:  803.3  on 4997  degrees of freedom
## AIC: 809.3
## 
## Number of Fisher Scoring iterations: 8

Probs=predict(fit.glm,newdata=Default[-train,],type="response")
pred.glm=rep("No",length(Probs))
pred.glm[Probs>0.5]="Yes"
mean(pred.glm!=Default[-train,]$default)

## [1] 0.0254

train=sample(dim(Default)[1],dim(Default)[1]/2)
fit.glm=glm(default~income+balance,data=Default,family="binomial",subset=train)
Probs=predict(fit.glm,newdata=Default[-train,],type="response")
pred.glm=rep("No",length(Probs))
pred.glm[Probs>0.5]="Yes"
mean(pred.glm!=Default[-train,]$default)

## [1] 0.0274

train=sample(dim(Default)[1],dim(Default)[1]/2)
fit.glm=glm(default~income+balance,data=Default,family="binomial",subset=train)
Probs=predict(fit.glm,newdata=Default[-train,],type="response")
pred.glm=rep("No",length(Probs))
pred.glm[Probs>0.5]="Yes"
mean(pred.glm!=Default[-train,]$default)

## [1] 0.0244

train=sample(dim(Default)[1],dim(Default)[1]/2)
fit.glm=glm(default~income+balance+student,data=Default,family="binomial",subset=train)
pred.glm=rep("No",length(Probs))
Probs=predict(fit.glm,newdata=Default[-train,],type="response")
pred.glm[Probs>0.5]="Yes"
mean(pred.glm!=Default[-train,]$default)

## [1] 0.0242

#6

set.seed(1)
attach(Default)

## The following objects are masked from Default (pos = 3):
## 
##     balance, default, income, student

fit.glm=glm(default~income+balance,data=Default,family="binomial")
summary(fit.glm)

## 
## Call:
## glm(formula = default ~ income + balance, family = "binomial", 
##     data = Default)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4725  -0.1444  -0.0574  -0.0211   3.7245  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.154e+01  4.348e-01 -26.545  < 2e-16 ***
## income       2.081e-05  4.985e-06   4.174 2.99e-05 ***
## balance      5.647e-03  2.274e-04  24.836  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1579.0  on 9997  degrees of freedom
## AIC: 1585
## 
## Number of Fisher Scoring iterations: 8

boot.fn=function(data,index)
{ fit=glm(default~income+balance,data=data,family="binomial",subset=index)
return(coef(fit))
}

#9

library(MASS)
attach(Boston)
mu.hat=mean(medv)
mu.hat

## [1] 22.53281

se.hat=sd(medv)/sqrt(dim(Boston)[1])
se.hat

## [1] 0.4088611

set.seed(1)