R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

  1. We now review k-fold cross-validation.
  1. Explain how k-fold cross-validation is implemented.
  1. What are the advantages and disadvantages of k-fold crossvalidation relative to:
  1. The validation set approach?
  1. LOOCV?

Problem 5:

library(ISLR)
data("Default")
summary(Default)
##  default    student       balance           income     
##  No :9667   No :7056   Min.   :   0.0   Min.   :  772  
##  Yes: 333   Yes:2944   1st Qu.: 481.7   1st Qu.:21340  
##                        Median : 823.6   Median :34553  
##                        Mean   : 835.4   Mean   :33517  
##                        3rd Qu.:1166.3   3rd Qu.:43808  
##                        Max.   :2654.3   Max.   :73554
logmodel1 = glm(default ~ balance + income, data = Default, family = binomial)

summary(logmodel1)
## 
## Call:
## glm(formula = default ~ balance + income, family = binomial, 
##     data = Default)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.154e+01  4.348e-01 -26.545  < 2e-16 ***
## balance      5.647e-03  2.274e-04  24.836  < 2e-16 ***
## income       2.081e-05  4.985e-06   4.174 2.99e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1579.0  on 9997  degrees of freedom
## AIC: 1585
## 
## Number of Fisher Scoring iterations: 8
trainDefault = sample(dim(Default)[1], dim(Default)[1]*0.50)
testDefault = Default[-trainDefault, ]

#ii
LOGmodel2 = glm(default ~ balance + income, data = Default, family = binomial, subset = trainDefault)

#iii
log.prob_def = predict(LOGmodel2, testDefault, type = "response")
log.pred_def = rep("No", dim(Default)[1]*0.50)
log.pred_def[log.prob_def > 0.5] = "Yes"
table(log.pred_def, testDefault$default)
##             
## log.pred_def   No  Yes
##          No  4831  106
##          Yes   19   44
mean(log.pred_def !=testDefault$default)
## [1] 0.025
trainDefault = sample(dim(Default)[1], dim(Default)[1]*0.50)
testDefault = Default[-trainDefault, ]

#ii
LOGmodel2 = glm(default ~ balance + income, data = Default, family = binomial, subset = trainDefault)

#iii
log.prob_def = predict(LOGmodel2, testDefault, type = "response")
log.pred_def = rep("No", dim(Default)[1]*0.50)
log.pred_def[log.prob_def > 0.5] = "Yes"
table(log.pred_def, testDefault$default)
##             
## log.pred_def   No  Yes
##          No  4820   99
##          Yes   28   53
mean(log.pred_def !=testDefault$default)
## [1] 0.0254
trainDefault = sample(dim(Default)[1], dim(Default)[1]*0.50)
testDefault = Default[-trainDefault, ]

#ii
LOGmodel2 = glm(default ~ balance + income, data = Default, family = binomial, subset = trainDefault)

#iii
log.prob_def = predict(LOGmodel2, testDefault, type = "response")
log.pred_def = rep("No", dim(Default)[1]*0.50)
log.pred_def[log.prob_def > 0.5] = "Yes"
table(log.pred_def, testDefault$default)
##             
## log.pred_def   No  Yes
##          No  4813  116
##          Yes   23   48
mean(log.pred_def !=testDefault$default)
## [1] 0.0278
trainDefault = sample(dim(Default)[1], dim(Default)[1]*0.50)
testDefault = Default[-trainDefault, ]

#ii
LOGmodel2 = glm(default ~ balance + income, data = Default, family = binomial, subset = trainDefault)

#iii
log.prob_def = predict(LOGmodel2, testDefault, type = "response")
log.pred_def = rep("No", dim(Default)[1]*0.50)
log.pred_def[log.prob_def > 0.5] = "Yes"
table(log.pred_def, testDefault$default)
##             
## log.pred_def   No  Yes
##          No  4803  118
##          Yes   27   52
mean(log.pred_def !=testDefault$default)
## [1] 0.029
trainDefault = sample(dim(Default)[1], dim(Default)[1]*0.50)
testDefault = Default[-trainDefault, ]

#ii
LOGmodel2 = glm(default ~ balance + income + student, data = Default, family = binomial, subset = trainDefault)

#iii
log.prob_def = predict(LOGmodel2, testDefault, type = "response")
log.pred_def = rep("No", dim(Default)[1]*0.50)
log.pred_def[log.prob_def > 0.5] = "Yes"
table(log.pred_def, testDefault$default)
##             
## log.pred_def   No  Yes
##          No  4814  123
##          Yes   14   49
mean(log.pred_def !=testDefault$default)
## [1] 0.0274
  1. Now consider a logistic regression model that predicts the probability of default using income, balance, and a dummy variable for student. Estimate the test error for this model using the validation set approach. Comment on whether or not including a dummy variable for student leads to a reduction in the test errorrate.

PROBLEM 6:

LOGmodel3 = glm(default ~ balance + income, data = Default, family = binomial)

summary(LOGmodel3)
## 
## Call:
## glm(formula = default ~ balance + income, family = binomial, 
##     data = Default)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.154e+01  4.348e-01 -26.545  < 2e-16 ***
## balance      5.647e-03  2.274e-04  24.836  < 2e-16 ***
## income       2.081e-05  4.985e-06   4.174 2.99e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1579.0  on 9997  degrees of freedom
## AIC: 1585
## 
## Number of Fisher Scoring iterations: 8
boot.fn = function(data, index) return(coef(glm(default ~ balance + income, data = data, family = binomial, subset = index)))
library(boot)
boot(Default, boot.fn, 100)
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Default, statistic = boot.fn, R = 100)
## 
## 
## Bootstrap Statistics :
##          original        bias     std. error
## t1* -1.154047e+01 -2.936869e-02 4.312253e-01
## t2*  5.647103e-03  1.482567e-05 2.307195e-04
## t3*  2.080898e-05  2.937906e-07 4.808245e-06
  1. Comment on the estimated standard errors obtained using the glm() function and using your bootstrap function.

PROBLEM 9:

library(MASS)
data("Boston")
summary(Boston)
##       crim                zn             indus            chas        
##  Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46   Min.   :0.00000  
##  1st Qu.: 0.08205   1st Qu.:  0.00   1st Qu.: 5.19   1st Qu.:0.00000  
##  Median : 0.25651   Median :  0.00   Median : 9.69   Median :0.00000  
##  Mean   : 3.61352   Mean   : 11.36   Mean   :11.14   Mean   :0.06917  
##  3rd Qu.: 3.67708   3rd Qu.: 12.50   3rd Qu.:18.10   3rd Qu.:0.00000  
##  Max.   :88.97620   Max.   :100.00   Max.   :27.74   Max.   :1.00000  
##       nox               rm             age              dis        
##  Min.   :0.3850   Min.   :3.561   Min.   :  2.90   Min.   : 1.130  
##  1st Qu.:0.4490   1st Qu.:5.886   1st Qu.: 45.02   1st Qu.: 2.100  
##  Median :0.5380   Median :6.208   Median : 77.50   Median : 3.207  
##  Mean   :0.5547   Mean   :6.285   Mean   : 68.57   Mean   : 3.795  
##  3rd Qu.:0.6240   3rd Qu.:6.623   3rd Qu.: 94.08   3rd Qu.: 5.188  
##  Max.   :0.8710   Max.   :8.780   Max.   :100.00   Max.   :12.127  
##       rad              tax           ptratio          black       
##  Min.   : 1.000   Min.   :187.0   Min.   :12.60   Min.   :  0.32  
##  1st Qu.: 4.000   1st Qu.:279.0   1st Qu.:17.40   1st Qu.:375.38  
##  Median : 5.000   Median :330.0   Median :19.05   Median :391.44  
##  Mean   : 9.549   Mean   :408.2   Mean   :18.46   Mean   :356.67  
##  3rd Qu.:24.000   3rd Qu.:666.0   3rd Qu.:20.20   3rd Qu.:396.23  
##  Max.   :24.000   Max.   :711.0   Max.   :22.00   Max.   :396.90  
##      lstat            medv      
##  Min.   : 1.73   Min.   : 5.00  
##  1st Qu.: 6.95   1st Qu.:17.02  
##  Median :11.36   Median :21.20  
##  Mean   :12.65   Mean   :22.53  
##  3rd Qu.:16.95   3rd Qu.:25.00  
##  Max.   :37.97   Max.   :50.00
attach(Boston)
mean.medv = mean(medv)
mean.medv
## [1] 22.53281
stderr.mean = sd(medv)/sqrt(length(medv))
stderr.mean
## [1] 0.4088611
boot.fn2 = function(data, index) return(mean(data[index]))
boot2 = boot(medv, boot.fn2, 100)
boot2
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = medv, statistic = boot.fn2, R = 100)
## 
## 
## Bootstrap Statistics :
##     original      bias    std. error
## t1* 22.53281 -0.07331621   0.3633222
t.test(Boston$medv)
## 
##  One Sample t-test
## 
## data:  Boston$medv
## t = 55.111, df = 505, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  21.72953 23.33608
## sample estimates:
## mean of x 
##  22.53281
CI.bos = c(22.53 - 2 * 0.4174872, 22.53 + 2 * 0.4174872)
CI.bos
## [1] 21.69503 23.36497
median.medv = median(medv)
median.medv
## [1] 21.2
boot.fn3 = function(data, index) return(median(data[index]))
boot3 = boot(medv, boot.fn3, 1000)
boot3
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = medv, statistic = boot.fn3, R = 1000)
## 
## 
## Bootstrap Statistics :
##     original  bias    std. error
## t1*     21.2 -0.0187    0.390048
tenth.medv = quantile(medv, c(0.1))
tenth.medv
##   10% 
## 12.75
boot.fn4 = function(data, index) return(quantile(data[index], c(0.1)))
boot4 = boot(medv, boot.fn4, 1000)
boot4
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = medv, statistic = boot.fn4, R = 1000)
## 
## 
## Bootstrap Statistics :
##     original  bias    std. error
## t1*    12.75  0.0376   0.4939385