title: “Homework 4” author: “Elaine Perera” date: “3/18/2021” output: html_document ## Chapter 05 (page 197): 3, 5, 6, 9
#Q3 #a. #This approach involves randomly dividing the set of observations into k groups, or folds, of approximately equal size. The first fold is treated as a validation set, and the method is fit on the remaining k − 1 folds. The mean squared error, MSE1, is then computed on the observations in the held-out fold. This procedure is repeated k times; each time, a different group of observations is treated as a validation set. This process results in k estimates of the test error,. The k-fold CV estimate is computed by averaging these values
#b.
#i. #Advantages:The validation set approach is conceptually simple and is easy to implement. #Disadvantages: The validation MSE can be highly variable and Only a subset of observations are used to fit the model (training data).
#ii. #Advantages: LOOCV have less bias. The validation approach produces different MSE when applied repeatedly due to randomness in the splitting process, while performing LOOCV multiple times will always yield the same results, because we split based on 1 obs. each time. #Disadvantage: LOOCV is computationally intensive
#Q5. #a. library(ISLR) data(“Default”) summary(Default) logmodel1 = glm(default ~ balance + income, data = Default, family = binomial) summary(logmodel1)
#b #i trainDefault = sample(dim(Default)[1], dim(Default)[1]*0.50) testDefault = Default[-trainDefault, ]
#ii LOGmodel2 = glm(default ~ balance + income, data = Default, family = binomial, subset = trainDefault)
#iii log.prob_def = predict(LOGmodel2, testDefault, type = “response”) log.pred_def = rep(“No”, dim(Default)[1]*0.50) log.pred_def[log.prob_def > 0.5] = “Yes” table(log.pred_def, testDefault$default)
#iv mean(log.pred_def !=testDefault$default)
#c
#i trainDefault = sample(dim(Default)[1], dim(Default)[1]*0.50) testDefault = Default[-trainDefault, ]
#ii LOGmodel2 = glm(default ~ balance + income, data = Default, family = binomial, subset = trainDefault)
#iii log.prob_def = predict(LOGmodel2, testDefault, type = “response”) log.pred_def = rep(“No”, dim(Default)[1]*0.50) log.pred_def[log.prob_def > 0.5] = “Yes” table(log.pred_def, testDefault$default)
#iv mean(log.pred_def !=testDefault$default)
#i trainDefault = sample(dim(Default)[1], dim(Default)[1]*0.50) testDefault = Default[-trainDefault, ]
#ii LOGmodel2 = glm(default ~ balance + income, data = Default, family = binomial, subset = trainDefault)
#iii log.prob_def = predict(LOGmodel2, testDefault, type = “response”) log.pred_def = rep(“No”, dim(Default)[1]*0.50) log.pred_def[log.prob_def > 0.5] = “Yes” table(log.pred_def, testDefault$default)
#iv mean(log.pred_def !=testDefault$default)
#i trainDefault = sample(dim(Default)[1], dim(Default)[1]*0.50) testDefault = Default[-trainDefault, ]
#ii LOGmodel2 = glm(default ~ balance + income, data = Default, family = binomial, subset = trainDefault)
#iii log.prob_def = predict(LOGmodel2, testDefault, type = “response”) log.pred_def = rep(“No”, dim(Default)[1]*0.50) log.pred_def[log.prob_def > 0.5] = “Yes” table(log.pred_def, testDefault$default)
#iv mean(log.pred_def !=testDefault$default)
#d #i trainDefault = sample(dim(Default)[1], dim(Default)[1]*0.50) testDefault = Default[-trainDefault, ]
#ii LOGmodel2 = glm(default ~ balance + income + student, data = Default, family = binomial, subset = trainDefault)
#iii log.prob_def = predict(LOGmodel2, testDefault, type = “response”) log.pred_def = rep(“No”, dim(Default)[1]*0.50) log.pred_def[log.prob_def > 0.5] = “Yes” table(log.pred_def, testDefault$default)
#iv mean(log.pred_def !=testDefault$default)
#6 #a. set.seed(1) attach(Default) fit.glm = glm(default ~ income + balance, data = Default, family = “binomial”) summary(fit.glm) #b. boot.fn = function(data, index) { fit = glm(default ~ income + balance, data = data, family = “binomial”, subset = index) return (coef(fit)) } #c library(boot) boot(Default, boot.fn, 100) #d #The estimated standard errors for the coefficients for income and balance are 2.294044e-04 and 3.875525e-06.
#9 #a library(MASS) data(“Boston”) summary(Boston) attach(Boston) mean.medv = mean(medv) mean.medv #b stderr.mean = sd(medv)/sqrt(length(medv)) stderr.mean #c boot.fn2 = function(data, index) return(mean(data[index])) boot2 = boot(medv, boot.fn2, 100) boot2 #d t.test(Boston$medv) CI.bos = c(22.53 - 2 * 0.4174872, 22.53 + 2 * 0.4174872) CI.bos #e median.medv = median(medv) median.medv #f boot.fn3 = function(data, index) return(median(data[index])) boot3 = boot(medv, boot.fn3, 1000) boot3 #g tenth.medv = quantile(medv, c(0.1)) tenth.medv #h boot.fn4 = function(data, index) return(quantile(data[index], c(0.1))) boot4 = boot(medv, boot.fn4, 1000) boot4