library(ISLR2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(tidyr)
library(stringr)
library(rsample)
library(caret)
## Loading required package: lattice
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## The following object is masked from 'package:ISLR2':
##
## Boston
library(naivebayes)
## naivebayes 1.0.0 loaded
## For more information please visit:
## https://majkamichal.github.io/naivebayes/
library(e1071)
##
## Attaching package: 'e1071'
## The following object is masked from 'package:rsample':
##
## permutations
library(class)
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.1-8
library(pcr)
library(pls)
##
## Attaching package: 'pls'
## The following object is masked from 'package:caret':
##
## R2
## The following object is masked from 'package:stats':
##
## loadings
# Chapter 4
# 16. Using the Boston data set, fit classification models in order to predict whether a given census tract has a crime rate above or below the me-dian. Explore logistic regression, LDA, naive Bayes, and KNN models using various subsets of the predictors. Describe your findings.
# Hint: You will have to create the response variable yourself, using the variables that are contained in the Boston data set.
data("Boston")
head(Boston)
## crim zn indus chas nox rm age dis rad tax ptratio black lstat
## 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98
## 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14
## 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03
## 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94
## 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 5.33
## 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 394.12 5.21
## medv
## 1 24.0
## 2 21.6
## 3 34.7
## 4 33.4
## 5 36.2
## 6 28.7
# Create binary response variable indicating whether the crime rate is above or below the median
Boston$crime_factor <- ifelse(Boston$crim > median(Boston$crim), "High", "Low")
# Convert to factor
Boston$crime_factor <- factor(Boston$crime_factor, levels = c("Low", "High"))
# Scatterplot
Boston %>%
dplyr::select(crim, crime_factor, rad, nox, tax, age, dis, indus) %>%
gather(Variable, value, -crim, -crime_factor) %>%
mutate(Variable = stringr::str_to_title(Variable)) %>%
ggplot(aes(value, crim)) +
geom_point(aes(col = crime_factor)) +
facet_wrap(~ Variable, scales = 'free') +
geom_smooth(method = 'lm', formula = y ~ poly(x, 3), se = FALSE) +
guides(col = FALSE) +
labs(title = 'Scatterplots')
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Splitting the data
set.seed(610)
boston_split <- initial_split(Boston, prop=0.8,strata= crime_factor)
boston_training <- training(boston_split)
boston_testing <- testing(boston_split)
# 1. Logistic Regression
glm_model <- glm(crime_factor ~ poly(rad, 3) + poly(nox, 3) +
poly(tax, 3) + poly(age, 3) + poly(dis, 3)+ poly(indus, 3),
data = boston_training, family = "binomial")
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(glm_model)
##
## Call:
## glm(formula = crime_factor ~ poly(rad, 3) + poly(nox, 3) + poly(tax,
## 3) + poly(age, 3) + poly(dis, 3) + poly(indus, 3), family = "binomial",
## data = boston_training)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.790e+14 3.339e+06 -113518879 <2e-16 ***
## poly(rad, 3)1 8.745e+16 4.751e+08 184071758 <2e-16 ***
## poly(rad, 3)2 8.225e+15 1.075e+08 76514466 <2e-16 ***
## poly(rad, 3)3 1.953e+15 7.773e+07 25122280 <2e-16 ***
## poly(nox, 3)1 1.888e+16 1.771e+08 106597240 <2e-16 ***
## poly(nox, 3)2 1.054e+15 1.016e+08 10377240 <2e-16 ***
## poly(nox, 3)3 2.177e+15 9.908e+07 21975041 <2e-16 ***
## poly(tax, 3)1 -6.941e+16 4.130e+08 -168075367 <2e-16 ***
## poly(tax, 3)2 -3.403e+16 1.859e+08 -183065250 <2e-16 ***
## poly(tax, 3)3 1.628e+16 8.421e+07 193318513 <2e-16 ***
## poly(age, 3)1 -1.028e+15 1.234e+08 -8335643 <2e-16 ***
## poly(age, 3)2 2.885e+15 7.958e+07 36254178 <2e-16 ***
## poly(age, 3)3 1.990e+15 7.152e+07 27820397 <2e-16 ***
## poly(dis, 3)1 -4.984e+15 1.606e+08 -31035200 <2e-16 ***
## poly(dis, 3)2 3.258e+15 9.902e+07 32900446 <2e-16 ***
## poly(dis, 3)3 1.957e+15 8.154e+07 23994946 <2e-16 ***
## poly(indus, 3)1 1.115e+16 1.411e+08 78989311 <2e-16 ***
## poly(indus, 3)2 4.318e+15 1.171e+08 36885916 <2e-16 ***
## poly(indus, 3)3 8.514e+15 1.210e+08 70376384 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 560.06 on 403 degrees of freedom
## Residual deviance: 2811.40 on 385 degrees of freedom
## AIC: 2849.4
##
## Number of Fisher Scoring iterations: 25
glm_fit <- predict(glm_model,type="response", newdata=boston_testing)
predict_binary_glm <- rep("Low", 102)
predict_binary_glm[glm_fit > .5] = "High"
predict_binary_glm <- predict_binary_glm %>% bind_cols(boston_testing %>% dplyr::select(crime_factor))
## New names:
## • `` -> `...1`
colnames(predict_binary_glm) <- c("predicted_value", "actual_value")
predict_binary_glm$predicted_value <- as.factor(predict_binary_glm$predicted_value)
predict_binary_glm$actual_value <- as.factor(predict_binary_glm$actual_value)
confusion_glm <- confusionMatrix(predict_binary_glm$predicted_value, predict_binary_glm$actual_value)
## Warning in confusionMatrix.default(predict_binary_glm$predicted_value,
## predict_binary_glm$actual_value): Levels are not in the same order for
## reference and data. Refactoring data to match.
confusion_glm
## Confusion Matrix and Statistics
##
## Reference
## Prediction Low High
## Low 49 4
## High 2 47
##
## Accuracy : 0.9412
## 95% CI : (0.8764, 0.9781)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8824
##
## Mcnemar's Test P-Value : 0.6831
##
## Sensitivity : 0.9608
## Specificity : 0.9216
## Pos Pred Value : 0.9245
## Neg Pred Value : 0.9592
## Prevalence : 0.5000
## Detection Rate : 0.4804
## Detection Prevalence : 0.5196
## Balanced Accuracy : 0.9412
##
## 'Positive' Class : Low
##
# 2. Linear Discriminant Analysis
lda_model <- lda(crime_factor ~ poly(rad, 3) + poly(nox, 3) +
poly(tax, 3) + poly(age, 3) + poly(dis, 3)+ poly(indus, 3),
data = boston_training)
predict_lda <- predict(lda_model, type= "response", newdata=boston_testing)$class
predict_result_lda <- predict_lda %>% bind_cols(boston_testing %>% dplyr::select(crime_factor))
## New names:
## • `` -> `...1`
colnames(predict_result_lda) <- c("predicted_value", "actual_value")
confusion_lda <- confusionMatrix(predict_result_lda$predicted_value, predict_result_lda$actual_value)
confusion_lda
## Confusion Matrix and Statistics
##
## Reference
## Prediction Low High
## Low 46 1
## High 5 50
##
## Accuracy : 0.9412
## 95% CI : (0.8764, 0.9781)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8824
##
## Mcnemar's Test P-Value : 0.2207
##
## Sensitivity : 0.9020
## Specificity : 0.9804
## Pos Pred Value : 0.9787
## Neg Pred Value : 0.9091
## Prevalence : 0.5000
## Detection Rate : 0.4510
## Detection Prevalence : 0.4608
## Balanced Accuracy : 0.9412
##
## 'Positive' Class : Low
##
# 3. Naivebayes
naivebayes_model <- naiveBayes(crime_factor ~ rad + nox+ tax + age + indus,
data = boston_training)
naivebayes_model
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## Low High
## 0.5 0.5
##
## Conditional probabilities:
## rad
## Y [,1] [,2]
## Low 4.168317 1.645554
## High 14.475248 9.592798
##
## nox
## Y [,1] [,2]
## Low 0.4709698 0.05651158
## High 0.6326337 0.09464589
##
## tax
## Y [,1] [,2]
## Low 306.1089 83.91098
## High 503.1485 168.82912
##
## age
## Y [,1] [,2]
## Low 50.87376 24.98412
## High 85.07129 18.87520
##
## indus
## Y [,1] [,2]
## Low 6.998465 5.471512
## High 15.143713 5.522585
predict_naivebayes <- predict(naivebayes_model, type= "class", newdata=boston_testing)
predict_result_naivebayes <- predict_naivebayes %>% bind_cols(boston_testing %>% dplyr::select(crime_factor))
## New names:
## • `` -> `...1`
colnames(predict_result_naivebayes) <- c("predicted_value", "actual_value")
confusion_naivebayes <- confusionMatrix(predict_result_naivebayes$predicted_value, predict_result_naivebayes$actual_value)
confusion_naivebayes
## Confusion Matrix and Statistics
##
## Reference
## Prediction Low High
## Low 47 11
## High 4 40
##
## Accuracy : 0.8529
## 95% CI : (0.7691, 0.9153)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 8.267e-14
##
## Kappa : 0.7059
##
## Mcnemar's Test P-Value : 0.1213
##
## Sensitivity : 0.9216
## Specificity : 0.7843
## Pos Pred Value : 0.8103
## Neg Pred Value : 0.9091
## Prevalence : 0.5000
## Detection Rate : 0.4608
## Detection Prevalence : 0.5686
## Balanced Accuracy : 0.8529
##
## 'Positive' Class : Low
##
# 4. K-nearest neighbors
variables <- c('rad', 'nox', 'tax', 'age', 'dis', 'zn', 'indus')
x_training <- boston_training[, variables]
y_training <- boston_training$crime_factor
x_testing <- boston_testing[, variables]
acc <- list()
for (i in 1:20) {
knn_pred <- knn(train = x_training, test = x_testing, cl = y_training, k = i)
acc[as.character(i)] = mean(knn_pred == boston_testing$crime_factor)
}
acc <- unlist(acc)
data_frame(acc = acc) %>%
mutate(k = row_number()) %>%
ggplot(aes(k, acc)) +
geom_col(aes(fill = k == which.max(acc))) +
labs(x = 'K', y = 'Accuracy', title = 'KNN Accuracy') +
scale_x_continuous(breaks = 1:20) +
scale_y_continuous(breaks = round(c(seq(0.90, 0.94, 0.01), max(acc)),
digits = 3)) +
geom_hline(yintercept = max(acc), lty = 2) +
coord_cartesian(ylim = c(min(acc), max(acc))) +
guides(fill = FALSE)
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## ℹ Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

#final model
knn_final <- knn(train = x_training, test = x_testing, cl = y_training, k = 1)
head(knn_final)
## [1] Low Low Low High Low Low
## Levels: Low High
predict_result_knn <- knn_final %>% bind_cols(boston_testing %>% dplyr::select(crime_factor))
## New names:
## • `` -> `...1`
colnames(predict_result_knn) <- c("predicted_value", "actual_value")
confusion_knn <- confusionMatrix(predict_result_knn$predicted_value, predict_result_knn$actual_value)
confusion_knn
## Confusion Matrix and Statistics
##
## Reference
## Prediction Low High
## Low 48 1
## High 3 50
##
## Accuracy : 0.9608
## 95% CI : (0.9026, 0.9892)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9216
##
## Mcnemar's Test P-Value : 0.6171
##
## Sensitivity : 0.9412
## Specificity : 0.9804
## Pos Pred Value : 0.9796
## Neg Pred Value : 0.9434
## Prevalence : 0.5000
## Detection Rate : 0.4706
## Detection Prevalence : 0.4804
## Balanced Accuracy : 0.9608
##
## 'Positive' Class : Low
##
#Chapter 5: Resampling Methods
# 5. In Chapter 4, we used logistic regression to predict the probability of default using income and balance on the Default data set. We will now estimate the test error of this logistic regression model using the validation set approach. Do not forget to set a random seed before beginning your analysis.
# a. Fit a logistic regression model that uses income and balance to predict default
attach(Default)
set.seed(106)
fit.glm <- glm(default ~ income+balance, data=Default, family="binomial")
summary(fit.glm)
##
## Call:
## glm(formula = default ~ income + balance, family = "binomial",
## data = Default)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
# b. Using the validation set approach, estimate the test error of this model
train <- sample(dim(Default)[1], dim(Default)[1]/2)
fit.glm <- glm(default ~ income+balance, data=Default, family="binomial")
summary(fit.glm)
##
## Call:
## glm(formula = default ~ income + balance, family = "binomial",
## data = Default)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
probs <- predict(fit.glm, newdata = Default[-train, ], type = "response")
pred.glm <- rep("No", length(probs))
pred.glm[probs > 0.5] <- "Yes"
mean(pred.glm != Default[-train,]$default)
## [1] 0.0248
# c. Repeat the process in (b) 3 times, using 3 different splits of the obervations unto a training set and a validation set. Comment on the results obtained
train <- sample(dim(Default)[1], dim(Default)[1] / 2)
fit.glm <- glm(default ~ income + balance, data = Default, family = "binomial", subset = train)
probs <- predict(fit.glm, newdata = Default[-train, ], type = "response")
pred.glm <- rep("No", length(probs))
pred.glm[probs > 0.5] <- "Yes"
mean(pred.glm != Default[-train, ]$default)
## [1] 0.0256
train <- sample(dim(Default)[1], dim(Default)[1] / 2)
fit.glm <- glm(default ~ income + balance, data = Default, family = "binomial", subset = train)
probs <- predict(fit.glm, newdata = Default[-train, ], type = "response")
pred.glm <- rep("No", length(probs))
pred.glm[probs > 0.5] <- "Yes"
mean(pred.glm != Default[-train, ]$default)
## [1] 0.0254
# d. Consider a logistic regression model that predicts the probability of default using income, balance, and a dummy variables for student. Estimate the test erroe for this model using the validation set approach. Comment on whether or not including a dummy variable for student leads to a reduction in the test error rate.
train <- sample(dim(Default)[1], dim(Default)[1] / 2)
fit.glm <- glm(default ~ income + balance + student, data = Default, family = "binomial", subset = train)
probs <- predict(fit.glm, newdata = Default[-train, ], type = "response")
pred.glm <- rep("No", length(probs))
pred.glm[probs > 0.5] <- "Yes"
mean(pred.glm != Default[-train, ]$default)
## [1] 0.022
# Chapter 6: Linear Model Selection and Regularization
# 9. In this exercise, we will predict the number of applications received using the other variables in the College data set.
# a. Split the data into a training set and a test set
sample_split <- function(dataset, split_ratio, seed = NULL) {
set.seed(seed)
train_subset <- sample(nrow(dataset) * split_ratio)
return(list(
train = dataset[train_subset, ],
test = dataset[-train_subset, ]
))
}
college_split <- sample_split(College, 0.7, seed = 601)
# b. Fit a linear model using least squares on the training set, and report the test error obtained
ols_model <- lm(Apps ~ . , data = college_split$train)
ols_predictions <- predict(ols_model, college_split$test)
# MSE function
calc_rmse <- function(y, y_hat) {
return(sqrt(mean((y - y_hat)^2)))
}
ols_rmse <- calc_rmse(college_split$test$Apps, ols_predictions)
ols_rmse
## [1] 1188.832
# c. Fit a ridge regression model on the training set, with λ chosen by cross-validation. Report the test error obtained.
train_matrix <- model.matrix(Apps ~ . , data = college_split$train)
test_matrix <- model.matrix(Apps ~ . , data = college_split$test)
grid <- 10 ^ seq(4, -2, length = 100)
ridge_model <- cv.glmnet(train_matrix, college_split$train$Apps, alpha = 0, lambda = grid, thresh = 1e-12)
ridge_predictions <- predict(ridge_model, test_matrix, s = ridge_model$lambda.min)
ridge_rmse <- calc_rmse(college_split$test$Apps, ridge_predictions)
ridge_rmse
## [1] 1188.815
# d. Fit a lasso model on the training set, with λ chosen by cross-validation. Report the test error obtained, along with the number of non-zero coefficient estimates.
lasso_model <- cv.glmnet(train_matrix, college_split$train$Apps, alpha = 1, lambda = grid, thresh = 1e-12)
lasso_predictions <- predict(lasso_model, test_matrix, s = lasso_model$lambda.min)
lasso_rmse <- calc_rmse(college_split$test$Apps, lasso_predictions)
lasso_rmse
## [1] 1188.799
predict(lasso_model, s = lasso_model$lambda.min, type = "coefficients")
## 19 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) -601.38112596
## (Intercept) .
## PrivateYes -321.41076249
## Accept 1.70685797
## Enroll -1.35810665
## Top10perc 45.45111099
## Top25perc -15.88692306
## F.Undergrad 0.09983028
## P.Undergrad 0.01567394
## Outstate -0.09222852
## Room.Board 0.11816525
## Books -0.04056782
## Personal 0.05792203
## PhD -5.56419852
## Terminal -5.43103094
## S.F.Ratio 21.59623616
## perc.alumni 1.87814752
## Expend 0.10806715
## Grad.Rate 8.13596816
# e. Fit a PCR model on the training set, with M chosen by cross validation. Report the test error obtained, along with the value of M selected by cross-validation.
pcr_model <- pcr(Apps ~ . , data = college_split$train, scale = TRUE, validation = "CV")
validationplot(pcr_model, val.type = "MSEP")

pcr_predictions <- predict(pcr_model, college_split$test, ncomp = 10)
pcr_rmse <- calc_rmse(college_split$test$Apps, pcr_predictions)
pcr_rmse
## [1] 1556.821
# f. Fit a PLS model on the training set, with M chosen by cross validation. Report the test error obtained, along with the value of M selected by cross-validation.
pls_model <- plsr(Apps ~ . , data = college_split$train, scale = TRUE, validation = "CV")
validationplot(pls_model, val.type = "MSEP")

pls_predictions <- predict(pls_model, college_split$test, ncomp = 10)
pls_rmse <- calc_rmse(college_split$test$Apps, pls_predictions)
pls_rmse
## [1] 1178.361
# g. Comment on the results obtained. How accurately can we predict the number of college applications received? Is there much difference among the test errors resulting from these five approaches?
# Function to compute R-squared
calc_r2 <- function(y, y_hat) {
y_bar <- mean(y)
rss <- sum((y - y_hat)^2)
tss <- sum((y - y_bar)^2)
return(1 - (rss / tss))
}
ols_r2 <- calc_r2(college_split$test$Apps, ols_predictions)
ridge_r2 <- calc_r2(college_split$test$Apps, ridge_predictions)
lasso_r2 <- calc_r2(college_split$test$Apps, lasso_predictions)
pcr_r2 <- calc_r2(college_split$test$Apps, pcr_predictions)
pls_r2 <- calc_r2(college_split$test$Apps, pls_predictions)
barplot(c(ols_r2, ridge_r2, lasso_r2, pcr_r2, pls_r2),
names.arg = c("OLS", "Ridge", "Lasso", "PCR", "PLS"))
