112035149_final exam

library(ISLR2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(tidyr)
library(stringr)
library(rsample)
library(caret)

## Loading required package: lattice

library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

## The following object is masked from 'package:ISLR2':
## 
##     Boston

library(naivebayes)

## naivebayes 1.0.0 loaded

## For more information please visit:

## https://majkamichal.github.io/naivebayes/

library(e1071)

## 
## Attaching package: 'e1071'

## The following object is masked from 'package:rsample':
## 
##     permutations

library(class)
library(glmnet)

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

## Loaded glmnet 4.1-8

library(pcr)
library(pls)

## 
## Attaching package: 'pls'

## The following object is masked from 'package:caret':
## 
##     R2

## The following object is masked from 'package:stats':
## 
##     loadings

# Chapter 4
# 16. Using the Boston data set, fit classification models in order to predict whether a given census tract has a crime rate above or below the me-dian. Explore logistic regression, LDA, naive Bayes, and KNN models using various subsets of the predictors. Describe your findings.
# Hint: You will have to create the response variable yourself, using the variables that are contained in the Boston data set.
data("Boston")
head(Boston)

##      crim zn indus chas   nox    rm  age    dis rad tax ptratio  black lstat
## 1 0.00632 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3 396.90  4.98
## 2 0.02731  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8 396.90  9.14
## 3 0.02729  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8 392.83  4.03
## 4 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7 394.63  2.94
## 5 0.06905  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7 396.90  5.33
## 6 0.02985  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7 394.12  5.21
##   medv
## 1 24.0
## 2 21.6
## 3 34.7
## 4 33.4
## 5 36.2
## 6 28.7

# Create binary response variable indicating whether the crime rate is above or below the median
Boston$crime_factor <- ifelse(Boston$crim > median(Boston$crim), "High", "Low")

# Convert to factor
Boston$crime_factor <- factor(Boston$crime_factor, levels = c("Low", "High"))
# Scatterplot
Boston %>%
  dplyr::select(crim, crime_factor, rad, nox, tax, age, dis, indus) %>%
  gather(Variable, value, -crim, -crime_factor) %>%
  mutate(Variable = stringr::str_to_title(Variable)) %>%
  ggplot(aes(value, crim)) +
  geom_point(aes(col = crime_factor)) +
  facet_wrap(~ Variable, scales = 'free') +
  geom_smooth(method = 'lm', formula = y ~ poly(x, 3), se = FALSE) +
  guides(col = FALSE) +
  labs(title = 'Scatterplots')

## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Splitting the data
set.seed(610)
boston_split <- initial_split(Boston, prop=0.8,strata= crime_factor)
boston_training <- training(boston_split)
boston_testing <- testing(boston_split)
# 1. Logistic Regression
glm_model <- glm(crime_factor ~ poly(rad, 3) + poly(nox, 3) + 
                   poly(tax, 3) + poly(age, 3) + poly(dis, 3)+ poly(indus, 3), 
                 data = boston_training, family = "binomial")

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

summary(glm_model)

## 
## Call:
## glm(formula = crime_factor ~ poly(rad, 3) + poly(nox, 3) + poly(tax, 
##     3) + poly(age, 3) + poly(dis, 3) + poly(indus, 3), family = "binomial", 
##     data = boston_training)
## 
## Coefficients:
##                   Estimate Std. Error    z value Pr(>|z|)    
## (Intercept)     -3.790e+14  3.339e+06 -113518879   <2e-16 ***
## poly(rad, 3)1    8.745e+16  4.751e+08  184071758   <2e-16 ***
## poly(rad, 3)2    8.225e+15  1.075e+08   76514466   <2e-16 ***
## poly(rad, 3)3    1.953e+15  7.773e+07   25122280   <2e-16 ***
## poly(nox, 3)1    1.888e+16  1.771e+08  106597240   <2e-16 ***
## poly(nox, 3)2    1.054e+15  1.016e+08   10377240   <2e-16 ***
## poly(nox, 3)3    2.177e+15  9.908e+07   21975041   <2e-16 ***
## poly(tax, 3)1   -6.941e+16  4.130e+08 -168075367   <2e-16 ***
## poly(tax, 3)2   -3.403e+16  1.859e+08 -183065250   <2e-16 ***
## poly(tax, 3)3    1.628e+16  8.421e+07  193318513   <2e-16 ***
## poly(age, 3)1   -1.028e+15  1.234e+08   -8335643   <2e-16 ***
## poly(age, 3)2    2.885e+15  7.958e+07   36254178   <2e-16 ***
## poly(age, 3)3    1.990e+15  7.152e+07   27820397   <2e-16 ***
## poly(dis, 3)1   -4.984e+15  1.606e+08  -31035200   <2e-16 ***
## poly(dis, 3)2    3.258e+15  9.902e+07   32900446   <2e-16 ***
## poly(dis, 3)3    1.957e+15  8.154e+07   23994946   <2e-16 ***
## poly(indus, 3)1  1.115e+16  1.411e+08   78989311   <2e-16 ***
## poly(indus, 3)2  4.318e+15  1.171e+08   36885916   <2e-16 ***
## poly(indus, 3)3  8.514e+15  1.210e+08   70376384   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance:  560.06  on 403  degrees of freedom
## Residual deviance: 2811.40  on 385  degrees of freedom
## AIC: 2849.4
## 
## Number of Fisher Scoring iterations: 25

glm_fit <- predict(glm_model,type="response", newdata=boston_testing) 

predict_binary_glm <- rep("Low", 102)
predict_binary_glm[glm_fit > .5] = "High"
predict_binary_glm <- predict_binary_glm %>% bind_cols(boston_testing %>% dplyr::select(crime_factor))

## New names:
## • `` -> `...1`

colnames(predict_binary_glm) <- c("predicted_value", "actual_value")
predict_binary_glm$predicted_value <- as.factor(predict_binary_glm$predicted_value)
predict_binary_glm$actual_value <- as.factor(predict_binary_glm$actual_value)
confusion_glm <- confusionMatrix(predict_binary_glm$predicted_value, predict_binary_glm$actual_value)

## Warning in confusionMatrix.default(predict_binary_glm$predicted_value,
## predict_binary_glm$actual_value): Levels are not in the same order for
## reference and data. Refactoring data to match.

confusion_glm

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Low High
##       Low   49    4
##       High   2   47
##                                           
##                Accuracy : 0.9412          
##                  95% CI : (0.8764, 0.9781)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8824          
##                                           
##  Mcnemar's Test P-Value : 0.6831          
##                                           
##             Sensitivity : 0.9608          
##             Specificity : 0.9216          
##          Pos Pred Value : 0.9245          
##          Neg Pred Value : 0.9592          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4804          
##    Detection Prevalence : 0.5196          
##       Balanced Accuracy : 0.9412          
##                                           
##        'Positive' Class : Low             
##

# 2. Linear Discriminant Analysis
lda_model <- lda(crime_factor ~ poly(rad, 3) + poly(nox, 3) + 
                   poly(tax, 3) + poly(age, 3) + poly(dis, 3)+ poly(indus, 3), 
                 data = boston_training)
predict_lda <- predict(lda_model, type= "response", newdata=boston_testing)$class
predict_result_lda <- predict_lda %>% bind_cols(boston_testing %>% dplyr::select(crime_factor))

## New names:
## • `` -> `...1`

colnames(predict_result_lda) <- c("predicted_value", "actual_value")
confusion_lda <- confusionMatrix(predict_result_lda$predicted_value, predict_result_lda$actual_value)
confusion_lda

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Low High
##       Low   46    1
##       High   5   50
##                                           
##                Accuracy : 0.9412          
##                  95% CI : (0.8764, 0.9781)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8824          
##                                           
##  Mcnemar's Test P-Value : 0.2207          
##                                           
##             Sensitivity : 0.9020          
##             Specificity : 0.9804          
##          Pos Pred Value : 0.9787          
##          Neg Pred Value : 0.9091          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4510          
##    Detection Prevalence : 0.4608          
##       Balanced Accuracy : 0.9412          
##                                           
##        'Positive' Class : Low             
##

# 3. Naivebayes
naivebayes_model <- naiveBayes(crime_factor ~ rad + nox+ tax + age + indus, 
                               data = boston_training)
naivebayes_model

## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##  Low High 
##  0.5  0.5 
## 
## Conditional probabilities:
##       rad
## Y           [,1]     [,2]
##   Low   4.168317 1.645554
##   High 14.475248 9.592798
## 
##       nox
## Y           [,1]       [,2]
##   Low  0.4709698 0.05651158
##   High 0.6326337 0.09464589
## 
##       tax
## Y          [,1]      [,2]
##   Low  306.1089  83.91098
##   High 503.1485 168.82912
## 
##       age
## Y          [,1]     [,2]
##   Low  50.87376 24.98412
##   High 85.07129 18.87520
## 
##       indus
## Y           [,1]     [,2]
##   Low   6.998465 5.471512
##   High 15.143713 5.522585

predict_naivebayes <- predict(naivebayes_model, type= "class", newdata=boston_testing)
predict_result_naivebayes <- predict_naivebayes %>% bind_cols(boston_testing %>% dplyr::select(crime_factor))

## New names:
## • `` -> `...1`

colnames(predict_result_naivebayes) <- c("predicted_value", "actual_value")
confusion_naivebayes <- confusionMatrix(predict_result_naivebayes$predicted_value, predict_result_naivebayes$actual_value)
confusion_naivebayes

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Low High
##       Low   47   11
##       High   4   40
##                                           
##                Accuracy : 0.8529          
##                  95% CI : (0.7691, 0.9153)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : 8.267e-14       
##                                           
##                   Kappa : 0.7059          
##                                           
##  Mcnemar's Test P-Value : 0.1213          
##                                           
##             Sensitivity : 0.9216          
##             Specificity : 0.7843          
##          Pos Pred Value : 0.8103          
##          Neg Pred Value : 0.9091          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4608          
##    Detection Prevalence : 0.5686          
##       Balanced Accuracy : 0.8529          
##                                           
##        'Positive' Class : Low             
##

# 4. K-nearest neighbors
variables <- c('rad', 'nox', 'tax', 'age', 'dis', 'zn', 'indus')

x_training <- boston_training[, variables]
y_training <- boston_training$crime_factor
x_testing <- boston_testing[, variables]
acc <- list()

for (i in 1:20) {
  knn_pred <- knn(train = x_training, test = x_testing, cl = y_training, k = i)
  acc[as.character(i)] = mean(knn_pred == boston_testing$crime_factor)
}

acc <- unlist(acc)

data_frame(acc = acc) %>%
  mutate(k = row_number()) %>%
  ggplot(aes(k, acc)) +
  geom_col(aes(fill = k == which.max(acc))) +
  labs(x = 'K', y = 'Accuracy', title = 'KNN Accuracy') +
  scale_x_continuous(breaks = 1:20) +
  scale_y_continuous(breaks = round(c(seq(0.90, 0.94, 0.01), max(acc)),
                                    digits = 3)) +
  geom_hline(yintercept = max(acc), lty = 2) +
  coord_cartesian(ylim = c(min(acc), max(acc))) +
  guides(fill = FALSE)

## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## ℹ Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

#final model
knn_final <- knn(train = x_training, test = x_testing, cl = y_training, k = 1)
head(knn_final)

## [1] Low  Low  Low  High Low  Low 
## Levels: Low High

predict_result_knn <- knn_final %>% bind_cols(boston_testing %>% dplyr::select(crime_factor))

## New names:
## • `` -> `...1`

colnames(predict_result_knn) <- c("predicted_value", "actual_value")
confusion_knn <- confusionMatrix(predict_result_knn$predicted_value, predict_result_knn$actual_value)
confusion_knn

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Low High
##       Low   48    1
##       High   3   50
##                                           
##                Accuracy : 0.9608          
##                  95% CI : (0.9026, 0.9892)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9216          
##                                           
##  Mcnemar's Test P-Value : 0.6171          
##                                           
##             Sensitivity : 0.9412          
##             Specificity : 0.9804          
##          Pos Pred Value : 0.9796          
##          Neg Pred Value : 0.9434          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4706          
##    Detection Prevalence : 0.4804          
##       Balanced Accuracy : 0.9608          
##                                           
##        'Positive' Class : Low             
##

#Chapter 5: Resampling Methods
# 5. In Chapter 4, we used logistic regression to predict the probability of default using income and balance on the Default data set. We will now estimate the test error of this logistic regression model using the validation set approach. Do not forget to set a random seed before beginning your analysis.
# a. Fit a logistic regression model that uses income and balance to predict default

attach(Default)
set.seed(106)
fit.glm <- glm(default ~ income+balance, data=Default, family="binomial")
summary(fit.glm)

## 
## Call:
## glm(formula = default ~ income + balance, family = "binomial", 
##     data = Default)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.154e+01  4.348e-01 -26.545  < 2e-16 ***
## income       2.081e-05  4.985e-06   4.174 2.99e-05 ***
## balance      5.647e-03  2.274e-04  24.836  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1579.0  on 9997  degrees of freedom
## AIC: 1585
## 
## Number of Fisher Scoring iterations: 8

# b. Using the validation set approach, estimate the test error of this model

train <- sample(dim(Default)[1], dim(Default)[1]/2)
fit.glm <- glm(default ~ income+balance, data=Default, family="binomial")
summary(fit.glm)

## 
## Call:
## glm(formula = default ~ income + balance, family = "binomial", 
##     data = Default)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.154e+01  4.348e-01 -26.545  < 2e-16 ***
## income       2.081e-05  4.985e-06   4.174 2.99e-05 ***
## balance      5.647e-03  2.274e-04  24.836  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1579.0  on 9997  degrees of freedom
## AIC: 1585
## 
## Number of Fisher Scoring iterations: 8

probs <- predict(fit.glm, newdata = Default[-train, ], type = "response")
pred.glm <- rep("No", length(probs))
pred.glm[probs > 0.5] <- "Yes"
mean(pred.glm != Default[-train,]$default)

## [1] 0.0248

# c. Repeat the process in (b) 3 times, using 3 different splits of the obervations unto a training set and a validation set. Comment on the results obtained
train <- sample(dim(Default)[1], dim(Default)[1] / 2)
fit.glm <- glm(default ~ income + balance, data = Default, family = "binomial", subset = train)
probs <- predict(fit.glm, newdata = Default[-train, ], type = "response")
pred.glm <- rep("No", length(probs))
pred.glm[probs > 0.5] <- "Yes"
mean(pred.glm != Default[-train, ]$default)

## [1] 0.0256

train <- sample(dim(Default)[1], dim(Default)[1] / 2)
fit.glm <- glm(default ~ income + balance, data = Default, family = "binomial", subset = train)
probs <- predict(fit.glm, newdata = Default[-train, ], type = "response")
pred.glm <- rep("No", length(probs))
pred.glm[probs > 0.5] <- "Yes"
mean(pred.glm != Default[-train, ]$default)

## [1] 0.0254

# d. Consider a logistic regression model that predicts the probability of default using income, balance, and a dummy variables for student. Estimate the test erroe for this model using the validation set approach. Comment on whether or not including a dummy variable for student leads to a reduction in the test error rate.
train <- sample(dim(Default)[1], dim(Default)[1] / 2)
fit.glm <- glm(default ~ income + balance + student, data = Default, family = "binomial", subset = train)
probs <- predict(fit.glm, newdata = Default[-train, ], type = "response")
pred.glm <- rep("No", length(probs))
pred.glm[probs > 0.5] <- "Yes"
mean(pred.glm != Default[-train, ]$default)

## [1] 0.022

# Chapter 6: Linear Model Selection and Regularization
# 9. In this exercise, we will predict the number of applications received using the other variables in the College data set.
# a. Split the data into a training set and a test set
sample_split <- function(dataset, split_ratio, seed = NULL) {
  set.seed(seed)
  
  train_subset <- sample(nrow(dataset) * split_ratio)
  
  return(list(
    train = dataset[train_subset, ],
    test = dataset[-train_subset, ]
  ))
}

college_split <- sample_split(College, 0.7, seed = 601)
# b. Fit a linear model using least squares on the training set, and report the test error obtained
ols_model <- lm(Apps ~ . , data = college_split$train)
ols_predictions <- predict(ols_model, college_split$test)
# MSE function
calc_rmse <- function(y, y_hat) {
  return(sqrt(mean((y - y_hat)^2)))
}
ols_rmse <- calc_rmse(college_split$test$Apps, ols_predictions)
ols_rmse

## [1] 1188.832

# c. Fit a ridge regression model on the training set, with λ chosen by cross-validation. Report the test error obtained.
train_matrix <- model.matrix(Apps ~ . , data = college_split$train)
test_matrix <- model.matrix(Apps ~ . , data = college_split$test)

grid <- 10 ^ seq(4, -2, length = 100)

ridge_model <- cv.glmnet(train_matrix, college_split$train$Apps, alpha = 0, lambda = grid, thresh = 1e-12)

ridge_predictions <- predict(ridge_model, test_matrix, s = ridge_model$lambda.min)

ridge_rmse <- calc_rmse(college_split$test$Apps, ridge_predictions)
ridge_rmse

## [1] 1188.815

# d. Fit a lasso model on the training set, with λ chosen by cross-validation. Report the test error obtained, along with the number of non-zero coefficient estimates.
lasso_model <- cv.glmnet(train_matrix, college_split$train$Apps, alpha = 1, lambda = grid, thresh = 1e-12)

lasso_predictions <- predict(lasso_model, test_matrix, s = lasso_model$lambda.min)

lasso_rmse <- calc_rmse(college_split$test$Apps, lasso_predictions)
lasso_rmse

## [1] 1188.799

predict(lasso_model, s = lasso_model$lambda.min, type = "coefficients")

## 19 x 1 sparse Matrix of class "dgCMatrix"
##                        s1
## (Intercept) -601.38112596
## (Intercept)    .         
## PrivateYes  -321.41076249
## Accept         1.70685797
## Enroll        -1.35810665
## Top10perc     45.45111099
## Top25perc    -15.88692306
## F.Undergrad    0.09983028
## P.Undergrad    0.01567394
## Outstate      -0.09222852
## Room.Board     0.11816525
## Books         -0.04056782
## Personal       0.05792203
## PhD           -5.56419852
## Terminal      -5.43103094
## S.F.Ratio     21.59623616
## perc.alumni    1.87814752
## Expend         0.10806715
## Grad.Rate      8.13596816

# e. Fit a PCR model on the training set, with M chosen by cross validation. Report the test error obtained, along with the value of M selected by cross-validation.
pcr_model <- pcr(Apps ~ . , data = college_split$train, scale = TRUE, validation = "CV")

validationplot(pcr_model, val.type = "MSEP")

pcr_predictions <- predict(pcr_model, college_split$test, ncomp = 10)

pcr_rmse <- calc_rmse(college_split$test$Apps, pcr_predictions)
pcr_rmse

## [1] 1556.821

# f. Fit a PLS model on the training set, with M chosen by cross validation. Report the test error obtained, along with the value of M selected by cross-validation.
pls_model <- plsr(Apps ~ . , data = college_split$train, scale = TRUE, validation = "CV")

validationplot(pls_model, val.type = "MSEP")

pls_predictions <- predict(pls_model, college_split$test, ncomp = 10)

pls_rmse <- calc_rmse(college_split$test$Apps, pls_predictions)
pls_rmse

## [1] 1178.361

# g. Comment on the results obtained. How accurately can we predict the number of college applications received? Is there much difference among the test errors resulting from these five approaches?
# Function to compute R-squared
calc_r2 <- function(y, y_hat) {
  y_bar <- mean(y)
  rss <- sum((y - y_hat)^2)
  tss <- sum((y - y_bar)^2)
  return(1 - (rss / tss))
}
ols_r2 <- calc_r2(college_split$test$Apps, ols_predictions)
ridge_r2 <- calc_r2(college_split$test$Apps, ridge_predictions)
lasso_r2 <- calc_r2(college_split$test$Apps, lasso_predictions)
pcr_r2 <- calc_r2(college_split$test$Apps, pcr_predictions)
pls_r2 <- calc_r2(college_split$test$Apps, pls_predictions)

barplot(c(ols_r2, ridge_r2, lasso_r2, pcr_r2, pls_r2),
        names.arg = c("OLS", "Ridge", "Lasso", "PCR", "PLS"))

112035149_final exam_part1

Munkhjin Sambuu

2024-06-13