#Question 1

library(ISLR2)
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
## 
##     Boston
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.0     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.1     ✔ tibble    3.1.8
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::select() masks MASS::select()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
#install.packages("kableExtra")
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
#install.packages('corrplot')
library(corrplot)
## corrplot 0.92 loaded
#install.packages("ggthemes") 
library(ggthemes)
library(knitr)
library(broom)
data('Boston')

Boston <- Boston %>%
  mutate(chas = factor(chas),
         crime_factor = factor(ifelse(crim > median(crim), 
                                      'High', 'Low'), 
                               levels = c('High', 'Low')))

cor_test <- Boston %>%
  select(-chas, -crime_factor) %>%
  cor.mtest(conf.level = .95)

Boston %>%
  select(-chas, -crime_factor) %>%
  cor %>%
  corrplot(method = 'color', 
           order = 'hclust', addrect = 2,
           tl.col = 'black', addCoef.col = 'black', number.cex = 0.65,
           p.mat = cor_test$p, sig.level = .05)

scale_this <- function(x) {
  (x - mean(x, na.rm = TRUE)) / sd(x, na.rm = TRUE)
}

table_1 <- Boston %>%
  dplyr::select(zn:crime_factor) %>%
  gather(Variable, value, -crime_factor, -chas) %>%
  group_by(Variable) %>%
  mutate(value = scale_this(value)) %>%
  group_by(Variable, crime_factor) %>%
  summarize(Q10 = quantile(value, .10),
            Q25 = quantile(value, .25),
            median = median(value), 
            mean = mean(value), 
            Q75 = quantile(value, .75),
            Q90 = quantile(value, .90))
## `summarise()` has grouped output by 'Variable'. You can override using the
## `.groups` argument.
kable(table_1,
      digits = 3, format = 'html') %>%
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed')) %>%
  column_spec(1:2, bold = T) %>%
  scroll_box(height = '400px')
Variable crime_factor Q10 Q25 median mean Q75 Q90
age High -0.234 0.470 0.846 0.613 1.042 1.116
age Low -1.782 -1.317 -0.713 -0.613 0.118 0.750
black High -2.942 -0.298 0.292 -0.351 0.421 0.441
black Low 0.222 0.356 0.405 0.351 0.441 0.441
dis High -1.108 -0.970 -0.786 -0.616 -0.355 0.098
dis Low -0.631 -0.202 0.628 0.616 1.275 1.915
indus High -0.720 -0.180 1.015 0.603 1.015 1.231
indus Low -1.306 -1.132 -0.801 -0.603 -0.376 0.247
lstat High -0.914 -0.290 0.352 0.453 1.076 1.931
lstat Low -1.149 -0.964 -0.580 -0.453 -0.045 0.457
medv High -1.319 -0.971 -0.504 -0.263 0.051 0.995
medv Low -0.436 -0.243 0.062 0.263 0.649 1.345
nox High -0.412 -0.092 0.598 0.723 1.254 1.599
nox Low -1.256 -1.067 -0.912 -0.723 -0.343 -0.046
ptratio High -1.735 -0.026 0.806 0.253 0.806 1.175
ptratio Low -1.504 -0.857 -0.210 -0.253 0.298 0.806
rad High -0.637 -0.522 1.660 0.619 1.660 1.660
rad Low -0.867 -0.752 -0.637 -0.619 -0.522 -0.408
rm High -1.337 -0.751 -0.222 -0.156 0.265 1.221
rm Low -0.672 -0.429 0.036 0.156 0.630 1.229
tax High -0.618 -0.601 1.529 0.608 1.529 1.529
tax Low -1.105 -0.957 -0.719 -0.608 -0.375 -0.061
zn High -0.487 -0.487 -0.487 -0.436 -0.487 -0.487
zn Low -0.487 -0.487 -0.487 0.436 0.971 2.943
difference <- function(x) x[1] - x[2]

table_2 <- table_1 %>%
  group_by(Variable) %>%
  summarize(diff_Q10 = difference(Q10),
            diff_Q25 = difference(Q25),
            diff_med = difference(median),
            diff_mean = difference(mean),
            diff_Q75 = difference(Q75),
            diff_Q90 = difference(Q90))


kable(table_2,
      digits = 3, format = 'html') %>%
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed')) %>%
  column_spec(1, bold = T) %>%
  add_header_above(c(' ' = 1, 'Between-Groups Differences' = 6)) %>%
  scroll_box(height = '400px')
Between-Groups Differences
Variable diff_Q10 diff_Q25 diff_med diff_mean diff_Q75 diff_Q90
age 1.548 1.787 1.560 1.227 0.924 0.366
black -3.165 -0.654 -0.113 -0.702 -0.020 0.000
dis -0.477 -0.768 -1.414 -1.231 -1.630 -1.817
indus 0.586 0.952 1.816 1.205 1.391 0.984
lstat 0.235 0.674 0.933 0.906 1.122 1.474
medv -0.883 -0.728 -0.565 -0.526 -0.598 -0.350
nox 0.844 0.975 1.510 1.445 1.597 1.645
ptratio -0.231 0.831 1.016 0.507 0.508 0.370
rad 0.230 0.230 2.297 1.238 2.182 2.067
rm -0.666 -0.322 -0.258 -0.312 -0.364 -0.007
tax 0.487 0.356 2.249 1.216 1.905 1.590
zn 0.000 0.000 0.000 -0.871 -1.458 -3.430
table_3 <- table_2 %>%
  gather(Measure, value, -Variable) %>%
  group_by(Variable) %>%
  summarize(`Absolute Mean Differences` = abs(mean(value))) %>%
  arrange(desc(`Absolute Mean Differences`))
## Warning: attributes are not identical across measure variables; they will be
## dropped
kable(table_3,
      digits = 3, format = 'html') %>%
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed')) %>%
  column_spec(1, bold = T) %>%
  scroll_box(height = '400px')
Variable Absolute Mean Differences
rad 1.374
nox 1.336
tax 1.300
age 1.235
dis 1.223
indus 1.156
zn 0.960
lstat 0.890
black 0.776
medv 0.608
ptratio 0.500
rm 0.322
Boston %>%
  dplyr::select(zn:crime_factor) %>%
  gather(value_type, value, -crime_factor, -chas) %>%
  ggplot(aes(value_type, value, fill = crime_factor)) +
  geom_boxplot(alpha = 0.5) +
  facet_wrap(~value_type, scales = 'free') +
  scale_fill_discrete(name = 'Crime Rate') +
  theme(legend.position = 'top')

#LOGISTIC REGRESSION
set.seed(1)
train_size <- nrow(Boston) * 0.75
inTrain <- sample(1:nrow(Boston), size = train_size)
train <- Boston[inTrain,]
test <- Boston[-inTrain,]

log_reg <- glm(crime_factor ~ rad + nox + tax + 
                 age + dis + indus, data = train, family = binomial)

log_reg %>%
  tidy %>%
  kable(digits = 3)
term estimate std.error statistic p.value
(Intercept) 31.312 4.984 6.282 0.000
rad -0.602 0.132 -4.543 0.000
nox -53.440 9.090 -5.879 0.000
tax 0.008 0.003 2.903 0.004
age -0.018 0.011 -1.697 0.090
dis -0.544 0.179 -3.034 0.002
indus 0.095 0.049 1.950 0.051
pred <- predict(log_reg, test, type = 'response')
pred_value <- ifelse(pred >= 0.5, 'Low', 'High')
acc <- mean(pred_value == test$crime_factor)

table(pred_value, test$crime_factor) %>%
  kable(format = 'html') %>%
  kable_styling() %>%
  add_header_above(c('Predicted' = 1, 'Observed' = 2)) %>%
  column_spec(1, bold = T) %>%
  add_footnote(label = acc)
Predicted
Observed
High Low
High 58 14
Low 6 49
a 0.84251968503937
log_reg <- glm(crime_factor ~ poly(rad, 3) + poly(nox, 3) + 
                 poly(tax, 3) + poly(age, 3) + poly(dis, 3), 
               data = train, family = binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
tidy(log_reg) %>%
  kable(digits = 3)
term estimate std.error statistic p.value
(Intercept) -336.446 156.083 -2.156 0.031
poly(rad, 3)1 -13319.953 5336.905 -2.496 0.013
poly(rad, 3)2 -2181.427 867.504 -2.515 0.012
poly(rad, 3)3 -297.994 126.652 -2.353 0.019
poly(nox, 3)1 -1674.159 657.049 -2.548 0.011
poly(nox, 3)2 -1384.744 580.787 -2.384 0.017
poly(nox, 3)3 -512.519 215.952 -2.373 0.018
poly(tax, 3)1 3380.046 1285.347 2.630 0.009
poly(tax, 3)2 1213.841 456.842 2.657 0.008
poly(tax, 3)3 233.440 94.770 2.463 0.014
poly(age, 3)1 -3.601 8.031 -0.448 0.654
poly(age, 3)2 -12.508 5.997 -2.086 0.037
poly(age, 3)3 0.976 6.002 0.163 0.871
poly(dis, 3)1 -13.737 15.247 -0.901 0.368
poly(dis, 3)2 2.369 10.966 0.216 0.829
poly(dis, 3)3 -16.354 9.667 -1.692 0.091
pred <- predict(log_reg, test, type = 'response')
pred_value <- ifelse(pred >= 0.5, 'Low', 'High')
acc <- mean(pred_value == test$crime_factor)

table(pred_value, test$crime_factor) %>%
  kable(format = 'html') %>%
  kable_styling() %>%
  add_header_above(c('Predicted' = 1, 'Observed' = 2)) %>%
  column_spec(1, bold = T) %>%
  add_footnote(label = acc)
Predicted
Observed
High Low
High 63 7
Low 1 56
a 0.937007874015748
#LDA
lda_model <- lda(crime_factor ~ poly(rad, 3) + poly(nox, 3) + 
                   poly(tax, 3) + poly(age, 3) + poly(dis, 3), data = train)
pred <- predict(lda_model, test)
acc <- mean(pred$class == test$crime_factor)

table(pred$class, test$crime_factor) %>%
  kable(format = 'html') %>%
  kable_styling() %>%
  add_header_above(c('Predicted' = 1, 'Observed' = 2)) %>%
  column_spec(1, bold = T) %>%
  add_footnote(label = acc)
Predicted
Observed
High Low
High 62 11
Low 2 52
a 0.897637795275591
lda_model <- lda(crime_factor ~ rad + nox + tax + age + dis, data = train)
pred <- predict(lda_model, test)
acc <- mean(pred$class == test$crime_factor)

table(pred$class, test$crime_factor) %>%
  kable(format = 'html') %>%
  kable_styling() %>%
  add_header_above(c('Predicted' = 1, 'Observed' = 2)) %>%
  column_spec(1, bold = T) %>%
  add_footnote(label = acc)
Predicted
Observed
High Low
High 49 1
Low 15 62
a 0.874015748031496
#Naive Bayes
library(e1071)
nb_model <- naiveBayes(crime_factor ~ rad + nox + tax + age + dis, data = train)
pred <- predict(nb_model, test)
acc <- mean(pred == test$crime_factor)

table(pred, test$crime_factor) %>%
  kable(format = 'html') %>%
  kable_styling() %>%
  add_header_above(c('Predicted' = 1, 'Observed' = 2)) %>%
  column_spec(1, bold = T) %>%
  add_footnote(label = acc)
Predicted
Observed
High Low
High 49 4
Low 15 59
a 0.850393700787402
#KNN
library(class)
variables <- c('rad', 'nox', 'tax', 'age', 'dis', 'zn', 'indus')

x_train <- train[, variables]
y_train <- train$crime_factor
x_test <- test[, variables]
acc <- list()

for (i in 1:20) {
  knn_pred <- knn(train = x_train, test = x_test, cl = y_train, k = i)
  acc[as.character(i)] = mean(knn_pred == test$crime_factor)
}

acc <- unlist(acc)

data_frame(acc = acc) %>%
  mutate(k = row_number()) %>%
  ggplot(aes(k, acc)) +
  geom_col(aes(fill = k == which.max(acc))) +
  labs(x = 'K', y = 'Accuracy', title = 'KNN Accuracy for different values of K') +
  scale_x_continuous(breaks = 1:20) +
  scale_y_continuous(breaks = round(c(seq(0.90, 0.94, 0.01), max(acc)),
                                    digits = 3)) +
  geom_hline(yintercept = max(acc), lty = 2) +
  coord_cartesian(ylim = c(min(acc), max(acc))) +
  guides(fill = FALSE)
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## ℹ Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

#We can conclude that Logistic regression with polynomial model is the best classifier to use with 93.7% accuracy. 

#Question 2 We perform best subset, forward stepwise, and backward stepwise selection on a single data set. For each approach, we obtain p+1 models, containing 0,1,2,…,p predictors. Explain your answers:

  1. Which of the three models with k predictors has the smallest training RSS?

The model with best subset selection has the smallest training RSS since it considers every possible model with k predictrors.

  1. Which of the three models with k predictors has the smallest test RSS?

Best subset selection may have the smallest test RSS because it takes into account more models than the other methods. However, the other methods might also pick a model with smaller test RSS by sheer luck. So it is hard to say which model has the smallest test RSS.

C.True or False: i. The predictors in the k-variable model identified by forwardstep wise are a subset of the predictors in the (k+1)-variable model identified by forward stepwise selection. True. The model with (k+1) predictors is obtained by augmenting the predictors in the model with k predictors with one additional predictor.

  1. The predictors in the k-variable model identified by backward stepwise are a subset of the predictors in the (k+1)-variable model identified by backward stepwise selection. True. The model with k predictors is obtained by removing one predictor from the model with (k+1) predictors.

  2. The predictors in the k-variable model identified by backward stepwise are a subset of the predictors in the (k+1)-variable model identified by forward stepwise selection. False. There is no direct link between the models obtained from forward and backward selection.

iv.The predictors in the k-variable model identified by forward stepwise are a subset of the predictors in the (k+1)-variable model identified by backward stepwise selection. False. There is no direct link between the models obtained from forward and backward selection.

  1. The predictors in the k-variable model identified by best subset are a subset of the predictors in the (k+1)-variable model identified by best subset selection. False. The model with (k+1) predictors is obtained by selecting among all possible models with (k+1) predictors, and so does not necessarily contain all the predictors selected for the k-variable model.

#Question 3 a) Split the data set into a training set and a test set.

library(ISLR2)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(tidyverse)
data('College')
set.seed(1)

inTrain <- createDataPartition(College$Apps, p = 0.75, list = FALSE)

training <- College[inTrain,]
testing <- College[-inTrain,]

preObj <- preProcess(training, method = c('center', 'scale'))

training <- predict(preObj, training)
testing <- predict(preObj, testing)

y_train <- training$Apps
y_test <- testing$Apps

one_hot_encoding <- dummyVars(Apps ~ ., data = training)
x_train <- predict(one_hot_encoding, training)
x_test <- predict(one_hot_encoding, testing)
  1. Fit a linear model using least squares on the training set, and report the test error obtained.
lin_model <- lm(Apps ~ ., data = training)

pred <- predict(lin_model, testing)

(lin_info <- postResample(pred, testing$Apps))
##      RMSE  Rsquared       MAE 
## 0.2799768 0.9201765 0.1568743
  1. Fit a ridge regression model on the training set, with λ chosen by cross-validation. Report the test error obtained.
ridge_fit <- train(x = x_train, y = y_train,
                   method = 'glmnet', 
                   trControl = trainControl(method = 'cv', number = 10),
                   tuneGrid = expand.grid(alpha = 0,
                                          lambda = seq(0, 10e2, length.out = 20)))
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
(ridge_info <- postResample(predict(ridge_fit, x_test), y_test))
##      RMSE  Rsquared       MAE 
## 0.2853247 0.9211286 0.1645806
coef(ridge_fit$finalModel, ridge_fit$bestTune$lambda)
## 19 x 1 sparse Matrix of class "dgCMatrix"
##                       s1
## (Intercept)  0.034871314
## Private.No   0.075423210
## Private.Yes -0.076037580
## Accept       0.665628735
## Enroll       0.090243371
## Top10perc    0.107160248
## Top25perc    0.011628030
## F.Undergrad  0.063308800
## P.Undergrad  0.017427317
## Outstate    -0.028995432
## Room.Board   0.048720533
## Books        0.012799145
## Personal    -0.002894430
## PhD         -0.017989250
## Terminal    -0.010434665
## S.F.Ratio    0.006920126
## perc.alumni -0.031683867
## Expend       0.083525070
## Grad.Rate    0.058131023
plot(ridge_fit)

plot(varImp(ridge_fit))

d) Fit a lasso model on the training set, with λ chosen by crossvalidation. Report the test error obtained, along with the number of non-zero coefficient estimates.

lasso_fit <- train(x = x_train, y = y_train, 
                   method = 'glmnet',
                   trControl = trainControl(method = 'cv', number = 10),
                   tuneGrid = expand.grid(alpha = 1,
                                          lambda = seq(0.0001, 1, length.out = 50)))
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
(lasso_info <- postResample(predict(lasso_fit, x_test), y_test))
##      RMSE  Rsquared       MAE 
## 0.2914812 0.9141364 0.1511801
coef(lasso_fit$finalModel, lasso_fit$bestTune$lambda)
## 19 x 1 sparse Matrix of class "dgCMatrix"
##                        s1
## (Intercept) -1.470609e-02
## Private.No   5.410732e-02
## Private.Yes -1.041607e-14
## Accept       8.883839e-01
## Enroll       .           
## Top10perc    1.092201e-01
## Top25perc    .           
## F.Undergrad  .           
## P.Undergrad  .           
## Outstate     .           
## Room.Board   1.483337e-04
## Books        .           
## Personal     .           
## PhD          .           
## Terminal     .           
## S.F.Ratio    .           
## perc.alumni  .           
## Expend       4.067011e-02
## Grad.Rate    4.062729e-06
plot(lasso_fit)

plot(varImp(lasso_fit))

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.