library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.2.3
data <- Boston
head(data,6)
## crim zn indus chas nox rm age dis rad tax ptratio lstat medv
## 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 4.98 24.0
## 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 9.14 21.6
## 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 4.03 34.7
## 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 2.94 33.4
## 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 5.33 36.2
## 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 5.21 28.7
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.2.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.2.3
library(rsample)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.1 ✔ tidyr 1.3.0
## ✔ readr 2.1.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(caret)
## Warning: package 'caret' was built under R version 4.2.3
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(vip)
## Warning: package 'vip' was built under R version 4.2.3
##
## Attaching package: 'vip'
##
## The following object is masked from 'package:utils':
##
## vi
library(kknn)
## Warning: package 'kknn' was built under R version 4.2.3
##
## Attaching package: 'kknn'
##
## The following object is masked from 'package:caret':
##
## contr.dummy
data <- data %>%
mutate(chas = factor(chas),
crime_rate = factor(ifelse(crim > median(crim),
'High', 'Low'),
levels = c('High', 'Low')))
head(data,6)
## crim zn indus chas nox rm age dis rad tax ptratio lstat medv
## 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 4.98 24.0
## 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 9.14 21.6
## 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 4.03 34.7
## 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 2.94 33.4
## 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 5.33 36.2
## 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 5.21 28.7
## crime_rate
## 1 Low
## 2 Low
## 3 Low
## 4 Low
## 5 Low
## 6 Low
summary(data)
## crim zn indus chas nox
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 0:471 Min. :0.3850
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1: 35 1st Qu.:0.4490
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.5380
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.5547
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.6240
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :0.8710
## rm age dis rad
## Min. :3.561 Min. : 2.90 Min. : 1.130 Min. : 1.000
## 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100 1st Qu.: 4.000
## Median :6.208 Median : 77.50 Median : 3.207 Median : 5.000
## Mean :6.285 Mean : 68.57 Mean : 3.795 Mean : 9.549
## 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188 3rd Qu.:24.000
## Max. :8.780 Max. :100.00 Max. :12.127 Max. :24.000
## tax ptratio lstat medv crime_rate
## Min. :187.0 Min. :12.60 Min. : 1.73 Min. : 5.00 High:253
## 1st Qu.:279.0 1st Qu.:17.40 1st Qu.: 6.95 1st Qu.:17.02 Low :253
## Median :330.0 Median :19.05 Median :11.36 Median :21.20
## Mean :408.2 Mean :18.46 Mean :12.65 Mean :22.53
## 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :711.0 Max. :22.00 Max. :37.97 Max. :50.00
set.seed(123)
data_split <- initial_split(data, prop = .7, strata ="crim")
data_train <- training(data_split)
head(data_train,6)
## crim zn indus chas nox rm age dis rad tax ptratio lstat medv
## 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 2.94 33.4
## 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 5.33 36.2
## 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 5.21 28.7
## 36 0.06417 0 5.96 0 0.499 5.933 68.2 3.3603 5 279 19.2 9.68 18.9
## 40 0.02763 75 2.95 0 0.428 6.595 21.8 5.4011 3 252 18.3 4.32 30.8
## 53 0.05360 21 5.64 0 0.439 6.511 21.1 6.8147 4 243 16.8 5.28 25.0
## crime_rate
## 4 Low
## 5 Low
## 6 Low
## 36 Low
## 40 Low
## 53 Low
data_test <- testing(data_split)
head(data_test,6)
## crim zn indus chas nox rm age dis rad tax ptratio lstat medv
## 1 0.00632 18.0 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 4.98 24.0
## 2 0.02731 0.0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 9.14 21.6
## 3 0.02729 0.0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 4.03 34.7
## 10 0.17004 12.5 7.87 0 0.524 6.004 85.9 6.5921 5 311 15.2 17.10 18.9
## 16 0.62739 0.0 8.14 0 0.538 5.834 56.5 4.4986 4 307 21.0 8.47 19.9
## 25 0.75026 0.0 8.14 0 0.538 5.924 94.1 4.3996 4 307 21.0 16.30 15.6
## crime_rate
## 1 Low
## 2 Low
## 3 Low
## 10 Low
## 16 High
## 25 High
#Logistic regression
model1 <-glm(crime_rate~chas+indus+age+dis+rad+tax, data=data_train,family=binomial)
summary(model1)
##
## Call:
## glm(formula = crime_rate ~ chas + indus + age + dis + rad + tax,
## family = binomial, data = data_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.66606 -0.01960 0.02098 0.55750 1.75338
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.270142 1.179986 1.924 0.05437 .
## chas1 0.309733 0.658693 0.470 0.63820
## indus -0.075372 0.037364 -2.017 0.04367 *
## age -0.025658 0.008654 -2.965 0.00303 **
## dis 0.433994 0.133896 3.241 0.00119 **
## rad -0.537394 0.127793 -4.205 2.61e-05 ***
## tax 0.005480 0.002473 2.216 0.02671 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 487.98 on 351 degrees of freedom
## Residual deviance: 230.44 on 345 degrees of freedom
## AIC: 244.44
##
## Number of Fisher Scoring iterations: 8
predicted1 <- predict(model1, data_train)
predicted2 <- if_else(predicted1 > 0.5, "Low", "High")
cross.tab1 <-table(data_train$chas, predicted2)
cross.tab1
## predicted2
## High Low
## 0 176 157
## 1 14 5
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## The following object is masked from 'package:ISLR2':
##
## Boston
model3 <- lda(crime_rate~chas+indus+age+dis+rad+tax, data = data_train, family = "binomial")
summary(model3)
## Length Class Mode
## prior 2 -none- numeric
## counts 2 -none- numeric
## means 12 -none- numeric
## scaling 6 -none- numeric
## lev 2 -none- character
## svd 1 -none- numeric
## N 1 -none- numeric
## call 4 -none- call
## terms 3 terms call
## xlevels 1 -none- list
predicted5 <- predict(model3, data_test)
predicted6 = predicted5$class
crosstab2 <- table(predicted6, data_test$crime_rate)
crosstab2
##
## predicted6 High Low
## High 61 5
## Low 16 72
#QDA
model4 <- qda(crime_rate~chas+indus+age+dis+rad+tax, data = data_train, family = "binomial")
predicted7 <- predict(model4, data_test)
predicted8 = predicted7$class
table(predicted8, data_test$crime_rate)
##
## predicted8 High Low
## High 64 6
## Low 13 71
#KNN
model5 <- knn3(crime_rate~chas+indus+age+dis+rad+tax, data = data_train, family = "binomial")
summary(model5)
## Length Class Mode
## learn 2 -none- list
## k 1 -none- numeric
## terms 3 terms call
## xlevels 1 -none- list
## theDots 1 -none- list
model5
## 5-nearest neighbor model
## Training set outcome distribution:
##
## High Low
## 176 176
#Naivebayes
library("naivebayes")
## Warning: package 'naivebayes' was built under R version 4.2.3
## naivebayes 0.9.7 loaded
model6 <- naive_bayes(crime_rate~chas+indus+age+dis+rad+tax, data = data_train, family = "binomial")
model6
##
## ================================== Naive Bayes ==================================
##
## Call:
## naive_bayes.formula(formula = crime_rate ~ chas + indus + age +
## dis + rad + tax, data = data_train, family = "binomial")
##
## ---------------------------------------------------------------------------------
##
## Laplace smoothing: 0
##
## ---------------------------------------------------------------------------------
##
## A priori probabilities:
##
## High Low
## 0.5 0.5
##
## ---------------------------------------------------------------------------------
##
## Tables:
##
## ---------------------------------------------------------------------------------
## ::: chas (Bernoulli)
## ---------------------------------------------------------------------------------
##
## chas High Low
## 0 0.93750000 0.95454545
## 1 0.06250000 0.04545455
##
## ---------------------------------------------------------------------------------
## ::: indus (Gaussian)
## ---------------------------------------------------------------------------------
##
## indus High Low
## mean 15.132500 7.135398
## sd 5.532886 5.639466
##
## ---------------------------------------------------------------------------------
## ::: age (Gaussian)
## ---------------------------------------------------------------------------------
##
## age High Low
## mean 84.86818 52.30227
## sd 18.79913 25.83849
##
## ---------------------------------------------------------------------------------
## ::: dis (Gaussian)
## ---------------------------------------------------------------------------------
##
## dis High Low
## mean 2.564850 5.058627
## sd 1.136806 2.112513
##
## ---------------------------------------------------------------------------------
## ::: rad (Gaussian)
## ---------------------------------------------------------------------------------
##
## rad High Low
## mean 15.045455 4.255682
## sd 9.555742 1.648366
##
## ---------------------------------------------------------------------------------
##
## # ... and 1 more table
##
## ---------------------------------------------------------------------------------
predicted10 <- predict(model6, data_test)
## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.
a <- table(predicted10, data_test$crime_rate)
a
##
## predicted10 High Low
## High 56 6
## Low 21 71
2a: Which of the three models with k predictors has the smallest training RSS ? When performing best subset selection, the model with k predictors is the model with the smallest RSS among all the pCk models with k predictors.
When performing forward stepwise selection, the model with k predictors is the model with the smallest RSS among the p−k models which augment the predictors in M(k−1) with one additional predictor.
When performing backward stepwise selection, the model with k predictors is the model with the smallest RSS among the k models which contains all but one of the predictors in M(k+1).
So, the model with k predictors which has the smallest training RSS is the one obtained from best subset selection as it is the one selected among all k predictors models.
2b: Which of the three models with k predictors has the smallest test RSS ? Best subset selection may have the smallest test RSS because it considers more models then the other methods.
However, the other models might have better luck picking a model that fits the test data better, as they would be less subject to overfitting.
The outcome will depend more heavily on the choice of test set / validation method than on the selection method.
2c: True or False: The predictors in the k-variable model identified by forward stepwise are a subset of the predictors in the (k+1)-variable model identified by forward stepwise selection. TRUE. The model with (k+1) predictors is obtained by augmenting the predictors in the model with k predictors with one additional predictor.
The predictors in the k-variable model identified by backward stepwise are a subset of the predictors in the (k+1)-variable model identified by backward stepwise selection. TRUE. The model with k predictors is obtained by removing one predictor from the model with (k+1) predictors.
The predictors in the k-variable model identified by backward stepwise are a subset of the predictors in the (k+1)-variable model identified by forward stepwise selection. FALSE. There is no direct link between the models obtained from forward and backward selection.
The predictors in the k-variable model identified by forward stepwise are a subset of the predictors in the (k+1)-variable model identified by backward stepwise selection. FALSE. There is no direct link between the models obtained from forward and backward selection.
The predictors in the k-variable model identified by best subset are a subset of the predictors in the (k+1)-variable model identified by best subset selection. FALSE. The predictors in the k-variable model identified by best subset are a subset of the predictors in the (k+1)-variable model identified by best subset selection. # Question 3: # Importing data
data2 <- College
head(College,6)
## Private Apps Accept Enroll Top10perc Top25perc
## Abilene Christian University Yes 1660 1232 721 23 52
## Adelphi University Yes 2186 1924 512 16 29
## Adrian College Yes 1428 1097 336 22 50
## Agnes Scott College Yes 417 349 137 60 89
## Alaska Pacific University Yes 193 146 55 16 44
## Albertson College Yes 587 479 158 38 62
## F.Undergrad P.Undergrad Outstate Room.Board Books
## Abilene Christian University 2885 537 7440 3300 450
## Adelphi University 2683 1227 12280 6450 750
## Adrian College 1036 99 11250 3750 400
## Agnes Scott College 510 63 12960 5450 450
## Alaska Pacific University 249 869 7560 4120 800
## Albertson College 678 41 13500 3335 500
## Personal PhD Terminal S.F.Ratio perc.alumni Expend
## Abilene Christian University 2200 70 78 18.1 12 7041
## Adelphi University 1500 29 30 12.2 16 10527
## Adrian College 1165 53 66 12.9 30 8735
## Agnes Scott College 875 92 97 7.7 37 19016
## Alaska Pacific University 1500 76 72 11.9 2 10922
## Albertson College 675 67 73 9.4 11 9727
## Grad.Rate
## Abilene Christian University 60
## Adelphi University 56
## Adrian College 54
## Agnes Scott College 59
## Alaska Pacific University 15
## Albertson College 55
summary(data2)
## Private Apps Accept Enroll Top10perc
## No :212 Min. : 81 Min. : 72 Min. : 35 Min. : 1.00
## Yes:565 1st Qu.: 776 1st Qu.: 604 1st Qu.: 242 1st Qu.:15.00
## Median : 1558 Median : 1110 Median : 434 Median :23.00
## Mean : 3002 Mean : 2019 Mean : 780 Mean :27.56
## 3rd Qu.: 3624 3rd Qu.: 2424 3rd Qu.: 902 3rd Qu.:35.00
## Max. :48094 Max. :26330 Max. :6392 Max. :96.00
## Top25perc F.Undergrad P.Undergrad Outstate
## Min. : 9.0 Min. : 139 Min. : 1.0 Min. : 2340
## 1st Qu.: 41.0 1st Qu.: 992 1st Qu.: 95.0 1st Qu.: 7320
## Median : 54.0 Median : 1707 Median : 353.0 Median : 9990
## Mean : 55.8 Mean : 3700 Mean : 855.3 Mean :10441
## 3rd Qu.: 69.0 3rd Qu.: 4005 3rd Qu.: 967.0 3rd Qu.:12925
## Max. :100.0 Max. :31643 Max. :21836.0 Max. :21700
## Room.Board Books Personal PhD
## Min. :1780 Min. : 96.0 Min. : 250 Min. : 8.00
## 1st Qu.:3597 1st Qu.: 470.0 1st Qu.: 850 1st Qu.: 62.00
## Median :4200 Median : 500.0 Median :1200 Median : 75.00
## Mean :4358 Mean : 549.4 Mean :1341 Mean : 72.66
## 3rd Qu.:5050 3rd Qu.: 600.0 3rd Qu.:1700 3rd Qu.: 85.00
## Max. :8124 Max. :2340.0 Max. :6800 Max. :103.00
## Terminal S.F.Ratio perc.alumni Expend
## Min. : 24.0 Min. : 2.50 Min. : 0.00 Min. : 3186
## 1st Qu.: 71.0 1st Qu.:11.50 1st Qu.:13.00 1st Qu.: 6751
## Median : 82.0 Median :13.60 Median :21.00 Median : 8377
## Mean : 79.7 Mean :14.09 Mean :22.74 Mean : 9660
## 3rd Qu.: 92.0 3rd Qu.:16.50 3rd Qu.:31.00 3rd Qu.:10830
## Max. :100.0 Max. :39.80 Max. :64.00 Max. :56233
## Grad.Rate
## Min. : 10.00
## 1st Qu.: 53.00
## Median : 65.00
## Mean : 65.46
## 3rd Qu.: 78.00
## Max. :118.00
set.seed(123)
data_split1 <- initial_split(data2, prop = .7, strata ="Apps")
data_train1 <- training(data_split1)
data_test1<- testing(data_split1)
head(data_train1,6)
## Private Apps Accept Enroll Top10perc Top25perc
## Albertus Magnus College Yes 353 340 103 17 45
## Alderson-Broaddus College Yes 582 498 172 21 44
## Alverno College Yes 494 313 157 23 46
## Antioch University Yes 713 661 252 25 44
## Aquinas College Yes 619 516 219 20 51
## Arkansas College (Lyon College) Yes 708 334 166 46 74
## F.Undergrad P.Undergrad Outstate Room.Board
## Albertus Magnus College 416 230 13290 5720
## Alderson-Broaddus College 799 78 10468 3380
## Alverno College 1317 1235 8352 3640
## Antioch University 712 23 15476 3336
## Aquinas College 1251 767 11208 4124
## Arkansas College (Lyon College) 530 182 8644 3922
## Books Personal PhD Terminal S.F.Ratio
## Albertus Magnus College 500 1500 90 93 11.5
## Alderson-Broaddus College 660 1800 40 41 11.5
## Alverno College 650 2449 36 69 11.1
## Antioch University 400 1100 69 82 11.3
## Aquinas College 350 1615 55 65 12.7
## Arkansas College (Lyon College) 500 800 79 88 12.6
## perc.alumni Expend Grad.Rate
## Albertus Magnus College 26 8861 63
## Alderson-Broaddus College 15 8991 52
## Alverno College 26 8127 55
## Antioch University 35 42926 48
## Aquinas College 25 6584 65
## Arkansas College (Lyon College) 24 14579 54
head(data_test1,6)
## Private Apps Accept Enroll Top10perc
## Abilene Christian University Yes 1660 1232 721 23
## Adelphi University Yes 2186 1924 512 16
## Agnes Scott College Yes 417 349 137 60
## Alaska Pacific University Yes 193 146 55 16
## Albertson College Yes 587 479 158 38
## Allentown Coll. of St. Francis de Sales Yes 1179 780 290 38
## Top25perc F.Undergrad P.Undergrad
## Abilene Christian University 52 2885 537
## Adelphi University 29 2683 1227
## Agnes Scott College 89 510 63
## Alaska Pacific University 44 249 869
## Albertson College 62 678 41
## Allentown Coll. of St. Francis de Sales 64 1130 638
## Outstate Room.Board Books Personal PhD
## Abilene Christian University 7440 3300 450 2200 70
## Adelphi University 12280 6450 750 1500 29
## Agnes Scott College 12960 5450 450 875 92
## Alaska Pacific University 7560 4120 800 1500 76
## Albertson College 13500 3335 500 675 67
## Allentown Coll. of St. Francis de Sales 9690 4785 600 1000 60
## Terminal S.F.Ratio perc.alumni Expend
## Abilene Christian University 78 18.1 12 7041
## Adelphi University 30 12.2 16 10527
## Agnes Scott College 97 7.7 37 19016
## Alaska Pacific University 72 11.9 2 10922
## Albertson College 73 9.4 11 9727
## Allentown Coll. of St. Francis de Sales 84 13.3 21 7940
## Grad.Rate
## Abilene Christian University 60
## Adelphi University 56
## Agnes Scott College 59
## Alaska Pacific University 15
## Albertson College 55
## Allentown Coll. of St. Francis de Sales 74
lm.fit <- lm(Apps ~ . , data = data_train1)
lm.pred <- predict(lm.fit, data_test1)
mean((data_test1[, "Apps"] - lm.pred)^2)
## [1] 1630969
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.1-6
train.mat <- model.matrix(Apps ~ . -1 , data = data_train1)
test.mat <- model.matrix(Apps ~ . -1, data = data_test1)
grid <- 10 ^ seq(4, -2, length = 100)
mod.ridge <- cv.glmnet(train.mat, data_train1[, "Apps"],
alpha = 0, lambda = grid, thresh = 1e-12)
lambda.best <- mod.ridge$lambda.min
lambda.best
## [1] 32.74549
ridge.pred <- predict(mod.ridge, newx = test.mat, s = lambda.best)
mean((data_test1[, "Apps"] - ridge.pred)^2)
## [1] 1856874
mod.lasso <- cv.glmnet(train.mat, data_train1[, "Apps"],
alpha = 1, lambda = grid, thresh = 1e-12)
lambda.best <- mod.lasso$lambda.min
lambda.best
## [1] 0.1873817
lasso.pred <- predict(mod.lasso, newx = test.mat, s = lambda.best)
mean((data_test1[, "Apps"] - lasso.pred)^2)
## [1] 1633075