# Load required packages
library(ISLR2)
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
##
## Boston
library(e1071)
library(class)
# Load Boston data set
data("Boston")
# Create response variable: crime_rate_above_median
crime_rate_above_median <- ifelse(Boston$crim > median(Boston$crim), 1, 0)
# Add the response variable to the Boston data set
Boston$crime_rate_above_median <- crime_rate_above_median
# Logistic Regression
logistic_model <- glm(crime_rate_above_median ~ ., data = Boston, family = "binomial")
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(logistic_model)
##
## Call:
## glm(formula = crime_rate_above_median ~ ., family = "binomial",
## data = Boston)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.437e+01 1.202e+05 0.000 1.000
## crim 1.083e+03 1.773e+04 0.061 0.951
## zn 2.194e+00 5.856e+01 0.037 0.970
## indus -2.510e+00 9.002e+02 -0.003 0.998
## chas 4.489e+00 1.014e+04 0.000 1.000
## nox -2.585e+02 1.458e+05 -0.002 0.999
## rm -3.953e+01 1.653e+03 -0.024 0.981
## age 3.437e-01 5.798e+01 0.006 0.995
## dis -1.742e+01 2.146e+03 -0.008 0.994
## rad -5.933e+00 2.642e+03 -0.002 0.998
## tax 1.639e-01 1.078e+02 0.002 0.999
## ptratio 5.525e+00 3.640e+03 0.002 0.999
## black 3.266e-02 1.208e+01 0.003 0.998
## lstat -1.687e+00 3.560e+02 -0.005 0.996
## medv 2.358e+00 5.382e+02 0.004 0.997
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7.0146e+02 on 505 degrees of freedom
## Residual deviance: 2.8371e-05 on 491 degrees of freedom
## AIC: 30
##
## Number of Fisher Scoring iterations: 25
# LDA
lda_model <- lda(crime_rate_above_median ~ ., data = Boston)
lda_model
## Call:
## lda(crime_rate_above_median ~ ., data = Boston)
##
## Prior probabilities of groups:
## 0 1
## 0.5 0.5
##
## Group means:
## crim zn indus chas nox rm age dis
## 0 0.0955715 21.525692 7.002292 0.05138340 0.4709711 6.394395 51.31028 5.091596
## 1 7.1314756 1.201581 15.271265 0.08695652 0.6384190 6.174874 85.83953 2.498489
## rad tax ptratio black lstat medv
## 0 4.158103 305.7431 17.90711 388.7061 9.419486 24.94941
## 1 14.940711 510.7312 19.00395 324.6420 15.886640 20.11621
##
## Coefficients of linear discriminants:
## LD1
## crim 0.0046376592
## zn -0.0056431194
## indus 0.0126159626
## chas -0.0592836851
## nox 8.1826206579
## rm 0.0874007870
## age 0.0112829040
## dis 0.0453643651
## rad 0.0699133176
## tax -0.0008444666
## ptratio 0.0513806507
## black -0.0009892799
## lstat 0.0143945059
## medv 0.0386990631
# Naive Bayes
naive_bayes_model <- naiveBayes(as.factor(crime_rate_above_median) ~ ., data = Boston)
naive_bayes_model
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## 0 1
## 0.5 0.5
##
## Conditional probabilities:
## crim
## Y [,1] [,2]
## 0 0.0955715 0.06281773
## 1 7.1314756 11.10912294
##
## zn
## Y [,1] [,2]
## 0 21.525692 29.319808
## 1 1.201581 4.798611
##
## indus
## Y [,1] [,2]
## 0 7.002292 5.514454
## 1 15.271265 5.439010
##
## chas
## Y [,1] [,2]
## 0 0.05138340 0.2212161
## 1 0.08695652 0.2823299
##
## nox
## Y [,1] [,2]
## 0 0.4709711 0.05559789
## 1 0.6384190 0.09870365
##
## rm
## Y [,1] [,2]
## 0 6.394395 0.5556856
## 1 6.174874 0.8101381
##
## age
## Y [,1] [,2]
## 0 51.31028 25.88190
## 1 85.83953 17.87423
##
## dis
## Y [,1] [,2]
## 0 5.091596 2.081304
## 1 2.498489 1.085521
##
## rad
## Y [,1] [,2]
## 0 4.158103 1.659121
## 1 14.940711 9.529843
##
## tax
## Y [,1] [,2]
## 0 305.7431 87.4837
## 1 510.7312 167.8553
##
## ptratio
## Y [,1] [,2]
## 0 17.90711 1.811216
## 1 19.00395 2.346947
##
## black
## Y [,1] [,2]
## 0 388.7061 22.83774
## 1 324.6420 118.83084
##
## lstat
## Y [,1] [,2]
## 0 9.419486 4.923497
## 1 15.886640 7.546922
##
## medv
## Y [,1] [,2]
## 0 24.94941 7.232047
## 1 20.11621 10.270362
set.seed(123) # For reproducibility
train_indices <- sample(1:nrow(Boston), 0.7*nrow(Boston)) # 70% for training
train_data <- Boston[train_indices, ]
test_data <- Boston[-train_indices, ]
knn_model <- knn(train_data[, -1], test_data[, -1], train_data$crime_rate_above_median, k = 5)
table(knn_model, test_data$crime_rate_above_median)
##
## knn_model 0 1
## 0 69 5
## 1 6 72
#CH5 #a.Fit a logistic regression model that uses income and balance to predict default.
library(ISLR)
##
## Attaching package: 'ISLR'
## The following objects are masked from 'package:ISLR2':
##
## Auto, Credit
set.seed(1)
fit.glm = glm(default ~ income + balance, data = Default, family = "binomial")
summary(fit.glm)
##
## Call:
## glm(formula = default ~ income + balance, family = "binomial",
## data = Default)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
#B. i.Using the validation set approach, estimate the test error of this model. In order to do this, you must perform the following steps: Split the sample set into a training set and a validation set.
train = sample(dim(Default)[1], dim(Default)[1] / 2)
#ii. Fit a multiple logistic regression model using only the train- ing observations.
fit.glm = glm(default ~ income + balance, data = Default[train,], family = "binomial")
fit.glm = glm(default ~ income + balance, data = Default, family = "binomial", subset = train)
#Both above formulas have the same outcome.
summary(fit.glm)
##
## Call:
## glm(formula = default ~ income + balance, family = "binomial",
## data = Default, subset = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.194e+01 6.178e-01 -19.333 < 2e-16 ***
## income 3.262e-05 7.024e-06 4.644 3.41e-06 ***
## balance 5.689e-03 3.158e-04 18.014 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1523.8 on 4999 degrees of freedom
## Residual deviance: 803.3 on 4997 degrees of freedom
## AIC: 809.3
##
## Number of Fisher Scoring iterations: 8
#iii.Obtain a prediction of default status for each individual in the validation set by computing the posterior probability of default for that individual, and classifying the individual to the default category if the posterior probability is greater than 0.5.
glm.probs = predict(fit.glm, newdata = Default[-train, ], type="response")
glm.pred=rep("No",5000)
glm.pred[glm.probs>0.5] = "Yes"
#iv.Compute the validation set error, which is the fraction of the observations in the validation set that are misclassified.
mean(glm.pred != Default[-train, ]$default)
## [1] 0.0254
#c.Repeat the process in (b) three times, using three different splits of the observations into a training set and a validation set. Comment on the results obtained.
train <- sample(dim(Default)[1], dim(Default)[1] / 2)
fit.glm <- glm(default ~ income + balance, data = Default, family = "binomial", subset = train)
probs <- predict(fit.glm, newdata = Default[-train, ], type = "response")
pred.glm <- rep("No", length(probs))
pred.glm[probs > 0.5] <- "Yes"
mean(pred.glm != Default[-train, ]$default)
## [1] 0.0274
train <- sample(dim(Default)[1], dim(Default)[1] / 2)
fit.glm <- glm(default ~ income + balance, data = Default, family = "binomial", subset = train)
probs <- predict(fit.glm, newdata = Default[-train, ], type = "response")
pred.glm <- rep("No", length(probs))
pred.glm[probs > 0.5] <- "Yes"
mean(pred.glm != Default[-train, ]$default)
## [1] 0.0244
train <- sample(dim(Default)[1], dim(Default)[1] / 2)
fit.glm <- glm(default ~ income + balance, data = Default, family = "binomial", subset = train)
probs <- predict(fit.glm, newdata = Default[-train, ], type = "response")
pred.glm <- rep("No", length(probs))
pred.glm[probs > 0.5] <- "Yes"
mean(pred.glm != Default[-train, ]$default)
## [1] 0.0244
#d.Now consider a logistic regression model that predicts the probability of default using income, balance, and a dummy variable for student. Estimate the test error for this model using the validation set approach. Comment on whether or not including a dummy variable for student leads to a reduction in the test error rate.
train <- sample(dim(Default)[1], dim(Default)[1] / 2)
fit.glm <- glm(default ~ income + balance + student, data = Default, family = "binomial", subset = train)
pred.glm <- rep("No", length(probs))
probs <- predict(fit.glm, newdata = Default[-train, ], type = "response")
pred.glm[probs > 0.5] <- "Yes"
mean(pred.glm != Default[-train, ]$default)
## [1] 0.0278
#CH6
data("College")
attach(College)
set.seed(2)
train <- sample(1:nrow(College), nrow(College)/2)
test <- -train
y.test <- Apps[test]
#B.(b) Fit a linear model using least squares on the training set, and report the test error obtained.
lm.fit <- lm(Apps ~., data=College, subset = train)
lm.pred <- predict(lm.fit, College[test,])
lm.error <- mean((lm.pred - y.test)^2)
lm.error
## [1] 1093608
#(c) Fit a ridge regression model on the training set, with λ chosen by cross-validation. Report the test error obtained.
library (glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-8
x <- model.matrix(Apps~., data=College)[, -1]
x.train <- x[train,]
y <- College$Apps
y.train <- y[train]
set.seed(7)
ridge.fit <- glmnet(x.train,y.train,alpha=0)
cv.ridge.fit <- cv.glmnet(x.train,y.train,alpha=0)
plot(cv.ridge.fit)
bestlam <- cv.ridge.fit$lambda.min
bestlam
## [1] 424.2704
ridge.pred <- predict(ridge.fit, s=bestlam, newx=x[test,])
ridge.error <- mean((ridge.pred-y.test)^2)
ridge.error
## [1] 1138030
#(d) Fit a lasso model on the training set, with λ chosen by cross-validation. Report the test error obtained, along with the number of non-zero coefficient estimates.
set.seed(2)
lasso.fit <- glmnet(x.train,y.train,alpha=1)
cv.lasso.fit<- cv.glmnet(x.train,y.train,alpha=1)
bestlam.lasso <- cv.lasso.fit$lambda.min
bestlam.lasso
## [1] 15.97351
lasso.pred <- predict(lasso.fit, s=bestlam.lasso, newx=x[test,])
lasso.error <- mean((lasso.pred-y.test)^2)
lasso.error
## [1] 1045922
lasso.c <- predict(lasso.fit,type="coefficients", s=bestlam)[1:17,]
length(lasso.c[lasso.c != 0])
## [1] 3
#(e) Fit a PCR model on the training set, with M chosen by cross-validation. Report the test error obtained, along with the value of M selected by cross-validation.
library (pls)
##
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
##
## loadings
set.seed(1)
pcr.fit <- pcr(Apps~., data=College, subset=train, scale=TRUE, validation ="CV")
summary(pcr.fit)
## Data: X dimension: 388 17
## Y dimension: 388 1
## Fit method: svdpc
## Number of components considered: 17
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 4440 4463 2362 2385 2092 1835 1832
## adjCV 4440 4465 2359 2384 2066 1824 1823
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 1829 1821 1753 1764 1771 1772 1784
## adjCV 1818 1813 1746 1757 1764 1764 1777
## 14 comps 15 comps 16 comps 17 comps
## CV 1778 1747 1401 1338
## adjCV 1773 1701 1385 1322
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 31.4816 57.40 64.84 70.54 75.80 80.23 84.04 87.64
## Apps 0.1398 72.45 72.50 80.45 84.74 84.77 85.06 85.16
## 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps
## X 90.87 93.29 95.33 96.98 98.05 98.75 99.37
## Apps 85.93 86.16 86.21 86.32 86.40 86.61 92.49
## 16 comps 17 comps
## X 99.85 100.00
## Apps 93.65 94.32
validationplot(pcr.fit, val.type = "MSEP")
pcr.pred <- predict(pcr.fit,x[test,], ncomp=17)
pcr.error <- mean((pcr.pred-y.test)^2)
pcr.error
## [1] 1093608
#(f) Fit a PLS model on the training set, with M chosen by cross-validation. Report the test error obtained, along with the value of M selected by cross-validation.
set.seed(1)
pls.fit <- plsr(Apps~., data=College, subset=train, scale=TRUE, validation="CV")
summary(pls.fit)
## Data: X dimension: 388 17
## Y dimension: 388 1
## Fit method: kernelpls
## Number of components considered: 17
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 4440 2153 1701 1711 1650 1498 1413
## adjCV 4440 2147 1677 1704 1615 1470 1392
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 1378 1358 1356 1342 1338 1337 1337
## adjCV 1361 1342 1339 1326 1322 1321 1321
## 14 comps 15 comps 16 comps 17 comps
## CV 1338 1338 1338 1338
## adjCV 1322 1322 1322 1322
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 25.76 31.99 61.88 65.00 68.68 73.82 78.33 81.38
## Apps 77.92 87.80 88.38 92.06 93.59 93.92 94.01 94.09
## 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps
## X 83.24 85.59 87.76 90.49 92.69 94.27 97.03
## Apps 94.20 94.27 94.30 94.31 94.32 94.32 94.32
## 16 comps 17 comps
## X 99.21 100.00
## Apps 94.32 94.32
validationplot(pls.fit, val.type = "MSEP")
pls.pred <- predict(pls.fit,x[test,],ncomp=12)
pls.error <- mean((pls.pred-y.test)^2)
pls.error
## [1] 1085346
#(g) Comment on the results obtained. How accurately can we predict the number of college applications received? Is there much difference among the test errors resulting from these five approaches?
errors <- c(lm.error, ridge.error, lasso.error, pcr.error, pls.error)
names(errors) <- c("lm", "ridge", "lasso", "pcr", "pls")
barplot(sort(errors))
print(sort(errors))
## lasso pls pcr lm ridge
## 1045922 1085346 1093608 1093608 1138030