# Load required packages
library(ISLR2)
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
## 
##     Boston
library(e1071)
library(class)

# Load Boston data set
data("Boston")

# Create response variable: crime_rate_above_median
crime_rate_above_median <- ifelse(Boston$crim > median(Boston$crim), 1, 0)

# Add the response variable to the Boston data set
Boston$crime_rate_above_median <- crime_rate_above_median

# Logistic Regression
logistic_model <- glm(crime_rate_above_median ~ ., data = Boston, family = "binomial")
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(logistic_model)
## 
## Call:
## glm(formula = crime_rate_above_median ~ ., family = "binomial", 
##     data = Boston)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.437e+01  1.202e+05   0.000    1.000
## crim         1.083e+03  1.773e+04   0.061    0.951
## zn           2.194e+00  5.856e+01   0.037    0.970
## indus       -2.510e+00  9.002e+02  -0.003    0.998
## chas         4.489e+00  1.014e+04   0.000    1.000
## nox         -2.585e+02  1.458e+05  -0.002    0.999
## rm          -3.953e+01  1.653e+03  -0.024    0.981
## age          3.437e-01  5.798e+01   0.006    0.995
## dis         -1.742e+01  2.146e+03  -0.008    0.994
## rad         -5.933e+00  2.642e+03  -0.002    0.998
## tax          1.639e-01  1.078e+02   0.002    0.999
## ptratio      5.525e+00  3.640e+03   0.002    0.999
## black        3.266e-02  1.208e+01   0.003    0.998
## lstat       -1.687e+00  3.560e+02  -0.005    0.996
## medv         2.358e+00  5.382e+02   0.004    0.997
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 7.0146e+02  on 505  degrees of freedom
## Residual deviance: 2.8371e-05  on 491  degrees of freedom
## AIC: 30
## 
## Number of Fisher Scoring iterations: 25
# LDA
lda_model <- lda(crime_rate_above_median ~ ., data = Boston)
lda_model
## Call:
## lda(crime_rate_above_median ~ ., data = Boston)
## 
## Prior probabilities of groups:
##   0   1 
## 0.5 0.5 
## 
## Group means:
##        crim        zn     indus       chas       nox       rm      age      dis
## 0 0.0955715 21.525692  7.002292 0.05138340 0.4709711 6.394395 51.31028 5.091596
## 1 7.1314756  1.201581 15.271265 0.08695652 0.6384190 6.174874 85.83953 2.498489
##         rad      tax  ptratio    black     lstat     medv
## 0  4.158103 305.7431 17.90711 388.7061  9.419486 24.94941
## 1 14.940711 510.7312 19.00395 324.6420 15.886640 20.11621
## 
## Coefficients of linear discriminants:
##                   LD1
## crim     0.0046376592
## zn      -0.0056431194
## indus    0.0126159626
## chas    -0.0592836851
## nox      8.1826206579
## rm       0.0874007870
## age      0.0112829040
## dis      0.0453643651
## rad      0.0699133176
## tax     -0.0008444666
## ptratio  0.0513806507
## black   -0.0009892799
## lstat    0.0143945059
## medv     0.0386990631
# Naive Bayes
naive_bayes_model <- naiveBayes(as.factor(crime_rate_above_median) ~ ., data = Boston)
naive_bayes_model
## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##   0   1 
## 0.5 0.5 
## 
## Conditional probabilities:
##    crim
## Y        [,1]        [,2]
##   0 0.0955715  0.06281773
##   1 7.1314756 11.10912294
## 
##    zn
## Y        [,1]      [,2]
##   0 21.525692 29.319808
##   1  1.201581  4.798611
## 
##    indus
## Y        [,1]     [,2]
##   0  7.002292 5.514454
##   1 15.271265 5.439010
## 
##    chas
## Y         [,1]      [,2]
##   0 0.05138340 0.2212161
##   1 0.08695652 0.2823299
## 
##    nox
## Y        [,1]       [,2]
##   0 0.4709711 0.05559789
##   1 0.6384190 0.09870365
## 
##    rm
## Y       [,1]      [,2]
##   0 6.394395 0.5556856
##   1 6.174874 0.8101381
## 
##    age
## Y       [,1]     [,2]
##   0 51.31028 25.88190
##   1 85.83953 17.87423
## 
##    dis
## Y       [,1]     [,2]
##   0 5.091596 2.081304
##   1 2.498489 1.085521
## 
##    rad
## Y        [,1]     [,2]
##   0  4.158103 1.659121
##   1 14.940711 9.529843
## 
##    tax
## Y       [,1]     [,2]
##   0 305.7431  87.4837
##   1 510.7312 167.8553
## 
##    ptratio
## Y       [,1]     [,2]
##   0 17.90711 1.811216
##   1 19.00395 2.346947
## 
##    black
## Y       [,1]      [,2]
##   0 388.7061  22.83774
##   1 324.6420 118.83084
## 
##    lstat
## Y        [,1]     [,2]
##   0  9.419486 4.923497
##   1 15.886640 7.546922
## 
##    medv
## Y       [,1]      [,2]
##   0 24.94941  7.232047
##   1 20.11621 10.270362

KNN

set.seed(123) # For reproducibility
train_indices <- sample(1:nrow(Boston), 0.7*nrow(Boston)) # 70% for training
train_data <- Boston[train_indices, ]
test_data <- Boston[-train_indices, ]
knn_model <- knn(train_data[, -1], test_data[, -1], train_data$crime_rate_above_median, k = 5)
table(knn_model, test_data$crime_rate_above_median)
##          
## knn_model  0  1
##         0 69  5
##         1  6 72

#CH5 #a.Fit a logistic regression model that uses income and balance to predict default.

library(ISLR)
## 
## Attaching package: 'ISLR'
## The following objects are masked from 'package:ISLR2':
## 
##     Auto, Credit
set.seed(1)
fit.glm = glm(default ~ income + balance, data = Default, family = "binomial")
summary(fit.glm)
## 
## Call:
## glm(formula = default ~ income + balance, family = "binomial", 
##     data = Default)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.154e+01  4.348e-01 -26.545  < 2e-16 ***
## income       2.081e-05  4.985e-06   4.174 2.99e-05 ***
## balance      5.647e-03  2.274e-04  24.836  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1579.0  on 9997  degrees of freedom
## AIC: 1585
## 
## Number of Fisher Scoring iterations: 8

#B. i.Using the validation set approach, estimate the test error of this model. In order to do this, you must perform the following steps: Split the sample set into a training set and a validation set.

train = sample(dim(Default)[1], dim(Default)[1] / 2)

#ii. Fit a multiple logistic regression model using only the train- ing observations.

 fit.glm = glm(default ~ income + balance, data = Default[train,], family = "binomial")
fit.glm = glm(default ~ income + balance, data = Default, family = "binomial", subset = train)
#Both above formulas have the same outcome.
summary(fit.glm)
## 
## Call:
## glm(formula = default ~ income + balance, family = "binomial", 
##     data = Default, subset = train)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.194e+01  6.178e-01 -19.333  < 2e-16 ***
## income       3.262e-05  7.024e-06   4.644 3.41e-06 ***
## balance      5.689e-03  3.158e-04  18.014  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1523.8  on 4999  degrees of freedom
## Residual deviance:  803.3  on 4997  degrees of freedom
## AIC: 809.3
## 
## Number of Fisher Scoring iterations: 8

#iii.Obtain a prediction of default status for each individual in the validation set by computing the posterior probability of default for that individual, and classifying the individual to the default category if the posterior probability is greater than 0.5.

glm.probs = predict(fit.glm, newdata = Default[-train, ], type="response")
glm.pred=rep("No",5000)
glm.pred[glm.probs>0.5] = "Yes"

#iv.Compute the validation set error, which is the fraction of the observations in the validation set that are misclassified.

mean(glm.pred != Default[-train, ]$default)
## [1] 0.0254

#c.Repeat the process in (b) three times, using three different splits of the observations into a training set and a validation set. Comment on the results obtained.

train <- sample(dim(Default)[1], dim(Default)[1] / 2)
fit.glm <- glm(default ~ income + balance, data = Default, family = "binomial", subset = train)
probs <- predict(fit.glm, newdata = Default[-train, ], type = "response")
pred.glm <- rep("No", length(probs))
pred.glm[probs > 0.5] <- "Yes"
mean(pred.glm != Default[-train, ]$default)
## [1] 0.0274
train <- sample(dim(Default)[1], dim(Default)[1] / 2)
fit.glm <- glm(default ~ income + balance, data = Default, family = "binomial", subset = train)
probs <- predict(fit.glm, newdata = Default[-train, ], type = "response")
pred.glm <- rep("No", length(probs))
pred.glm[probs > 0.5] <- "Yes"
mean(pred.glm != Default[-train, ]$default)
## [1] 0.0244
train <- sample(dim(Default)[1], dim(Default)[1] / 2)
fit.glm <- glm(default ~ income + balance, data = Default, family = "binomial", subset = train)
probs <- predict(fit.glm, newdata = Default[-train, ], type = "response")
pred.glm <- rep("No", length(probs))
pred.glm[probs > 0.5] <- "Yes"
mean(pred.glm != Default[-train, ]$default)
## [1] 0.0244

#d.Now consider a logistic regression model that predicts the probability of default using income, balance, and a dummy variable for student. Estimate the test error for this model using the validation set approach. Comment on whether or not including a dummy variable for student leads to a reduction in the test error rate.

train <- sample(dim(Default)[1], dim(Default)[1] / 2)
fit.glm <- glm(default ~ income + balance + student, data = Default, family = "binomial", subset = train)
pred.glm <- rep("No", length(probs))
probs <- predict(fit.glm, newdata = Default[-train, ], type = "response")
pred.glm[probs > 0.5] <- "Yes"
mean(pred.glm != Default[-train, ]$default)
## [1] 0.0278

#CH6

data("College")
attach(College)
set.seed(2)
train <- sample(1:nrow(College), nrow(College)/2)
test <- -train
y.test <- Apps[test]

#B.(b) Fit a linear model using least squares on the training set, and report the test error obtained.

lm.fit <- lm(Apps ~., data=College, subset = train)
lm.pred <- predict(lm.fit, College[test,])
lm.error <- mean((lm.pred - y.test)^2)
lm.error
## [1] 1093608

#(c) Fit a ridge regression model on the training set, with λ chosen by cross-validation. Report the test error obtained.

library (glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-8
x <- model.matrix(Apps~., data=College)[, -1]
x.train <- x[train,]
y <- College$Apps
y.train <- y[train]

set.seed(7)
ridge.fit <- glmnet(x.train,y.train,alpha=0)
cv.ridge.fit <- cv.glmnet(x.train,y.train,alpha=0)
plot(cv.ridge.fit)

bestlam <- cv.ridge.fit$lambda.min
bestlam
## [1] 424.2704
ridge.pred <- predict(ridge.fit, s=bestlam, newx=x[test,]) 
ridge.error <- mean((ridge.pred-y.test)^2)
ridge.error
## [1] 1138030

#(d) Fit a lasso model on the training set, with λ chosen by cross-validation. Report the test error obtained, along with the number of non-zero coefficient estimates.

set.seed(2)
lasso.fit <- glmnet(x.train,y.train,alpha=1)
cv.lasso.fit<- cv.glmnet(x.train,y.train,alpha=1)
bestlam.lasso <- cv.lasso.fit$lambda.min
bestlam.lasso
## [1] 15.97351
lasso.pred <- predict(lasso.fit, s=bestlam.lasso, newx=x[test,]) 
lasso.error <- mean((lasso.pred-y.test)^2)
lasso.error
## [1] 1045922
lasso.c <- predict(lasso.fit,type="coefficients", s=bestlam)[1:17,]
length(lasso.c[lasso.c != 0])
## [1] 3

#(e) Fit a PCR model on the training set, with M chosen by cross-validation. Report the test error obtained, along with the value of M selected by cross-validation.

library (pls)
## 
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
## 
##     loadings
set.seed(1)
pcr.fit <- pcr(Apps~., data=College, subset=train, scale=TRUE, validation ="CV")
summary(pcr.fit)
## Data:    X dimension: 388 17 
##  Y dimension: 388 1
## Fit method: svdpc
## Number of components considered: 17
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV            4440     4463     2362     2385     2092     1835     1832
## adjCV         4440     4465     2359     2384     2066     1824     1823
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV        1829     1821     1753      1764      1771      1772      1784
## adjCV     1818     1813     1746      1757      1764      1764      1777
##        14 comps  15 comps  16 comps  17 comps
## CV         1778      1747      1401      1338
## adjCV      1773      1701      1385      1322
## 
## TRAINING: % variance explained
##       1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps  8 comps
## X     31.4816    57.40    64.84    70.54    75.80    80.23    84.04    87.64
## Apps   0.1398    72.45    72.50    80.45    84.74    84.77    85.06    85.16
##       9 comps  10 comps  11 comps  12 comps  13 comps  14 comps  15 comps
## X       90.87     93.29     95.33     96.98     98.05     98.75     99.37
## Apps    85.93     86.16     86.21     86.32     86.40     86.61     92.49
##       16 comps  17 comps
## X        99.85    100.00
## Apps     93.65     94.32
validationplot(pcr.fit, val.type = "MSEP")

pcr.pred <- predict(pcr.fit,x[test,], ncomp=17) 
pcr.error <- mean((pcr.pred-y.test)^2)
pcr.error
## [1] 1093608

#(f) Fit a PLS model on the training set, with M chosen by cross-validation. Report the test error obtained, along with the value of M selected by cross-validation.

set.seed(1)
pls.fit <- plsr(Apps~., data=College, subset=train, scale=TRUE, validation="CV")
summary(pls.fit)
## Data:    X dimension: 388 17 
##  Y dimension: 388 1
## Fit method: kernelpls
## Number of components considered: 17
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV            4440     2153     1701     1711     1650     1498     1413
## adjCV         4440     2147     1677     1704     1615     1470     1392
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV        1378     1358     1356      1342      1338      1337      1337
## adjCV     1361     1342     1339      1326      1322      1321      1321
##        14 comps  15 comps  16 comps  17 comps
## CV         1338      1338      1338      1338
## adjCV      1322      1322      1322      1322
## 
## TRAINING: % variance explained
##       1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps  8 comps
## X       25.76    31.99    61.88    65.00    68.68    73.82    78.33    81.38
## Apps    77.92    87.80    88.38    92.06    93.59    93.92    94.01    94.09
##       9 comps  10 comps  11 comps  12 comps  13 comps  14 comps  15 comps
## X       83.24     85.59     87.76     90.49     92.69     94.27     97.03
## Apps    94.20     94.27     94.30     94.31     94.32     94.32     94.32
##       16 comps  17 comps
## X        99.21    100.00
## Apps     94.32     94.32
validationplot(pls.fit, val.type = "MSEP")

pls.pred <- predict(pls.fit,x[test,],ncomp=12) 
pls.error <- mean((pls.pred-y.test)^2)
pls.error
## [1] 1085346

#(g) Comment on the results obtained. How accurately can we predict the number of college applications received? Is there much difference among the test errors resulting from these five approaches?

errors <- c(lm.error, ridge.error, lasso.error, pcr.error, pls.error)
names(errors) <- c("lm", "ridge", "lasso", "pcr", "pls")
barplot(sort(errors))

print(sort(errors))
##   lasso     pls     pcr      lm   ridge 
## 1045922 1085346 1093608 1093608 1138030