library(ISLR2)
library(ridge)
library(elasticnet)
## Loading required package: lars
## Loaded lars 1.3
library(naivebayes)
## naivebayes 0.9.7 loaded
library(class)
# Add class package for KNN
# Q1: Classification Models for Crime Rate Prediction
data(Boston)
# Create binary response variable for crime rate above or below median
Boston$CrimeAboveMedian <- as.factor(ifelse(Boston$crim > median(Boston$crim), 1, 0))
# Logistic Regression
logistic_model <- glm(CrimeAboveMedian ~ ., data = Boston, family = binomial)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(logistic_model)
##
## Call:
## glm(formula = CrimeAboveMedian ~ ., family = binomial, data = Boston)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.270e+02 2.030e+05 -0.002 0.999
## crim 1.056e+03 2.021e+04 0.052 0.958
## zn 2.251e+00 6.284e+01 0.036 0.971
## indus -3.859e+00 1.542e+03 -0.003 0.998
## chas -5.407e+00 1.089e+04 0.000 1.000
## nox 1.467e+02 2.190e+05 0.001 0.999
## rm -4.152e+01 1.990e+03 -0.021 0.983
## age 4.756e-01 8.017e+01 0.006 0.995
## dis -1.335e+01 2.827e+03 -0.005 0.996
## rad -4.353e+00 3.454e+03 -0.001 0.999
## tax -1.346e-01 1.581e+02 -0.001 0.999
## ptratio 1.464e+01 6.733e+03 0.002 0.998
## lstat -9.119e-01 5.204e+02 -0.002 0.999
## medv 3.491e+00 7.710e+02 0.005 0.996
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7.0146e+02 on 505 degrees of freedom
## Residual deviance: 2.8134e-05 on 492 degrees of freedom
## AIC: 28
##
## Number of Fisher Scoring iterations: 25
# LDA
lda_model <- MASS::lda(CrimeAboveMedian ~ ., data = Boston)
lda_model
## Call:
## lda(CrimeAboveMedian ~ ., data = Boston)
##
## Prior probabilities of groups:
## 0 1
## 0.5 0.5
##
## Group means:
## crim zn indus chas nox rm age dis
## 0 0.0955715 21.525692 7.002292 0.05138340 0.4709711 6.394395 51.31028 5.091596
## 1 7.1314756 1.201581 15.271265 0.08695652 0.6384190 6.174874 85.83953 2.498489
## rad tax ptratio lstat medv
## 0 4.158103 305.7431 17.90711 9.419486 24.94941
## 1 14.940711 510.7312 19.00395 15.886640 20.11621
##
## Coefficients of linear discriminants:
## LD1
## crim 0.0057477432
## zn -0.0055783361
## indus 0.0133950314
## chas -0.0683284866
## nox 8.2352660572
## rm 0.1127191607
## age 0.0109751104
## dis 0.0431741184
## rad 0.0723695021
## tax -0.0008391622
## ptratio 0.0473594598
## lstat 0.0158822769
## medv 0.0361430310
# Naive Bayes
naive_bayes_model <- naive_bayes(CrimeAboveMedian ~ ., data = Boston)
naive_bayes_model
##
## ================================== Naive Bayes ==================================
##
## Call:
## naive_bayes.formula(formula = CrimeAboveMedian ~ ., data = Boston)
##
## ---------------------------------------------------------------------------------
##
## Laplace smoothing: 0
##
## ---------------------------------------------------------------------------------
##
## A priori probabilities:
##
## 0 1
## 0.5 0.5
##
## ---------------------------------------------------------------------------------
##
## Tables:
##
## ---------------------------------------------------------------------------------
## ::: crim (Gaussian)
## ---------------------------------------------------------------------------------
##
## crim 0 1
## mean 0.09557150 7.13147561
## sd 0.06281773 11.10912294
##
## ---------------------------------------------------------------------------------
## ::: zn (Gaussian)
## ---------------------------------------------------------------------------------
##
## zn 0 1
## mean 21.525692 1.201581
## sd 29.319808 4.798611
##
## ---------------------------------------------------------------------------------
## ::: indus (Gaussian)
## ---------------------------------------------------------------------------------
##
## indus 0 1
## mean 7.002292 15.271265
## sd 5.514454 5.439010
##
## ---------------------------------------------------------------------------------
## ::: chas (Gaussian)
## ---------------------------------------------------------------------------------
##
## chas 0 1
## mean 0.05138340 0.08695652
## sd 0.22121612 0.28232985
##
## ---------------------------------------------------------------------------------
## ::: nox (Gaussian)
## ---------------------------------------------------------------------------------
##
## nox 0 1
## mean 0.47097115 0.63841897
## sd 0.05559789 0.09870365
##
## ---------------------------------------------------------------------------------
##
## # ... and 8 more tables
##
## ---------------------------------------------------------------------------------
# KNN
knn_model <- knn.cv(train = Boston[, -14], cl = Boston$CrimeAboveMedian, k = 5)
knn_model
## [1] 0 0 0 0 0 0 1 0 1 0 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
## [38] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [75] 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [112] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0
## [186] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1
## [223] 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1
## [260] 1 1 1 1 1 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [297] 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## [334] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [371] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [408] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [445] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [482] 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## Levels: 0 1
# Q2: Model Selection Approaches
# (a) Smallest training RSS: Best Subset, Forward Stepwise, Backward Stepwise - Not implemented in this code snippet
# (b) Smallest test RSS: Best Subset, Forward Stepwise, Backward Stepwise - Not implemented in this code snippet
# (c) True or False
# (c) (i) True
# (c) (ii) True
# (c) (iii) False
# (c) (iv) False
# (c) (v) True
# Q3: Predicting Number of Applications Received
# Load required packages
library(ISLR2)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
# Load College dataset
data("College")
summary(College)
## Private Apps Accept Enroll Top10perc
## No :212 Min. : 81 Min. : 72 Min. : 35 Min. : 1.00
## Yes:565 1st Qu.: 776 1st Qu.: 604 1st Qu.: 242 1st Qu.:15.00
## Median : 1558 Median : 1110 Median : 434 Median :23.00
## Mean : 3002 Mean : 2019 Mean : 780 Mean :27.56
## 3rd Qu.: 3624 3rd Qu.: 2424 3rd Qu.: 902 3rd Qu.:35.00
## Max. :48094 Max. :26330 Max. :6392 Max. :96.00
## Top25perc F.Undergrad P.Undergrad Outstate
## Min. : 9.0 Min. : 139 Min. : 1.0 Min. : 2340
## 1st Qu.: 41.0 1st Qu.: 992 1st Qu.: 95.0 1st Qu.: 7320
## Median : 54.0 Median : 1707 Median : 353.0 Median : 9990
## Mean : 55.8 Mean : 3700 Mean : 855.3 Mean :10441
## 3rd Qu.: 69.0 3rd Qu.: 4005 3rd Qu.: 967.0 3rd Qu.:12925
## Max. :100.0 Max. :31643 Max. :21836.0 Max. :21700
## Room.Board Books Personal PhD
## Min. :1780 Min. : 96.0 Min. : 250 Min. : 8.00
## 1st Qu.:3597 1st Qu.: 470.0 1st Qu.: 850 1st Qu.: 62.00
## Median :4200 Median : 500.0 Median :1200 Median : 75.00
## Mean :4358 Mean : 549.4 Mean :1341 Mean : 72.66
## 3rd Qu.:5050 3rd Qu.: 600.0 3rd Qu.:1700 3rd Qu.: 85.00
## Max. :8124 Max. :2340.0 Max. :6800 Max. :103.00
## Terminal S.F.Ratio perc.alumni Expend
## Min. : 24.0 Min. : 2.50 Min. : 0.00 Min. : 3186
## 1st Qu.: 71.0 1st Qu.:11.50 1st Qu.:13.00 1st Qu.: 6751
## Median : 82.0 Median :13.60 Median :21.00 Median : 8377
## Mean : 79.7 Mean :14.09 Mean :22.74 Mean : 9660
## 3rd Qu.: 92.0 3rd Qu.:16.50 3rd Qu.:31.00 3rd Qu.:10830
## Max. :100.0 Max. :39.80 Max. :64.00 Max. :56233
## Grad.Rate
## Min. : 10.00
## 1st Qu.: 53.00
## Median : 65.00
## Mean : 65.46
## 3rd Qu.: 78.00
## Max. :118.00
# Set seed for reproducibility
set.seed(123)
# Split the data into training and test sets
trainIndex <- createDataPartition(College$Apps, p = 0.7, list = FALSE)
trainData <- College[trainIndex, ]
testData <- College[-trainIndex, ]
# Fit linear model on the training set
lm_model <- lm(Apps ~ ., data = trainData)
summary(lm_model)
##
## Call:
## lm(formula = Apps ~ ., data = trainData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3100.8 -429.5 -57.1 262.1 7131.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.214e+03 4.545e+02 -2.671 0.007801 **
## PrivateYes -4.832e+02 1.551e+02 -3.115 0.001936 **
## Accept 1.273e+00 5.758e-02 22.115 < 2e-16 ***
## Enroll -1.480e-01 2.324e-01 -0.637 0.524520
## Top10perc 3.802e+01 6.248e+00 6.086 2.24e-09 ***
## Top25perc -1.155e+01 4.980e+00 -2.319 0.020778 *
## F.Undergrad 2.208e-02 4.054e-02 0.545 0.586221
## P.Undergrad 8.545e-02 3.602e-02 2.372 0.018046 *
## Outstate -5.581e-02 2.092e-02 -2.667 0.007882 **
## Room.Board 2.095e-01 5.370e-02 3.902 0.000108 ***
## Books 2.301e-01 2.614e-01 0.880 0.379272
## Personal 1.174e-02 6.796e-02 0.173 0.862926
## PhD -4.220e+00 5.652e+00 -0.747 0.455681
## Terminal -6.631e+00 6.043e+00 -1.097 0.272972
## S.F.Ratio 2.806e+01 1.407e+01 1.994 0.046659 *
## perc.alumni -2.720e+00 4.776e+00 -0.570 0.569244
## Expend 9.087e-02 1.251e-02 7.262 1.38e-12 ***
## Grad.Rate 1.018e+01 3.467e+00 2.937 0.003462 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 959.4 on 527 degrees of freedom
## Multiple R-squared: 0.9222, Adjusted R-squared: 0.9197
## F-statistic: 367.4 on 17 and 527 DF, p-value: < 2.2e-16
# Predict on the test set
lm_pred <- predict(lm_model, newdata = testData)
# Calculate test error for linear model (mean squared error)
lm_test_error <- mean((lm_pred - testData$Apps)^2)
# Fit ridge regression model with cross-validation for alpha selection
ridge_model <- train(Apps ~ ., data = trainData, method = "ridge",
trControl = trainControl(method = "cv"),
tuneLength = 10)
# Predict on the test set
ridge_pred <- predict(ridge_model, newdata = testData)
# Calculate test error for ridge regression model (mean squared error)
ridge_test_error <- mean((ridge_pred - testData$Apps)^2)
# Fit lasso model with cross-validation for alpha selection
lasso_model <- train(Apps ~ ., data = trainData, method = "lasso",
trControl = trainControl(method = "cv"),
tuneLength = 10)
# Predict on the test set
lasso_pred <- predict(lasso_model, newdata = testData)
# Calculate test error for lasso model (mean squared error)
lasso_test_error <- mean((lasso_pred - testData$Apps)^2)
# Number of non-zero coefficient estimates in the lasso model
num_non_zero <- sum(coef(lasso_model$finalModel) != 0)
# Print the test errors and the number of non-zero coefficients
print(paste("Linear Model Test Error:", lm_test_error))
## [1] "Linear Model Test Error: 1882073.83239865"
print(paste("Ridge Regression Test Error:", ridge_test_error))
## [1] "Ridge Regression Test Error: 1893106.5864917"
print(paste("Lasso Model Test Error:", lasso_test_error))
## [1] "Lasso Model Test Error: 2006127.86876507"
print(paste("Number of Non-Zero Coefficients in Lasso Model:", num_non_zero))
## [1] "Number of Non-Zero Coefficients in Lasso Model: 0"