library(ISLR2)
library(ridge)
library(elasticnet)
## Loading required package: lars
## Loaded lars 1.3
library(naivebayes)
## naivebayes 0.9.7 loaded
library(class)  
# Add class package for KNN

# Q1: Classification Models for Crime Rate Prediction
data(Boston)

# Create binary response variable for crime rate above or below median
Boston$CrimeAboveMedian <- as.factor(ifelse(Boston$crim > median(Boston$crim), 1, 0))

# Logistic Regression
logistic_model <- glm(CrimeAboveMedian ~ ., data = Boston, family = binomial)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(logistic_model)
## 
## Call:
## glm(formula = CrimeAboveMedian ~ ., family = binomial, data = Boston)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.270e+02  2.030e+05  -0.002    0.999
## crim         1.056e+03  2.021e+04   0.052    0.958
## zn           2.251e+00  6.284e+01   0.036    0.971
## indus       -3.859e+00  1.542e+03  -0.003    0.998
## chas        -5.407e+00  1.089e+04   0.000    1.000
## nox          1.467e+02  2.190e+05   0.001    0.999
## rm          -4.152e+01  1.990e+03  -0.021    0.983
## age          4.756e-01  8.017e+01   0.006    0.995
## dis         -1.335e+01  2.827e+03  -0.005    0.996
## rad         -4.353e+00  3.454e+03  -0.001    0.999
## tax         -1.346e-01  1.581e+02  -0.001    0.999
## ptratio      1.464e+01  6.733e+03   0.002    0.998
## lstat       -9.119e-01  5.204e+02  -0.002    0.999
## medv         3.491e+00  7.710e+02   0.005    0.996
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 7.0146e+02  on 505  degrees of freedom
## Residual deviance: 2.8134e-05  on 492  degrees of freedom
## AIC: 28
## 
## Number of Fisher Scoring iterations: 25
# LDA
lda_model <- MASS::lda(CrimeAboveMedian ~ ., data = Boston)
lda_model
## Call:
## lda(CrimeAboveMedian ~ ., data = Boston)
## 
## Prior probabilities of groups:
##   0   1 
## 0.5 0.5 
## 
## Group means:
##        crim        zn     indus       chas       nox       rm      age      dis
## 0 0.0955715 21.525692  7.002292 0.05138340 0.4709711 6.394395 51.31028 5.091596
## 1 7.1314756  1.201581 15.271265 0.08695652 0.6384190 6.174874 85.83953 2.498489
##         rad      tax  ptratio     lstat     medv
## 0  4.158103 305.7431 17.90711  9.419486 24.94941
## 1 14.940711 510.7312 19.00395 15.886640 20.11621
## 
## Coefficients of linear discriminants:
##                   LD1
## crim     0.0057477432
## zn      -0.0055783361
## indus    0.0133950314
## chas    -0.0683284866
## nox      8.2352660572
## rm       0.1127191607
## age      0.0109751104
## dis      0.0431741184
## rad      0.0723695021
## tax     -0.0008391622
## ptratio  0.0473594598
## lstat    0.0158822769
## medv     0.0361430310
# Naive Bayes
naive_bayes_model <- naive_bayes(CrimeAboveMedian ~ ., data = Boston)
naive_bayes_model
## 
## ================================== Naive Bayes ================================== 
##  
##  Call: 
## naive_bayes.formula(formula = CrimeAboveMedian ~ ., data = Boston)
## 
## --------------------------------------------------------------------------------- 
##  
## Laplace smoothing: 0
## 
## --------------------------------------------------------------------------------- 
##  
##  A priori probabilities: 
## 
##   0   1 
## 0.5 0.5 
## 
## --------------------------------------------------------------------------------- 
##  
##  Tables: 
## 
## --------------------------------------------------------------------------------- 
##  ::: crim (Gaussian) 
## --------------------------------------------------------------------------------- 
##       
## crim             0           1
##   mean  0.09557150  7.13147561
##   sd    0.06281773 11.10912294
## 
## --------------------------------------------------------------------------------- 
##  ::: zn (Gaussian) 
## --------------------------------------------------------------------------------- 
##       
## zn             0         1
##   mean 21.525692  1.201581
##   sd   29.319808  4.798611
## 
## --------------------------------------------------------------------------------- 
##  ::: indus (Gaussian) 
## --------------------------------------------------------------------------------- 
##       
## indus          0         1
##   mean  7.002292 15.271265
##   sd    5.514454  5.439010
## 
## --------------------------------------------------------------------------------- 
##  ::: chas (Gaussian) 
## --------------------------------------------------------------------------------- 
##       
## chas            0          1
##   mean 0.05138340 0.08695652
##   sd   0.22121612 0.28232985
## 
## --------------------------------------------------------------------------------- 
##  ::: nox (Gaussian) 
## --------------------------------------------------------------------------------- 
##       
## nox             0          1
##   mean 0.47097115 0.63841897
##   sd   0.05559789 0.09870365
## 
## ---------------------------------------------------------------------------------
## 
## # ... and 8 more tables
## 
## ---------------------------------------------------------------------------------
# KNN
knn_model <- knn.cv(train = Boston[, -14], cl = Boston$CrimeAboveMedian, k = 5)
knn_model
##   [1] 0 0 0 0 0 0 1 0 1 0 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
##  [38] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [75] 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [112] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0
## [186] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1
## [223] 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1
## [260] 1 1 1 1 1 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [297] 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## [334] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [371] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [408] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [445] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [482] 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## Levels: 0 1
# Q2: Model Selection Approaches
# (a) Smallest training RSS: Best Subset, Forward Stepwise, Backward Stepwise - Not implemented in this code snippet
# (b) Smallest test RSS: Best Subset, Forward Stepwise, Backward Stepwise - Not implemented in this code snippet
# (c) True or False

# (c) (i) True
# (c) (ii) True
# (c) (iii) False
# (c) (iv) False
# (c) (v) True
# Q3: Predicting Number of Applications Received
# Load required packages
library(ISLR2)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
# Load College dataset
data("College")
summary(College)
##  Private        Apps           Accept          Enroll       Top10perc    
##  No :212   Min.   :   81   Min.   :   72   Min.   :  35   Min.   : 1.00  
##  Yes:565   1st Qu.:  776   1st Qu.:  604   1st Qu.: 242   1st Qu.:15.00  
##            Median : 1558   Median : 1110   Median : 434   Median :23.00  
##            Mean   : 3002   Mean   : 2019   Mean   : 780   Mean   :27.56  
##            3rd Qu.: 3624   3rd Qu.: 2424   3rd Qu.: 902   3rd Qu.:35.00  
##            Max.   :48094   Max.   :26330   Max.   :6392   Max.   :96.00  
##    Top25perc      F.Undergrad     P.Undergrad         Outstate    
##  Min.   :  9.0   Min.   :  139   Min.   :    1.0   Min.   : 2340  
##  1st Qu.: 41.0   1st Qu.:  992   1st Qu.:   95.0   1st Qu.: 7320  
##  Median : 54.0   Median : 1707   Median :  353.0   Median : 9990  
##  Mean   : 55.8   Mean   : 3700   Mean   :  855.3   Mean   :10441  
##  3rd Qu.: 69.0   3rd Qu.: 4005   3rd Qu.:  967.0   3rd Qu.:12925  
##  Max.   :100.0   Max.   :31643   Max.   :21836.0   Max.   :21700  
##    Room.Board       Books           Personal         PhD        
##  Min.   :1780   Min.   :  96.0   Min.   : 250   Min.   :  8.00  
##  1st Qu.:3597   1st Qu.: 470.0   1st Qu.: 850   1st Qu.: 62.00  
##  Median :4200   Median : 500.0   Median :1200   Median : 75.00  
##  Mean   :4358   Mean   : 549.4   Mean   :1341   Mean   : 72.66  
##  3rd Qu.:5050   3rd Qu.: 600.0   3rd Qu.:1700   3rd Qu.: 85.00  
##  Max.   :8124   Max.   :2340.0   Max.   :6800   Max.   :103.00  
##     Terminal       S.F.Ratio      perc.alumni        Expend     
##  Min.   : 24.0   Min.   : 2.50   Min.   : 0.00   Min.   : 3186  
##  1st Qu.: 71.0   1st Qu.:11.50   1st Qu.:13.00   1st Qu.: 6751  
##  Median : 82.0   Median :13.60   Median :21.00   Median : 8377  
##  Mean   : 79.7   Mean   :14.09   Mean   :22.74   Mean   : 9660  
##  3rd Qu.: 92.0   3rd Qu.:16.50   3rd Qu.:31.00   3rd Qu.:10830  
##  Max.   :100.0   Max.   :39.80   Max.   :64.00   Max.   :56233  
##    Grad.Rate     
##  Min.   : 10.00  
##  1st Qu.: 53.00  
##  Median : 65.00  
##  Mean   : 65.46  
##  3rd Qu.: 78.00  
##  Max.   :118.00
# Set seed for reproducibility
set.seed(123)

# Split the data into training and test sets
trainIndex <- createDataPartition(College$Apps, p = 0.7, list = FALSE)
trainData <- College[trainIndex, ]
testData <- College[-trainIndex, ]
# Fit linear model on the training set
lm_model <- lm(Apps ~ ., data = trainData)
summary(lm_model)
## 
## Call:
## lm(formula = Apps ~ ., data = trainData)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3100.8  -429.5   -57.1   262.1  7131.5 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.214e+03  4.545e+02  -2.671 0.007801 ** 
## PrivateYes  -4.832e+02  1.551e+02  -3.115 0.001936 ** 
## Accept       1.273e+00  5.758e-02  22.115  < 2e-16 ***
## Enroll      -1.480e-01  2.324e-01  -0.637 0.524520    
## Top10perc    3.802e+01  6.248e+00   6.086 2.24e-09 ***
## Top25perc   -1.155e+01  4.980e+00  -2.319 0.020778 *  
## F.Undergrad  2.208e-02  4.054e-02   0.545 0.586221    
## P.Undergrad  8.545e-02  3.602e-02   2.372 0.018046 *  
## Outstate    -5.581e-02  2.092e-02  -2.667 0.007882 ** 
## Room.Board   2.095e-01  5.370e-02   3.902 0.000108 ***
## Books        2.301e-01  2.614e-01   0.880 0.379272    
## Personal     1.174e-02  6.796e-02   0.173 0.862926    
## PhD         -4.220e+00  5.652e+00  -0.747 0.455681    
## Terminal    -6.631e+00  6.043e+00  -1.097 0.272972    
## S.F.Ratio    2.806e+01  1.407e+01   1.994 0.046659 *  
## perc.alumni -2.720e+00  4.776e+00  -0.570 0.569244    
## Expend       9.087e-02  1.251e-02   7.262 1.38e-12 ***
## Grad.Rate    1.018e+01  3.467e+00   2.937 0.003462 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 959.4 on 527 degrees of freedom
## Multiple R-squared:  0.9222, Adjusted R-squared:  0.9197 
## F-statistic: 367.4 on 17 and 527 DF,  p-value: < 2.2e-16
# Predict on the test set
lm_pred <- predict(lm_model, newdata = testData)

# Calculate test error for linear model (mean squared error)
lm_test_error <- mean((lm_pred - testData$Apps)^2)

# Fit ridge regression model with cross-validation for alpha selection
ridge_model <- train(Apps ~ ., data = trainData, method = "ridge",
                     trControl = trainControl(method = "cv"),
                     tuneLength = 10)

# Predict on the test set
ridge_pred <- predict(ridge_model, newdata = testData)

# Calculate test error for ridge regression model (mean squared error)
ridge_test_error <- mean((ridge_pred - testData$Apps)^2)

# Fit lasso model with cross-validation for alpha selection
lasso_model <- train(Apps ~ ., data = trainData, method = "lasso",
                     trControl = trainControl(method = "cv"),
                     tuneLength = 10)

# Predict on the test set
lasso_pred <- predict(lasso_model, newdata = testData)

# Calculate test error for lasso model (mean squared error)
lasso_test_error <- mean((lasso_pred - testData$Apps)^2)

# Number of non-zero coefficient estimates in the lasso model
num_non_zero <- sum(coef(lasso_model$finalModel) != 0)

# Print the test errors and the number of non-zero coefficients
print(paste("Linear Model Test Error:", lm_test_error))
## [1] "Linear Model Test Error: 1882073.83239865"
print(paste("Ridge Regression Test Error:", ridge_test_error))
## [1] "Ridge Regression Test Error: 1893106.5864917"
print(paste("Lasso Model Test Error:", lasso_test_error))
## [1] "Lasso Model Test Error: 2006127.86876507"
print(paste("Number of Non-Zero Coefficients in Lasso Model:", num_non_zero))
## [1] "Number of Non-Zero Coefficients in Lasso Model: 0"