Question 1

Using the Boston data set, fit classification models in order to predict whether a given suburb has a crime rate above or below the median. Explore logistic regression, LDA, and KNN models using various subsets of the predictors. Describe your findings.

library(pacman)
p_load(class, tidyverse,MASS, corrplot, kableExtra, ISLR2)
data("Boston")
attach(Boston)
head(Boston)
##      crim zn indus chas   nox    rm  age    dis rad tax ptratio lstat medv
## 1 0.00632 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3  4.98 24.0
## 2 0.02731  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8  9.14 21.6
## 3 0.02729  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8  4.03 34.7
## 4 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7  2.94 33.4
## 5 0.06905  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7  5.33 36.2
## 6 0.02985  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7  5.21 28.7

Visualize the relationship between 2 variables

# Calculate the median crime rate
med_crime <- median(Boston$crim)
crim_lvl <- ifelse(Boston$crim > med_crime, 1, 0)

# Convert 'crim_lvl' to a factor variable
crim_lvl <- as.factor(crim_lvl)
Boston_2 <- data.frame(Boston, crim_lvl)

# Detach the 'Boston' dataset
detach(Boston)

# Visualize the relationships between variables in 'Boston_2' using a scatterplot matrix
pairs(Boston_2)

summary(Boston_2)
##       crim                zn             indus            chas        
##  Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46   Min.   :0.00000  
##  1st Qu.: 0.08205   1st Qu.:  0.00   1st Qu.: 5.19   1st Qu.:0.00000  
##  Median : 0.25651   Median :  0.00   Median : 9.69   Median :0.00000  
##  Mean   : 3.61352   Mean   : 11.36   Mean   :11.14   Mean   :0.06917  
##  3rd Qu.: 3.67708   3rd Qu.: 12.50   3rd Qu.:18.10   3rd Qu.:0.00000  
##  Max.   :88.97620   Max.   :100.00   Max.   :27.74   Max.   :1.00000  
##       nox               rm             age              dis        
##  Min.   :0.3850   Min.   :3.561   Min.   :  2.90   Min.   : 1.130  
##  1st Qu.:0.4490   1st Qu.:5.886   1st Qu.: 45.02   1st Qu.: 2.100  
##  Median :0.5380   Median :6.208   Median : 77.50   Median : 3.207  
##  Mean   :0.5547   Mean   :6.285   Mean   : 68.57   Mean   : 3.795  
##  3rd Qu.:0.6240   3rd Qu.:6.623   3rd Qu.: 94.08   3rd Qu.: 5.188  
##  Max.   :0.8710   Max.   :8.780   Max.   :100.00   Max.   :12.127  
##       rad              tax           ptratio          lstat      
##  Min.   : 1.000   Min.   :187.0   Min.   :12.60   Min.   : 1.73  
##  1st Qu.: 4.000   1st Qu.:279.0   1st Qu.:17.40   1st Qu.: 6.95  
##  Median : 5.000   Median :330.0   Median :19.05   Median :11.36  
##  Mean   : 9.549   Mean   :408.2   Mean   :18.46   Mean   :12.65  
##  3rd Qu.:24.000   3rd Qu.:666.0   3rd Qu.:20.20   3rd Qu.:16.95  
##  Max.   :24.000   Max.   :711.0   Max.   :22.00   Max.   :37.97  
##       medv       crim_lvl
##  Min.   : 5.00   0:253   
##  1st Qu.:17.02   1:253   
##  Median :21.20           
##  Mean   :22.53           
##  3rd Qu.:25.00           
##  Max.   :50.00
corrplot(cor(Boston_2[,-14]), method="square")

According to the Correlation plot above, we consider the significant variables to crime rate to be indus, nox, age, dis, rad, tax, lstat, medv. Therefore, these may make good predictors for our crim_lvl variable (crime level high or low, high if crime rate above median and low otherwise).

Use the full data set to perform a logistic regression with Crime level as the response and the 8 variables above.

crime_fit<-glm(crim_lvl~indus + nox + age + dis + rad + tax + lstat + medv, data= Boston_2,                       family=binomial)
summary(crime_fit)
## 
## Call:
## glm(formula = crim_lvl ~ indus + nox + age + dis + rad + tax + 
##     lstat + medv, family = binomial, data = Boston_2)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.94076  -0.26496  -0.01434   0.01146   2.75863  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -27.582736   4.343637  -6.350 2.15e-10 ***
## indus        -0.068239   0.044386  -1.537  0.12420    
## nox          44.337338   7.269049   6.099 1.06e-09 ***
## age           0.013836   0.009621   1.438  0.15040    
## dis           0.292732   0.158087   1.852  0.06407 .  
## rad           0.533675   0.119105   4.481 7.44e-06 ***
## tax          -0.006313   0.002408  -2.621  0.00876 ** 
## lstat         0.039949   0.042914   0.931  0.35190    
## medv          0.052652   0.032482   1.621  0.10503    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 701.46  on 505  degrees of freedom
## Residual deviance: 242.01  on 497  degrees of freedom
## AIC: 260.01
## 
## Number of Fisher Scoring iterations: 8

As nox, rad and tax have the most significant value of p-value, we exclude these other variables out of the model. We use the linear regression model to test again with the three variables: nox, rad, tax.

crime_fit<-glm(crim_lvl~ nox + rad + tax, data= Boston_2,                       family=binomial)
summary(crime_fit)
## 
## Call:
## glm(formula = crim_lvl ~ nox + rad + tax, family = binomial, 
##     data = Boston_2)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.94777  -0.28148  -0.01433   0.00495   2.54883  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -20.075965   2.287688  -8.776  < 2e-16 ***
## nox          36.015644   4.338728   8.301  < 2e-16 ***
## rad           0.631900   0.111610   5.662  1.5e-08 ***
## tax          -0.007801   0.002184  -3.571 0.000355 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 701.46  on 505  degrees of freedom
## Residual deviance: 250.34  on 502  degrees of freedom
## AIC: 258.34
## 
## Number of Fisher Scoring iterations: 8

We prepare our train and test data:

set.seed(1)
train_13 <- rbinom(506, 1, 0.7)
Boston_2 <- cbind(Boston_2, train_13)
Boston.train <- Boston_2[train_13 == 1,]
Boston.test <- Boston_2[train_13 == 0,]
# We generate model using the train data
Boston_fit <- glm(crim_lvl~ nox + rad + tax, data= Boston.train,                       family=binomial)
Boston.probs <- predict(Boston_fit, Boston.test, type = "response")
Boston.probs[1:10]
##          4          6          7         15         17         18         20 
## 0.02569957 0.02569957 0.38986175 0.36048788 0.36048788 0.36048788 0.36048788 
##         21         29         35 
## 0.36048788 0.36048788 0.36048788
contrasts(crim_lvl)
##   1
## 0 0
## 1 1
Bos_pred <- rep(0, length(Boston.probs))
Bos_pred[Boston.probs > .5] = 1
table(Bos_pred, Boston.test$crim_lvl)
##         
## Bos_pred  0  1
##        0 67 15
##        1  8 56

The logistic model we selected used “nox” (nitrogen oxides concentration), “tax” (full-value property-tax rate per $10,000) and rad (index of accessibility to radial highways) as predictors. Other predictors were excluded from the model as they were not statistically significant. According to the model, as the level of nitrogen oxides concentration increases, the predicted probability of a neighborhood having an above-median crime rate also increases. Additionally, the model suggests that as the index of accessibility to radial highways increases, the predicted probability of a high crime rate also increases, but the two other variable have a lesser significant effect compared to the “nox” variable.

(8 + 15) / length(Boston.probs)
## [1] 0.1575342

The final error rate of this model with a threshold of 50% was 15.7%.

We will now run an LDA analysis using our set of predictors (nox, tax, and rad).

library(MASS)
lda.fit <- lda(crim_lvl ~ nox + tax + rad, data = Boston_2,
               subset = (train_13==1))
lda.fit
## Call:
## lda(crim_lvl ~ nox + tax + rad, data = Boston_2, subset = (train_13 == 
##     1))
## 
## Prior probabilities of groups:
##         0         1 
## 0.4944444 0.5055556 
## 
## Group means:
##         nox      tax       rad
## 0 0.4690051 303.3989  4.106742
## 1 0.6401758 506.0549 14.576923
## 
## Coefficients of linear discriminants:
##              LD1
## nox  9.970745177
## tax -0.001202623
## rad  0.081982185
plot(lda.fit)

lda.pred <- predict(lda.fit, Boston.test)
names(lda.pred)
## [1] "class"     "posterior" "x"
lda.class <- lda.pred$class
table(lda.class, Boston.test$crim_lvl)
##          
## lda.class  0  1
##         0 73 17
##         1  2 54
(18 + 2) / length(Boston.probs)
## [1] 0.1369863

The final error rate of the LDA model with a threshold of 50% was 13.6%.

We using KNN to run the model with K = 1

Bos.train <- cbind(Boston.train$nox, Boston.train$tax, Boston.train$rad)
Bos.test <- cbind(Boston.train$nox, Boston.train$tax, Boston.train$rad)
set.seed(1)
knn.pred=knn(Bos.train,Bos.test, Boston.train$crim_lvl,k=1)
table(knn.pred,Boston.train$crim_lvl)
##         
## knn.pred   0   1
##        0 176  10
##        1   2 172
#Error rate:
(10+2)/146
## [1] 0.08219178

We using KNN to run the model with K = 1

knn.pred=knn(Bos.train,Bos.test, Boston.train$crim_lvl,k=3)
table(knn.pred,Boston.train$crim_lvl)
##         
## knn.pred   0   1
##        0 170  10
##        1   8 172
#Error rate:
(10+8)/146
## [1] 0.1232877

We ran to KNN models, one with a k of 1 and one with a k of 3. The error rates were 8.2% and 12,3% respectively.This model outperforms the Logistic Regression and LDA models developed in the examples above. The best model is the KNN model with a k of 1.

Question 2

a) Which of the three models with k predictors has the smallest traning RSS

The model with the smallest training RSS among the three models with k predictors is the one generated through best subset selection. This approach considers all possible models with k predictors, increasing the likelihood of finding the model with the lowest RSS. It is possible for the other two approaches to select the same model, but best subset selection generally produces the model with the smallest training RSS.

b) Which of the three models with k predictors has the smallest test RSS?

It is difficult to determine which of the three models with k predictors has the smallest test RSS based on the provided information. The best subset selection approach may lead to overfitting if the number of observations (n) is relatively small compared to the number of predictors (p). Alternatively, the other two methods could randomly select a model that performs better on the test set due to chance.

True or False: i) TRUE ii) TRUE iii) FALSE iv) FALSE v) FALSE

Question 3

We will predict the number of applications received using the other variables in the College data set.

a) Split the data set into a training set and a test set.

data("College")
summary(College)
##  Private        Apps           Accept          Enroll       Top10perc    
##  No :212   Min.   :   81   Min.   :   72   Min.   :  35   Min.   : 1.00  
##  Yes:565   1st Qu.:  776   1st Qu.:  604   1st Qu.: 242   1st Qu.:15.00  
##            Median : 1558   Median : 1110   Median : 434   Median :23.00  
##            Mean   : 3002   Mean   : 2019   Mean   : 780   Mean   :27.56  
##            3rd Qu.: 3624   3rd Qu.: 2424   3rd Qu.: 902   3rd Qu.:35.00  
##            Max.   :48094   Max.   :26330   Max.   :6392   Max.   :96.00  
##    Top25perc      F.Undergrad     P.Undergrad         Outstate    
##  Min.   :  9.0   Min.   :  139   Min.   :    1.0   Min.   : 2340  
##  1st Qu.: 41.0   1st Qu.:  992   1st Qu.:   95.0   1st Qu.: 7320  
##  Median : 54.0   Median : 1707   Median :  353.0   Median : 9990  
##  Mean   : 55.8   Mean   : 3700   Mean   :  855.3   Mean   :10441  
##  3rd Qu.: 69.0   3rd Qu.: 4005   3rd Qu.:  967.0   3rd Qu.:12925  
##  Max.   :100.0   Max.   :31643   Max.   :21836.0   Max.   :21700  
##    Room.Board       Books           Personal         PhD        
##  Min.   :1780   Min.   :  96.0   Min.   : 250   Min.   :  8.00  
##  1st Qu.:3597   1st Qu.: 470.0   1st Qu.: 850   1st Qu.: 62.00  
##  Median :4200   Median : 500.0   Median :1200   Median : 75.00  
##  Mean   :4358   Mean   : 549.4   Mean   :1341   Mean   : 72.66  
##  3rd Qu.:5050   3rd Qu.: 600.0   3rd Qu.:1700   3rd Qu.: 85.00  
##  Max.   :8124   Max.   :2340.0   Max.   :6800   Max.   :103.00  
##     Terminal       S.F.Ratio      perc.alumni        Expend     
##  Min.   : 24.0   Min.   : 2.50   Min.   : 0.00   Min.   : 3186  
##  1st Qu.: 71.0   1st Qu.:11.50   1st Qu.:13.00   1st Qu.: 6751  
##  Median : 82.0   Median :13.60   Median :21.00   Median : 8377  
##  Mean   : 79.7   Mean   :14.09   Mean   :22.74   Mean   : 9660  
##  3rd Qu.: 92.0   3rd Qu.:16.50   3rd Qu.:31.00   3rd Qu.:10830  
##  Max.   :100.0   Max.   :39.80   Max.   :64.00   Max.   :56233  
##    Grad.Rate     
##  Min.   : 10.00  
##  1st Qu.: 53.00  
##  Median : 65.00  
##  Mean   : 65.46  
##  3rd Qu.: 78.00  
##  Max.   :118.00
dim(College)
## [1] 777  18
set.seed(1)
train_3 <- rbinom(777, 1, 0.7)
col2 <- cbind(College, train_3)
Col.train <- College[train_3 == 1,]
Col.test <- College[train_3 == 0,]

b) Fit a linear model using least squares on the training set, and report the test error obtained.

attach(Col.test)
RMSE <- function(x,y){
  return(sqrt(mean((x-y)^2)))
}
col.fit = lm(Apps ~ ., data = Col.train)
summary(col.fit)
## 
## Call:
## lm(formula = Apps ~ ., data = Col.train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2913.6  -410.4   -57.3   295.6  6913.1 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -5.850e+02  4.541e+02  -1.288  0.19818    
## PrivateYes  -6.669e+02  1.580e+02  -4.222 2.85e-05 ***
## Accept       1.237e+00  5.892e-02  20.994  < 2e-16 ***
## Enroll      -1.943e-01  2.089e-01  -0.930  0.35260    
## Top10perc    4.691e+01  6.474e+00   7.247 1.53e-12 ***
## Top25perc   -1.447e+01  5.029e+00  -2.876  0.00418 ** 
## F.Undergrad  7.494e-02  3.610e-02   2.076  0.03839 *  
## P.Undergrad -2.268e-03  4.153e-02  -0.055  0.95648    
## Outstate    -2.849e-02  2.165e-02  -1.316  0.18876    
## Room.Board   1.523e-01  5.618e-02   2.711  0.00692 ** 
## Books        4.950e-02  2.629e-01   0.188  0.85072    
## Personal     4.800e-02  6.914e-02   0.694  0.48784    
## PhD         -8.823e+00  5.500e+00  -1.604  0.10929    
## Terminal    -2.546e-01  5.981e+00  -0.043  0.96606    
## S.F.Ratio    5.011e+00  1.439e+01   0.348  0.72790    
## perc.alumni -5.429e+00  4.679e+00  -1.160  0.24645    
## Expend       6.563e-02  1.381e-02   4.751 2.62e-06 ***
## Grad.Rate    8.967e+00  3.279e+00   2.735  0.00646 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 973.4 on 527 degrees of freedom
## Multiple R-squared:  0.9272, Adjusted R-squared:  0.9248 
## F-statistic: 394.7 on 17 and 527 DF,  p-value: < 2.2e-16
Col.pred <- predict(col.fit, Col.test, type = "response")
RMSE(Col.pred, Col.test$Apps)
## [1] 1341.404

c) Fit a ridge regression model on the training set, with λ chosen by cross-validation. Report the test error obtained.

library(glmnet)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loaded glmnet 4.1-7
set.seed(1)
train.mat <- as.matrix(Col.train[,-1])
test.mat <- as.matrix(Col.test[,-1])
cv_fit <- cv.glmnet(x = train.mat, y = Col.train$Apps, alpha = 0, nfolds = 10)
optimal_lambda <- cv_fit$lambda.min
optimal_lambda
## [1] 354.7011
# Fit a ridge regression model on the training set with the chosen lambda
ridge_model <- glmnet(x = train.mat, y = Col.train$Apps, alpha = 0, lambda = optimal_lambda)

# Make predictions on the test set
test_pred <- predict(ridge_model, newx = test.mat)
RMSE(test_pred, Col.test$Apps)
## [1] 818.0597

The test error of the ridge regression fit with a lambda chosen by cross-validation is 818, which is lower than the linear model test error.

d) Fit a lasso model on the training set, with λ chosen by crossvalidation. Report the test error obtained, along with the number of non-zero coefficient estimates.

cv_fit <- cv.glmnet(x = train.mat, y = Col.train$Apps, alpha = 1, nfolds = 10)
lasso_lambda <- cv_fit$lambda.min
lasso_lambda
## [1] 103.3972
# Fit a lasso model on the training set with the chosen lambda
lasso_model <- glmnet(x = train.mat, y = Col.train$Apps, alpha = 1, lambda = lasso_lambda)

# Make predictions on the test set
test_pred <- predict(lasso_model, newx = test.mat)
RMSE(test_pred, Col.test$Apps)
## [1] 132.2373

The test error of the lasso model fit with a lambda chosen by cross-validation is 132.24, which is much lower than the linear model test error and ridge regression model.