Using the Boston data set, fit classification models in order to predict whether a given suburb has a crime rate above or below the median. Explore logistic regression, LDA, and KNN models using various subsets of the predictors. Describe your findings.
library(pacman)
p_load(class, tidyverse,MASS, corrplot, kableExtra, ISLR2)
data("Boston")
attach(Boston)
head(Boston)
## crim zn indus chas nox rm age dis rad tax ptratio lstat medv
## 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 4.98 24.0
## 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 9.14 21.6
## 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 4.03 34.7
## 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 2.94 33.4
## 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 5.33 36.2
## 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 5.21 28.7
# Calculate the median crime rate
med_crime <- median(Boston$crim)
crim_lvl <- ifelse(Boston$crim > med_crime, 1, 0)
# Convert 'crim_lvl' to a factor variable
crim_lvl <- as.factor(crim_lvl)
Boston_2 <- data.frame(Boston, crim_lvl)
# Detach the 'Boston' dataset
detach(Boston)
# Visualize the relationships between variables in 'Boston_2' using a scatterplot matrix
pairs(Boston_2)
summary(Boston_2)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio lstat
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 1.73
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.: 6.95
## Median : 5.000 Median :330.0 Median :19.05 Median :11.36
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :12.65
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:16.95
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :37.97
## medv crim_lvl
## Min. : 5.00 0:253
## 1st Qu.:17.02 1:253
## Median :21.20
## Mean :22.53
## 3rd Qu.:25.00
## Max. :50.00
corrplot(cor(Boston_2[,-14]), method="square")
According to the Correlation plot above, we consider the significant
variables to crime rate to be indus, nox, age, dis, rad, tax, lstat,
medv. Therefore, these may make good predictors for our crim_lvl
variable (crime level high or low, high if crime rate above median and
low otherwise).
crime_fit<-glm(crim_lvl~indus + nox + age + dis + rad + tax + lstat + medv, data= Boston_2, family=binomial)
summary(crime_fit)
##
## Call:
## glm(formula = crim_lvl ~ indus + nox + age + dis + rad + tax +
## lstat + medv, family = binomial, data = Boston_2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.94076 -0.26496 -0.01434 0.01146 2.75863
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -27.582736 4.343637 -6.350 2.15e-10 ***
## indus -0.068239 0.044386 -1.537 0.12420
## nox 44.337338 7.269049 6.099 1.06e-09 ***
## age 0.013836 0.009621 1.438 0.15040
## dis 0.292732 0.158087 1.852 0.06407 .
## rad 0.533675 0.119105 4.481 7.44e-06 ***
## tax -0.006313 0.002408 -2.621 0.00876 **
## lstat 0.039949 0.042914 0.931 0.35190
## medv 0.052652 0.032482 1.621 0.10503
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 701.46 on 505 degrees of freedom
## Residual deviance: 242.01 on 497 degrees of freedom
## AIC: 260.01
##
## Number of Fisher Scoring iterations: 8
As nox, rad and tax have the most significant value of p-value, we exclude these other variables out of the model. We use the linear regression model to test again with the three variables: nox, rad, tax.
crime_fit<-glm(crim_lvl~ nox + rad + tax, data= Boston_2, family=binomial)
summary(crime_fit)
##
## Call:
## glm(formula = crim_lvl ~ nox + rad + tax, family = binomial,
## data = Boston_2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.94777 -0.28148 -0.01433 0.00495 2.54883
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -20.075965 2.287688 -8.776 < 2e-16 ***
## nox 36.015644 4.338728 8.301 < 2e-16 ***
## rad 0.631900 0.111610 5.662 1.5e-08 ***
## tax -0.007801 0.002184 -3.571 0.000355 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 701.46 on 505 degrees of freedom
## Residual deviance: 250.34 on 502 degrees of freedom
## AIC: 258.34
##
## Number of Fisher Scoring iterations: 8
set.seed(1)
train_13 <- rbinom(506, 1, 0.7)
Boston_2 <- cbind(Boston_2, train_13)
Boston.train <- Boston_2[train_13 == 1,]
Boston.test <- Boston_2[train_13 == 0,]
# We generate model using the train data
Boston_fit <- glm(crim_lvl~ nox + rad + tax, data= Boston.train, family=binomial)
Boston.probs <- predict(Boston_fit, Boston.test, type = "response")
Boston.probs[1:10]
## 4 6 7 15 17 18 20
## 0.02569957 0.02569957 0.38986175 0.36048788 0.36048788 0.36048788 0.36048788
## 21 29 35
## 0.36048788 0.36048788 0.36048788
contrasts(crim_lvl)
## 1
## 0 0
## 1 1
Bos_pred <- rep(0, length(Boston.probs))
Bos_pred[Boston.probs > .5] = 1
table(Bos_pred, Boston.test$crim_lvl)
##
## Bos_pred 0 1
## 0 67 15
## 1 8 56
The logistic model we selected used “nox” (nitrogen oxides concentration), “tax” (full-value property-tax rate per $10,000) and rad (index of accessibility to radial highways) as predictors. Other predictors were excluded from the model as they were not statistically significant. According to the model, as the level of nitrogen oxides concentration increases, the predicted probability of a neighborhood having an above-median crime rate also increases. Additionally, the model suggests that as the index of accessibility to radial highways increases, the predicted probability of a high crime rate also increases, but the two other variable have a lesser significant effect compared to the “nox” variable.
(8 + 15) / length(Boston.probs)
## [1] 0.1575342
The final error rate of this model with a threshold of 50% was 15.7%.
library(MASS)
lda.fit <- lda(crim_lvl ~ nox + tax + rad, data = Boston_2,
subset = (train_13==1))
lda.fit
## Call:
## lda(crim_lvl ~ nox + tax + rad, data = Boston_2, subset = (train_13 ==
## 1))
##
## Prior probabilities of groups:
## 0 1
## 0.4944444 0.5055556
##
## Group means:
## nox tax rad
## 0 0.4690051 303.3989 4.106742
## 1 0.6401758 506.0549 14.576923
##
## Coefficients of linear discriminants:
## LD1
## nox 9.970745177
## tax -0.001202623
## rad 0.081982185
plot(lda.fit)
lda.pred <- predict(lda.fit, Boston.test)
names(lda.pred)
## [1] "class" "posterior" "x"
lda.class <- lda.pred$class
table(lda.class, Boston.test$crim_lvl)
##
## lda.class 0 1
## 0 73 17
## 1 2 54
(18 + 2) / length(Boston.probs)
## [1] 0.1369863
The final error rate of the LDA model with a threshold of 50% was 13.6%.
Bos.train <- cbind(Boston.train$nox, Boston.train$tax, Boston.train$rad)
Bos.test <- cbind(Boston.train$nox, Boston.train$tax, Boston.train$rad)
set.seed(1)
knn.pred=knn(Bos.train,Bos.test, Boston.train$crim_lvl,k=1)
table(knn.pred,Boston.train$crim_lvl)
##
## knn.pred 0 1
## 0 176 10
## 1 2 172
#Error rate:
(10+2)/146
## [1] 0.08219178
knn.pred=knn(Bos.train,Bos.test, Boston.train$crim_lvl,k=3)
table(knn.pred,Boston.train$crim_lvl)
##
## knn.pred 0 1
## 0 170 10
## 1 8 172
#Error rate:
(10+8)/146
## [1] 0.1232877
We ran to KNN models, one with a k of 1 and one with a k of 3. The error rates were 8.2% and 12,3% respectively.This model outperforms the Logistic Regression and LDA models developed in the examples above. The best model is the KNN model with a k of 1.
The model with the smallest training RSS among the three models with k predictors is the one generated through best subset selection. This approach considers all possible models with k predictors, increasing the likelihood of finding the model with the lowest RSS. It is possible for the other two approaches to select the same model, but best subset selection generally produces the model with the smallest training RSS.
It is difficult to determine which of the three models with k predictors has the smallest test RSS based on the provided information. The best subset selection approach may lead to overfitting if the number of observations (n) is relatively small compared to the number of predictors (p). Alternatively, the other two methods could randomly select a model that performs better on the test set due to chance.
True or False: i) TRUE ii) TRUE iii) FALSE iv) FALSE v) FALSE
We will predict the number of applications received using the other variables in the College data set.
data("College")
summary(College)
## Private Apps Accept Enroll Top10perc
## No :212 Min. : 81 Min. : 72 Min. : 35 Min. : 1.00
## Yes:565 1st Qu.: 776 1st Qu.: 604 1st Qu.: 242 1st Qu.:15.00
## Median : 1558 Median : 1110 Median : 434 Median :23.00
## Mean : 3002 Mean : 2019 Mean : 780 Mean :27.56
## 3rd Qu.: 3624 3rd Qu.: 2424 3rd Qu.: 902 3rd Qu.:35.00
## Max. :48094 Max. :26330 Max. :6392 Max. :96.00
## Top25perc F.Undergrad P.Undergrad Outstate
## Min. : 9.0 Min. : 139 Min. : 1.0 Min. : 2340
## 1st Qu.: 41.0 1st Qu.: 992 1st Qu.: 95.0 1st Qu.: 7320
## Median : 54.0 Median : 1707 Median : 353.0 Median : 9990
## Mean : 55.8 Mean : 3700 Mean : 855.3 Mean :10441
## 3rd Qu.: 69.0 3rd Qu.: 4005 3rd Qu.: 967.0 3rd Qu.:12925
## Max. :100.0 Max. :31643 Max. :21836.0 Max. :21700
## Room.Board Books Personal PhD
## Min. :1780 Min. : 96.0 Min. : 250 Min. : 8.00
## 1st Qu.:3597 1st Qu.: 470.0 1st Qu.: 850 1st Qu.: 62.00
## Median :4200 Median : 500.0 Median :1200 Median : 75.00
## Mean :4358 Mean : 549.4 Mean :1341 Mean : 72.66
## 3rd Qu.:5050 3rd Qu.: 600.0 3rd Qu.:1700 3rd Qu.: 85.00
## Max. :8124 Max. :2340.0 Max. :6800 Max. :103.00
## Terminal S.F.Ratio perc.alumni Expend
## Min. : 24.0 Min. : 2.50 Min. : 0.00 Min. : 3186
## 1st Qu.: 71.0 1st Qu.:11.50 1st Qu.:13.00 1st Qu.: 6751
## Median : 82.0 Median :13.60 Median :21.00 Median : 8377
## Mean : 79.7 Mean :14.09 Mean :22.74 Mean : 9660
## 3rd Qu.: 92.0 3rd Qu.:16.50 3rd Qu.:31.00 3rd Qu.:10830
## Max. :100.0 Max. :39.80 Max. :64.00 Max. :56233
## Grad.Rate
## Min. : 10.00
## 1st Qu.: 53.00
## Median : 65.00
## Mean : 65.46
## 3rd Qu.: 78.00
## Max. :118.00
dim(College)
## [1] 777 18
set.seed(1)
train_3 <- rbinom(777, 1, 0.7)
col2 <- cbind(College, train_3)
Col.train <- College[train_3 == 1,]
Col.test <- College[train_3 == 0,]
attach(Col.test)
RMSE <- function(x,y){
return(sqrt(mean((x-y)^2)))
}
col.fit = lm(Apps ~ ., data = Col.train)
summary(col.fit)
##
## Call:
## lm(formula = Apps ~ ., data = Col.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2913.6 -410.4 -57.3 295.6 6913.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.850e+02 4.541e+02 -1.288 0.19818
## PrivateYes -6.669e+02 1.580e+02 -4.222 2.85e-05 ***
## Accept 1.237e+00 5.892e-02 20.994 < 2e-16 ***
## Enroll -1.943e-01 2.089e-01 -0.930 0.35260
## Top10perc 4.691e+01 6.474e+00 7.247 1.53e-12 ***
## Top25perc -1.447e+01 5.029e+00 -2.876 0.00418 **
## F.Undergrad 7.494e-02 3.610e-02 2.076 0.03839 *
## P.Undergrad -2.268e-03 4.153e-02 -0.055 0.95648
## Outstate -2.849e-02 2.165e-02 -1.316 0.18876
## Room.Board 1.523e-01 5.618e-02 2.711 0.00692 **
## Books 4.950e-02 2.629e-01 0.188 0.85072
## Personal 4.800e-02 6.914e-02 0.694 0.48784
## PhD -8.823e+00 5.500e+00 -1.604 0.10929
## Terminal -2.546e-01 5.981e+00 -0.043 0.96606
## S.F.Ratio 5.011e+00 1.439e+01 0.348 0.72790
## perc.alumni -5.429e+00 4.679e+00 -1.160 0.24645
## Expend 6.563e-02 1.381e-02 4.751 2.62e-06 ***
## Grad.Rate 8.967e+00 3.279e+00 2.735 0.00646 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 973.4 on 527 degrees of freedom
## Multiple R-squared: 0.9272, Adjusted R-squared: 0.9248
## F-statistic: 394.7 on 17 and 527 DF, p-value: < 2.2e-16
Col.pred <- predict(col.fit, Col.test, type = "response")
RMSE(Col.pred, Col.test$Apps)
## [1] 1341.404
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.1-7
set.seed(1)
train.mat <- as.matrix(Col.train[,-1])
test.mat <- as.matrix(Col.test[,-1])
cv_fit <- cv.glmnet(x = train.mat, y = Col.train$Apps, alpha = 0, nfolds = 10)
optimal_lambda <- cv_fit$lambda.min
optimal_lambda
## [1] 354.7011
# Fit a ridge regression model on the training set with the chosen lambda
ridge_model <- glmnet(x = train.mat, y = Col.train$Apps, alpha = 0, lambda = optimal_lambda)
# Make predictions on the test set
test_pred <- predict(ridge_model, newx = test.mat)
RMSE(test_pred, Col.test$Apps)
## [1] 818.0597
The test error of the ridge regression fit with a lambda chosen by cross-validation is 818, which is lower than the linear model test error.
cv_fit <- cv.glmnet(x = train.mat, y = Col.train$Apps, alpha = 1, nfolds = 10)
lasso_lambda <- cv_fit$lambda.min
lasso_lambda
## [1] 103.3972
# Fit a lasso model on the training set with the chosen lambda
lasso_model <- glmnet(x = train.mat, y = Col.train$Apps, alpha = 1, lambda = lasso_lambda)
# Make predictions on the test set
test_pred <- predict(lasso_model, newx = test.mat)
RMSE(test_pred, Col.test$Apps)
## [1] 132.2373
The test error of the lasso model fit with a lambda chosen by cross-validation is 132.24, which is much lower than the linear model test error and ridge regression model.