Boston
data set, fit classifcation models
in order to predict whether a given census tract has a crime rate above
or below the median. Explore logistic regression, LDA, naive Bayes, and
KNN models using various subsets of the predictors. Describe your
findings.#Load dataset
data("Boston")
head(Boston)
## crim zn indus chas nox rm age dis rad tax ptratio black lstat
## 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98
## 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14
## 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03
## 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94
## 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 5.33
## 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 394.12 5.21
## medv
## 1 24.0
## 2 21.6
## 3 34.7
## 4 33.4
## 5 36.2
## 6 28.7
# Create a binary response variable indicating whether the crime rate is above or below the median
Boston$crime_factor <- ifelse(Boston$crim > median(Boston$crim), "High", "Low")
# Convert to factor
Boston$crime_factor <- factor(Boston$crime_factor, levels = c("Low", "High"))
# Scatterplot
Boston %>%
dplyr::select(crim, crime_factor, rad, nox, tax, age, dis, indus) %>%
gather(Variable, value, -crim, -crime_factor) %>%
mutate(Variable = stringr::str_to_title(Variable)) %>%
ggplot(aes(value, crim)) +
geom_point(aes(col = crime_factor)) +
facet_wrap(~ Variable, scales = 'free') +
geom_smooth(method = 'lm', formula = y ~ poly(x, 3), se = FALSE) +
guides(col = FALSE) +
labs(title = 'Scatterplots for each strong predictor')
From the above figures, I chose some significant variables such as: age,
dis, indus, nox, rad, and tax. Then, I plot to see the relationship
between these significant variables and dependent variable as crim.
Looking at the graph, we can see the relation will be polynomial. Hence, the models will use polynomial relationships.
# Splitting the data
set.seed(610)
boston_split <- initial_split(Boston, prop=0.8,strata= crime_factor)
boston_training <- training(boston_split)
boston_testing <- testing(boston_split)
glm_model <- glm(crime_factor ~ poly(rad, 3) + poly(nox, 3) +
poly(tax, 3) + poly(age, 3) + poly(dis, 3)+ poly(indus, 3),
data = boston_training, family = "binomial")
summary(glm_model)
##
## Call:
## glm(formula = crime_factor ~ poly(rad, 3) + poly(nox, 3) + poly(tax,
## 3) + poly(age, 3) + poly(dis, 3) + poly(indus, 3), family = "binomial",
## data = boston_training)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.790e+14 3.339e+06 -113518879 <2e-16 ***
## poly(rad, 3)1 8.745e+16 4.751e+08 184071758 <2e-16 ***
## poly(rad, 3)2 8.225e+15 1.075e+08 76514466 <2e-16 ***
## poly(rad, 3)3 1.953e+15 7.773e+07 25122280 <2e-16 ***
## poly(nox, 3)1 1.888e+16 1.771e+08 106597240 <2e-16 ***
## poly(nox, 3)2 1.054e+15 1.016e+08 10377240 <2e-16 ***
## poly(nox, 3)3 2.177e+15 9.908e+07 21975041 <2e-16 ***
## poly(tax, 3)1 -6.941e+16 4.130e+08 -168075367 <2e-16 ***
## poly(tax, 3)2 -3.403e+16 1.859e+08 -183065250 <2e-16 ***
## poly(tax, 3)3 1.628e+16 8.421e+07 193318513 <2e-16 ***
## poly(age, 3)1 -1.028e+15 1.234e+08 -8335643 <2e-16 ***
## poly(age, 3)2 2.885e+15 7.958e+07 36254178 <2e-16 ***
## poly(age, 3)3 1.990e+15 7.152e+07 27820397 <2e-16 ***
## poly(dis, 3)1 -4.984e+15 1.606e+08 -31035200 <2e-16 ***
## poly(dis, 3)2 3.258e+15 9.902e+07 32900446 <2e-16 ***
## poly(dis, 3)3 1.957e+15 8.154e+07 23994946 <2e-16 ***
## poly(indus, 3)1 1.115e+16 1.411e+08 78989311 <2e-16 ***
## poly(indus, 3)2 4.318e+15 1.171e+08 36885916 <2e-16 ***
## poly(indus, 3)3 8.514e+15 1.210e+08 70376384 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 560.06 on 403 degrees of freedom
## Residual deviance: 2811.40 on 385 degrees of freedom
## AIC: 2849.4
##
## Number of Fisher Scoring iterations: 25
glm_fit <- predict(glm_model,type="response", newdata=boston_testing)
predict_binary_glm <- rep("Low", 102)
predict_binary_glm[glm_fit > .5] = "High"
predict_binary_glm <- predict_binary_glm %>% bind_cols(boston_testing %>% dplyr::select(crime_factor))
colnames(predict_binary_glm) <- c("predicted_value", "actual_value")
predict_binary_glm$predicted_value <- as.factor(predict_binary_glm$predicted_value)
predict_binary_glm$actual_value <- as.factor(predict_binary_glm$actual_value)
confusion_glm <- confusionMatrix(predict_binary_glm$predicted_value, predict_binary_glm$actual_value)
confusion_glm
## Confusion Matrix and Statistics
##
## Reference
## Prediction Low High
## Low 49 4
## High 2 47
##
## Accuracy : 0.9412
## 95% CI : (0.8764, 0.9781)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8824
##
## Mcnemar's Test P-Value : 0.6831
##
## Sensitivity : 0.9608
## Specificity : 0.9216
## Pos Pred Value : 0.9245
## Neg Pred Value : 0.9592
## Prevalence : 0.5000
## Detection Rate : 0.4804
## Detection Prevalence : 0.5196
## Balanced Accuracy : 0.9412
##
## 'Positive' Class : Low
##
The confusion matrix indicate that the accuracy of the model is 94.12% meaning that 94.12% of predicted results are true. The confusion matrix shows that the model perform very well.
lda_model <- lda(crime_factor ~ poly(rad, 3) + poly(nox, 3) +
poly(tax, 3) + poly(age, 3) + poly(dis, 3)+ poly(indus, 3),
data = boston_training)
lda_model
## Call:
## lda(crime_factor ~ poly(rad, 3) + poly(nox, 3) + poly(tax, 3) +
## poly(age, 3) + poly(dis, 3) + poly(indus, 3), data = boston_training)
##
## Prior probabilities of groups:
## Low High
## 0.5 0.5
##
## Group means:
## poly(rad, 3)1 poly(rad, 3)2 poly(rad, 3)3 poly(nox, 3)1 poly(nox, 3)2
## Low -0.02986818 0.00527937 -0.00410086 -0.035856 0.01135621
## High 0.02986818 -0.00527937 0.00410086 0.035856 -0.01135621
## poly(nox, 3)3 poly(tax, 3)1 poly(tax, 3)2 poly(tax, 3)3 poly(age, 3)1
## Low 0.003725802 -0.02961653 0.002564994 -0.0006208593 -0.03045625
## High -0.003725802 0.02961653 -0.002564994 0.0006208593 0.03045625
## poly(age, 3)2 poly(age, 3)3 poly(dis, 3)1 poly(dis, 3)2 poly(dis, 3)3
## Low -0.009153263 0.001180473 0.02998473 -0.01131224 0.00113431
## High 0.009153263 -0.001180473 -0.02998473 0.01131224 -0.00113431
## poly(indus, 3)1 poly(indus, 3)2 poly(indus, 3)3
## Low -0.02966437 0.01012727 0.01050137
## High 0.02966437 -0.01012727 -0.01050137
##
## Coefficients of linear discriminants:
## LD1
## poly(rad, 3)1 54.0984813
## poly(rad, 3)2 3.3023799
## poly(rad, 3)3 1.6940323
## poly(nox, 3)1 24.9539062
## poly(nox, 3)2 -9.7129156
## poly(nox, 3)3 0.9273337
## poly(tax, 3)1 -45.9158270
## poly(tax, 3)2 -20.7927520
## poly(tax, 3)3 9.3380422
## poly(age, 3)1 5.0154409
## poly(age, 3)2 5.0754930
## poly(age, 3)3 0.6445356
## poly(dis, 3)1 4.3726330
## poly(dis, 3)2 -2.6098746
## poly(dis, 3)3 4.6884000
## poly(indus, 3)1 7.0953914
## poly(indus, 3)2 8.0702091
## poly(indus, 3)3 -2.6227889
plot(lda_model)
predict_lda <- predict(lda_model, type= "response", newdata=boston_testing)$class
predict_result_lda <- predict_lda %>% bind_cols(boston_testing %>% dplyr::select(crime_factor))
colnames(predict_result_lda) <- c("predicted_value", "actual_value")
confusion_lda <- confusionMatrix(predict_result_lda$predicted_value, predict_result_lda$actual_value)
confusion_lda
## Confusion Matrix and Statistics
##
## Reference
## Prediction Low High
## Low 46 1
## High 5 50
##
## Accuracy : 0.9412
## 95% CI : (0.8764, 0.9781)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8824
##
## Mcnemar's Test P-Value : 0.2207
##
## Sensitivity : 0.9020
## Specificity : 0.9804
## Pos Pred Value : 0.9787
## Neg Pred Value : 0.9091
## Prevalence : 0.5000
## Detection Rate : 0.4510
## Detection Prevalence : 0.4608
## Balanced Accuracy : 0.9412
##
## 'Positive' Class : Low
##
The confusion matrix indicate that the accuracy of the model is 94.12% meaning that 94.12% of predicted results are true. Compare to the logistic model, LDA has better performance in specificity, while it illustrates a worst sensitivity. In general, the two models’ performance are same with the similar accuracy.
naivebayes_model <- naiveBayes(crime_factor ~ rad + nox+ tax + age + indus,
data = boston_training)
naivebayes_model
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## Low High
## 0.5 0.5
##
## Conditional probabilities:
## rad
## Y [,1] [,2]
## Low 4.168317 1.645554
## High 14.475248 9.592798
##
## nox
## Y [,1] [,2]
## Low 0.4709698 0.05651158
## High 0.6326337 0.09464589
##
## tax
## Y [,1] [,2]
## Low 306.1089 83.91098
## High 503.1485 168.82912
##
## age
## Y [,1] [,2]
## Low 50.87376 24.98412
## High 85.07129 18.87520
##
## indus
## Y [,1] [,2]
## Low 6.998465 5.471512
## High 15.143713 5.522585
predict_naivebayes <- predict(naivebayes_model, type= "class", newdata=boston_testing)
predict_result_naivebayes <- predict_naivebayes %>% bind_cols(boston_testing %>% dplyr::select(crime_factor))
colnames(predict_result_naivebayes) <- c("predicted_value", "actual_value")
confusion_naivebayes <- confusionMatrix(predict_result_naivebayes$predicted_value, predict_result_naivebayes$actual_value)
confusion_naivebayes
## Confusion Matrix and Statistics
##
## Reference
## Prediction Low High
## Low 47 11
## High 4 40
##
## Accuracy : 0.8529
## 95% CI : (0.7691, 0.9153)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 8.267e-14
##
## Kappa : 0.7059
##
## Mcnemar's Test P-Value : 0.1213
##
## Sensitivity : 0.9216
## Specificity : 0.7843
## Pos Pred Value : 0.8103
## Neg Pred Value : 0.9091
## Prevalence : 0.5000
## Detection Rate : 0.4608
## Detection Prevalence : 0.5686
## Balanced Accuracy : 0.8529
##
## 'Positive' Class : Low
##
The naive bayes model is considered as worst model when its accuracy is only at 85.29%, in which sensitivity is 92.16% and specificity is 78.43%. Hence, the decrease in accuracy is because the model fail to predict the High - or its sensitivity.
variables <- c('rad', 'nox', 'tax', 'age', 'dis', 'zn', 'indus')
x_training <- boston_training[, variables]
y_training <- boston_training$crime_factor
x_testing <- boston_testing[, variables]
acc <- list()
for (i in 1:20) {
knn_pred <- knn(train = x_training, test = x_testing, cl = y_training, k = i)
acc[as.character(i)] = mean(knn_pred == boston_testing$crime_factor)
}
acc <- unlist(acc)
data_frame(acc = acc) %>%
mutate(k = row_number()) %>%
ggplot(aes(k, acc)) +
geom_col(aes(fill = k == which.max(acc))) +
labs(x = 'K', y = 'Accuracy', title = 'KNN Accuracy for different values of K') +
scale_x_continuous(breaks = 1:20) +
scale_y_continuous(breaks = round(c(seq(0.90, 0.94, 0.01), max(acc)),
digits = 3)) +
geom_hline(yintercept = max(acc), lty = 2) +
coord_cartesian(ylim = c(min(acc), max(acc))) +
guides(fill = FALSE)
#final model
knn_final <- knn(train = x_training, test = x_testing, cl = y_training, k = 1)
head(knn_final)
## [1] Low Low Low High Low Low
## Levels: Low High
predict_result_knn <- knn_final %>% bind_cols(boston_testing %>% dplyr::select(crime_factor))
colnames(predict_result_knn) <- c("predicted_value", "actual_value")
confusion_knn <- confusionMatrix(predict_result_knn$predicted_value, predict_result_knn$actual_value)
confusion_knn
## Confusion Matrix and Statistics
##
## Reference
## Prediction Low High
## Low 48 1
## High 3 50
##
## Accuracy : 0.9608
## 95% CI : (0.9026, 0.9892)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9216
##
## Mcnemar's Test P-Value : 0.6171
##
## Sensitivity : 0.9412
## Specificity : 0.9804
## Pos Pred Value : 0.9796
## Neg Pred Value : 0.9434
## Prevalence : 0.5000
## Detection Rate : 0.4706
## Detection Prevalence : 0.4804
## Balanced Accuracy : 0.9608
##
## 'Positive' Class : Low
##
By running the k from 1 to 20, we can see the best model is k=1 with model accuracy at 96.08%. Looking at the confusion matrix, the sensitivity is calculated at 94.12%, while specificity is reported as 98.04%. All the numbers conclude that KNN with K=1 is the best model.
default
using income
and
balance
on the Default
data set. We will now
estimate the test error of this logistic regression model using the
validation set approach. Do not forget to set a random seed before
beginning your analysis.income
and balance
to predict default
attach(Default)
set.seed(106)
fit.glm <- glm(default ~ income+balance, data=Default, family="binomial")
summary(fit.glm)
##
## Call:
## glm(formula = default ~ income + balance, family = "binomial",
## data = Default)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
train <- sample(dim(Default)[1], dim(Default)[1]/2)
fit.glm <- glm(default ~ income+balance, data=Default, family="binomial")
summary(fit.glm)
##
## Call:
## glm(formula = default ~ income + balance, family = "binomial",
## data = Default)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
probs <- predict(fit.glm, newdata = Default[-train, ], type = "response")
pred.glm <- rep("No", length(probs))
pred.glm[probs > 0.5] <- "Yes"
mean(pred.glm != Default[-train,]$default)
## [1] 0.0248
We have a 2.48% test error rate with the validation set approach.
train <- sample(dim(Default)[1], dim(Default)[1] / 2)
fit.glm <- glm(default ~ income + balance, data = Default, family = "binomial", subset = train)
probs <- predict(fit.glm, newdata = Default[-train, ], type = "response")
pred.glm <- rep("No", length(probs))
pred.glm[probs > 0.5] <- "Yes"
mean(pred.glm != Default[-train, ]$default)
## [1] 0.0256
train <- sample(dim(Default)[1], dim(Default)[1] / 2)
fit.glm <- glm(default ~ income + balance, data = Default, family = "binomial", subset = train)
probs <- predict(fit.glm, newdata = Default[-train, ], type = "response")
pred.glm <- rep("No", length(probs))
pred.glm[probs > 0.5] <- "Yes"
mean(pred.glm != Default[-train, ]$default)
## [1] 0.0254
train <- sample(dim(Default)[1], dim(Default)[1] / 2)
fit.glm <- glm(default ~ income + balance, data = Default, family = "binomial", subset = train)
probs <- predict(fit.glm, newdata = Default[-train, ], type = "response")
pred.glm <- rep("No", length(probs))
pred.glm[probs > 0.5] <- "Yes"
mean(pred.glm != Default[-train, ]$default)
## [1] 0.0222
We see that the validation estimate of the test error rate can vary, depending on the precisely which observations are included in the training set and which observations are included in the validation set.
default
using income
,
balance
, and a dummy variables for student.
Estimate the test erroe for this model using the validation set
approach. Comment on whether or not including a dummy variable for
student
leads to a reduction in the test error rate.train <- sample(dim(Default)[1], dim(Default)[1] / 2)
fit.glm <- glm(default ~ income + balance + student, data = Default, family = "binomial", subset = train)
probs <- predict(fit.glm, newdata = Default[-train, ], type = "response")
pred.glm <- rep("No", length(probs))
pred.glm[probs > 0.5] <- "Yes"
mean(pred.glm != Default[-train, ]$default)
## [1] 0.0262
It doesn’t seem that adding the “student” dummy variable leads to a reduction in the validation set estimate of the test error rate.
College
data
set.sample_split <- function(dataset, split_ratio, seed = NULL) {
set.seed(seed)
train_subset <- sample(nrow(dataset) * split_ratio)
return(list(
train = dataset[train_subset, ],
test = dataset[-train_subset, ]
))
}
college_split <- sample_split(College, 0.7, seed = 601)
ols_model <- lm(Apps ~ . , data = college_split$train)
ols_predictions <- predict(ols_model, college_split$test)
# MSE function
calc_rmse <- function(y, y_hat) {
return(sqrt(mean((y - y_hat)^2)))
}
ols_rmse <- calc_rmse(college_split$test$Apps, ols_predictions)
ols_rmse
## [1] 1188.832
Test RMSE for OLS is: 1188.832
train_matrix <- model.matrix(Apps ~ . , data = college_split$train)
test_matrix <- model.matrix(Apps ~ . , data = college_split$test)
grid <- 10 ^ seq(4, -2, length = 100)
ridge_model <- cv.glmnet(train_matrix, college_split$train$Apps, alpha = 0, lambda = grid, thresh = 1e-12)
ridge_predictions <- predict(ridge_model, test_matrix, s = ridge_model$lambda.min)
ridge_rmse <- calc_rmse(college_split$test$Apps, ridge_predictions)
ridge_rmse
## [1] 1188.815
Test RMSE for ridge model is: 1188.815
lasso_model <- cv.glmnet(train_matrix, college_split$train$Apps, alpha = 1, lambda = grid, thresh = 1e-12)
lasso_predictions <- predict(lasso_model, test_matrix, s = lasso_model$lambda.min)
lasso_rmse <- calc_rmse(college_split$test$Apps, lasso_predictions)
lasso_rmse
## [1] 1188.799
Test RMSE for lasso model is: 1188.799
predict(lasso_model, s = lasso_model$lambda.min, type = "coefficients")
## 19 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) -601.38112596
## (Intercept) .
## PrivateYes -321.41076249
## Accept 1.70685797
## Enroll -1.35810665
## Top10perc 45.45111099
## Top25perc -15.88692306
## F.Undergrad 0.09983028
## P.Undergrad 0.01567394
## Outstate -0.09222852
## Room.Board 0.11816525
## Books -0.04056782
## Personal 0.05792203
## PhD -5.56419852
## Terminal -5.43103094
## S.F.Ratio 21.59623616
## perc.alumni 1.87814752
## Expend 0.10806715
## Grad.Rate 8.13596816
pcr_model <- pcr(Apps ~ . , data = college_split$train, scale = TRUE, validation = "CV")
validationplot(pcr_model, val.type = "MSEP")
pcr_predictions <- predict(pcr_model, college_split$test, ncomp = 10)
pcr_rmse <- calc_rmse(college_split$test$Apps, pcr_predictions)
pcr_rmse
## [1] 1556.821
Test RMSE for PCR model is: 1556.821
pls_model <- plsr(Apps ~ . , data = college_split$train, scale = TRUE, validation = "CV")
validationplot(pls_model, val.type = "MSEP")
pls_predictions <- predict(pls_model, college_split$test, ncomp = 10)
pls_rmse <- calc_rmse(college_split$test$Apps, pls_predictions)
pls_rmse
## [1] 1178.361
Test RMSE for PLS model is: 1178.361
# Function to compute R-squared
calc_r2 <- function(y, y_hat) {
y_bar <- mean(y)
rss <- sum((y - y_hat)^2)
tss <- sum((y - y_bar)^2)
return(1 - (rss / tss))
}
# Test R-squared for all models
ols_r2 <- calc_r2(college_split$test$Apps, ols_predictions)
ridge_r2 <- calc_r2(college_split$test$Apps, ridge_predictions)
lasso_r2 <- calc_r2(college_split$test$Apps, lasso_predictions)
pcr_r2 <- calc_r2(college_split$test$Apps, pcr_predictions)
pls_r2 <- calc_r2(college_split$test$Apps, pls_predictions)
barplot(c(ols_r2, ridge_r2, lasso_r2, pcr_r2, pls_r2),
names.arg = c("OLS", "Ridge", "Lasso", "PCR", "PLS"))
The plot shows that test \(R^2\)
for all modes except PCR are around 0.9, with PLS having slightly higher
test \(R^2\) than others. PCR has a
smaller test \(R^2\) of less than 0.8.
All modes except PCR predict college applications with high
accuracy.