(16)Using the Boston data set, fit classification models in order to predict whether a given census tract has a crime rate above or below the median. Explore logistic regression, LDA, naive Bayes, and KNN models using various subsets of the predictors. Describe your findings. Hint: You will have to create the response variable yourself, using the variables that are contained in the Boston data set.
#Loading necessary libraries: Content
library(MASS) # Boston dataset, LDA, and QDA
library(class) # KNN
library(e1071) # Naïve Bayes
## Warning: package 'e1071' was built under R version 4.4.3
library(ggplot2) # visualization
library(caret) # splitting the data
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
#Loading data
data("Boston")
#Median crime rate
crim_median = median(Boston$crim)
#Creating binary values
Boston$crim01 = ifelse(Boston$crim > crim_median, 1, 0)
table(Boston$crim01)# the data is evenly split
##
## 0 1
## 253 253
#Scatterplot to examin relationships
pairs(Boston, c("crim", "indus", "nox", "rm", "age", "tax", "lstat"))
#Boxplots for showcase key variables
ggplot(Boston, aes(x = as.factor(crim01), y = indus)) + #indus
geom_boxplot()
ggplot(Boston, aes(x = as.factor(crim01), y = tax)) + #tax
geom_boxplot()
ggplot(Boston, aes(x = as.factor(crim01), y = lstat)) + #lstat
geom_boxplot()
#`indus`, `tax`, and `lstat` appear to be strongly associated with crime rate.
Splitting the data into test and training sets
set.seed(100) # reproducibility
train_index <- createDataPartition(Boston$crim01, p = 0.7, list = FALSE) # 70% training, 30% test
# Create training and test sets
train <- Boston[train_index, ]
test <- Boston[-train_index, ]
# Define predictor variables based on (b)
predictors <- c("indus", "tax", "lstat")
Logistic regression
glm.fit = glm(crim01 ~ indus + tax + lstat, data = train, family = binomial)
glm.probs = predict(glm.fit, test, type = "response")
glm.pred = ifelse(glm.probs > 0.5, 1, 0)
table(glm.pred, test$crim01)
##
## glm.pred 0 1
## 0 64 20
## 1 11 55
mean(glm.pred == test$crim01)
## [1] 0.7933333
LDA
lda.fit = lda(crim01 ~ indus + tax + lstat, data = train)
lda.pred = predict(lda.fit, test)
lda.class = lda.pred$class
table(lda.class, test$crim01)
##
## lda.class 0 1
## 0 65 21
## 1 10 54
mean(lda.class == test$crim01)
## [1] 0.7933333
Naive Bayes
nb.fit = naiveBayes(as.factor(crim01) ~ indus + tax + lstat, data = train)
nb.pred = predict(nb.fit, test)
table(nb.pred, test$crim01)
##
## nb.pred 0 1
## 0 69 19
## 1 6 56
mean(nb.pred == test$crim01)
## [1] 0.8333333
KNN with different values for K
train.X = as.matrix(train[, predictors])
test.X = as.matrix(test[, predictors])
train.Y = train$crim01
test.Y = test$crim01
knn.pred = knn(train.X, test.X, train.Y, k = 1)
mean(knn.pred == test.Y)
## [1] 0.9333333
knn.pred = knn(train.X, test.X, train.Y, k = 3)
mean(knn.pred == test.Y)
## [1] 0.9333333
knn.pred = knn(train.X, test.X, train.Y, k = 5)
mean(knn.pred == test.Y)
## [1] 0.9133333
Findings KNN has highest accuracy 0.93% Best K values for KNN: 1 & 3 Naive Bayes: 0.83 LDA: 0.79