Prob_16

(16)Using the Boston data set, fit classification models in order to predict whether a given census tract has a crime rate above or below the median. Explore logistic regression, LDA, naive Bayes, and KNN models using various subsets of the predictors. Describe your findings. Hint: You will have to create the response variable yourself, using the variables that are contained in the Boston data set.

#Loading necessary libraries:   Content
library(MASS)    # Boston dataset, LDA, and QDA
library(class)   # KNN
library(e1071)   # Naïve Bayes

## Warning: package 'e1071' was built under R version 4.4.3

library(ggplot2) # visualization
library(caret)   # splitting the data

## Warning: package 'caret' was built under R version 4.4.3

## Loading required package: lattice

#Loading data
data("Boston")
#Median crime rate
crim_median = median(Boston$crim)
#Creating binary values
Boston$crim01 = ifelse(Boston$crim > crim_median, 1, 0)

table(Boston$crim01)# the data is evenly split

## 
##   0   1 
## 253 253

#Scatterplot to examin relationships
pairs(Boston, c("crim", "indus", "nox", "rm", "age", "tax", "lstat"))

#Boxplots for showcase key variables

ggplot(Boston, aes(x = as.factor(crim01), y = indus)) + #indus
  geom_boxplot()

ggplot(Boston, aes(x = as.factor(crim01), y = tax)) +   #tax
  geom_boxplot()

ggplot(Boston, aes(x = as.factor(crim01), y = lstat)) + #lstat
  geom_boxplot()

#`indus`, `tax`, and `lstat` appear to be strongly associated with crime rate.

Splitting the data into test and training sets

set.seed(100)  #  reproducibility
train_index <- createDataPartition(Boston$crim01, p = 0.7, list = FALSE)  # 70% training, 30% test

# Create training and test sets
train <- Boston[train_index, ]
test <- Boston[-train_index, ]

# Define predictor variables based on (b)
predictors <- c("indus", "tax", "lstat")

Logistic regression

glm.fit = glm(crim01 ~ indus + tax + lstat, data = train, family = binomial)
glm.probs = predict(glm.fit, test, type = "response")
glm.pred = ifelse(glm.probs > 0.5, 1, 0)
table(glm.pred, test$crim01)

##         
## glm.pred  0  1
##        0 64 20
##        1 11 55

mean(glm.pred == test$crim01)

## [1] 0.7933333

LDA

lda.fit = lda(crim01 ~ indus + tax + lstat, data = train)
lda.pred = predict(lda.fit, test)
lda.class = lda.pred$class
table(lda.class, test$crim01)

##          
## lda.class  0  1
##         0 65 21
##         1 10 54

mean(lda.class == test$crim01)

## [1] 0.7933333

Naive Bayes

nb.fit = naiveBayes(as.factor(crim01) ~ indus + tax + lstat, data = train)
nb.pred = predict(nb.fit, test)
table(nb.pred, test$crim01)

##        
## nb.pred  0  1
##       0 69 19
##       1  6 56

mean(nb.pred == test$crim01)

## [1] 0.8333333

KNN with different values for K

train.X = as.matrix(train[, predictors])
test.X = as.matrix(test[, predictors])
train.Y = train$crim01
test.Y = test$crim01

knn.pred = knn(train.X, test.X, train.Y, k = 1)
mean(knn.pred == test.Y)

## [1] 0.9333333

knn.pred = knn(train.X, test.X, train.Y, k = 3)
mean(knn.pred == test.Y)

## [1] 0.9333333

knn.pred = knn(train.X, test.X, train.Y, k = 5)
mean(knn.pred == test.Y)

## [1] 0.9133333

Findings KNN has highest accuracy 0.93% Best K values for KNN: 1 & 3 Naive Bayes: 0.83 LDA: 0.79

Prob_16_Ch4

2025-03-07