Assignment #3

Chapter 04 (page 189): 13, 14, 16

library(ISLR2)

## Warning: package 'ISLR2' was built under R version 4.5.3

library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:ISLR2':
## 
##     Boston

library(class)
library(naivebayes)

## Warning: package 'naivebayes' was built under R version 4.5.3

## naivebayes 1.0.0 loaded

## For more information please visit:

## https://majkamichal.github.io/naivebayes/

data(Weekly)
str(Weekly)

## 'data.frame':    1089 obs. of  9 variables:
##  $ Year     : num  1990 1990 1990 1990 1990 1990 1990 1990 1990 1990 ...
##  $ Lag1     : num  0.816 -0.27 -2.576 3.514 0.712 ...
##  $ Lag2     : num  1.572 0.816 -0.27 -2.576 3.514 ...
##  $ Lag3     : num  -3.936 1.572 0.816 -0.27 -2.576 ...
##  $ Lag4     : num  -0.229 -3.936 1.572 0.816 -0.27 ...
##  $ Lag5     : num  -3.484 -0.229 -3.936 1.572 0.816 ...
##  $ Volume   : num  0.155 0.149 0.16 0.162 0.154 ...
##  $ Today    : num  -0.27 -2.576 3.514 0.712 1.178 ...
##  $ Direction: Factor w/ 2 levels "Down","Up": 1 1 2 2 2 1 2 2 2 1 ...

summary(Weekly)

##       Year           Lag1               Lag2               Lag3         
##  Min.   :1990   Min.   :-18.1950   Min.   :-18.1950   Min.   :-18.1950  
##  1st Qu.:1995   1st Qu.: -1.1540   1st Qu.: -1.1540   1st Qu.: -1.1580  
##  Median :2000   Median :  0.2410   Median :  0.2410   Median :  0.2410  
##  Mean   :2000   Mean   :  0.1506   Mean   :  0.1511   Mean   :  0.1472  
##  3rd Qu.:2005   3rd Qu.:  1.4050   3rd Qu.:  1.4090   3rd Qu.:  1.4090  
##  Max.   :2010   Max.   : 12.0260   Max.   : 12.0260   Max.   : 12.0260  
##       Lag4               Lag5              Volume            Today         
##  Min.   :-18.1950   Min.   :-18.1950   Min.   :0.08747   Min.   :-18.1950  
##  1st Qu.: -1.1580   1st Qu.: -1.1660   1st Qu.:0.33202   1st Qu.: -1.1540  
##  Median :  0.2380   Median :  0.2340   Median :1.00268   Median :  0.2410  
##  Mean   :  0.1458   Mean   :  0.1399   Mean   :1.57462   Mean   :  0.1499  
##  3rd Qu.:  1.4090   3rd Qu.:  1.4050   3rd Qu.:2.05373   3rd Qu.:  1.4050  
##  Max.   : 12.0260   Max.   : 12.0260   Max.   :9.32821   Max.   : 12.0260  
##  Direction 
##  Down:484  
##  Up  :605  
##            
##            
##            
##

Numerical and graphical summaries

summary(Weekly)

##       Year           Lag1               Lag2               Lag3         
##  Min.   :1990   Min.   :-18.1950   Min.   :-18.1950   Min.   :-18.1950  
##  1st Qu.:1995   1st Qu.: -1.1540   1st Qu.: -1.1540   1st Qu.: -1.1580  
##  Median :2000   Median :  0.2410   Median :  0.2410   Median :  0.2410  
##  Mean   :2000   Mean   :  0.1506   Mean   :  0.1511   Mean   :  0.1472  
##  3rd Qu.:2005   3rd Qu.:  1.4050   3rd Qu.:  1.4090   3rd Qu.:  1.4090  
##  Max.   :2010   Max.   : 12.0260   Max.   : 12.0260   Max.   : 12.0260  
##       Lag4               Lag5              Volume            Today         
##  Min.   :-18.1950   Min.   :-18.1950   Min.   :0.08747   Min.   :-18.1950  
##  1st Qu.: -1.1580   1st Qu.: -1.1660   1st Qu.:0.33202   1st Qu.: -1.1540  
##  Median :  0.2380   Median :  0.2340   Median :1.00268   Median :  0.2410  
##  Mean   :  0.1458   Mean   :  0.1399   Mean   :1.57462   Mean   :  0.1499  
##  3rd Qu.:  1.4090   3rd Qu.:  1.4050   3rd Qu.:2.05373   3rd Qu.:  1.4050  
##  Max.   : 12.0260   Max.   : 12.0260   Max.   :9.32821   Max.   : 12.0260  
##  Direction 
##  Down:484  
##  Up  :605  
##            
##            
##            
##

pairs(Weekly)

cor(Weekly[, -9])

##               Year         Lag1        Lag2        Lag3         Lag4
## Year    1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1   -0.03228927  1.000000000 -0.07485305  0.05863568 -0.071273876
## Lag2   -0.03339001 -0.074853051  1.00000000 -0.07572091  0.058381535
## Lag3   -0.03000649  0.058635682 -0.07572091  1.00000000 -0.075395865
## Lag4   -0.03112792 -0.071273876  0.05838153 -0.07539587  1.000000000
## Lag5   -0.03051910 -0.008183096 -0.07249948  0.06065717 -0.075675027
## Volume  0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today  -0.03245989 -0.075031842  0.05916672 -0.07124364 -0.007825873
##                Lag5      Volume        Today
## Year   -0.030519101  0.84194162 -0.032459894
## Lag1   -0.008183096 -0.06495131 -0.075031842
## Lag2   -0.072499482 -0.08551314  0.059166717
## Lag3    0.060657175 -0.06928771 -0.071243639
## Lag4   -0.075675027 -0.06107462 -0.007825873
## Lag5    1.000000000 -0.05851741  0.011012698
## Volume -0.058517414  1.00000000 -0.033077783
## Today   0.011012698 -0.03307778  1.000000000

plot(Weekly$Year, Weekly$Volume,
     xlab = "Year",
     ylab = "Volume",
     main = "Weekly Volume Over Time")

The pairwise plots do not show very strong relationships among most of the lag variables and the response. However, volume appears to increase over time, suggesting a clear time trend.

Logistic regression using full data

glm_full <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data = Weekly, family = binomial)

summary(glm_full)

## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = binomial, data = Weekly)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.26686    0.08593   3.106   0.0019 **
## Lag1        -0.04127    0.02641  -1.563   0.1181   
## Lag2         0.05844    0.02686   2.175   0.0296 * 
## Lag3        -0.01606    0.02666  -0.602   0.5469   
## Lag4        -0.02779    0.02646  -1.050   0.2937   
## Lag5        -0.01447    0.02638  -0.549   0.5833   
## Volume      -0.02274    0.03690  -0.616   0.5377   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1486.4  on 1082  degrees of freedom
## AIC: 1500.4
## 
## Number of Fisher Scoring iterations: 4

Based on the logistic regression output, Lag2 is usually the only predictor that appears statistically significant. This suggests that the return from two weeks ago has some relationship with the direction of the market this week.

Confusion matrix and accuracy

glm_probs <- predict(glm_full, type = "response")

glm_pred <- rep("Down", length(glm_probs))
glm_pred[glm_probs > 0.5] <- "Up"

table(glm_pred, Weekly$Direction)

##         
## glm_pred Down  Up
##     Down   54  48
##     Up    430 557

mean(glm_pred == Weekly$Direction)

## [1] 0.5610652

Create training and test sets

train <- Weekly$Year <= 2008
test <- Weekly$Year > 2008

Weekly_train <- Weekly[train, ]
Weekly_test <- Weekly[test, ]

Direction_test <- Weekly$Direction[test]

Logistic regression using Lag2 only

glm_lag2 <- glm(Direction ~ Lag2,
                data = Weekly_train,
                family = binomial)

glm_lag2_probs <- predict(glm_lag2,
                          Weekly_test,
                          type = "response")

glm_lag2_pred <- rep("Down", length(glm_lag2_probs))
glm_lag2_pred[glm_lag2_probs > 0.5] <- "Up"

table(glm_lag2_pred, Direction_test)

##              Direction_test
## glm_lag2_pred Down Up
##          Down    9  5
##          Up     34 56

mean(glm_lag2_pred == Direction_test)

## [1] 0.625

LDA using Lag2

lda_fit <- lda(Direction ~ Lag2, data = Weekly_train)
lda_pred <- predict(lda_fit, Weekly_test)
table(lda_pred$class, Direction_test)

##       Direction_test
##        Down Up
##   Down    9  5
##   Up     34 56

mean(lda_pred$class == Direction_test)

## [1] 0.625

QDA using Lag2

qda_fit <- qda(Direction ~ Lag2, data = Weekly_train)
qda_pred <- predict(qda_fit, Weekly_test)
table(qda_pred$class, Direction_test)

##       Direction_test
##        Down Up
##   Down    0  0
##   Up     43 61

mean(qda_pred$class == Direction_test)

## [1] 0.5865385

KNN with K = 1

train_X <- as.matrix(Weekly_train$Lag2)
test_X <- as.matrix(Weekly_test$Lag2)
train_Direction <- Weekly_train$Direction
set.seed(1)
knn_pred <- knn(train_X, test_X, train_Direction, k = 1)
table(knn_pred, Direction_test)

##         Direction_test
## knn_pred Down Up
##     Down   21 30
##     Up     22 31

mean(knn_pred == Direction_test)

## [1] 0.5

Naive Bayes

nb_fit <- naive_bayes(Direction ~ Lag2, data = Weekly_train)
nb_pred <- predict(nb_fit, Weekly_test)

## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.

table(nb_pred, Direction_test)

##        Direction_test
## nb_pred Down Up
##    Down    0  0
##    Up     43 61

mean(nb_pred == Direction_test)

## [1] 0.5865385

Best method

logistic regression and LDA using Lag2 usually perform best, with an accuracy around 62.5%.

#14

data(Auto)
Auto <- na.omit(Auto)

Create binary variable mpg01

mpg01 <- rep(0, nrow(Auto))
mpg01[Auto$mpg > median(Auto$mpg)] <- 1
Auto2 <- data.frame(Auto, mpg01)
summary(as.factor(Auto2$mpg01))

##   0   1 
## 196 196

Graphical exploration

pairs(Auto2[, c("mpg01", "cylinders", "displacement","horsepower", "weight", "acceleration","year", "origin")])

Graphical exploration

par(mfrow = c(2, 3))

boxplot(cylinders ~ mpg01, data = Auto2,
        xlab = "mpg01",
        ylab = "Cylinders",
        main = "Cylinders by mpg01")

boxplot(displacement ~ mpg01, data = Auto2,
        xlab = "mpg01",
        ylab = "Displacement",
        main = "Displacement by mpg01")

boxplot(horsepower ~ mpg01, data = Auto2,
        xlab = "mpg01",
        ylab = "Horsepower",
        main = "Horsepower by mpg01")

boxplot(weight ~ mpg01, data = Auto2,
        xlab = "mpg01",
        ylab = "Weight",
        main = "Weight by mpg01")

boxplot(year ~ mpg01, data = Auto2,
        xlab = "mpg01",
        ylab = "Year",
        main = "Year by mpg01")

boxplot(origin ~ mpg01, data = Auto2,
        xlab = "mpg01",
        ylab = "Origin",
        main = "Origin by mpg01")

par(mfrow = c(1, 1))

The boxplots suggest that cylinders, displacement, horsepower, weight, and year are strongly associated with mpg01. Cars with high gas mileage tend to have fewer cylinders, lower displacement, lower horsepower, lower weight, and newer model years.

Split into training and test sets

train <- Auto2$year < 76
test <- Auto2$year >= 76
Auto_train <- Auto2[train, ]
Auto_test <- Auto2[test, ]
mpg01_test <- Auto2$mpg01[test]

lda_fit <- lda(mpg01 ~ cylinders + displacement + horsepower + weight + year, data = Auto_train)
lda_pred <- predict(lda_fit, Auto_test)
table(lda_pred$class, mpg01_test)

##    mpg01_test
##       0   1
##   0  63   7
##   1  11 131

mean(lda_pred$class != mpg01_test)

## [1] 0.08490566

qda_fit <- qda(mpg01 ~ cylinders + displacement + horsepower + weight + year, data = Auto_train)
qda_pred <- predict(qda_fit, Auto_test)
table(qda_pred$class, mpg01_test)

##    mpg01_test
##       0   1
##   0  69  17
##   1   5 121

mean(qda_pred$class != mpg01_test)

## [1] 0.1037736

Logistic regression

glm_fit <- glm(mpg01 ~ cylinders + displacement + horsepower + weight + year, data = Auto_train,family = binomial)
glm_probs <- predict(glm_fit, Auto_test, type = "response")
glm_pred <- rep(0, length(glm_probs))
glm_pred[glm_probs > 0.5] <- 1
table(glm_pred, mpg01_test)

##         mpg01_test
## glm_pred   0   1
##        0  67   9
##        1   7 129

mean(glm_pred != mpg01_test)

## [1] 0.0754717

Naive Bayes

nb_fit <- naive_bayes(as.factor(mpg01) ~ cylinders + displacement + horsepower + weight + year, data = Auto_train)
nb_pred <- predict(nb_fit, Auto_test)

## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.

table(nb_pred, mpg01_test)

##        mpg01_test
## nb_pred   0   1
##       0  67  15
##       1   7 123

mean(nb_pred != mpg01_test)

## [1] 0.1037736

KNN with several K values

train_X <- Auto_train[, c("cylinders", "displacement", "horsepower", "weight", "year")]
test_X <- Auto_test[, c("cylinders", "displacement", "horsepower", "weight", "year")]
train_X_scaled <- scale(train_X)
test_X_scaled <- scale(test_X, center = attr(train_X_scaled, "scaled:center"), scale = attr(train_X_scaled, "scaled:scale"))

train_y <- as.factor(Auto_train$mpg01)
test_y <- as.factor(Auto_test$mpg01)

set.seed(1)

knn_results <- data.frame(K = numeric(),
                          Test_Error = numeric())

for(k in c(1, 3, 5, 7, 9, 11, 15, 20)) {
  
  knn_pred <- knn(train_X_scaled,
                  test_X_scaled,
                  train_y,
                  k = k)
  
  test_error <- mean(knn_pred != test_y)
  
  knn_results <- rbind(knn_results,
                       data.frame(K = k,
                                  Test_Error = test_error))
}

knn_results

##    K Test_Error
## 1  1 0.09433962
## 2  3 0.10377358
## 3  5 0.10377358
## 4  7 0.10377358
## 5  9 0.10377358
## 6 11 0.10377358
## 7 15 0.10377358
## 8 20 0.10377358

best_k <- knn_results[which.min(knn_results$Test_Error), ]
best_k

##   K Test_Error
## 1 1 0.09433962

Final comparison

lda_error <- mean(lda_pred$class != mpg01_test)
qda_error <- mean(qda_pred$class != mpg01_test)
glm_error <- mean(glm_pred != mpg01_test)
nb_error <- mean(nb_pred != mpg01_test)
knn_best_error <- min(knn_results$Test_Error)

comparison <- data.frame(
  Method = c("LDA", "QDA", "Logistic Regression", "Naive Bayes", "Best KNN"),
  Test_Error = c(lda_error, qda_error, glm_error, nb_error, knn_best_error)
)
comparison

##                Method Test_Error
## 1                 LDA 0.08490566
## 2                 QDA 0.10377358
## 3 Logistic Regression 0.07547170
## 4         Naive Bayes 0.10377358
## 5            Best KNN 0.09433962

Logistic regression is the best fit due to the lowest test error score.

Boston Classification

data(Boston)

Create a binary response variable

crime01 <- ifelse(Boston$crim > median(Boston$crim), 1, 0)
Boston2 <- data.frame(Boston, crime01)
summary(as.factor(Boston2$crime01))

##   0   1 
## 253 253

Explore the data

par(mfrow=c(2,3))

plot(Boston2$lstat, Boston2$crime01,
     pch=19, col="steelblue",
     xlab="LSTAT", ylab="crime01")

plot(Boston2$rm, Boston2$crime01,
     pch=19, col="steelblue",
     xlab="RM", ylab="crime01")

plot(Boston2$nox, Boston2$crime01,
     pch=19, col="steelblue",
     xlab="NOX", ylab="crime01")

plot(Boston2$dis, Boston2$crime01,
     pch=19, col="steelblue",
     xlab="DIS", ylab="crime01")

plot(Boston2$tax, Boston2$crime01,
     pch=19, col="steelblue",
     xlab="TAX", ylab="crime01")

plot(Boston2$ptratio, Boston2$crime01,
     pch=19, col="steelblue",
     xlab="PTRATIO", ylab="crime01")

par(mfrow=c(1,1))

The scatterplots suggest that lstat, rm, nox, dis, tax, and ptratio are associated with whether the crime rate is above or below the median.

Training and Test Sets

set.seed(1)

train <- sample(1:nrow(Boston2), nrow(Boston2)/2)

Boston.train <- Boston2[train, ]
Boston.test <- Boston2[-train, ]

crime.test <- Boston2$crime01[-train]

Logistic Regression

glm.fit <- glm(crime01 ~ lstat + rm + nox + dis + tax + ptratio,data = Boston.train, family = binomial)
glm.probs <- predict(glm.fit, Boston.test, type="response")
glm.pred <- ifelse(glm.probs > .5,1,0)
table(glm.pred, crime.test)

##         crime.test
## glm.pred   0   1
##        0 110  19
##        1  16 108

glm.error <- mean(glm.pred != crime.test)
glm.error

## [1] 0.1383399

LDA

lda.fit <- lda(crime01 ~ lstat + rm + nox + dis + tax + ptratio,data = Boston.train)
lda.pred <- predict(lda.fit, Boston.test)
table(lda.pred$class, crime.test)

##    crime.test
##       0   1
##   0 114  37
##   1  12  90

lda.error <- mean(lda.pred$class != crime.test)
lda.error

## [1] 0.1936759

Naive Bayes

nb.fit <- naive_bayes(as.factor(crime01) ~ lstat + rm + nox + dis + tax + ptratio, data = Boston.train)
nb.pred <- predict(nb.fit, Boston.test)

## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.

table(nb.pred, crime.test)

##        crime.test
## nb.pred   0   1
##       0 109  24
##       1  17 103

nb.error <- mean(nb.pred != crime.test)
nb.error

## [1] 0.1620553

KNN

train.X <- Boston.train[,c("lstat","rm","nox","dis","tax","ptratio")]
test.X  <- Boston.test[,c("lstat","rm","nox","dis","tax","ptratio")]
train.X <- scale(train.X)
test.X <- scale(test.X, center=attr(train.X,"scaled:center"), scale=attr(train.X,"scaled:scale"))
train.Y <- as.factor(Boston.train$crime01)
test.Y <- as.factor(crime.test)

set.seed(1)

knn.results <- data.frame(K=numeric(), Test_Error=numeric())

for(k in c(1,3,5,7,9,11,15,20)){

pred <- knn(train.X,test.X,train.Y,k=k)

error <- mean(pred!=test.Y)
knn.results <- rbind(knn.results, data.frame(K=k, Test_Error=error))

}
knn.results

##    K Test_Error
## 1  1 0.09090909
## 2  3 0.07905138
## 3  5 0.07509881
## 4  7 0.08695652
## 5  9 0.12648221
## 6 11 0.12648221
## 7 15 0.13043478
## 8 20 0.14624506

best.knn <- knn.results[which.min(knn.results$Test_Error),]
best.knn

##   K Test_Error
## 3 5 0.07509881

Compare Methods

comparison <- data.frame(

Method=c("Logistic Regression","LDA","Naive Bayes","Best KNN"),

Test_Error=c(glm.error, lda.error, nb.error, min(knn.results$Test_Error))
)

comparison

##                Method Test_Error
## 1 Logistic Regression 0.13833992
## 2                 LDA 0.19367589
## 3         Naive Bayes 0.16205534
## 4            Best KNN 0.07509881

Conclusions

KNN is the best fit due to the lowest test score.

Assignment #3

Eric Cantu ugq789

2026-06-25