Chapter 04 (page 189): 13, 14, 16

library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.5.3
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
## 
##     Boston
library(class)
library(naivebayes)
## Warning: package 'naivebayes' was built under R version 4.5.3
## naivebayes 1.0.0 loaded
## For more information please visit:
## https://majkamichal.github.io/naivebayes/
data(Weekly)
str(Weekly)
## 'data.frame':    1089 obs. of  9 variables:
##  $ Year     : num  1990 1990 1990 1990 1990 1990 1990 1990 1990 1990 ...
##  $ Lag1     : num  0.816 -0.27 -2.576 3.514 0.712 ...
##  $ Lag2     : num  1.572 0.816 -0.27 -2.576 3.514 ...
##  $ Lag3     : num  -3.936 1.572 0.816 -0.27 -2.576 ...
##  $ Lag4     : num  -0.229 -3.936 1.572 0.816 -0.27 ...
##  $ Lag5     : num  -3.484 -0.229 -3.936 1.572 0.816 ...
##  $ Volume   : num  0.155 0.149 0.16 0.162 0.154 ...
##  $ Today    : num  -0.27 -2.576 3.514 0.712 1.178 ...
##  $ Direction: Factor w/ 2 levels "Down","Up": 1 1 2 2 2 1 2 2 2 1 ...
summary(Weekly)
##       Year           Lag1               Lag2               Lag3         
##  Min.   :1990   Min.   :-18.1950   Min.   :-18.1950   Min.   :-18.1950  
##  1st Qu.:1995   1st Qu.: -1.1540   1st Qu.: -1.1540   1st Qu.: -1.1580  
##  Median :2000   Median :  0.2410   Median :  0.2410   Median :  0.2410  
##  Mean   :2000   Mean   :  0.1506   Mean   :  0.1511   Mean   :  0.1472  
##  3rd Qu.:2005   3rd Qu.:  1.4050   3rd Qu.:  1.4090   3rd Qu.:  1.4090  
##  Max.   :2010   Max.   : 12.0260   Max.   : 12.0260   Max.   : 12.0260  
##       Lag4               Lag5              Volume            Today         
##  Min.   :-18.1950   Min.   :-18.1950   Min.   :0.08747   Min.   :-18.1950  
##  1st Qu.: -1.1580   1st Qu.: -1.1660   1st Qu.:0.33202   1st Qu.: -1.1540  
##  Median :  0.2380   Median :  0.2340   Median :1.00268   Median :  0.2410  
##  Mean   :  0.1458   Mean   :  0.1399   Mean   :1.57462   Mean   :  0.1499  
##  3rd Qu.:  1.4090   3rd Qu.:  1.4050   3rd Qu.:2.05373   3rd Qu.:  1.4050  
##  Max.   : 12.0260   Max.   : 12.0260   Max.   :9.32821   Max.   : 12.0260  
##  Direction 
##  Down:484  
##  Up  :605  
##            
##            
##            
## 
  1. Numerical and graphical summaries
summary(Weekly)
##       Year           Lag1               Lag2               Lag3         
##  Min.   :1990   Min.   :-18.1950   Min.   :-18.1950   Min.   :-18.1950  
##  1st Qu.:1995   1st Qu.: -1.1540   1st Qu.: -1.1540   1st Qu.: -1.1580  
##  Median :2000   Median :  0.2410   Median :  0.2410   Median :  0.2410  
##  Mean   :2000   Mean   :  0.1506   Mean   :  0.1511   Mean   :  0.1472  
##  3rd Qu.:2005   3rd Qu.:  1.4050   3rd Qu.:  1.4090   3rd Qu.:  1.4090  
##  Max.   :2010   Max.   : 12.0260   Max.   : 12.0260   Max.   : 12.0260  
##       Lag4               Lag5              Volume            Today         
##  Min.   :-18.1950   Min.   :-18.1950   Min.   :0.08747   Min.   :-18.1950  
##  1st Qu.: -1.1580   1st Qu.: -1.1660   1st Qu.:0.33202   1st Qu.: -1.1540  
##  Median :  0.2380   Median :  0.2340   Median :1.00268   Median :  0.2410  
##  Mean   :  0.1458   Mean   :  0.1399   Mean   :1.57462   Mean   :  0.1499  
##  3rd Qu.:  1.4090   3rd Qu.:  1.4050   3rd Qu.:2.05373   3rd Qu.:  1.4050  
##  Max.   : 12.0260   Max.   : 12.0260   Max.   :9.32821   Max.   : 12.0260  
##  Direction 
##  Down:484  
##  Up  :605  
##            
##            
##            
## 
pairs(Weekly)

cor(Weekly[, -9])
##               Year         Lag1        Lag2        Lag3         Lag4
## Year    1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1   -0.03228927  1.000000000 -0.07485305  0.05863568 -0.071273876
## Lag2   -0.03339001 -0.074853051  1.00000000 -0.07572091  0.058381535
## Lag3   -0.03000649  0.058635682 -0.07572091  1.00000000 -0.075395865
## Lag4   -0.03112792 -0.071273876  0.05838153 -0.07539587  1.000000000
## Lag5   -0.03051910 -0.008183096 -0.07249948  0.06065717 -0.075675027
## Volume  0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today  -0.03245989 -0.075031842  0.05916672 -0.07124364 -0.007825873
##                Lag5      Volume        Today
## Year   -0.030519101  0.84194162 -0.032459894
## Lag1   -0.008183096 -0.06495131 -0.075031842
## Lag2   -0.072499482 -0.08551314  0.059166717
## Lag3    0.060657175 -0.06928771 -0.071243639
## Lag4   -0.075675027 -0.06107462 -0.007825873
## Lag5    1.000000000 -0.05851741  0.011012698
## Volume -0.058517414  1.00000000 -0.033077783
## Today   0.011012698 -0.03307778  1.000000000
plot(Weekly$Year, Weekly$Volume,
     xlab = "Year",
     ylab = "Volume",
     main = "Weekly Volume Over Time")

The pairwise plots do not show very strong relationships among most of the lag variables and the response. However, volume appears to increase over time, suggesting a clear time trend.

  1. Logistic regression using full data
glm_full <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data = Weekly, family = binomial)

summary(glm_full)
## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = binomial, data = Weekly)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.26686    0.08593   3.106   0.0019 **
## Lag1        -0.04127    0.02641  -1.563   0.1181   
## Lag2         0.05844    0.02686   2.175   0.0296 * 
## Lag3        -0.01606    0.02666  -0.602   0.5469   
## Lag4        -0.02779    0.02646  -1.050   0.2937   
## Lag5        -0.01447    0.02638  -0.549   0.5833   
## Volume      -0.02274    0.03690  -0.616   0.5377   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1486.4  on 1082  degrees of freedom
## AIC: 1500.4
## 
## Number of Fisher Scoring iterations: 4

Based on the logistic regression output, Lag2 is usually the only predictor that appears statistically significant. This suggests that the return from two weeks ago has some relationship with the direction of the market this week.

  1. Confusion matrix and accuracy
glm_probs <- predict(glm_full, type = "response")

glm_pred <- rep("Down", length(glm_probs))
glm_pred[glm_probs > 0.5] <- "Up"

table(glm_pred, Weekly$Direction)
##         
## glm_pred Down  Up
##     Down   54  48
##     Up    430 557
mean(glm_pred == Weekly$Direction)
## [1] 0.5610652

Create training and test sets

train <- Weekly$Year <= 2008
test <- Weekly$Year > 2008

Weekly_train <- Weekly[train, ]
Weekly_test <- Weekly[test, ]

Direction_test <- Weekly$Direction[test]
  1. Logistic regression using Lag2 only
glm_lag2 <- glm(Direction ~ Lag2,
                data = Weekly_train,
                family = binomial)

glm_lag2_probs <- predict(glm_lag2,
                          Weekly_test,
                          type = "response")

glm_lag2_pred <- rep("Down", length(glm_lag2_probs))
glm_lag2_pred[glm_lag2_probs > 0.5] <- "Up"

table(glm_lag2_pred, Direction_test)
##              Direction_test
## glm_lag2_pred Down Up
##          Down    9  5
##          Up     34 56
mean(glm_lag2_pred == Direction_test)
## [1] 0.625
  1. LDA using Lag2
lda_fit <- lda(Direction ~ Lag2, data = Weekly_train)
lda_pred <- predict(lda_fit, Weekly_test)
table(lda_pred$class, Direction_test)
##       Direction_test
##        Down Up
##   Down    9  5
##   Up     34 56
mean(lda_pred$class == Direction_test)
## [1] 0.625
  1. QDA using Lag2
qda_fit <- qda(Direction ~ Lag2, data = Weekly_train)
qda_pred <- predict(qda_fit, Weekly_test)
table(qda_pred$class, Direction_test)
##       Direction_test
##        Down Up
##   Down    0  0
##   Up     43 61
mean(qda_pred$class == Direction_test)
## [1] 0.5865385
  1. KNN with K = 1
train_X <- as.matrix(Weekly_train$Lag2)
test_X <- as.matrix(Weekly_test$Lag2)
train_Direction <- Weekly_train$Direction
set.seed(1)
knn_pred <- knn(train_X, test_X, train_Direction, k = 1)
table(knn_pred, Direction_test)
##         Direction_test
## knn_pred Down Up
##     Down   21 30
##     Up     22 31
mean(knn_pred == Direction_test)
## [1] 0.5
  1. Naive Bayes
nb_fit <- naive_bayes(Direction ~ Lag2, data = Weekly_train)
nb_pred <- predict(nb_fit, Weekly_test)
## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.
table(nb_pred, Direction_test)
##        Direction_test
## nb_pred Down Up
##    Down    0  0
##    Up     43 61
mean(nb_pred == Direction_test)
## [1] 0.5865385
  1. Best method

logistic regression and LDA using Lag2 usually perform best, with an accuracy around 62.5%.

#14

data(Auto)
Auto <- na.omit(Auto)
  1. Create binary variable mpg01
mpg01 <- rep(0, nrow(Auto))
mpg01[Auto$mpg > median(Auto$mpg)] <- 1
Auto2 <- data.frame(Auto, mpg01)
summary(as.factor(Auto2$mpg01))
##   0   1 
## 196 196
  1. Graphical exploration
pairs(Auto2[, c("mpg01", "cylinders", "displacement","horsepower", "weight", "acceleration","year", "origin")])

  1. Graphical exploration
par(mfrow = c(2, 3))

boxplot(cylinders ~ mpg01, data = Auto2,
        xlab = "mpg01",
        ylab = "Cylinders",
        main = "Cylinders by mpg01")

boxplot(displacement ~ mpg01, data = Auto2,
        xlab = "mpg01",
        ylab = "Displacement",
        main = "Displacement by mpg01")

boxplot(horsepower ~ mpg01, data = Auto2,
        xlab = "mpg01",
        ylab = "Horsepower",
        main = "Horsepower by mpg01")

boxplot(weight ~ mpg01, data = Auto2,
        xlab = "mpg01",
        ylab = "Weight",
        main = "Weight by mpg01")

boxplot(year ~ mpg01, data = Auto2,
        xlab = "mpg01",
        ylab = "Year",
        main = "Year by mpg01")

boxplot(origin ~ mpg01, data = Auto2,
        xlab = "mpg01",
        ylab = "Origin",
        main = "Origin by mpg01")

par(mfrow = c(1, 1))

The boxplots suggest that cylinders, displacement, horsepower, weight, and year are strongly associated with mpg01. Cars with high gas mileage tend to have fewer cylinders, lower displacement, lower horsepower, lower weight, and newer model years.

  1. Split into training and test sets
train <- Auto2$year < 76
test <- Auto2$year >= 76
Auto_train <- Auto2[train, ]
Auto_test <- Auto2[test, ]
mpg01_test <- Auto2$mpg01[test]
  1. LDA
lda_fit <- lda(mpg01 ~ cylinders + displacement + horsepower + weight + year, data = Auto_train)
lda_pred <- predict(lda_fit, Auto_test)
table(lda_pred$class, mpg01_test)
##    mpg01_test
##       0   1
##   0  63   7
##   1  11 131
mean(lda_pred$class != mpg01_test)
## [1] 0.08490566
  1. QDA
qda_fit <- qda(mpg01 ~ cylinders + displacement + horsepower + weight + year, data = Auto_train)
qda_pred <- predict(qda_fit, Auto_test)
table(qda_pred$class, mpg01_test)
##    mpg01_test
##       0   1
##   0  69  17
##   1   5 121
mean(qda_pred$class != mpg01_test)
## [1] 0.1037736
  1. Logistic regression
glm_fit <- glm(mpg01 ~ cylinders + displacement + horsepower + weight + year, data = Auto_train,family = binomial)
glm_probs <- predict(glm_fit, Auto_test, type = "response")
glm_pred <- rep(0, length(glm_probs))
glm_pred[glm_probs > 0.5] <- 1
table(glm_pred, mpg01_test)
##         mpg01_test
## glm_pred   0   1
##        0  67   9
##        1   7 129
mean(glm_pred != mpg01_test)
## [1] 0.0754717
  1. Naive Bayes
nb_fit <- naive_bayes(as.factor(mpg01) ~ cylinders + displacement + horsepower + weight + year, data = Auto_train)
nb_pred <- predict(nb_fit, Auto_test)
## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.
table(nb_pred, mpg01_test)
##        mpg01_test
## nb_pred   0   1
##       0  67  15
##       1   7 123
mean(nb_pred != mpg01_test)
## [1] 0.1037736
  1. KNN with several K values
train_X <- Auto_train[, c("cylinders", "displacement", "horsepower", "weight", "year")]
test_X <- Auto_test[, c("cylinders", "displacement", "horsepower", "weight", "year")]
train_X_scaled <- scale(train_X)
test_X_scaled <- scale(test_X, center = attr(train_X_scaled, "scaled:center"), scale = attr(train_X_scaled, "scaled:scale"))

train_y <- as.factor(Auto_train$mpg01)
test_y <- as.factor(Auto_test$mpg01)
set.seed(1)

knn_results <- data.frame(K = numeric(),
                          Test_Error = numeric())

for(k in c(1, 3, 5, 7, 9, 11, 15, 20)) {
  
  knn_pred <- knn(train_X_scaled,
                  test_X_scaled,
                  train_y,
                  k = k)
  
  test_error <- mean(knn_pred != test_y)
  
  knn_results <- rbind(knn_results,
                       data.frame(K = k,
                                  Test_Error = test_error))
}

knn_results
##    K Test_Error
## 1  1 0.09433962
## 2  3 0.10377358
## 3  5 0.10377358
## 4  7 0.10377358
## 5  9 0.10377358
## 6 11 0.10377358
## 7 15 0.10377358
## 8 20 0.10377358
best_k <- knn_results[which.min(knn_results$Test_Error), ]
best_k
##   K Test_Error
## 1 1 0.09433962

Final comparison

lda_error <- mean(lda_pred$class != mpg01_test)
qda_error <- mean(qda_pred$class != mpg01_test)
glm_error <- mean(glm_pred != mpg01_test)
nb_error <- mean(nb_pred != mpg01_test)
knn_best_error <- min(knn_results$Test_Error)

comparison <- data.frame(
  Method = c("LDA", "QDA", "Logistic Regression", "Naive Bayes", "Best KNN"),
  Test_Error = c(lda_error, qda_error, glm_error, nb_error, knn_best_error)
)
comparison
##                Method Test_Error
## 1                 LDA 0.08490566
## 2                 QDA 0.10377358
## 3 Logistic Regression 0.07547170
## 4         Naive Bayes 0.10377358
## 5            Best KNN 0.09433962

Logistic regression is the best fit due to the lowest test error score.

Boston Classification

data(Boston)
  1. Create a binary response variable
crime01 <- ifelse(Boston$crim > median(Boston$crim), 1, 0)
Boston2 <- data.frame(Boston, crime01)
summary(as.factor(Boston2$crime01))
##   0   1 
## 253 253

Explore the data

par(mfrow=c(2,3))

plot(Boston2$lstat, Boston2$crime01,
     pch=19, col="steelblue",
     xlab="LSTAT", ylab="crime01")

plot(Boston2$rm, Boston2$crime01,
     pch=19, col="steelblue",
     xlab="RM", ylab="crime01")

plot(Boston2$nox, Boston2$crime01,
     pch=19, col="steelblue",
     xlab="NOX", ylab="crime01")

plot(Boston2$dis, Boston2$crime01,
     pch=19, col="steelblue",
     xlab="DIS", ylab="crime01")

plot(Boston2$tax, Boston2$crime01,
     pch=19, col="steelblue",
     xlab="TAX", ylab="crime01")

plot(Boston2$ptratio, Boston2$crime01,
     pch=19, col="steelblue",
     xlab="PTRATIO", ylab="crime01")

par(mfrow=c(1,1))

The scatterplots suggest that lstat, rm, nox, dis, tax, and ptratio are associated with whether the crime rate is above or below the median.

Training and Test Sets

set.seed(1)

train <- sample(1:nrow(Boston2), nrow(Boston2)/2)

Boston.train <- Boston2[train, ]
Boston.test <- Boston2[-train, ]

crime.test <- Boston2$crime01[-train]

Logistic Regression

glm.fit <- glm(crime01 ~ lstat + rm + nox + dis + tax + ptratio,data = Boston.train, family = binomial)
glm.probs <- predict(glm.fit, Boston.test, type="response")
glm.pred <- ifelse(glm.probs > .5,1,0)
table(glm.pred, crime.test)
##         crime.test
## glm.pred   0   1
##        0 110  19
##        1  16 108
glm.error <- mean(glm.pred != crime.test)
glm.error
## [1] 0.1383399

LDA

lda.fit <- lda(crime01 ~ lstat + rm + nox + dis + tax + ptratio,data = Boston.train)
lda.pred <- predict(lda.fit, Boston.test)
table(lda.pred$class, crime.test)
##    crime.test
##       0   1
##   0 114  37
##   1  12  90
lda.error <- mean(lda.pred$class != crime.test)
lda.error
## [1] 0.1936759

Naive Bayes

nb.fit <- naive_bayes(as.factor(crime01) ~ lstat + rm + nox + dis + tax + ptratio, data = Boston.train)
nb.pred <- predict(nb.fit, Boston.test)
## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.
table(nb.pred, crime.test)
##        crime.test
## nb.pred   0   1
##       0 109  24
##       1  17 103
nb.error <- mean(nb.pred != crime.test)
nb.error
## [1] 0.1620553

KNN

train.X <- Boston.train[,c("lstat","rm","nox","dis","tax","ptratio")]
test.X  <- Boston.test[,c("lstat","rm","nox","dis","tax","ptratio")]
train.X <- scale(train.X)
test.X <- scale(test.X, center=attr(train.X,"scaled:center"), scale=attr(train.X,"scaled:scale"))
train.Y <- as.factor(Boston.train$crime01)
test.Y <- as.factor(crime.test)
set.seed(1)

knn.results <- data.frame(K=numeric(), Test_Error=numeric())

for(k in c(1,3,5,7,9,11,15,20)){

pred <- knn(train.X,test.X,train.Y,k=k)

error <- mean(pred!=test.Y)
knn.results <- rbind(knn.results, data.frame(K=k, Test_Error=error))

}
knn.results
##    K Test_Error
## 1  1 0.09090909
## 2  3 0.07905138
## 3  5 0.07509881
## 4  7 0.08695652
## 5  9 0.12648221
## 6 11 0.12648221
## 7 15 0.13043478
## 8 20 0.14624506
best.knn <- knn.results[which.min(knn.results$Test_Error),]
best.knn
##   K Test_Error
## 3 5 0.07509881

Compare Methods

comparison <- data.frame(

Method=c("Logistic Regression","LDA","Naive Bayes","Best KNN"),

Test_Error=c(glm.error, lda.error, nb.error, min(knn.results$Test_Error))
)

comparison
##                Method Test_Error
## 1 Logistic Regression 0.13833992
## 2                 LDA 0.19367589
## 3         Naive Bayes 0.16205534
## 4            Best KNN 0.07509881

Conclusions

KNN is the best fit due to the lowest test score.