library(ISLR)
## Warning: package 'ISLR' was built under R version 4.0.3
library(caret)
## Warning: package 'caret' was built under R version 4.0.4
## Loading required package: lattice
## Loading required package: ggplot2
library(MASS)
library(tidyverse)
## -- Attaching packages -------------------------------------------------------------- tidyverse 1.3.0 --
## v tibble 3.0.3 v dplyr 1.0.2
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## v purrr 0.3.4
## -- Conflicts ----------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x purrr::lift() masks caret::lift()
## x dplyr::select() masks MASS::select()
library(class)
data(Weekly)
pairs(Weekly[,-9])
cor(Weekly[,-9])
## Year Lag1 Lag2 Lag3 Lag4
## Year 1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1 -0.03228927 1.000000000 -0.07485305 0.05863568 -0.071273876
## Lag2 -0.03339001 -0.074853051 1.00000000 -0.07572091 0.058381535
## Lag3 -0.03000649 0.058635682 -0.07572091 1.00000000 -0.075395865
## Lag4 -0.03112792 -0.071273876 0.05838153 -0.07539587 1.000000000
## Lag5 -0.03051910 -0.008183096 -0.07249948 0.06065717 -0.075675027
## Volume 0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today -0.03245989 -0.075031842 0.05916672 -0.07124364 -0.007825873
## Lag5 Volume Today
## Year -0.030519101 0.84194162 -0.032459894
## Lag1 -0.008183096 -0.06495131 -0.075031842
## Lag2 -0.072499482 -0.08551314 0.059166717
## Lag3 0.060657175 -0.06928771 -0.071243639
## Lag4 -0.075675027 -0.06107462 -0.007825873
## Lag5 1.000000000 -0.05851741 0.011012698
## Volume -0.058517414 1.00000000 -0.033077783
## Today 0.011012698 -0.03307778 1.000000000
at first glance there is very little relationship between the lag variables, there appers to be a correlation between year and Volume.
glm_full <- glm(Direction ~. -Year -Today, data = Weekly, family = "binomial")
summary(glm_full)
##
## Call:
## glm(formula = Direction ~ . - Year - Today, family = "binomial",
## data = Weekly)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6949 -1.2565 0.9913 1.0849 1.4579
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
predict_glm <- predict(glm_full, type = "response")
predict_glm <- ifelse(predict_glm <0.5, "Down", "Up")
predict_glm<- factor(predict_glm)
confusionMatrix(predict_glm,Weekly$Direction, positive = "Up")
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 54 48
## Up 430 557
##
## Accuracy : 0.5611
## 95% CI : (0.531, 0.5908)
## No Information Rate : 0.5556
## P-Value [Acc > NIR] : 0.369
##
## Kappa : 0.035
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.9207
## Specificity : 0.1116
## Pos Pred Value : 0.5643
## Neg Pred Value : 0.5294
## Prevalence : 0.5556
## Detection Rate : 0.5115
## Detection Prevalence : 0.9063
## Balanced Accuracy : 0.5161
##
## 'Positive' Class : Up
##
specificity is low aka the model does a bad job in predicting negative class
train <- Weekly[Weekly$Year <= 2008,]
test <- Weekly[Weekly$Year >2008,]
glm_train_test <- glm(Direction ~ Lag2, data = train, family ='binomial')
predict_test <- predict(glm_train_test, newdata = test, type = "response")
predict_test <- ifelse(predict_test <0.5, "Down", "Up")
predict_test <- factor(predict_test)
confusionMatrix(predict_test,test$Direction, positive = "Up")
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 9 5
## Up 34 56
##
## Accuracy : 0.625
## 95% CI : (0.5247, 0.718)
## No Information Rate : 0.5865
## P-Value [Acc > NIR] : 0.2439
##
## Kappa : 0.1414
##
## Mcnemar's Test P-Value : 7.34e-06
##
## Sensitivity : 0.9180
## Specificity : 0.2093
## Pos Pred Value : 0.6222
## Neg Pred Value : 0.6429
## Prevalence : 0.5865
## Detection Rate : 0.5385
## Detection Prevalence : 0.8654
## Balanced Accuracy : 0.5637
##
## 'Positive' Class : Up
##
accuracy of 62.5% compared to 56.1%, but we can’t confirm the model is any better at prediction as the test sample was very small.
lda_lag2 <- lda(Direction ~ Lag2, data = train)
predict_lda <- predict(lda_lag2, newdata = test)
confusionMatrix(predict_lda$class,test$Direction, positive = "Up")
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 9 5
## Up 34 56
##
## Accuracy : 0.625
## 95% CI : (0.5247, 0.718)
## No Information Rate : 0.5865
## P-Value [Acc > NIR] : 0.2439
##
## Kappa : 0.1414
##
## Mcnemar's Test P-Value : 7.34e-06
##
## Sensitivity : 0.9180
## Specificity : 0.2093
## Pos Pred Value : 0.6222
## Neg Pred Value : 0.6429
## Prevalence : 0.5865
## Detection Rate : 0.5385
## Detection Prevalence : 0.8654
## Balanced Accuracy : 0.5637
##
## 'Positive' Class : Up
##
accuracy didnt change, same assement as the earlier model
qda_lag2 <- qda(Direction ~ Lag2, data = train)
predict_qda <- predict(qda_lag2, newdata = test)
confusionMatrix(predict_qda$class,test$Direction, positive = "Up")
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 0 0
## Up 43 61
##
## Accuracy : 0.5865
## 95% CI : (0.4858, 0.6823)
## No Information Rate : 0.5865
## P-Value [Acc > NIR] : 0.5419
##
## Kappa : 0
##
## Mcnemar's Test P-Value : 1.504e-10
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.5865
## Neg Pred Value : NaN
## Prevalence : 0.5865
## Detection Rate : 0.5865
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : Up
##
accuracy fell. qda predicts UP for every observation so indicated by the specificity of 0.
Repeat (d) using KNN with K = 1.
Which of these methods appears to provide the best results on this data?
LDA and Logit regression get the same test accuracy and similiar results.
data(Auto)
Auto$mpg01 <- factor(as.numeric(Auto$mpg > median(Auto$mpg)))
Auto$origin <- factor(Auto$origin)
Auto_new <- subset(Auto , select = -c(mpg,name))
glimpse(Auto_new)
## Rows: 392
## Columns: 8
## $ cylinders <dbl> 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 6, 6, 6, ...
## $ displacement <dbl> 307, 350, 318, 304, 302, 429, 454, 440, 455, 390, 383,...
## $ horsepower <dbl> 130, 165, 150, 150, 140, 198, 220, 215, 225, 190, 170,...
## $ weight <dbl> 3504, 3693, 3436, 3433, 3449, 4341, 4354, 4312, 4425, ...
## $ acceleration <dbl> 12.0, 11.5, 11.0, 12.0, 10.5, 10.0, 9.0, 8.5, 10.0, 8....
## $ year <dbl> 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70...
## $ origin <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, ...
## $ mpg01 <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...
g1 <- ggplot(Auto_new, aes(x = mpg01, y = cylinders, col = mpg01)) +
geom_jitter() +
theme(legend.position = "none") +
ggtitle("Cylinders vs mpg01 - Jitter Plot")
g2 <- ggplot(Auto_new, aes(x = mpg01, y = displacement, fill = mpg01)) +
geom_boxplot() +
theme(legend.position = "none") +
ggtitle("Displacement vs mpg01 - Boxplot")
g3 <- ggplot(Auto_new, aes(x = mpg01, y = horsepower, fill = mpg01)) +
geom_boxplot() +
theme(legend.position = "none") +
ggtitle("Horsepower vs mpg01 - Boxplot")
g4 <- ggplot(Auto_new, aes(x = origin, fill = mpg01)) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent_format()) +
theme(axis.title.y = element_blank()) +
ggtitle("Origin vs mpg01 - Bar Plot")
g1
g2
g3
g4
interesting find: American muscle car gas guzzlers are not a myth as seen by the last graph. At large american cars where had mpg below median.
pairs(Auto_new)
set.seed(123)
index <- sample(1:nrow(Auto),.75*nrow(Auto),replace=FALSE)
train <- Auto_new[index,]
test <- Auto_new[-index,]
lda_auto <- lda(mpg01 ~cylinders+displacement+weight+horsepower, train)
lda_auto_predict <- predict(lda_auto, test)
mean(lda_auto_predict$class != test$mpg01)
## [1] 0.1122449
qda_auto <-train(mpg01 ~cylinders+displacement+weight+horsepower, train, method ="qda")
qda_auto_predict <- predict(qda_auto, test)
mean(qda_auto_predict!= test$mpg01)
## [1] 0.1020408
glm_auto <- train(mpg01 ~ cylinders+displacement+weight+horsepower, data= train, method = 'glm',family= 'binomial')
glm_auto_predict <- predict(glm_auto, test)
mean(glm_auto_predict != test$mpg01)
## [1] 0.122449
knn_auto <- train(mpg01 ~ cylinders + displacement + weight + horsepower,
data = train,
method = "knn",
preProcess = c("center", "scale"),
tuneGrid = expand.grid(k = seq(1, 80, 1)))
knn_auto
## k-Nearest Neighbors
##
## 294 samples
## 4 predictor
## 2 classes: '0', '1'
##
## Pre-processing: centered (4), scaled (4)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 294, 294, 294, 294, 294, 294, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 1 0.8819013 0.7632568
## 2 0.8846526 0.7688534
## 3 0.8846636 0.7687603
## 4 0.8900369 0.7795305
## 5 0.8921409 0.7837935
## 6 0.8970076 0.7934615
## 7 0.8989449 0.7973644
## 8 0.8994178 0.7983132
## 9 0.9042827 0.8080220
## 10 0.9031777 0.8056565
## 11 0.9061507 0.8115993
## 12 0.9068505 0.8130570
## 13 0.9053898 0.8100616
## 14 0.9049939 0.8092661
## 15 0.9047906 0.8088153
## 16 0.9047834 0.8087705
## 17 0.9036274 0.8065016
## 18 0.9033074 0.8058610
## 19 0.9022294 0.8037107
## 20 0.9018545 0.8029481
## 21 0.9011246 0.8014918
## 22 0.9007362 0.8007183
## 23 0.9014882 0.8022741
## 24 0.9014882 0.8022741
## 25 0.9014882 0.8022741
## 26 0.9014882 0.8022741
## 27 0.9014882 0.8022741
## 28 0.9014882 0.8022741
## 29 0.9014882 0.8022741
## 30 0.9014882 0.8022741
## 31 0.9011144 0.8015016
## 32 0.9011144 0.8015016
## 33 0.9011144 0.8015016
## 34 0.9011144 0.8015016
## 35 0.9011144 0.8015016
## 36 0.9011144 0.8015016
## 37 0.9007222 0.8007282
## 38 0.9007222 0.8007282
## 39 0.9007222 0.8007282
## 40 0.8999982 0.7992827
## 41 0.8999982 0.7992827
## 42 0.9003619 0.8000057
## 43 0.9000079 0.7992958
## 44 0.8999982 0.7992856
## 45 0.8996443 0.7985757
## 46 0.8996443 0.7985728
## 47 0.8999982 0.7992827
## 48 0.8999982 0.7992827
## 49 0.8996443 0.7985728
## 50 0.8996443 0.7985728
## 51 0.8996443 0.7985728
## 52 0.8988997 0.7970684
## 53 0.8988997 0.7970684
## 54 0.8992806 0.7978528
## 55 0.8992806 0.7978528
## 56 0.8992806 0.7978528
## 57 0.8992806 0.7978528
## 58 0.8988997 0.7970684
## 59 0.8985519 0.7963938
## 60 0.8985519 0.7963938
## 61 0.8981815 0.7956606
## 62 0.8981815 0.7956606
## 63 0.8981815 0.7956606
## 64 0.8981815 0.7956606
## 65 0.8981815 0.7956606
## 66 0.8981815 0.7956606
## 67 0.8981815 0.7956606
## 68 0.8981815 0.7956606
## 69 0.8981815 0.7956606
## 70 0.8981815 0.7956606
## 71 0.8981815 0.7956606
## 72 0.8974792 0.7942662
## 73 0.8974602 0.7942602
## 74 0.8978714 0.7950373
## 75 0.8978523 0.7950313
## 76 0.8974602 0.7942602
## 77 0.8974602 0.7942602
## 78 0.8974984 0.7943211
## 79 0.8971062 0.7935260
## 80 0.8974298 0.7941999
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 12.
knn_auto_predicted <- predict(knn_auto, test, type = "raw")
mean(knn_auto_predicted != test$mpg01)
## [1] 0.1122449
data(Boston)
Boston$crim <- factor(ifelse(Boston$crim > median(Boston$crim), 1, 0))
train and split
set.seed(123)
index = sample(1:nrow(Boston),.75*nrow(Boston),replace=FALSE)
train_boston = Boston[index,]
test_boston = Boston[-index,]
Glm:
glm_boston <- glm(crim ~.,train_boston, family = "binomial")
glm_boston_predict <-predict(glm_boston,test_boston, type = 'response')
glm_boston_predict <- ifelse(glm_boston_predict <0.5, 0, 1)
mean(glm_boston_predict != test_boston$crim)
## [1] 0.1259843
LDA:
lda_boston <- lda(crim ~ .,train_boston)
pred.lda_boston <- predict(lda_boston, test_boston)
mean(pred.lda_boston$class != test_boston$crim)
## [1] 0.1811024
KNN:
train.X <- subset(train_boston , select = -crim)
test.X <- subset(test_boston, select =-crim)
train_crim <- subset( train_boston , select =crim)