library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.4.3
attach(Weekly)
pairs(Weekly)
cor(Weekly[,-9])
## Year Lag1 Lag2 Lag3 Lag4
## Year 1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1 -0.03228927 1.000000000 -0.07485305 0.05863568 -0.071273876
## Lag2 -0.03339001 -0.074853051 1.00000000 -0.07572091 0.058381535
## Lag3 -0.03000649 0.058635682 -0.07572091 1.00000000 -0.075395865
## Lag4 -0.03112792 -0.071273876 0.05838153 -0.07539587 1.000000000
## Lag5 -0.03051910 -0.008183096 -0.07249948 0.06065717 -0.075675027
## Volume 0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today -0.03245989 -0.075031842 0.05916672 -0.07124364 -0.007825873
## Lag5 Volume Today
## Year -0.030519101 0.84194162 -0.032459894
## Lag1 -0.008183096 -0.06495131 -0.075031842
## Lag2 -0.072499482 -0.08551314 0.059166717
## Lag3 0.060657175 -0.06928771 -0.071243639
## Lag4 -0.075675027 -0.06107462 -0.007825873
## Lag5 1.000000000 -0.05851741 0.011012698
## Volume -0.058517414 1.00000000 -0.033077783
## Today 0.011012698 -0.03307778 1.000000000
Based on our pairwise correlation plot, there appears to be no real relationship between variables outside of Year and Volume.
Weekly_logreg =glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,data=Weekly,family=binomial)
summary(Weekly_logreg)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
Based on our logistic regression results, Lag2 is the only statistically significant variable.
Weekly.prob=predict(Weekly_logreg,type='response')
Weekly.pred=rep("Down",1089)
Weekly.pred[Weekly.prob>0.5]="Up"
table(Weekly.pred,Direction)
## Direction
## Weekly.pred Down Up
## Down 54 48
## Up 430 557
Based on the results of our confusion matrix, we have 54 True Negatives and 557 True Positives, then 48 False Negatives and 430 False Positives. The model predicts “Up” more often than “Down”. The model appears biased towards “Up” due to the ratio of False Positives to False Negatives. The model is 56.1% accurate but this may just be because of its “Up” bias so we don’t feel great about it.
train=(Year<2009)
Weekly.2009=Weekly[!train,]
Direction.2009=Direction[!train]
Weekly_logreg2=glm(Direction~Lag2,data=Weekly,family=binomial,subset=train)
Weekly.prob2=predict(Weekly_logreg2,Weekly.2009,type='response')
Weekly.pred2=rep("Down",104)
Weekly.pred2[Weekly.prob2>0.5]="Up"
table(Weekly.pred2,Direction.2009)
## Direction.2009
## Weekly.pred2 Down Up
## Down 9 5
## Up 34 56
Based on the results of our confusion matrix using the training data, we see that prediction accuracy increases by a decent margin from 56.1% to 62.5%.
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
##
## Boston
Weeklylda_fit<-lda(Direction~Lag2, data=Weekly,family=binomial, subset=train)
Weeklylda_pred<-predict(Weeklylda_fit, Weekly.2009)
table(Weeklylda_pred$class, Direction.2009)
## Direction.2009
## Down Up
## Down 9 5
## Up 34 56
Using LDA didn’t improve or hurt the model. Accuracy remained at 62.5%
Weeklyqda_fit<-qda(Direction~Lag2, data=Weekly,subset=train)
Weeklyqda_pred<-predict(Weeklyqda_fit, Weekly.2009)
table(Weeklyqda_pred$class, Direction.2009)
## Direction.2009
## Down Up
## Down 0 0
## Up 43 61
mean(Weeklyqda_pred$class==Direction.2009)
## [1] 0.5865385
Using QDA, model accuracy as at 58.7%. It appears it didn’t attempt to predict “Down”.
library(class)
Week_train <- as.matrix(Lag2[train])
Week_test <- as.matrix(Lag2[!train])
train_Direction <- Direction[train]
set.seed(1)
Weekknn_pred=knn(Week_train,Week_test,train_Direction,k=1)
table(Weekknn_pred,Direction.2009)
## Direction.2009
## Weekknn_pred Down Up
## Down 21 30
## Up 22 31
mean(Weekknn_pred == Direction.2009)
## [1] 0.5
Using KNN with K = 1, accuracy is at 50%
library(e1071)
## Warning: package 'e1071' was built under R version 4.4.3
Bayes.fit=naiveBayes(Direction~Lag2,data=Weekly,subset=train)
Bayes.pred=predict(Bayes.fit,Weekly.2009)
table(Bayes.pred,Direction.2009)
## Direction.2009
## Bayes.pred Down Up
## Down 0 0
## Up 43 61
mean(Bayes.pred == Direction.2009)
## [1] 0.5865385
Using Bayes, accuracy is at 58.65%.
LDA at 62.5%.
library(ISLR2)
attach(Auto)
mpg01 <- ifelse(Auto$mpg > median(Auto$mpg),1,0)
Auto_mpg01 <- data.frame(Auto, mpg01)
pairs(Auto_mpg01[,-9])
par(mfrow=c(2,2))
boxplot(cylinders~mpg01,main="cylinders vs mpg01")
boxplot(displacement~mpg01,main="displacement vs mpg01")
boxplot(horsepower~mpg01,main="horsepower vs mpg01")
boxplot(weight~mpg01,main= "weight vs mpg01")
Weight, displacement, horsepower, and cylinders all seem to impact mpg.
set.seed(2)
train_x <- sample(1:nrow(Auto_mpg01), 0.8*nrow(Auto_mpg01))
test_x <- Auto_mpg01[,-train_x]
library(MASS)
lda_autofit <- lda(mpg01 ~ cylinders + displacement + weight, data=Auto_mpg01)
lda_autopred <-predict(lda_autofit, test_x)$class
table(lda_autopred, test_x$mpg01)
##
## lda_autopred 0 1
## 0 168 13
## 1 28 183
mean(lda_autopred == test_x$mpg01)
## [1] 0.8954082
The LDA model had a test accuracy of 89.5% predicting mpg when using cylinders, displacement, and weight.
qda_autofit <- qda(mpg01 ~ cylinders + displacement + weight, data=Auto_mpg01)
qda_autopred <- predict(qda_autofit, test_x)$class
table(qda_autopred, test_x$mpg01)
##
## qda_autopred 0 1
## 0 175 17
## 1 21 179
mean(qda_autopred == test_x$mpg01)
## [1] 0.9030612
QDA model had a test accuracy of 90.3%, better than LDA model.
autoglm_fit <- glm(mpg01 ~ cylinders + displacement + weight, family=binomial, data=Auto_mpg01)
glm_autoprobs <- predict(autoglm_fit,test_x,type="response")
glm_autopred <- rep(0,nrow(test_x))
glm_autopred[glm_autoprobs > 0.50]=1
table(glm_autopred, test_x$mpg01)
##
## glm_autopred 0 1
## 0 170 15
## 1 26 181
mean(glm_autopred == test_x$mpg01)
## [1] 0.8954082
The results of logistic regression using the variables which seemed the most associated with mpg01 were 89.5% accuracy.
library(e1071)
Bayes_autofit <- naiveBayes(mpg01~ cylinders + displacement + weight, data=Auto_mpg01)
Bayes_autoclass <- predict(Bayes_autofit, test_x)
## Warning in predict.naiveBayes(Bayes_autofit, test_x): Type mismatch between
## training and new data for variable 'cylinders'. Did you use factors with
## numeric labels for training, and numeric values for new data?
## Warning in predict.naiveBayes(Bayes_autofit, test_x): Type mismatch between
## training and new data for variable 'displacement'. Did you use factors with
## numeric labels for training, and numeric values for new data?
table(Bayes_autoclass, test_x$mpg01)
##
## Bayes_autoclass 0 1
## 0 165 16
## 1 31 180
mean(Bayes_autoclass == test_x$mpg01)
## [1] 0.880102
Bayes model had an accuracy of 88.01%.
library(class)
set.seed(3)
idx <- sample(1:nrow(Auto), 0.8*nrow(Auto))
train_auto <- Auto[idx,]
test_auto <- Auto[-idx,]
y_train <- train_auto$mpg
x_train <- train_auto[,-1]
x_test <- test_auto[,-1]
x_train <- x_train[,-8]
x_test <- x_test[,-8]
y_test<- ifelse(test_auto$mpg>=23,1,0)
high_low <- ifelse(y_train>=23,1,0)
knn_pred <- knn (train=x_train, test=x_test, cl=high_low, k=1)
table(knn_pred, y_test)
## y_test
## knn_pred 0 1
## 0 33 3
## 1 9 34
mean(knn_pred == y_test)
## [1] 0.8481013
set.seed(3)
knn_pred <- knn (train=x_train, test=x_test, cl=high_low, k=29)
table(knn_pred, y_test)
## y_test
## knn_pred 0 1
## 0 34 3
## 1 8 34
mean(knn_pred == y_test)
## [1] 0.8607595
KNN results are 84.81% with K = 1 and 86.08% with K = 29.
library(ISLR2)
attach(Boston)
crim01=rep(0,length(crim))
crim01[crim>median(crim)]=1
Boston=data.frame(Boston,crim01)
train=1:(length(crim)/2)
test=(length(crim) / 2 + 1):length(crim)
Boston.train=Boston[train, ]
Boston.test=Boston[test, ]
crim01.test=crim01[test]
glm.fit=glm(crim01~.-crim01-crim,data=Boston,family=binomial,subset = train)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(glm.fit)
##
## Call:
## glm(formula = crim01 ~ . - crim01 - crim, family = binomial,
## data = Boston, subset = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -91.319906 19.490273 -4.685 2.79e-06 ***
## zn -0.815573 0.193373 -4.218 2.47e-05 ***
## indus 0.354172 0.173862 2.037 0.04164 *
## chas 0.167396 0.991922 0.169 0.86599
## nox 93.706326 21.202008 4.420 9.88e-06 ***
## rm -4.719108 1.788765 -2.638 0.00833 **
## age 0.048634 0.024199 2.010 0.04446 *
## dis 4.301493 0.979996 4.389 1.14e-05 ***
## rad 3.039983 0.719592 4.225 2.39e-05 ***
## tax -0.006546 0.007855 -0.833 0.40461
## ptratio 1.430877 0.359572 3.979 6.91e-05 ***
## black -0.017552 0.006734 -2.606 0.00915 **
## lstat 0.190439 0.086722 2.196 0.02809 *
## medv 0.598533 0.185514 3.226 0.00125 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 329.367 on 252 degrees of freedom
## Residual deviance: 69.568 on 239 degrees of freedom
## AIC: 97.568
##
## Number of Fisher Scoring iterations: 10
glm.probs=predict(glm.fit,Boston.test,type = "response")
glm.pred=rep(0,length(glm.probs))
glm.pred[glm.probs>0.5]=1
table(glm.pred,crim01.test)
## crim01.test
## glm.pred 0 1
## 0 68 24
## 1 22 139
mean(glm.pred==crim01.test)
## [1] 0.8181818
Logistic regression has a test error of 81.81%.
glm.fit=glm(crim01~.-crim01-crim-dis-nox-rad,data=Boston, family=binomial,subset=train)
summary(glm.fit)
##
## Call:
## glm(formula = crim01 ~ . - crim01 - crim - dis - nox - rad, family = binomial,
## data = Boston, subset = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.405817 4.572663 -1.182 0.237125
## zn -0.097635 0.051679 -1.889 0.058855 .
## indus 0.053784 0.034927 1.540 0.123588
## chas 1.089458 0.597100 1.825 0.068064 .
## rm -0.062265 0.674096 -0.092 0.926405
## age 0.027282 0.010678 2.555 0.010622 *
## tax 0.009573 0.002864 3.343 0.000829 ***
## ptratio 0.209979 0.112209 1.871 0.061302 .
## black -0.019665 0.007686 -2.558 0.010513 *
## lstat 0.063938 0.047871 1.336 0.181662
## medv 0.110295 0.061176 1.803 0.071402 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 329.37 on 252 degrees of freedom
## Residual deviance: 186.22 on 242 degrees of freedom
## AIC: 208.22
##
## Number of Fisher Scoring iterations: 8
glm.probs=predict(glm.fit,Boston.test,type="response")
glm.pred=rep(0,length(glm.probs))
glm.pred[glm.probs>0.5]=1
table(glm.pred, crim01.test)
## crim01.test
## glm.pred 0 1
## 0 85 30
## 1 5 133
mean(glm.pred==crim01.test)
## [1] 0.8616601
Logistic regression based on crim, dis, nox, and rad has a test error of 86.17%.
lda.fit=lda(crim01~.-crim01-crim,data=Boston,subset=train)
lda.pred=predict(lda.fit,Boston.test)
table(lda.pred$class,crim01.test)
## crim01.test
## 0 1
## 0 80 24
## 1 10 139
mean(lda.pred$class==crim01.test)
## [1] 0.8656126
LDA has a test error of 86.56%
lda.fit=lda(crim01~.-crim01-crim-dis-nox-rad,data=Boston,subset=train)
lda.pred=predict(lda.fit,Boston.test)
table(lda.pred$class,crim01.test)
## crim01.test
## 0 1
## 0 85 30
## 1 5 133
mean(lda.pred$class==crim01.test)
## [1] 0.8616601
LDA based on crim, dis, nox, and rad has a test error raete of 86.17%
train.X=cbind(zn,indus,chas,nox,rm,age,dis,rad,tax, ptratio,black,lstat,medv)[train,]
test.X=cbind(zn,indus,chas,nox,rm,age,dis,rad,tax, ptratio,black,lstat,medv)[test,]
train.crim01=crim01[train]
set.seed(1)
knn.pred=knn(train.X,test.X,train.crim01,k=1)
table(knn.pred,crim01.test)
## crim01.test
## knn.pred 0 1
## 0 85 111
## 1 5 52
mean(knn.pred==crim01.test)
## [1] 0.541502
knn.pred=knn(train.X,test.X,train.crim01,k=10)
table(knn.pred,crim01.test)
## crim01.test
## knn.pred 0 1
## 0 83 23
## 1 7 140
mean(knn.pred==crim01.test)
## [1] 0.8814229
knn.pred=knn(train.X,test.X,train.crim01,k=10)
table(knn.pred,crim01.test)
## crim01.test
## knn.pred 0 1
## 0 83 22
## 1 7 141
mean(knn.pred==crim01.test)
## [1] 0.8853755
knn.pred=knn(train.X,test.X,train.crim01,k=100)
table(knn.pred,crim01.test)
## crim01.test
## knn.pred 0 1
## 0 86 119
## 1 4 44
mean(knn.pred==crim01.test)
## [1] 0.513834
KNN at K = 10 scored the highest in test error at 88.54%