install.packages("ISLR2")
## Installing package into 'C:/Users/lsunb/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'ISLR2' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\lsunb\AppData\Local\Temp\Rtmp0USj3t\downloaded_packages
library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.4.3
library(ggplot2)
#Problem 13 part a)
library(knitr)
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
pairs(Weekly)

cor(Weekly[,-9])
## Year Lag1 Lag2 Lag3 Lag4
## Year 1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1 -0.03228927 1.000000000 -0.07485305 0.05863568 -0.071273876
## Lag2 -0.03339001 -0.074853051 1.00000000 -0.07572091 0.058381535
## Lag3 -0.03000649 0.058635682 -0.07572091 1.00000000 -0.075395865
## Lag4 -0.03112792 -0.071273876 0.05838153 -0.07539587 1.000000000
## Lag5 -0.03051910 -0.008183096 -0.07249948 0.06065717 -0.075675027
## Volume 0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today -0.03245989 -0.075031842 0.05916672 -0.07124364 -0.007825873
## Lag5 Volume Today
## Year -0.030519101 0.84194162 -0.032459894
## Lag1 -0.008183096 -0.06495131 -0.075031842
## Lag2 -0.072499482 -0.08551314 0.059166717
## Lag3 0.060657175 -0.06928771 -0.071243639
## Lag4 -0.075675027 -0.06107462 -0.007825873
## Lag5 1.000000000 -0.05851741 0.011012698
## Volume -0.058517414 1.00000000 -0.033077783
## Today 0.011012698 -0.03307778 1.000000000
#13a) From the correlation plot above, it is clear that there is correlation between volume and year. This can also be shown in a graph show down below.
plot(Weekly$Volume, ylab = "Shares traded")

#Problem 13 part b)
Weekly_fits<-glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume, data=Weekly, family=binomial)
summary(Weekly_fits)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
#Based on the logistic regression, Lag2 seems to be the only statistically significant variable.
#Problem 13 Part c)
weekly_probs <- predict(Weekly_fits, type = "response")
weekly_pred <- rep("Down", 1089)
weekly_pred[weekly_probs >.5]= "Up"
table(weekly_pred, Weekly$Direction)
##
## weekly_pred Down Up
## Down 54 48
## Up 430 557
mean(weekly_pred == Weekly$Direction)
## [1] 0.5610652
#Problem 13 Part c)
#Based on the results of the confusion matrix, in general we predicted the weekly trend correctly approximately 56.11% of the time.
#Problem 13 Part d)
trainRows <- (Weekly$Year < 2009)
train <- Weekly[trainRows,]
test <- Weekly[!trainRows,]
glm.wkfit2 <- glm(Direction~Lag2,data=train,family=binomial)
wkfitprobs <- predict(glm.wkfit2 ,newdata=test)
wkfit_2010.Preds <- rep('Down',length=nrow(test))
wkfit_2010.Preds[wkfitprobs > .5] = 'Up'
table(wkfit_2010.Preds,test$Direction)
##
## wkfit_2010.Preds Down Up
## Down 41 56
## Up 2 5
#Problem 13 Part e)
library(MASS)
## Warning: package 'MASS' was built under R version 4.4.2
##
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
##
## Boston
train <- (Weekly$Year < 2009)
Weekly_train <- Weekly[train, ]
Weekly_2009 <- Weekly[!train, ]
Weeklylda_fit <- lda(Direction ~ Lag2, data = Weekly, subset = train)
Weeklylda_pred <- predict(Weeklylda_fit, newdata = Weekly_2009)
Direction_2009 <- Weekly_2009$Direction
table(Weeklylda_pred$class, Direction_2009)
## Direction_2009
## Down Up
## Down 9 5
## Up 34 56
mean(Weeklylda_pred$class == Direction_2009)
## [1] 0.625
#Problem 13 Part f)
Weeklyqda_fit <- qda(Direction ~ Lag2, data = Weekly, subset = train)
Weeklyqda_pred <- predict(Weeklyqda_fit, Weekly_2009)$class
table(Weeklyqda_pred, Direction_2009)
## Direction_2009
## Weeklyqda_pred Down Up
## Down 0 0
## Up 43 61
mean(Weeklyqda_pred == Direction_2009)
## [1] 0.5865385
#Problem 13 Part g)
library(class)
train <- (Weekly$Year < 2009)
Week_train <- as.matrix(Weekly$Lag2[train])
Week_test <- as.matrix(Weekly$Lag2[!train])
train_Direction <- Weekly$Direction[train]
Direction_2009 <- Weekly$Direction[!train]
set.seed(1)
Weekknn_pred <- knn(Week_train, Week_test, train_Direction, k=1)
table(Weekknn_pred, Direction_2009)
## Direction_2009
## Weekknn_pred Down Up
## Down 21 30
## Up 22 31
mean(Weekknn_pred == Direction_2009)
## [1] 0.5
#Problem 13 Part h)
library(e1071)
## Warning: package 'e1071' was built under R version 4.4.2
weeklynb_fit <- naiveBayes(Direction~Lag2 ,data=Weekly ,subset=train)
weeklynb_fit
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## Down Up
## 0.4477157 0.5522843
##
## Conditional probabilities:
## Lag2
## Y [,1] [,2]
## Down -0.03568254 2.199504
## Up 0.26036581 2.317485
weeklynb_class <- predict(weeklynb_fit ,Weekly_2009)
table(weeklynb_class ,Direction_2009)
## Direction_2009
## weeklynb_class Down Up
## Down 0 0
## Up 43 61
mean (weeklynb_class == Direction_2009)
## [1] 0.5865385
#Problem 13 Part i)
#Based on the results of the modes above, LDA provides the best results.
#Problem 13 Part j)
set.seed(1)
Weekknn_pred2 <- knn(Week_train,Week_test,train_Direction,k=15)
table(Weekknn_pred2,Direction_2009)
## Direction_2009
## Weekknn_pred2 Down Up
## Down 20 20
## Up 23 41
mean(Weekknn_pred2 == Direction_2009)
## [1] 0.5865385
#Problem 13 Part j)
library(MASS)
train <- (Weekly$Year < 2009)
Weekly_train <- Weekly[train, ]
Weekly_2009 <- Weekly[!train, ]
Weeklyqda_fit2 <- qda(Direction ~ I(Lag2^2), data = Weekly, subset = train)
Weeklyqda_pred2 <- predict(Weeklyqda_fit2, newdata = Weekly_2009)$class
Direction_2009 <- Weekly_2009$Direction
table(Weeklyqda_pred2, Direction_2009)
## Direction_2009
## Weeklyqda_pred2 Down Up
## Down 4 6
## Up 39 55
mean(Weeklyqda_pred2 == Direction_2009)
## [1] 0.5673077
#Problem 13 Part j)
#Changing the K value from 1 to 15 increased the result to 58.65%. Additionally, changing Lag2 to Lag2^2 decreased the result to 56.73%.
#Problem 14 Part a)
library(ISLR2)
attach(Auto)
## The following object is masked from package:ggplot2:
##
## mpg
mpg01 <- ifelse(Auto$mpg > median(Auto$mpg),1,0)
Auto_mpg01 <- data.frame(Auto, mpg01)
#Problem 14 Part b)
pairs(Auto_mpg01[,-9])

par(mfrow=c(2,2))
boxplot(cylinders ~ mpg01, data = Auto_mpg01, main = "Cylinders vs mpg01")
boxplot(displacement ~ mpg01, data = Auto_mpg01, main = "Displacement vs mpg01")
boxplot(horsepower ~ mpg01, data = Auto_mpg01, main = "Horsepower vs mpg01")
boxplot(weight ~ mpg01, data = Auto_mpg01, main = "Weight vs mpg01")

boxplot(acceleration ~ mpg01, data = Auto_mpg01, main = "Acceleration vs mpg01")
boxplot(year ~ mpg01, data = Auto_mpg01, main = "Year vs mpg01")
cor(Auto_mpg01[, -9])
## mpg cylinders displacement horsepower weight
## mpg 1.0000000 -0.7776175 -0.8051269 -0.7784268 -0.8322442
## cylinders -0.7776175 1.0000000 0.9508233 0.8429834 0.8975273
## displacement -0.8051269 0.9508233 1.0000000 0.8972570 0.9329944
## horsepower -0.7784268 0.8429834 0.8972570 1.0000000 0.8645377
## weight -0.8322442 0.8975273 0.9329944 0.8645377 1.0000000
## acceleration 0.4233285 -0.5046834 -0.5438005 -0.6891955 -0.4168392
## year 0.5805410 -0.3456474 -0.3698552 -0.4163615 -0.3091199
## origin 0.5652088 -0.5689316 -0.6145351 -0.4551715 -0.5850054
## mpg01 0.8369392 -0.7591939 -0.7534766 -0.6670526 -0.7577566
## acceleration year origin mpg01
## mpg 0.4233285 0.5805410 0.5652088 0.8369392
## cylinders -0.5046834 -0.3456474 -0.5689316 -0.7591939
## displacement -0.5438005 -0.3698552 -0.6145351 -0.7534766
## horsepower -0.6891955 -0.4163615 -0.4551715 -0.6670526
## weight -0.4168392 -0.3091199 -0.5850054 -0.7577566
## acceleration 1.0000000 0.2903161 0.2127458 0.3468215
## year 0.2903161 1.0000000 0.1815277 0.4299042
## origin 0.2127458 0.1815277 1.0000000 0.5136984
## mpg01 0.3468215 0.4299042 0.5136984 1.0000000

#Problem 14 Part b)
#The correlation plot and the results of the correlation tahble reveals that the variables cylinder, displacement, horsepower and weight are most useful in predicting the mpg01.
#Problem 14 Part c)
set.seed(2)
train_auto <- sample(1:nrow(Auto_mpg01), 0.8*nrow(Auto_mpg01))
test_auto <- Auto_mpg01[,-train_auto]
#Problem 14 Part d)
library(MASS)
lda_autofit <- lda(mpg01 ~ cylinders + displacement + weight, data=Auto_mpg01)
lda_autopred <-predict(lda_autofit, test_auto)$class
table(lda_autopred, test_auto$mpg01)
##
## lda_autopred 0 1
## 0 168 13
## 1 28 183
mean(lda_autopred == test_auto$mpg01)
## [1] 0.8954082
#Problem 14 Part d)
#The test error that the model obtained was 89.54%.
#Problem 14 Part e)
qda_autofit <- qda(mpg01 ~ cylinders + displacement + weight, data=Auto_mpg01)
qda_autopred <- predict(qda_autofit, test_auto)$class
table(qda_autopred, test_auto$mpg01)
##
## qda_autopred 0 1
## 0 175 17
## 1 21 179
mean(qda_autopred == test_auto$mpg01)
## [1] 0.9030612
#Problem 14 Part e)
#The test error that the model obtained was 90.31%.
#Problem 14 Part f)
autoglm_fit <- glm(mpg01 ~ cylinders + displacement + weight, family=binomial, data=Auto_mpg01)
glm_autoprobs <- predict(autoglm_fit,test_auto,type="response")
glm_autopred <- rep(0,nrow(test_auto))
glm_autopred[glm_autoprobs > 0.50]=1
table(glm_autopred, test_auto$mpg01)
##
## glm_autopred 0 1
## 0 170 15
## 1 26 181
mean(glm_autopred == test_auto$mpg01)
## [1] 0.8954082
#Problem 14 Part f)
#The test error that the model obtained was 89.54%.
#Problem 14 Parf g)
Auto_mpg01$cylinders <- as.numeric(as.character(Auto_mpg01$cylinders))
Auto_mpg01$displacement <- as.numeric(as.character(Auto_mpg01$displacement))
Auto_mpg01$weight <- as.numeric(as.character(Auto_mpg01$weight))
library(e1071)
nb_autofit <- naiveBayes(mpg01~ cylinders + displacement + weight, data=Auto_mpg01)
nb_autoclass <- predict(nb_autofit, test_auto)
## Warning in predict.naiveBayes(nb_autofit, test_auto): Type mismatch between
## training and new data for variable 'cylinders'. Did you use factors with
## numeric labels for training, and numeric values for new data?
## Warning in predict.naiveBayes(nb_autofit, test_auto): Type mismatch between
## training and new data for variable 'displacement'. Did you use factors with
## numeric labels for training, and numeric values for new data?
table(nb_autoclass, test_auto$mpg01)
##
## nb_autoclass 0 1
## 0 165 16
## 1 31 180
mean(nb_autoclass == test_auto$mpg01)
## [1] 0.880102
#Problem 14 Part g)
#The test error that the model obtained was 50.00%.
#Problem 15 Part h)
library(class)
set.seed(3)
idx <- sample(1:nrow(Auto), 0.8*nrow(Auto))
train_auto <- Auto[idx,]
test_auto <- Auto[-idx,]
y_train <- train_auto$mpg
x_train <- train_auto[,-1]
x_test <- test_auto[,-1]
x_train <- x_train[,-8]
x_test <- x_test[,-8]
y_test<- ifelse(test_auto$mpg>=23,1,0)
high_low <- ifelse(y_train>=23,1,0)
knn_pred <- knn (train=x_train, test=x_test, cl=high_low, k=1)
table(knn_pred, y_test)
## y_test
## knn_pred 0 1
## 0 33 3
## 1 9 34
#Problem 15 Part h)
mean(knn_pred == y_test)
## [1] 0.8481013
set.seed(3)
knn_pred <- knn (train=x_train, test=x_test, cl=high_low, k=29)
table(knn_pred, y_test)
## y_test
## knn_pred 0 1
## 0 34 3
## 1 8 34
mean(knn_pred == y_test)
## [1] 0.8607595
#Problem 15 Part h)
#The test error that the model obtained was 82.28%.
#Problem 16
library(ISLR2)
attach(Boston)
Crime <- rep(0, length(crim))
Crime[crim > median(crim)] <- 1
Boston01 <- data.frame(Boston, Crime)
#Problem 16
train <- 1:(dim(Boston01)[1]/2)
test <- (dim(Boston01)[1]/2 + 1):dim(Boston01)[1]
Boston_train <- Boston01[train, ]
Boston_test <- Boston01[test, ]
Crime_test <- Crime[test]
cor(Boston01)
## crim zn indus chas nox
## crim 1.00000000 -0.20046922 0.40658341 -0.055891582 0.42097171
## zn -0.20046922 1.00000000 -0.53382819 -0.042696719 -0.51660371
## indus 0.40658341 -0.53382819 1.00000000 0.062938027 0.76365145
## chas -0.05589158 -0.04269672 0.06293803 1.000000000 0.09120281
## nox 0.42097171 -0.51660371 0.76365145 0.091202807 1.00000000
## rm -0.21924670 0.31199059 -0.39167585 0.091251225 -0.30218819
## age 0.35273425 -0.56953734 0.64477851 0.086517774 0.73147010
## dis -0.37967009 0.66440822 -0.70802699 -0.099175780 -0.76923011
## rad 0.62550515 -0.31194783 0.59512927 -0.007368241 0.61144056
## tax 0.58276431 -0.31456332 0.72076018 -0.035586518 0.66802320
## ptratio 0.28994558 -0.39167855 0.38324756 -0.121515174 0.18893268
## black -0.38506394 0.17552032 -0.35697654 0.048788485 -0.38005064
## lstat 0.45562148 -0.41299457 0.60379972 -0.053929298 0.59087892
## medv -0.38830461 0.36044534 -0.48372516 0.175260177 -0.42732077
## Crime 0.40939545 -0.43615103 0.60326017 0.070096774 0.72323480
## rm age dis rad tax ptratio
## crim -0.21924670 0.35273425 -0.37967009 0.625505145 0.58276431 0.2899456
## zn 0.31199059 -0.56953734 0.66440822 -0.311947826 -0.31456332 -0.3916785
## indus -0.39167585 0.64477851 -0.70802699 0.595129275 0.72076018 0.3832476
## chas 0.09125123 0.08651777 -0.09917578 -0.007368241 -0.03558652 -0.1215152
## nox -0.30218819 0.73147010 -0.76923011 0.611440563 0.66802320 0.1889327
## rm 1.00000000 -0.24026493 0.20524621 -0.209846668 -0.29204783 -0.3555015
## age -0.24026493 1.00000000 -0.74788054 0.456022452 0.50645559 0.2615150
## dis 0.20524621 -0.74788054 1.00000000 -0.494587930 -0.53443158 -0.2324705
## rad -0.20984667 0.45602245 -0.49458793 1.000000000 0.91022819 0.4647412
## tax -0.29204783 0.50645559 -0.53443158 0.910228189 1.00000000 0.4608530
## ptratio -0.35550149 0.26151501 -0.23247054 0.464741179 0.46085304 1.0000000
## black 0.12806864 -0.27353398 0.29151167 -0.444412816 -0.44180801 -0.1773833
## lstat -0.61380827 0.60233853 -0.49699583 0.488676335 0.54399341 0.3740443
## medv 0.69535995 -0.37695457 0.24992873 -0.381626231 -0.46853593 -0.5077867
## Crime -0.15637178 0.61393992 -0.61634164 0.619786249 0.60874128 0.2535684
## black lstat medv Crime
## crim -0.38506394 0.4556215 -0.3883046 0.40939545
## zn 0.17552032 -0.4129946 0.3604453 -0.43615103
## indus -0.35697654 0.6037997 -0.4837252 0.60326017
## chas 0.04878848 -0.0539293 0.1752602 0.07009677
## nox -0.38005064 0.5908789 -0.4273208 0.72323480
## rm 0.12806864 -0.6138083 0.6953599 -0.15637178
## age -0.27353398 0.6023385 -0.3769546 0.61393992
## dis 0.29151167 -0.4969958 0.2499287 -0.61634164
## rad -0.44441282 0.4886763 -0.3816262 0.61978625
## tax -0.44180801 0.5439934 -0.4685359 0.60874128
## ptratio -0.17738330 0.3740443 -0.5077867 0.25356836
## black 1.00000000 -0.3660869 0.3334608 -0.35121093
## lstat -0.36608690 1.0000000 -0.7376627 0.45326273
## medv 0.33346082 -0.7376627 1.0000000 -0.26301673
## Crime -0.35121093 0.4532627 -0.2630167 1.00000000
#Problem 16
set.seed(1)
glm_boston <-glm(Crime~ nox+rad+tax+indus+chas, data = Boston_train, family = 'binomial')
Boston_probs <- predict(glm_boston, Boston_test, type = 'response')
Boston_pred <- rep(0, length(Boston_probs))
Boston_pred[Boston_probs > 0.5] = 1
table(Boston_pred, Crime_test)
## Crime_test
## Boston_pred 0 1
## 0 75 6
## 1 15 157
mean(Boston_pred == Crime_test)
## [1] 0.916996
#Problem 16
#The logistic regression model gives test accuracy of 91.70% when using these predictor variables: nox, rad, tax, indus, chas.
#Problem 15 LDA
library(MASS)
lda_boston <-lda(Crime~ nox+rad+tax+indus+chas, data= Boston_train)
Bostonlda_pred <- predict(lda_boston, Boston_test)
table(Bostonlda_pred$class, Crime_test)
## Crime_test
## 0 1
## 0 80 18
## 1 10 145
mean(Bostonlda_pred$class == Crime_test)
## [1] 0.8893281
#Problem 16 LDA
#The LDA model using the predictor variables: nox, rad, tax, indus, chas givestest accuracy of 88.93%.
#Problem 16 QDA
qda_boston <-qda(Crime~ nox*rad+tax+indus*chas, data= Boston_train)
Bostonqda_pred <- predict(qda_boston, Boston_test)
table(Bostonqda_pred$class, Crime_test)
## Crime_test
## 0 1
## 0 83 130
## 1 7 33
mean(Bostonqda_pred$class == Crime_test)
## [1] 0.458498
#Problem 15 QDA
#The QDA model using the predictor variables: nox, rad, tax, indus, chas givestest accuracy of 45.85%.
#Problem 16 KNN
library(class)
set.seed(1)
train_knn <- cbind(indus,nox,age,dis,rad,tax)[train,]
test_knn <- cbind(indus,nox,age,dis,rad,tax)[test,]
Bostonknn_pred <- knn(train_knn, test_knn, Crime_test, k=10)
table(Bostonknn_pred, Crime_test)
## Crime_test
## Bostonknn_pred 0 1
## 0 44 8
## 1 46 155
mean(Bostonknn_pred == Crime_test)
## [1] 0.7865613
#Problem 16 KNN
#The KNN model using the predictor variables: nox, rad, dis, tax, indus, chas with k=10 gives test accuracy of 78.66%.
#OVerall, the logistic regression and LDA model lead the accuracy results, followed by KNN. QDA model had the lowest test accuracy.