Assignment 3, Lee

install.packages("ISLR2")

## Installing package into 'C:/Users/lsunb/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)

## package 'ISLR2' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\lsunb\AppData\Local\Temp\Rtmp0USj3t\downloaded_packages

library(ISLR2)

## Warning: package 'ISLR2' was built under R version 4.4.3

library(ggplot2)

#Problem 13 part a)
library(knitr)
summary(Weekly)

##       Year           Lag1               Lag2               Lag3         
##  Min.   :1990   Min.   :-18.1950   Min.   :-18.1950   Min.   :-18.1950  
##  1st Qu.:1995   1st Qu.: -1.1540   1st Qu.: -1.1540   1st Qu.: -1.1580  
##  Median :2000   Median :  0.2410   Median :  0.2410   Median :  0.2410  
##  Mean   :2000   Mean   :  0.1506   Mean   :  0.1511   Mean   :  0.1472  
##  3rd Qu.:2005   3rd Qu.:  1.4050   3rd Qu.:  1.4090   3rd Qu.:  1.4090  
##  Max.   :2010   Max.   : 12.0260   Max.   : 12.0260   Max.   : 12.0260  
##       Lag4               Lag5              Volume            Today         
##  Min.   :-18.1950   Min.   :-18.1950   Min.   :0.08747   Min.   :-18.1950  
##  1st Qu.: -1.1580   1st Qu.: -1.1660   1st Qu.:0.33202   1st Qu.: -1.1540  
##  Median :  0.2380   Median :  0.2340   Median :1.00268   Median :  0.2410  
##  Mean   :  0.1458   Mean   :  0.1399   Mean   :1.57462   Mean   :  0.1499  
##  3rd Qu.:  1.4090   3rd Qu.:  1.4050   3rd Qu.:2.05373   3rd Qu.:  1.4050  
##  Max.   : 12.0260   Max.   : 12.0260   Max.   :9.32821   Max.   : 12.0260  
##  Direction 
##  Down:484  
##  Up  :605  
##            
##            
##            
##

pairs(Weekly)

cor(Weekly[,-9])

##               Year         Lag1        Lag2        Lag3         Lag4
## Year    1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1   -0.03228927  1.000000000 -0.07485305  0.05863568 -0.071273876
## Lag2   -0.03339001 -0.074853051  1.00000000 -0.07572091  0.058381535
## Lag3   -0.03000649  0.058635682 -0.07572091  1.00000000 -0.075395865
## Lag4   -0.03112792 -0.071273876  0.05838153 -0.07539587  1.000000000
## Lag5   -0.03051910 -0.008183096 -0.07249948  0.06065717 -0.075675027
## Volume  0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today  -0.03245989 -0.075031842  0.05916672 -0.07124364 -0.007825873
##                Lag5      Volume        Today
## Year   -0.030519101  0.84194162 -0.032459894
## Lag1   -0.008183096 -0.06495131 -0.075031842
## Lag2   -0.072499482 -0.08551314  0.059166717
## Lag3    0.060657175 -0.06928771 -0.071243639
## Lag4   -0.075675027 -0.06107462 -0.007825873
## Lag5    1.000000000 -0.05851741  0.011012698
## Volume -0.058517414  1.00000000 -0.033077783
## Today   0.011012698 -0.03307778  1.000000000

#13a) From the correlation plot above, it is clear that there is correlation between volume and year. This can also be shown in a graph show down below.
plot(Weekly$Volume, ylab = "Shares traded")

#Problem 13 part b)
Weekly_fits<-glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume, data=Weekly, family=binomial)
summary(Weekly_fits)

## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = binomial, data = Weekly)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.26686    0.08593   3.106   0.0019 **
## Lag1        -0.04127    0.02641  -1.563   0.1181   
## Lag2         0.05844    0.02686   2.175   0.0296 * 
## Lag3        -0.01606    0.02666  -0.602   0.5469   
## Lag4        -0.02779    0.02646  -1.050   0.2937   
## Lag5        -0.01447    0.02638  -0.549   0.5833   
## Volume      -0.02274    0.03690  -0.616   0.5377   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1486.4  on 1082  degrees of freedom
## AIC: 1500.4
## 
## Number of Fisher Scoring iterations: 4

#Based on the logistic regression, Lag2 seems to be the only statistically significant variable.

#Problem 13 Part c)
weekly_probs <- predict(Weekly_fits, type = "response")
weekly_pred <- rep("Down", 1089)
weekly_pred[weekly_probs >.5]= "Up"
table(weekly_pred, Weekly$Direction)

##            
## weekly_pred Down  Up
##        Down   54  48
##        Up    430 557

mean(weekly_pred == Weekly$Direction)

## [1] 0.5610652

#Problem 13 Part c)
#Based on the results of the confusion matrix, in general we predicted the weekly trend correctly approximately 56.11% of the time.

#Problem 13 Part d)
trainRows <- (Weekly$Year < 2009)
train <- Weekly[trainRows,]
test <- Weekly[!trainRows,]

glm.wkfit2 <- glm(Direction~Lag2,data=train,family=binomial)
wkfitprobs <- predict(glm.wkfit2 ,newdata=test)
wkfit_2010.Preds <- rep('Down',length=nrow(test))
wkfit_2010.Preds[wkfitprobs > .5] = 'Up'
table(wkfit_2010.Preds,test$Direction)

##                 
## wkfit_2010.Preds Down Up
##             Down   41 56
##             Up      2  5

#Problem 13 Part e)
library(MASS)

## Warning: package 'MASS' was built under R version 4.4.2

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:ISLR2':
## 
##     Boston

train <- (Weekly$Year < 2009)  
Weekly_train <- Weekly[train, ] 
Weekly_2009 <- Weekly[!train, ]  

Weeklylda_fit <- lda(Direction ~ Lag2, data = Weekly, subset = train)

Weeklylda_pred <- predict(Weeklylda_fit, newdata = Weekly_2009)

Direction_2009 <- Weekly_2009$Direction

table(Weeklylda_pred$class, Direction_2009)

##       Direction_2009
##        Down Up
##   Down    9  5
##   Up     34 56

mean(Weeklylda_pred$class == Direction_2009)

## [1] 0.625

#Problem 13 Part f)
Weeklyqda_fit <- qda(Direction ~ Lag2, data = Weekly, subset = train)
Weeklyqda_pred <- predict(Weeklyqda_fit, Weekly_2009)$class
table(Weeklyqda_pred, Direction_2009)

##               Direction_2009
## Weeklyqda_pred Down Up
##           Down    0  0
##           Up     43 61

mean(Weeklyqda_pred == Direction_2009)

## [1] 0.5865385

#Problem 13 Part g)
library(class)

train <- (Weekly$Year < 2009)  

Week_train <- as.matrix(Weekly$Lag2[train])  
Week_test <- as.matrix(Weekly$Lag2[!train])  

train_Direction <- Weekly$Direction[train]  
Direction_2009 <- Weekly$Direction[!train] 

set.seed(1)

Weekknn_pred <- knn(Week_train, Week_test, train_Direction, k=1)

table(Weekknn_pred, Direction_2009)

##             Direction_2009
## Weekknn_pred Down Up
##         Down   21 30
##         Up     22 31

mean(Weekknn_pred == Direction_2009)

## [1] 0.5

#Problem 13 Part h)
library(e1071)

## Warning: package 'e1071' was built under R version 4.4.2

weeklynb_fit <- naiveBayes(Direction~Lag2 ,data=Weekly ,subset=train)
weeklynb_fit

## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##      Down        Up 
## 0.4477157 0.5522843 
## 
## Conditional probabilities:
##       Lag2
## Y             [,1]     [,2]
##   Down -0.03568254 2.199504
##   Up    0.26036581 2.317485

weeklynb_class <- predict(weeklynb_fit ,Weekly_2009)
table(weeklynb_class ,Direction_2009)

##               Direction_2009
## weeklynb_class Down Up
##           Down    0  0
##           Up     43 61

mean (weeklynb_class == Direction_2009)

## [1] 0.5865385

#Problem 13 Part i)
#Based on the results of the modes above, LDA provides the best results.

#Problem 13 Part j)
set.seed(1)
Weekknn_pred2 <- knn(Week_train,Week_test,train_Direction,k=15)
table(Weekknn_pred2,Direction_2009)

##              Direction_2009
## Weekknn_pred2 Down Up
##          Down   20 20
##          Up     23 41

mean(Weekknn_pred2 == Direction_2009)

## [1] 0.5865385

#Problem 13 Part j)
library(MASS)

train <- (Weekly$Year < 2009)  
Weekly_train <- Weekly[train, ] 
Weekly_2009 <- Weekly[!train, ]
Weeklyqda_fit2 <- qda(Direction ~ I(Lag2^2), data = Weekly, subset = train)

Weeklyqda_pred2 <- predict(Weeklyqda_fit2, newdata = Weekly_2009)$class

Direction_2009 <- Weekly_2009$Direction  

table(Weeklyqda_pred2, Direction_2009)

##                Direction_2009
## Weeklyqda_pred2 Down Up
##            Down    4  6
##            Up     39 55

mean(Weeklyqda_pred2 == Direction_2009)

## [1] 0.5673077

#Problem 13 Part j)
#Changing the K value from 1 to 15 increased the result to 58.65%. Additionally, changing Lag2 to Lag2^2 decreased the result to 56.73%.

#Problem 14 Part a)
library(ISLR2)
attach(Auto)

## The following object is masked from package:ggplot2:
## 
##     mpg

mpg01 <- ifelse(Auto$mpg > median(Auto$mpg),1,0)
Auto_mpg01 <- data.frame(Auto, mpg01)

#Problem 14 Part b)
pairs(Auto_mpg01[,-9])

par(mfrow=c(2,2))
boxplot(cylinders ~ mpg01, data = Auto_mpg01, main = "Cylinders vs mpg01")
boxplot(displacement ~ mpg01, data = Auto_mpg01, main = "Displacement vs mpg01")
boxplot(horsepower ~ mpg01, data = Auto_mpg01, main = "Horsepower vs mpg01")
boxplot(weight ~ mpg01, data = Auto_mpg01, main = "Weight vs mpg01")

boxplot(acceleration ~ mpg01, data = Auto_mpg01, main = "Acceleration vs mpg01")
boxplot(year ~ mpg01, data = Auto_mpg01, main = "Year vs mpg01")
cor(Auto_mpg01[, -9])

##                     mpg  cylinders displacement horsepower     weight
## mpg           1.0000000 -0.7776175   -0.8051269 -0.7784268 -0.8322442
## cylinders    -0.7776175  1.0000000    0.9508233  0.8429834  0.8975273
## displacement -0.8051269  0.9508233    1.0000000  0.8972570  0.9329944
## horsepower   -0.7784268  0.8429834    0.8972570  1.0000000  0.8645377
## weight       -0.8322442  0.8975273    0.9329944  0.8645377  1.0000000
## acceleration  0.4233285 -0.5046834   -0.5438005 -0.6891955 -0.4168392
## year          0.5805410 -0.3456474   -0.3698552 -0.4163615 -0.3091199
## origin        0.5652088 -0.5689316   -0.6145351 -0.4551715 -0.5850054
## mpg01         0.8369392 -0.7591939   -0.7534766 -0.6670526 -0.7577566
##              acceleration       year     origin      mpg01
## mpg             0.4233285  0.5805410  0.5652088  0.8369392
## cylinders      -0.5046834 -0.3456474 -0.5689316 -0.7591939
## displacement   -0.5438005 -0.3698552 -0.6145351 -0.7534766
## horsepower     -0.6891955 -0.4163615 -0.4551715 -0.6670526
## weight         -0.4168392 -0.3091199 -0.5850054 -0.7577566
## acceleration    1.0000000  0.2903161  0.2127458  0.3468215
## year            0.2903161  1.0000000  0.1815277  0.4299042
## origin          0.2127458  0.1815277  1.0000000  0.5136984
## mpg01           0.3468215  0.4299042  0.5136984  1.0000000

#Problem 14 Part b)
#The correlation plot and the results of the correlation tahble reveals that the variables cylinder, displacement, horsepower and weight are most useful in predicting the mpg01.

#Problem 14 Part c)
set.seed(2)
train_auto <- sample(1:nrow(Auto_mpg01), 0.8*nrow(Auto_mpg01))
test_auto <- Auto_mpg01[,-train_auto]

#Problem 14 Part d)
library(MASS)
lda_autofit <- lda(mpg01 ~ cylinders + displacement + weight, data=Auto_mpg01)
lda_autopred <-predict(lda_autofit, test_auto)$class
table(lda_autopred, test_auto$mpg01)

##             
## lda_autopred   0   1
##            0 168  13
##            1  28 183

mean(lda_autopred == test_auto$mpg01)

## [1] 0.8954082

#Problem 14 Part d)
#The test error that the model obtained was 89.54%.

#Problem 14 Part e)
qda_autofit <- qda(mpg01 ~ cylinders + displacement + weight, data=Auto_mpg01)
qda_autopred <- predict(qda_autofit, test_auto)$class
table(qda_autopred, test_auto$mpg01)

##             
## qda_autopred   0   1
##            0 175  17
##            1  21 179

mean(qda_autopred == test_auto$mpg01)

## [1] 0.9030612

#Problem 14 Part e)
#The test error that the model obtained was 90.31%.

#Problem 14 Part f)
autoglm_fit <- glm(mpg01 ~ cylinders + displacement + weight, family=binomial, data=Auto_mpg01)
glm_autoprobs <- predict(autoglm_fit,test_auto,type="response")
glm_autopred <- rep(0,nrow(test_auto))
glm_autopred[glm_autoprobs > 0.50]=1
table(glm_autopred, test_auto$mpg01)

##             
## glm_autopred   0   1
##            0 170  15
##            1  26 181

mean(glm_autopred == test_auto$mpg01)

## [1] 0.8954082

#Problem 14 Part f)
#The test error that the model obtained was 89.54%.

#Problem 14 Parf g)
Auto_mpg01$cylinders <- as.numeric(as.character(Auto_mpg01$cylinders))
Auto_mpg01$displacement <- as.numeric(as.character(Auto_mpg01$displacement))
Auto_mpg01$weight <- as.numeric(as.character(Auto_mpg01$weight))
library(e1071)
nb_autofit <- naiveBayes(mpg01~ cylinders + displacement + weight, data=Auto_mpg01)
nb_autoclass <- predict(nb_autofit, test_auto)

## Warning in predict.naiveBayes(nb_autofit, test_auto): Type mismatch between
## training and new data for variable 'cylinders'. Did you use factors with
## numeric labels for training, and numeric values for new data?

## Warning in predict.naiveBayes(nb_autofit, test_auto): Type mismatch between
## training and new data for variable 'displacement'. Did you use factors with
## numeric labels for training, and numeric values for new data?

table(nb_autoclass, test_auto$mpg01)

##             
## nb_autoclass   0   1
##            0 165  16
##            1  31 180

mean(nb_autoclass == test_auto$mpg01)

## [1] 0.880102

#Problem 14 Part g)
#The test error that the model obtained was 50.00%.

#Problem 15 Part h)
library(class)
set.seed(3)
idx <- sample(1:nrow(Auto), 0.8*nrow(Auto))
train_auto <- Auto[idx,]
test_auto <- Auto[-idx,]
y_train <- train_auto$mpg
x_train <- train_auto[,-1]
x_test <- test_auto[,-1]
x_train <- x_train[,-8]
x_test <- x_test[,-8]
y_test<- ifelse(test_auto$mpg>=23,1,0)
high_low <- ifelse(y_train>=23,1,0)
knn_pred <- knn (train=x_train, test=x_test, cl=high_low, k=1)
table(knn_pred, y_test)

##         y_test
## knn_pred  0  1
##        0 33  3
##        1  9 34

#Problem 15 Part h)
mean(knn_pred == y_test)

## [1] 0.8481013

set.seed(3)
knn_pred <- knn (train=x_train, test=x_test, cl=high_low, k=29)
table(knn_pred, y_test)

##         y_test
## knn_pred  0  1
##        0 34  3
##        1  8 34

mean(knn_pred == y_test)

## [1] 0.8607595

#Problem 15 Part h)
#The test error that the model obtained was 82.28%.

#Problem 16
library(ISLR2)
attach(Boston)
Crime <- rep(0, length(crim))
Crime[crim > median(crim)] <- 1
Boston01 <- data.frame(Boston, Crime)

#Problem 16
train <- 1:(dim(Boston01)[1]/2)
test <- (dim(Boston01)[1]/2 + 1):dim(Boston01)[1]
Boston_train <- Boston01[train, ]
Boston_test <- Boston01[test, ]
Crime_test <- Crime[test]
cor(Boston01)

##                crim          zn       indus         chas         nox
## crim     1.00000000 -0.20046922  0.40658341 -0.055891582  0.42097171
## zn      -0.20046922  1.00000000 -0.53382819 -0.042696719 -0.51660371
## indus    0.40658341 -0.53382819  1.00000000  0.062938027  0.76365145
## chas    -0.05589158 -0.04269672  0.06293803  1.000000000  0.09120281
## nox      0.42097171 -0.51660371  0.76365145  0.091202807  1.00000000
## rm      -0.21924670  0.31199059 -0.39167585  0.091251225 -0.30218819
## age      0.35273425 -0.56953734  0.64477851  0.086517774  0.73147010
## dis     -0.37967009  0.66440822 -0.70802699 -0.099175780 -0.76923011
## rad      0.62550515 -0.31194783  0.59512927 -0.007368241  0.61144056
## tax      0.58276431 -0.31456332  0.72076018 -0.035586518  0.66802320
## ptratio  0.28994558 -0.39167855  0.38324756 -0.121515174  0.18893268
## black   -0.38506394  0.17552032 -0.35697654  0.048788485 -0.38005064
## lstat    0.45562148 -0.41299457  0.60379972 -0.053929298  0.59087892
## medv    -0.38830461  0.36044534 -0.48372516  0.175260177 -0.42732077
## Crime    0.40939545 -0.43615103  0.60326017  0.070096774  0.72323480
##                  rm         age         dis          rad         tax    ptratio
## crim    -0.21924670  0.35273425 -0.37967009  0.625505145  0.58276431  0.2899456
## zn       0.31199059 -0.56953734  0.66440822 -0.311947826 -0.31456332 -0.3916785
## indus   -0.39167585  0.64477851 -0.70802699  0.595129275  0.72076018  0.3832476
## chas     0.09125123  0.08651777 -0.09917578 -0.007368241 -0.03558652 -0.1215152
## nox     -0.30218819  0.73147010 -0.76923011  0.611440563  0.66802320  0.1889327
## rm       1.00000000 -0.24026493  0.20524621 -0.209846668 -0.29204783 -0.3555015
## age     -0.24026493  1.00000000 -0.74788054  0.456022452  0.50645559  0.2615150
## dis      0.20524621 -0.74788054  1.00000000 -0.494587930 -0.53443158 -0.2324705
## rad     -0.20984667  0.45602245 -0.49458793  1.000000000  0.91022819  0.4647412
## tax     -0.29204783  0.50645559 -0.53443158  0.910228189  1.00000000  0.4608530
## ptratio -0.35550149  0.26151501 -0.23247054  0.464741179  0.46085304  1.0000000
## black    0.12806864 -0.27353398  0.29151167 -0.444412816 -0.44180801 -0.1773833
## lstat   -0.61380827  0.60233853 -0.49699583  0.488676335  0.54399341  0.3740443
## medv     0.69535995 -0.37695457  0.24992873 -0.381626231 -0.46853593 -0.5077867
## Crime   -0.15637178  0.61393992 -0.61634164  0.619786249  0.60874128  0.2535684
##               black      lstat       medv       Crime
## crim    -0.38506394  0.4556215 -0.3883046  0.40939545
## zn       0.17552032 -0.4129946  0.3604453 -0.43615103
## indus   -0.35697654  0.6037997 -0.4837252  0.60326017
## chas     0.04878848 -0.0539293  0.1752602  0.07009677
## nox     -0.38005064  0.5908789 -0.4273208  0.72323480
## rm       0.12806864 -0.6138083  0.6953599 -0.15637178
## age     -0.27353398  0.6023385 -0.3769546  0.61393992
## dis      0.29151167 -0.4969958  0.2499287 -0.61634164
## rad     -0.44441282  0.4886763 -0.3816262  0.61978625
## tax     -0.44180801  0.5439934 -0.4685359  0.60874128
## ptratio -0.17738330  0.3740443 -0.5077867  0.25356836
## black    1.00000000 -0.3660869  0.3334608 -0.35121093
## lstat   -0.36608690  1.0000000 -0.7376627  0.45326273
## medv     0.33346082 -0.7376627  1.0000000 -0.26301673
## Crime   -0.35121093  0.4532627 -0.2630167  1.00000000

#Problem 16
set.seed(1)
glm_boston <-glm(Crime~ nox+rad+tax+indus+chas, data = Boston_train, family = 'binomial')
Boston_probs <- predict(glm_boston, Boston_test, type = 'response')
Boston_pred <- rep(0, length(Boston_probs))
Boston_pred[Boston_probs > 0.5] = 1
table(Boston_pred, Crime_test)

##            Crime_test
## Boston_pred   0   1
##           0  75   6
##           1  15 157

mean(Boston_pred == Crime_test)

## [1] 0.916996

#Problem 16
#The logistic regression model gives test accuracy of 91.70% when using these predictor variables: nox, rad, tax, indus, chas.

#Problem 15 LDA
library(MASS)
lda_boston <-lda(Crime~ nox+rad+tax+indus+chas, data= Boston_train)
Bostonlda_pred <- predict(lda_boston, Boston_test)
table(Bostonlda_pred$class, Crime_test)

##    Crime_test
##       0   1
##   0  80  18
##   1  10 145

mean(Bostonlda_pred$class == Crime_test)

## [1] 0.8893281

#Problem 16 LDA
#The LDA model using the predictor variables: nox, rad, tax, indus, chas givestest accuracy of 88.93%.

#Problem 16 QDA
qda_boston <-qda(Crime~ nox*rad+tax+indus*chas, data= Boston_train)
Bostonqda_pred <- predict(qda_boston, Boston_test)
table(Bostonqda_pred$class, Crime_test)

##    Crime_test
##       0   1
##   0  83 130
##   1   7  33

mean(Bostonqda_pred$class == Crime_test)

## [1] 0.458498

#Problem 15 QDA
#The QDA model using the predictor variables: nox, rad, tax, indus, chas givestest accuracy of 45.85%.

#Problem 16 KNN
library(class)
set.seed(1)
train_knn <- cbind(indus,nox,age,dis,rad,tax)[train,]
test_knn <- cbind(indus,nox,age,dis,rad,tax)[test,]
Bostonknn_pred <- knn(train_knn, test_knn, Crime_test, k=10)
table(Bostonknn_pred, Crime_test)

##               Crime_test
## Bostonknn_pred   0   1
##              0  44   8
##              1  46 155

mean(Bostonknn_pred == Crime_test)

## [1] 0.7865613

#Problem 16 KNN
#The KNN model using the predictor variables: nox, rad, dis, tax, indus, chas with k=10 gives test accuracy of 78.66%.

#OVerall, the logistic regression and LDA model lead the accuracy results, followed by KNN. QDA model had the lowest test accuracy.

Assignment 3, Lee

SunBeom Lee

2025-03-07