Assignment 3

13.

13a)

weekly = ISLR2::Weekly

Loading data from ISLR2 package, then producing some numerical and graphical summaries of the data.

summary(weekly)

##       Year           Lag1               Lag2               Lag3         
##  Min.   :1990   Min.   :-18.1950   Min.   :-18.1950   Min.   :-18.1950  
##  1st Qu.:1995   1st Qu.: -1.1540   1st Qu.: -1.1540   1st Qu.: -1.1580  
##  Median :2000   Median :  0.2410   Median :  0.2410   Median :  0.2410  
##  Mean   :2000   Mean   :  0.1506   Mean   :  0.1511   Mean   :  0.1472  
##  3rd Qu.:2005   3rd Qu.:  1.4050   3rd Qu.:  1.4090   3rd Qu.:  1.4090  
##  Max.   :2010   Max.   : 12.0260   Max.   : 12.0260   Max.   : 12.0260  
##       Lag4               Lag5              Volume            Today         
##  Min.   :-18.1950   Min.   :-18.1950   Min.   :0.08747   Min.   :-18.1950  
##  1st Qu.: -1.1580   1st Qu.: -1.1660   1st Qu.:0.33202   1st Qu.: -1.1540  
##  Median :  0.2380   Median :  0.2340   Median :1.00268   Median :  0.2410  
##  Mean   :  0.1458   Mean   :  0.1399   Mean   :1.57462   Mean   :  0.1499  
##  3rd Qu.:  1.4090   3rd Qu.:  1.4050   3rd Qu.:2.05373   3rd Qu.:  1.4050  
##  Max.   : 12.0260   Max.   : 12.0260   Max.   :9.32821   Max.   : 12.0260  
##  Direction 
##  Down:484  
##  Up  :605  
##            
##            
##            
##

str(weekly)

## 'data.frame':    1089 obs. of  9 variables:
##  $ Year     : num  1990 1990 1990 1990 1990 1990 1990 1990 1990 1990 ...
##  $ Lag1     : num  0.816 -0.27 -2.576 3.514 0.712 ...
##  $ Lag2     : num  1.572 0.816 -0.27 -2.576 3.514 ...
##  $ Lag3     : num  -3.936 1.572 0.816 -0.27 -2.576 ...
##  $ Lag4     : num  -0.229 -3.936 1.572 0.816 -0.27 ...
##  $ Lag5     : num  -3.484 -0.229 -3.936 1.572 0.816 ...
##  $ Volume   : num  0.155 0.149 0.16 0.162 0.154 ...
##  $ Today    : num  -0.27 -2.576 3.514 0.712 1.178 ...
##  $ Direction: Factor w/ 2 levels "Down","Up": 1 1 2 2 2 1 2 2 2 1 ...

weekly_corr = weekly %>% 
  dplyr::select(-Direction) #removing direction from the plot

plot(weekly_corr)

corrplot::corrplot(cor(weekly_corr))

Looking at the numbers and graphs, it looks like the Lags all very similar ranges of data (-18:12 approx) along with all variables being numeric except our target variable of Direction. Direction is also semi-balanced with a 484:605 ratio. In the correlation plot, we can see a strong positive relationship between Volume and Year.

13b)

Time to fit a logistic regression on the data.

weekly_glm = glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume, data = weekly,
                 family = binomial)
summary(weekly_glm)

## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = binomial, data = weekly)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.26686    0.08593   3.106   0.0019 **
## Lag1        -0.04127    0.02641  -1.563   0.1181   
## Lag2         0.05844    0.02686   2.175   0.0296 * 
## Lag3        -0.01606    0.02666  -0.602   0.5469   
## Lag4        -0.02779    0.02646  -1.050   0.2937   
## Lag5        -0.01447    0.02638  -0.549   0.5833   
## Volume      -0.02274    0.03690  -0.616   0.5377   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1486.4  on 1082  degrees of freedom
## AIC: 1500.4
## 
## Number of Fisher Scoring iterations: 4

Using an alpha level of 0.05, Lag2 is significant at a p-value of 0.0296.

13c.

Time to make the confusion martrix to assess accuracy.

pred1 = predict(weekly_glm, type = 'response') > 0.5

tab1 <- table(Predicted = pred1, Actual = weekly$Direction)
tab1

##          Actual
## Predicted Down  Up
##     FALSE   54  48
##     TRUE   430 557

sum(diag(tab1))/sum(tab1)

## [1] 0.5610652

We get an accuracy of 0.561 for this logistic regression model which isn’t all that great. Another issue with this is that the model is predicting a lot of False Negatives for Down. So the model predicts downtrends instead as uptrends.

13d.

Now only using Lag2 and train/test split based on years.

weekly_train = weekly$Year < 2009

weekly_glm2 = glm(Direction~Lag2, data = weekly[weekly_train,], family = binomial)
summary(weekly_glm2)

## 
## Call:
## glm(formula = Direction ~ Lag2, family = binomial, data = weekly[weekly_train, 
##     ])
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.20326    0.06428   3.162  0.00157 **
## Lag2         0.05810    0.02870   2.024  0.04298 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1354.7  on 984  degrees of freedom
## Residual deviance: 1350.5  on 983  degrees of freedom
## AIC: 1354.5
## 
## Number of Fisher Scoring iterations: 4

pred2 = predict(weekly_glm2, weekly[!weekly_train, ], type = 'response') > 0.5

tab2 <- table(Predicted = pred2, Actual = weekly[!weekly_train,]$Direction)
tab2

##          Actual
## Predicted Down Up
##     FALSE    9  5
##     TRUE    34 56

sum(diag(tab2))/sum(tab2)

## [1] 0.625

We get an accuracy of 0.625 of accurately predicting weekly trends on the final 2 test years. This is about an ~11% increase in accuracy compared to the previous model with multiple predictors and without a test split.

13e. LDA

weekly_lda = lda(Direction~Lag2, data = weekly[weekly_train,])

pred3 = predict(weekly_lda, weekly[!weekly_train, ], type = 'response')$class

tab3 <- table(Predicted = pred3, Actual = weekly[!weekly_train,]$Direction)
tab3

##          Actual
## Predicted Down Up
##      Down    9  5
##      Up     34 56

sum(diag(tab3))/sum(tab3)

## [1] 0.625

LDA also had an accuracy of 0.625

13f. QDA

weekly_qda = qda(Direction~Lag2, data = weekly[weekly_train,])

pred4 = predict(weekly_qda, weekly[!weekly_train, ], type = 'response')$class

tab4 <- table(Predicted = pred4, Actual = weekly[!weekly_train,]$Direction)
tab4

##          Actual
## Predicted Down Up
##      Down    0  0
##      Up     43 61

sum(diag(tab4))/sum(tab4)

## [1] 0.5865385

QDA accuracy : 0.587

13g. KNN w/ K=1

set.seed(1)
weekly_knn = knn(weekly[weekly_train, 'Lag2', drop = F],
                 weekly[!weekly_train, 'Lag2', drop = F],
                 weekly$Direction[weekly_train])

tab5 <- table(weekly_knn, weekly[!weekly_train,]$Direction)
tab5

##           
## weekly_knn Down Up
##       Down   21 30
##       Up     22 31

sum(diag(tab5))/sum(tab5)

## [1] 0.5

KNN Accuracy is .5

13h. naive Bayes

weekly_bayes = naiveBayes(Direction~Lag2, data = weekly, subset = weekly_train)
pred6 = predict(weekly_bayes, weekly[!weekly_train,], type = 'class')
tab6 <- table(pred6, weekly[!weekly_train,]$Direction)
tab6

##       
## pred6  Down Up
##   Down    0  0
##   Up     43 61

sum(diag(tab6))/sum(tab6)

## [1] 0.5865385

Naive Bayes accuracy of 0.587

13i. Which is best?

The two models that gave the best accuracies are LDA and Logistic Regression with 0.625 (after we only used Lag2 as a predictor).

13j. Experiment

weekly_lda = lda(Direction~Lag2+Volume, data = weekly[weekly_train,])

pred = predict(weekly_lda, weekly[!weekly_train, ], type = 'response')$class

tab <- table(Predicted = pred, Actual = weekly[!weekly_train,]$Direction)
tab

##          Actual
## Predicted Down Up
##      Down   20 25
##      Up     23 36

sum(diag(tab))/sum(tab)

## [1] 0.5384615

Only adding Volume changed the accuracy by almost 0.1!

set.seed(1)
weekly_knn = knn(weekly[weekly_train, 'Lag2', drop = F],
                 weekly[!weekly_train, 'Lag2', drop = F],
                 weekly$Direction[weekly_train], k =10)

tab <- table(weekly_knn, weekly[!weekly_train,]$Direction)
tab

##           
## weekly_knn Down Up
##       Down   17 21
##       Up     26 40

sum(diag(tab))/sum(tab)

## [1] 0.5480769

Increasing k to 10 gave us 0.048 increased accuracy.

14.

14a)

auto = ISLR2::Auto
mpg01 = rep(0, length(auto$mpg))
mpg01[auto$mpg > median(auto$mpg)] = 1
auto = data.frame(auto, mpg01)
summary(auto$mpg)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    9.00   17.00   22.75   23.45   29.00   46.60

view(auto)

14b)

auto_no_name = auto %>% 
  dplyr::select(-name)
corrplot::corrplot(cor(auto_no_name))

plot(auto_no_name)

Variables that are positively correlated: mpg, and slight correlation with origin Negatively correlated: Cylinders, Displacement, Horsepower, Weight

14c. Test/Train Split

set.seed(1)
index = sample(1:nrow(auto), 0.8*nrow(auto))
auto_train = auto[index,]
auto_test = auto[-index,]

14d. LDA

I will use cylinders, displacement,horsepower, and weight because they looked to have the largest association with mpg01.

auto_lda = lda(mpg01~ cylinders + displacement + weight+ horsepower,
               data = auto_train)
pred = predict(auto_lda, auto_test)
tab = table(pred$class, auto_test$mpg01)
tab

##    
##      0  1
##   0 35  0
##   1  7 37

sum(diag(tab))/sum(tab)

## [1] 0.9113924

mean(pred$class != auto_test$mpg01)

## [1] 0.08860759

This LDA model using an 80/20 train/test split got an accuracy of 91.1% or an error rate of 8.8%.

14e. QDA

auto_qda = qda(mpg01~ cylinders + displacement + weight + horsepower,
               data = auto_train)
pred = predict(auto_qda, auto_test)
tab = table(pred$class, auto_test$mpg01)
tab

##    
##      0  1
##   0 37  2
##   1  5 35

sum(diag(tab))/sum(tab)

## [1] 0.9113924

mean(pred$class != auto_test$mpg01)

## [1] 0.08860759

Same accuracy / error rate as the LDA 91.1% acc, 8.8% error

14f. logistic regression

auto_glm = glm(mpg01~ cylinders + displacement + weight + horsepower,
               data = auto_train, family = binomial())
pred = predict(auto_glm, auto_test, type = 'response') > 0.5
tab = table(Predicted = pred, Actual = auto_test$mpg01)
tab

##          Actual
## Predicted  0  1
##     FALSE 38  1
##     TRUE   4 36

sum(diag(tab))/sum(tab)

## [1] 0.9367089

1 - (sum(diag(tab))/sum(tab))

## [1] 0.06329114

Logistic regression gave us an accuracy of 93.7% and an error rate of 6.3% which is better than our prior models.

14g. Naive Bayes

auto_bayes = naiveBayes(mpg01 ~ cylinders + displacement + weight + horsepower,
               data = auto_train)
pred = predict(auto_bayes, auto_test, type = 'class')
tab = table(Predicted = pred, Actual = auto_test$mpg01)
tab

##          Actual
## Predicted  0  1
##         0 37  1
##         1  5 36

sum(diag(tab))/sum(tab)

## [1] 0.9240506

1 - (sum(diag(tab))/sum(tab))

## [1] 0.07594937

Bayes gave us an accuracy of 92.4% and error of 7.6%

14h. KNN

set.seed(1)
auto_train = auto[index,c(2, 3, 4, 5)] 
auto_test <- auto[-index, c(2, 3, 4, 5)]
auto_knn <- knn(auto_train, auto_test, auto$mpg01[index], k = 1)

tab <- table(auto_knn, auto$mpg01[-index])
accuracy <- sum(diag(tab)) / sum(tab)
tab

##         
## auto_knn  0  1
##        0 36  4
##        1  6 33

accuracy

## [1] 0.8734177

1-accuracy

## [1] 0.1265823

set.seed(1)
finding_k = sapply(1:10, function(i) {
  auto_knn = knn(auto_train, auto_test, auto$mpg01[index], k = i)
  mean(auto_knn != auto$mpg01[-index])
})
names(finding_k) = 1:10
finding_k[which.min(finding_k)]

##          6 
## 0.07594937

The best k=6 which gave us an accuracy of 92.4% and an error of 7.6% which is better than k=1 which gave us 87.3% accuracy and 12.75 error

16.

boston = ISLR2::Boston
morecrim = rep(0, length(boston$crim))
morecrim[boston$crim > median(boston$crim)] = 1
boston = data.frame(boston, morecrim)
summary(boston)

##       crim                zn             indus            chas        
##  Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46   Min.   :0.00000  
##  1st Qu.: 0.08205   1st Qu.:  0.00   1st Qu.: 5.19   1st Qu.:0.00000  
##  Median : 0.25651   Median :  0.00   Median : 9.69   Median :0.00000  
##  Mean   : 3.61352   Mean   : 11.36   Mean   :11.14   Mean   :0.06917  
##  3rd Qu.: 3.67708   3rd Qu.: 12.50   3rd Qu.:18.10   3rd Qu.:0.00000  
##  Max.   :88.97620   Max.   :100.00   Max.   :27.74   Max.   :1.00000  
##       nox               rm             age              dis        
##  Min.   :0.3850   Min.   :3.561   Min.   :  2.90   Min.   : 1.130  
##  1st Qu.:0.4490   1st Qu.:5.886   1st Qu.: 45.02   1st Qu.: 2.100  
##  Median :0.5380   Median :6.208   Median : 77.50   Median : 3.207  
##  Mean   :0.5547   Mean   :6.285   Mean   : 68.57   Mean   : 3.795  
##  3rd Qu.:0.6240   3rd Qu.:6.623   3rd Qu.: 94.08   3rd Qu.: 5.188  
##  Max.   :0.8710   Max.   :8.780   Max.   :100.00   Max.   :12.127  
##       rad              tax           ptratio          lstat      
##  Min.   : 1.000   Min.   :187.0   Min.   :12.60   Min.   : 1.73  
##  1st Qu.: 4.000   1st Qu.:279.0   1st Qu.:17.40   1st Qu.: 6.95  
##  Median : 5.000   Median :330.0   Median :19.05   Median :11.36  
##  Mean   : 9.549   Mean   :408.2   Mean   :18.46   Mean   :12.65  
##  3rd Qu.:24.000   3rd Qu.:666.0   3rd Qu.:20.20   3rd Qu.:16.95  
##  Max.   :24.000   Max.   :711.0   Max.   :22.00   Max.   :37.97  
##       medv          morecrim  
##  Min.   : 5.00   Min.   :0.0  
##  1st Qu.:17.02   1st Qu.:0.0  
##  Median :21.20   Median :0.5  
##  Mean   :22.53   Mean   :0.5  
##  3rd Qu.:25.00   3rd Qu.:1.0  
##  Max.   :50.00   Max.   :1.0

corrplot::corrplot(cor(boston))

Potential predictors based on larger circles on morecrim in the correlation plot: indus, nox, age, dis, rad, tax

set.seed(1)
index = sample(1:nrow(auto), 0.8*nrow(auto))
boston_train = boston[index,]
boston_test = boston[-index,]

Logit

boston_glm = glm(morecrim ~ indus+ nox+ age+ dis+ rad+ tax, data = boston_train,
                 family = binomial)
pred = predict(boston_glm, boston_test, type = 'response') > 0.5
tab = table(Predicted = pred, Actual = boston_test$morecrim)
tab

##          Actual
## Predicted   0   1
##     FALSE  52   4
##     TRUE   13 124

sum(diag(tab))/sum(tab)

## [1] 0.9119171

1 - (sum(diag(tab))/sum(tab))

## [1] 0.0880829

Logistic regression accuracy: 91.2% Error: 8.8%

LDA

boston_lda = lda(morecrim ~ indus+ nox+ age+ dis+ rad+ tax, data = boston_train)
pred = predict(boston_lda, boston_test)
tab = table(pred$class, boston_test$morecrim)
tab

##    
##       0   1
##   0  49  11
##   1  16 117

sum(diag(tab))/sum(tab)

## [1] 0.8601036

mean(pred$class != boston_test$morecrim)

## [1] 0.1398964

LDA Acc: 86% Error: 14%

Naive Bayes

boston_bayes = naiveBayes(morecrim ~ indus+ nox+ age+ dis+ rad+ tax, 
                           data = boston_train)
pred = predict(boston_bayes, boston_test, type = 'class')
tab <- table(pred, boston_test$morecrim)
tab

##     
## pred   0   1
##    0  47  12
##    1  18 116

sum(diag(tab))/sum(tab)

## [1] 0.8445596

1-(sum(diag(tab))/sum(tab))

## [1] 0.1554404

Naive Bayes Acc: 84.4% Error: 15.5%

KNN

set.seed(1)
boston_train = boston[index,c(3,5,7,8,9,10)] 
boston_test = boston[-index, c(3,5,7,8,9,10)]
boston_knn = knn(boston_train, boston_test, boston$morecrim[index], k = 1)

tab = table(boston_knn, boston$morecrim[-index])
accuracy = sum(diag(tab)) / sum(tab)
tab

##           
## boston_knn   0   1
##          0  54   6
##          1  11 122

accuracy

## [1] 0.9119171

1-accuracy

## [1] 0.0880829

With k=1, Acc: 91.2% Error: 8.8% Time to do a for loop to find best k.

set.seed(1)
finding_k = sapply(1:30, function(i) {
  boston_knn = knn(boston_train, boston_test, boston$morecrim[index], k = i)
  mean(boston_knn != boston$morecrim[-index])
})
names(finding_k) = 1:10
finding_k[which.min(finding_k)]

##          3 
## 0.08290155

set.seed(1)
boston_train = boston[index,c(3,5,7,8,9,10)] 
boston_test = boston[-index, c(3,5,7,8,9,10)]
boston_knn = knn(boston_train, boston_test, boston$morecrim[index], k = 3)

tab = table(boston_knn, boston$morecrim[-index])
accuracy = sum(diag(tab)) / sum(tab)
tab

##           
## boston_knn   0   1
##          0  56   7
##          1   9 121

accuracy

## [1] 0.9170984

1-accuracy

## [1] 0.08290155

Best k=3, so we got a slightly improved accuracy of 91.7 and a lower error rate of 8.3%.

The best model we got for the Boston data was the final KNN model where k=3 with an accuracy of 91.7 and error rate of 8.3%. The next best models were the logistic regression and KNN=1 with both having an accuracy of 91.2%.