Assignment 3

#Beginning Question10

library(ISLR)

## Warning: package 'ISLR' was built under R version 3.6.3

summary(Weekly)

##       Year           Lag1               Lag2               Lag3         
##  Min.   :1990   Min.   :-18.1950   Min.   :-18.1950   Min.   :-18.1950  
##  1st Qu.:1995   1st Qu.: -1.1540   1st Qu.: -1.1540   1st Qu.: -1.1580  
##  Median :2000   Median :  0.2410   Median :  0.2410   Median :  0.2410  
##  Mean   :2000   Mean   :  0.1506   Mean   :  0.1511   Mean   :  0.1472  
##  3rd Qu.:2005   3rd Qu.:  1.4050   3rd Qu.:  1.4090   3rd Qu.:  1.4090  
##  Max.   :2010   Max.   : 12.0260   Max.   : 12.0260   Max.   : 12.0260  
##       Lag4               Lag5              Volume            Today         
##  Min.   :-18.1950   Min.   :-18.1950   Min.   :0.08747   Min.   :-18.1950  
##  1st Qu.: -1.1580   1st Qu.: -1.1660   1st Qu.:0.33202   1st Qu.: -1.1540  
##  Median :  0.2380   Median :  0.2340   Median :1.00268   Median :  0.2410  
##  Mean   :  0.1458   Mean   :  0.1399   Mean   :1.57462   Mean   :  0.1499  
##  3rd Qu.:  1.4090   3rd Qu.:  1.4050   3rd Qu.:2.05373   3rd Qu.:  1.4050  
##  Max.   : 12.0260   Max.   : 12.0260   Max.   :9.32821   Max.   : 12.0260  
##  Direction 
##  Down:484  
##  Up  :605  
##            
##            
##            
##

#(A.)
pairs(Weekly[,1:8])

cor(Weekly[,1:8])

##               Year         Lag1        Lag2        Lag3         Lag4
## Year    1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1   -0.03228927  1.000000000 -0.07485305  0.05863568 -0.071273876
## Lag2   -0.03339001 -0.074853051  1.00000000 -0.07572091  0.058381535
## Lag3   -0.03000649  0.058635682 -0.07572091  1.00000000 -0.075395865
## Lag4   -0.03112792 -0.071273876  0.05838153 -0.07539587  1.000000000
## Lag5   -0.03051910 -0.008183096 -0.07249948  0.06065717 -0.075675027
## Volume  0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today  -0.03245989 -0.075031842  0.05916672 -0.07124364 -0.007825873
##                Lag5      Volume        Today
## Year   -0.030519101  0.84194162 -0.032459894
## Lag1   -0.008183096 -0.06495131 -0.075031842
## Lag2   -0.072499482 -0.08551314  0.059166717
## Lag3    0.060657175 -0.06928771 -0.071243639
## Lag4   -0.075675027 -0.06107462 -0.007825873
## Lag5    1.000000000 -0.05851741  0.011012698
## Volume -0.058517414  1.00000000 -0.033077783
## Today   0.011012698 -0.03307778  1.000000000

#On the scatterplot and correlation matrices, there appears to be a positive correlation between Year and Volume. 

#(B.) & (C.)
logistic_fit = glm(Direction ~ Lag1+Lag2+Lag3+Lag4+Lag5+Volume, data = Weekly, family = binomial)
summary(logistic_fit)

## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = binomial, data = Weekly)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6949  -1.2565   0.9913   1.0849   1.4579  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.26686    0.08593   3.106   0.0019 **
## Lag1        -0.04127    0.02641  -1.563   0.1181   
## Lag2         0.05844    0.02686   2.175   0.0296 * 
## Lag3        -0.01606    0.02666  -0.602   0.5469   
## Lag4        -0.02779    0.02646  -1.050   0.2937   
## Lag5        -0.01447    0.02638  -0.549   0.5833   
## Volume      -0.02274    0.03690  -0.616   0.5377   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1486.4  on 1082  degrees of freedom
## AIC: 1500.4
## 
## Number of Fisher Scoring iterations: 4

logistic_probs = predict(logistic_fit, type = "response")
logistic_preds = rep("Down", 1089)
logistic_preds[logistic_probs>0.5] = "Up"

attach(Weekly)
table(logistic_preds, Direction)

##               Direction
## logistic_preds Down  Up
##           Down   54  48
##           Up    430 557

#(D.)
train = (Year<2009)

Test = Weekly[!train ,]
Test_Direction=Direction[!train]
logistic_fit2 = glm(Direction~Lag2, data = Weekly, family = binomial, subset= train)

logistic_probs2 = predict(logistic_fit2,Test, type = "response")
logistic_preds2 = rep("Down", 104)
logistic_preds2[logistic_probs2>0.5] = "Up"

table(logistic_preds2, Test_Direction)

##                Test_Direction
## logistic_preds2 Down Up
##            Down    9  5
##            Up     34 56

#(E.)
library(MASS)
lda_fit = lda(Direction~Lag2, data = Weekly, subset = train)
lda_pred = predict(lda_fit, Test)
lda_class = lda_pred$class

table(lda_class,Test_Direction)

##          Test_Direction
## lda_class Down Up
##      Down    9  5
##      Up     34 56

#(F.)
qda_fit = qda(Direction~Lag2, data = Weekly, subset = train)
qda_pred = predict(qda_fit,Test)
qda_class = qda_pred$class
table(qda_class, Test_Direction)

##          Test_Direction
## qda_class Down Up
##      Down    0  0
##      Up     43 61

#(Q.)
library(class)

set.seed(1)
train_X = Weekly[train,3]
test_X = Weekly[!train,3]
train_direction = Direction[train]

dim(train_X) = c(985,1)
dim(test_X) = c(104,1)

knn_pred = knn(train_X, test_X, train_direction, k=1)
table(knn_pred, Test_Direction)

##         Test_Direction
## knn_pred Down Up
##     Down   21 30
##     Up     22 31

#(I.)
knn_pred2 = knn(train_X, test_X, train_direction, k=3)
table(knn_pred2, Test_Direction)

##          Test_Direction
## knn_pred2 Down Up
##      Down   16 19
##      Up     27 42

knn_pred3 = knn(train_X, test_X, train_direction, k=6)
table(knn_pred3, Test_Direction)

##          Test_Direction
## knn_pred3 Down Up
##      Down   15 20
##      Up     28 41

lda_fit2 = lda(Direction~Lag1+Lag2+Lag3+Lag4+Lag5, data=Weekly, subset=train)

lda_pred2 = predict(lda_fit2,Test)
lda_class2 = lda_pred2$class

table(lda_class2, Test_Direction)

##           Test_Direction
## lda_class2 Down Up
##       Down    9 13
##       Up     34 48

logistic_fit3 = glm(Direction~Lag2+ I(Lag2^2), data=Weekly, family=binomial, subset=train)

logistic_probs3 = predict(logistic_fit3,Test, type="response")
logistic_preds3 = rep("Down", 104)
logistic_preds3[logistic_probs3>0.5] = "Up"

logistic_fit4 = glm(Direction~Lag2 + I(Lag1^2), data=Weekly, family=binomial, subset=train)

logistic_probs4 = predict(logistic_fit4,Test, type="response")
logistic_preds4 = rep("Down", 104)
logistic_preds4[logistic_probs4>0.5] = "Up"

table(logistic_preds4,Test_Direction)

##                Test_Direction
## logistic_preds4 Down Up
##            Down    8  2
##            Up     35 59

#Ending Question10

#Beginning Question11

library(ISLR)

#(A.)
df = Auto
df$mpg01 = NA
median_mpg = median(df$mpg)

for(i in 1:dim(df)[1]){
  if(df$mpg[i] > median_mpg){
    df$mpg01[i] = 1}
  else{df$mpg01[i] = 0
  }
}

movetolast = function(data, move) {
  data[c(setdiff(names(data), move), move)]
}

#(B.)
df = movetolast(df, c("name"))
pairs(df[,1:9])

cor(df[,1:9])

##                     mpg  cylinders displacement horsepower     weight
## mpg           1.0000000 -0.7776175   -0.8051269 -0.7784268 -0.8322442
## cylinders    -0.7776175  1.0000000    0.9508233  0.8429834  0.8975273
## displacement -0.8051269  0.9508233    1.0000000  0.8972570  0.9329944
## horsepower   -0.7784268  0.8429834    0.8972570  1.0000000  0.8645377
## weight       -0.8322442  0.8975273    0.9329944  0.8645377  1.0000000
## acceleration  0.4233285 -0.5046834   -0.5438005 -0.6891955 -0.4168392
## year          0.5805410 -0.3456474   -0.3698552 -0.4163615 -0.3091199
## origin        0.5652088 -0.5689316   -0.6145351 -0.4551715 -0.5850054
## mpg01         0.8369392 -0.7591939   -0.7534766 -0.6670526 -0.7577566
##              acceleration       year     origin      mpg01
## mpg             0.4233285  0.5805410  0.5652088  0.8369392
## cylinders      -0.5046834 -0.3456474 -0.5689316 -0.7591939
## displacement   -0.5438005 -0.3698552 -0.6145351 -0.7534766
## horsepower     -0.6891955 -0.4163615 -0.4551715 -0.6670526
## weight         -0.4168392 -0.3091199 -0.5850054 -0.7577566
## acceleration    1.0000000  0.2903161  0.2127458  0.3468215
## year            0.2903161  1.0000000  0.1815277  0.4299042
## origin          0.2127458  0.1815277  1.0000000  0.5136984
## mpg01           0.3468215  0.4299042  0.5136984  1.0000000

#There is a strong positive correlation between mpg and mpg01, and a strong negative correlation between cylinders, displacement, weight, horsepower and mpg01.

#(C.)
require(caTools)

## Loading required package: caTools

## Warning: package 'caTools' was built under R version 3.6.3

set.seed(123)
sample_data = sample.split(df$mpg,SplitRatio =0.70)
train2 = subset(df, sample_data==TRUE)
test2 = subset(df, sample_data==FALSE)

#(D.)
lda_fit3 = lda(mpg01~cylinders+displacement+horsepower+weight, data=train2)
lda_pred3 = predict(lda_fit3,test2)
predictions = lda_pred3$class
actual = test2$mpg01

table(predictions, actual)

##            actual
## predictions  0  1
##           0 48  4
##           1  8 44

mean(predictions!=actual)

## [1] 0.1153846

#The test error of this model is 11.5%.

#(E.)
qda_fit2 = qda(mpg01~cylinders+displacement+horsepower+weight, data=train2)
qda_pred2 = predict(qda_fit2,test2)
predictions = qda_pred2$class

table(predictions, actual)

##            actual
## predictions  0  1
##           0 50  4
##           1  6 44

mean(predictions!=actual)

## [1] 0.09615385

#The QDA model has a test error of 9.6%.

#(F.)
logistic_fit5 = glm(mpg01~cylinders+displacement+horsepower+weight, data=train2, family=binomial)

logistic_probs5 = predict(logistic_fit5,test2, type="response")
logistic_preds5 = rep(0,length(test2$mpg01))
logistic_preds5[logistic_probs5>0.5] = 1

table(logistic_preds5, actual)

##                actual
## logistic_preds5  0  1
##               0 50  4
##               1  6 44

mean(predictions!=actual)

## [1] 0.09615385

#The logistic model has a 9.6% test error rate.

#(G.)
train2_matrix = data.matrix(train2[,c("cylinders","displacement","weight","horsepower")])
test2_matrix = data.matrix(test2[,c("cylinders","displacement","weight","horsepower")])
train2_y = data.matrix(train2$mpg01)
test2_y = data.matrix(test2$mpg01)

knn_pred4 = knn(train2_matrix, test2_matrix, train2_y, k=1)
table(knn_pred4, test2_y)

##          test2_y
## knn_pred4  0  1
##         0 45  9
##         1 11 39

mean(knn_pred4!=test2_y)

## [1] 0.1923077

#KNN with K=1 has a test error of 20%.

knn_pred5 = knn(train2_matrix, test2_matrix, train2_y, k=3)
table(knn_pred5, test2_y)

##          test2_y
## knn_pred5  0  1
##         0 45  5
##         1 11 43

mean(knn_pred5!=test2_y)

## [1] 0.1538462

#KNN with K=3 is has a test error of 15%.

knn_pred6 = knn(train2_matrix, test2_matrix, train2_y, k=10)
table(knn_pred6, test2_y)

##          test2_y
## knn_pred6  0  1
##         0 45  4
##         1 11 44

mean(knn_pred6!=test2_y)

## [1] 0.1442308

#K=10 leads to a slight improvement in test error 14.4%.

#Ending Question11

#Beginning Question13

library(ISLR)
library(MASS)
library(class)
require(caTools)
boston_df = Boston

median_crim =median(Boston$crim)
boston_df$crim01 = with(ifelse(crim>median_crim,1, 0), data=Boston)

cor(boston_df$crim01,boston_df)

##           crim        zn     indus       chas       nox         rm       age
## [1,] 0.4093955 -0.436151 0.6032602 0.07009677 0.7232348 -0.1563718 0.6139399
##             dis       rad       tax   ptratio      black     lstat       medv
## [1,] -0.6163416 0.6197862 0.6087413 0.2535684 -0.3512109 0.4532627 -0.2630167
##      crim01
## [1,]      1

set.seed(123)
boston_sample = sample.split(boston_df$crim01, SplitRatio =0.80)
boston_train = subset(boston_df, boston_sample==TRUE)
boston_test = subset(boston_df, boston_sample==FALSE)

boston_lr = glm(crim01~.-chas-crim , data=boston_train, family=binomial)
summary(boston_lr)

## 
## Call:
## glm(formula = crim01 ~ . - chas - crim, family = binomial, data = boston_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8978  -0.1705  -0.0002   0.0025   3.5340  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -39.431670   7.164710  -5.504 3.72e-08 ***
## zn           -0.102660   0.038683  -2.654 0.007958 ** 
## indus        -0.069107   0.047124  -1.467 0.142511    
## nox          48.381844   8.123415   5.956 2.59e-09 ***
## rm            0.084088   0.798160   0.105 0.916096    
## age           0.023671   0.013566   1.745 0.081014 .  
## dis           0.820828   0.255752   3.209 0.001330 ** 
## rad           0.631930   0.164647   3.838 0.000124 ***
## tax          -0.005362   0.002838  -1.889 0.058826 .  
## ptratio       0.334091   0.133126   2.510 0.012087 *  
## black        -0.007506   0.005230  -1.435 0.151230    
## lstat         0.076788   0.051698   1.485 0.137462    
## medv          0.149509   0.077409   1.931 0.053430 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 560.06  on 403  degrees of freedom
## Residual deviance: 169.97  on 391  degrees of freedom
## AIC: 195.97
## 
## Number of Fisher Scoring iterations: 9

boston_probs = predict(boston_lr, boston_test, type="response")
boston_preds = rep(0,length(boston_test$crim01))
boston_preds[boston_probs>0.5] = 1
actual = boston_test$crim01
table(boston_preds, actual)

##             actual
## boston_preds  0  1
##            0 46  5
##            1  5 46

mean(boston_preds!=actual)

## [1] 0.09803922

#Test error rate of 9.8%. 

boston_lr2 = glm(crim01~zn+nox+dis+rad+ptratio, data=boston_train,family=binomial)
boston_probs2 = predict(boston_lr2, boston_test, type="response")
boston_preds2 = rep(0,length(boston_test$crim01))
boston_preds2[boston_probs2>0.5] = 1
actual =boston_test$crim01
table(boston_preds2, actual)

##              actual
## boston_preds2  0  1
##             0 45  9
##             1  6 42

mean(boston_preds2!=actual)

## [1] 0.1470588

#Test error rises to 14.7% when using this subset.

boston_lda = lda(crim01~.-chas-crim ,data=boston_train)
boston_preds2 = predict(boston_lda, boston_test)
table(boston_preds2$class, actual)

##    actual
##      0  1
##   0 49 12
##   1  2 39

mean(boston_preds2$class!=actual)

## [1] 0.1372549

#Test error rate of 13.7%.

boston_qda = qda(crim01~.-chas-crim , data=boston_train)
boston_preds3 = predict(boston_qda, boston_test)
table(boston_preds3$class, actual)

##    actual
##      0  1
##   0 50  9
##   1  1 42

mean(boston_preds3$class!=actual)

## [1] 0.09803922

#Test error rate of 9.8%.

boston_train2 = data.matrix(subset(boston_train, select=-c(crim,chas)))
boston_test2 = data.matrix(subset(boston_test, select=-c(crim,chas)))
train2_y = data.matrix(boston_train[,15])
test2_y = data.matrix(boston_test[,15])

boston_knn1 = knn(boston_train2, boston_test2, train2_y,k=1)
table(boston_knn1, test2_y)

##            test2_y
## boston_knn1  0  1
##           0 47  1
##           1  4 50

mean(boston_knn1!=test2_y)

## [1] 0.04901961

#Test error rate of 4.9%.

boston_knn2 = knn(boston_train2, boston_test2, train2_y, k=3)
table(boston_knn2, test2_y)

##            test2_y
## boston_knn2  0  1
##           0 45  1
##           1  6 50

mean(boston_knn2!=test2_y)

## [1] 0.06862745

#Higher test error rate of 6.9%.

boston_knn3 = knn(boston_train2, boston_test2, train2_y, k=10)
table(boston_knn3, test2_y)

##            test2_y
## boston_knn3  0  1
##           0 43  5
##           1  8 46

mean(boston_knn3!=test2_y)

## [1] 0.127451

#Test error rate of 11.7%.
#QDA and Logistic regression perform better than LDA but worse than KNN.

#End Question13

Assignment 3

Josh Gauntt

1/29/2021