Algo 2 HW3

library(ISLR)

## Warning: package 'ISLR' was built under R version 4.0.3

library(MASS)
library(class)

## Warning: package 'class' was built under R version 4.0.3

library(caret)

## Warning: package 'caret' was built under R version 4.0.3

## Loading required package: lattice

## Loading required package: ggplot2

Question 10

Part A.

data("Weekly")

summary(Weekly)

##       Year           Lag1               Lag2               Lag3         
##  Min.   :1990   Min.   :-18.1950   Min.   :-18.1950   Min.   :-18.1950  
##  1st Qu.:1995   1st Qu.: -1.1540   1st Qu.: -1.1540   1st Qu.: -1.1580  
##  Median :2000   Median :  0.2410   Median :  0.2410   Median :  0.2410  
##  Mean   :2000   Mean   :  0.1506   Mean   :  0.1511   Mean   :  0.1472  
##  3rd Qu.:2005   3rd Qu.:  1.4050   3rd Qu.:  1.4090   3rd Qu.:  1.4090  
##  Max.   :2010   Max.   : 12.0260   Max.   : 12.0260   Max.   : 12.0260  
##       Lag4               Lag5              Volume            Today         
##  Min.   :-18.1950   Min.   :-18.1950   Min.   :0.08747   Min.   :-18.1950  
##  1st Qu.: -1.1580   1st Qu.: -1.1660   1st Qu.:0.33202   1st Qu.: -1.1540  
##  Median :  0.2380   Median :  0.2340   Median :1.00268   Median :  0.2410  
##  Mean   :  0.1458   Mean   :  0.1399   Mean   :1.57462   Mean   :  0.1499  
##  3rd Qu.:  1.4090   3rd Qu.:  1.4050   3rd Qu.:2.05373   3rd Qu.:  1.4050  
##  Max.   : 12.0260   Max.   : 12.0260   Max.   :9.32821   Max.   : 12.0260  
##  Direction 
##  Down:484  
##  Up  :605  
##            
##            
##            
##

pairs(Weekly)

cor(Weekly[,-9])

##               Year         Lag1        Lag2        Lag3         Lag4
## Year    1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1   -0.03228927  1.000000000 -0.07485305  0.05863568 -0.071273876
## Lag2   -0.03339001 -0.074853051  1.00000000 -0.07572091  0.058381535
## Lag3   -0.03000649  0.058635682 -0.07572091  1.00000000 -0.075395865
## Lag4   -0.03112792 -0.071273876  0.05838153 -0.07539587  1.000000000
## Lag5   -0.03051910 -0.008183096 -0.07249948  0.06065717 -0.075675027
## Volume  0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today  -0.03245989 -0.075031842  0.05916672 -0.07124364 -0.007825873
##                Lag5      Volume        Today
## Year   -0.030519101  0.84194162 -0.032459894
## Lag1   -0.008183096 -0.06495131 -0.075031842
## Lag2   -0.072499482 -0.08551314  0.059166717
## Lag3    0.060657175 -0.06928771 -0.071243639
## Lag4   -0.075675027 -0.06107462 -0.007825873
## Lag5    1.000000000 -0.05851741  0.011012698
## Volume -0.058517414  1.00000000 -0.033077783
## Today   0.011012698 -0.03307778  1.000000000

Year and Volume seem to have a positive relationship.

Part B.

m12 = glm(Direction ~ ., data = Weekly[, -c(1,8)], family = binomial)

summary(m12)

## 
## Call:
## glm(formula = Direction ~ ., family = binomial, data = Weekly[, 
##     -c(1, 8)])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6949  -1.2565   0.9913   1.0849   1.4579  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.26686    0.08593   3.106   0.0019 **
## Lag1        -0.04127    0.02641  -1.563   0.1181   
## Lag2         0.05844    0.02686   2.175   0.0296 * 
## Lag3        -0.01606    0.02666  -0.602   0.5469   
## Lag4        -0.02779    0.02646  -1.050   0.2937   
## Lag5        -0.01447    0.02638  -0.549   0.5833   
## Volume      -0.02274    0.03690  -0.616   0.5377   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1486.4  on 1082  degrees of freedom
## AIC: 1500.4
## 
## Number of Fisher Scoring iterations: 4

Lag 2 is statistically significant with a p-value of 0.03.

Part C.

m12.probs = predict(m12, type = "response")
m12.pred = rep("Down", length(m12.probs))
m12.pred[m12.probs > 0.5] = "Up"
table(m12.pred, Weekly$Direction)

##         
## m12.pred Down  Up
##     Down   54  48
##     Up    430 557

(54+557)/(54+557+48+430)

## [1] 0.5610652

557/(557+48)

## [1] 0.9206612

54/(430+54)

## [1] 0.1115702

56% of the predictions are correct. When the market goes up the model is right 92.07% of the time. When the market goes down the model is right 11.16% of the time.

Part D.

train <- (Weekly$Year < 2009)
Weekly0910 <- Weekly[!train, ]
Direction0901 <-Weekly$Direction[!train]

m13 <- glm(Direction ~ Lag2, data = Weekly, family = binomial, subset = train)
summary(m13)

## 
## Call:
## glm(formula = Direction ~ Lag2, family = binomial, data = Weekly, 
##     subset = train)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.536  -1.264   1.021   1.091   1.368  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.20326    0.06428   3.162  0.00157 **
## Lag2         0.05810    0.02870   2.024  0.04298 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1354.7  on 984  degrees of freedom
## Residual deviance: 1350.5  on 983  degrees of freedom
## AIC: 1354.5
## 
## Number of Fisher Scoring iterations: 4

m13probs <- predict(m13, Weekly0910, type = "response")
m13preds <- rep("Down", length(m13probs))
m13preds[m13probs> 0.5] <- "Up"
table(m13preds, Direction0901)

##         Direction0901
## m13preds Down Up
##     Down    9  5
##     Up     34 56

(9+56)/(9+5+34+56)

## [1] 0.625

56/(56+5)

## [1] 0.9180328

9/(9+34)

## [1] 0.2093023

The model predicts 62.5% of the predictions correctly. When the market goes up, the model correctly predicts this 91.80% of the time. When the market goes down, the model is correct 20.93% of the time.

Part E

mlda = lda(Direction ~ Lag2, data = Weekly, subset = train)

mlda

## Call:
## lda(Direction ~ Lag2, data = Weekly, subset = train)
## 
## Prior probabilities of groups:
##      Down        Up 
## 0.4477157 0.5522843 
## 
## Group means:
##             Lag2
## Down -0.03568254
## Up    0.26036581
## 
## Coefficients of linear discriminants:
##            LD1
## Lag2 0.4414162

pred.lda <- predict(mlda, Weekly0910)
table(pred.lda$class, Direction0901)

##       Direction0901
##        Down Up
##   Down    9  5
##   Up     34 56

(9+56)/(9+5+34+56)

## [1] 0.625

56/(56+5)

## [1] 0.9180328

9/(9+34)

## [1] 0.2093023

Part F.

mqda = qda(Direction ~ Lag2, data = Weekly, subset = train)

mqda

## Call:
## qda(Direction ~ Lag2, data = Weekly, subset = train)
## 
## Prior probabilities of groups:
##      Down        Up 
## 0.4477157 0.5522843 
## 
## Group means:
##             Lag2
## Down -0.03568254
## Up    0.26036581

pred.qda <- predict(mqda, Weekly0910)
table(pred.qda$class, Direction0901)

##       Direction0901
##        Down Up
##   Down    0  0
##   Up     43 61

(0+61)/(0+0+43+61)

## [1] 0.5865385

61/(61 +0)

## [1] 1

0/(0+43)

## [1] 0

The model predicts 58.65% of the predictions correctly. When the market goes up, the model correctly predicts this 100% of the time. When the market goes down, the model is correct 0% of the time.

Part G.

train.X <- as.matrix(Weekly$Lag2[train])
test.X <- as.matrix(Weekly$Lag2[!train])
train.Direction <- Weekly$Direction[train]
set.seed(1)
pred.knn <- knn(train.X, test.X, train.Direction, k = 1)
table(pred.knn, Direction0901)

##         Direction0901
## pred.knn Down Up
##     Down   21 30
##     Up     22 31

(21+31)/(21+30+22+31)

## [1] 0.5

31/(31+30)

## [1] 0.5081967

21/(21+22)

## [1] 0.4883721

The model predicts 50% of the predictions correctly. When the market goes up, the model correctly predicts this 50.82% of the time. When the market goes down, the model is correct 48.84% of the time.

Part H.

The logistic model and the LDA model have a higher accuracy. QDA performs better than KNN.

Part I

glm.fit = glm(Direction ~ Lag2:Lag1, data = Weekly, family = binomial, subset = train)
glm.probs = predict(glm.fit, Weekly0910, type = "response")
glm.pred = rep("Down", length(glm.probs))
glm.pred[glm.probs > 0.5] = "Up"
Direction.0910 = Weekly$Direction[!train]
table(glm.pred, Direction.0910)

##         Direction.0910
## glm.pred Down Up
##     Down    1  1
##     Up     42 60

(1+60)/(1+1+42+60)

## [1] 0.5865385

lda.fit2 = lda(Direction ~ Lag2:Lag1, data = Weekly, subset = train)
lda.pred2 = predict(lda.fit2, Weekly0910)
table(lda.pred2$class, Direction0901)

##       Direction0901
##        Down Up
##   Down    0  1
##   Up     43 60

(0+60)/(0+1+43+60)

## [1] 0.5769231

qda.fit = qda(Direction ~ Lag2:Lag1, data = Weekly, subset = train)
qda.class = predict(qda.fit, Weekly0910)$class
table(qda.class, Direction.0910)

##          Direction.0910
## qda.class Down Up
##      Down   16 32
##      Up     27 29

(16+29)/(16+32+27+29)

## [1] 0.4326923

knn.pred = knn(train.X, test.X, train.Direction, k = 10)
table(knn.pred, Direction.0910)

##         Direction.0910
## knn.pred Down Up
##     Down   17 18
##     Up     26 43

(17+43)/(18+17+26+43)

## [1] 0.5769231

The logistic model with the interaction of Lag1 and Lag2 predict 58.65% of the predictions correctly. The LDA model with the interaction between Lag1 and Lag2 predicts 57.69% of the predictions correctly. The QDA model with the interaction between Lag1 and Lag2 predict 43.27% of the predictions correctly. The KNN with k=10 predicts 57.69% of the predictions correctly.

Question 11

Part A.

mpg01 = rep(0, length(Auto$mpg))
mpg01[Auto$mpg > median(Auto$mpg)] = 1
Auto2 = data.frame(Auto, mpg01)

pairs(Auto2)

boxplot(cylinders ~ mpg01, data = Auto2, main = "Cylinders vs mpg01")

boxplot(displacement ~ mpg01, data = Auto2, main = "Displacement vs mpg01")

boxplot(horsepower ~ mpg01, data = Auto2, main = "Horsepower vs mpg01")

boxplot(weight ~ mpg01, data = Auto2, main = "Weight vs mpg01")

boxplot(acceleration ~ mpg01, data = Auto2, main = "Acceleration vs mpg01")

There is a possible relationship between mpg01 and cylinders, weight, displacement, and horsepower.

Part C.

train = (Auto2$year%%2 == 0)  
test = !train
Auto.train = Auto2[train, ]
Auto.test = Auto2[test, ]
mpg01.test = mpg01[test]

Part D.

lda.fit = lda(mpg01 ~ cylinders + weight + displacement + horsepower, data = Auto2, 
    subset = train)
lda.pred = predict(lda.fit, Auto.test)
mean(lda.pred$class != mpg01.test)

## [1] 0.1263736

There is a 12.64% test error rate.

Part E.

qda.fit = qda(mpg01 ~ cylinders + weight + displacement + horsepower, data = Auto2, 
    subset = train)
qda.pred = predict(qda.fit, Auto.test)
mean(qda.pred$class != mpg01.test)

## [1] 0.1318681

There is a 13.19% test error rate.

Part F.

glm.fit = glm(mpg01 ~ cylinders + weight + displacement + horsepower, data = Auto2, 
    family = binomial, subset = train)
glm.probs = predict(glm.fit, Auto.test, type = "response")
glm.pred = rep(0, length(glm.probs))
glm.pred[glm.probs > 0.5] = 1
mean(glm.pred != mpg01.test)

## [1] 0.1208791

There is a 12.09% test error rate.

Part G.

train.X = cbind(Auto$cylinders, Auto$weight, Auto$displacement, Auto$horsepower)[train, ]
test.X = cbind(Auto$cylinders, Auto$weight, Auto$displacement, Auto$horsepower)[test, ]
train.mpg01 = mpg01[train]
set.seed(1)

knn.pred = knn(train.X, test.X, train.mpg01, k = 1)
mean(knn.pred != mpg01.test)

## [1] 0.1538462

knn.pred = knn(train.X, test.X, train.mpg01, k = 10)
mean(knn.pred != mpg01.test)

## [1] 0.1648352

knn.pred = knn(train.X, test.X, train.mpg01, k = 7)
mean(knn.pred != mpg01.test)

## [1] 0.1483516

k=1 there is a 15.38% test error rate, k=10 there is a 16.48%, k=7 there is a 14.84% test error rate. 7 k had the lowest test error rate so it performed better than the others.

Question 13

data("Boston")

attach(Boston)
crime01 = rep(0, length(crim))
crime01[crim > median(crim)] = 1
Boston = data.frame(Boston, crime01)

train = 1:(dim(Boston)[1]/2)
test = (dim(Boston)[1]/2 + 1):dim(Boston)[1]
Boston.train = Boston[train, ]
Boston.test = Boston[test, ]
crime01.test = crime01[test]

glm.fit = glm(crime01 ~ . - crime01 - crim, data = Boston, family = binomial, 
    subset = train)

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

glm.probs = predict(glm.fit, Boston.test, type = "response")
glm.pred = rep(0, length(glm.probs))
glm.pred[glm.probs > 0.5] = 1
mean(glm.pred != crime01.test)

## [1] 0.1818182

Logistic regression has an error rate of 18.18%.

lda.fit = lda(crime01 ~ . - crime01 - crim - chas - tax, data = Boston, subset = train)
lda.pred = predict(lda.fit, Boston.test)
mean(lda.pred$class != crime01.test)

## [1] 0.1225296

This LDA model has a 12.25% error rate.

train.X = cbind(zn, indus, chas, nox, rm, age, dis, rad, tax, ptratio, black, 
    lstat, medv)[train, ]
test.X = cbind(zn, indus, chas, nox, rm, age, dis, rad, tax, ptratio, black, 
    lstat, medv)[test, ]
train.crime01 = crime01[train]
set.seed(1)

knn.pred = knn(train.X, test.X, train.crime01, k = 1)
mean(knn.pred != crime01.test)

## [1] 0.458498

This KNN model where k = 1 has a 45.85 test error rate.

knn.pred = knn(train.X, test.X, train.crime01, k = 5)
mean(knn.pred != crime01.test)

## [1] 0.1699605

This KNN model where k = 5 has a 16.99% test error rate.

knn.pred = knn(train.X, test.X, train.crime01, k = 7)
mean(knn.pred != crime01.test)

## [1] 0.1146245

This KNN model where k = 7 has a 11.46% test error rate.

Algo 2 HW3

Mary Aldrete

2/13/2021

Question 10

Question 11

Question 13