Questions 10, 11, 13
10.
a) It looks like there is a log-based relationship between Year and Volume.
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
b) Only Lag2 is statistically significant.
m<- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, weekly, family="binomial")
summary(m)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = "binomial", data = weekly)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6949 -1.2565 0.9913 1.0849 1.4579
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
c) The regression model is pretty good at predicting in the “Up” direction, but there are a large number of false negatives in the “Down” direction (i.e. overpredicting “Up”).
m_probs <- predict(m, type = "response")
m_predict <- ifelse(m_probs >0.5, "up","down")
c_matrix<- table(weekly$Direction, m_predict)
c_matrix
## m_predict
## down up
## Down 54 430
## Up 48 557
d)
train_df = filter(weekly, Year<=2008)
test_df= filter(weekly, Year>2008)
p<- glm(Direction ~ Lag2, data = train_df, family = "binomial")
p_probs<- predict(p, type = "response", newdata = test_df)
p_preds<- ifelse(p_probs>0.5, "up", "down")
p_matrix<- table(test_df$Direction, p_preds)
p_matrix
## p_preds
## down up
## Down 9 34
## Up 5 56
acc = (9+56)/sum(p_matrix)
acc
## [1] 0.625
e)
e<- lda(Direction ~ Lag2, data = train_df)
e
## Call:
## lda(Direction ~ Lag2, data = train_df)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag2
## Down -0.03568254
## Up 0.26036581
##
## Coefficients of linear discriminants:
## LD1
## Lag2 0.4414162
e_preds<- predict(e, newdata = test_df)
e_matrix<- table(test_df$Direction, e_preds$class)
e_matrix
##
## Down Up
## Down 9 34
## Up 5 56
acc = (9+56)/sum(e_matrix)
acc
## [1] 0.625
f)
f<- qda(Direction ~ Lag2, data = train_df)
f_preds<- predict(f, newdata = test_df)
f_matrix<- table(test_df$Direction, f_preds$class)
f_matrix
##
## Down Up
## Down 0 43
## Up 0 61
f_acc= (0)/104
f_acc
## [1] 0
g)
g<- knn(train = data.frame(train_df$Lag2), test =data.frame(test_df$Lag2), cl = train_df$Direction, k = 1)
g_matrix<- table(test_df$Direction, g)
g_matrix
## g
## Down Up
## Down 21 22
## Up 29 32
g_acc= (21+32)/sum(g_matrix)
g_acc
## [1] 0.5096154
h) The logistic regression and LDA produce the highest accuracy.
i) LDA accuracy improves slightly by adding the other Lag variables.
j<- lda(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5, data = train_df)
j
## Call:
## lda(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5, data = train_df)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag1 Lag2 Lag3 Lag4 Lag5
## Down 0.289444444 -0.03568254 0.17080045 0.15925624 0.21409297
## Up -0.009213235 0.26036581 0.08404044 0.09220956 0.04548897
##
## Coefficients of linear discriminants:
## LD1
## Lag1 -0.27604838
## Lag2 0.26309706
## Lag3 -0.04233861
## Lag4 -0.11994755
## Lag5 -0.15335694
j_probs<- predict(j, newdata = test_df)
j_matrix<- table(test_df$Direction, j_probs$class)
j_matrix
##
## Down Up
## Down 9 34
## Up 13 48
acc = (9+56)/sum(j_matrix)
acc
## [1] 0.625
11.
a)
data(Auto)
Auto$mpg01<- ifelse(Auto$mpg>median(Auto$mpg), 1,0)
b) Displacement, horsepower, and weight seem to be the three biggest contributors to predicting mpg.
pairs(Auto)
noname<- Auto[,c(1:8,10)]
cor(noname)
## mpg cylinders displacement horsepower weight
## mpg 1.0000000 -0.7776175 -0.8051269 -0.7784268 -0.8322442
## cylinders -0.7776175 1.0000000 0.9508233 0.8429834 0.8975273
## displacement -0.8051269 0.9508233 1.0000000 0.8972570 0.9329944
## horsepower -0.7784268 0.8429834 0.8972570 1.0000000 0.8645377
## weight -0.8322442 0.8975273 0.9329944 0.8645377 1.0000000
## acceleration 0.4233285 -0.5046834 -0.5438005 -0.6891955 -0.4168392
## year 0.5805410 -0.3456474 -0.3698552 -0.4163615 -0.3091199
## origin 0.5652088 -0.5689316 -0.6145351 -0.4551715 -0.5850054
## mpg01 0.8369392 -0.7591939 -0.7534766 -0.6670526 -0.7577566
## acceleration year origin mpg01
## mpg 0.4233285 0.5805410 0.5652088 0.8369392
## cylinders -0.5046834 -0.3456474 -0.5689316 -0.7591939
## displacement -0.5438005 -0.3698552 -0.6145351 -0.7534766
## horsepower -0.6891955 -0.4163615 -0.4551715 -0.6670526
## weight -0.4168392 -0.3091199 -0.5850054 -0.7577566
## acceleration 1.0000000 0.2903161 0.2127458 0.3468215
## year 0.2903161 1.0000000 0.1815277 0.4299042
## origin 0.2127458 0.1815277 1.0000000 0.5136984
## mpg01 0.3468215 0.4299042 0.5136984 1.0000000
c)
bound <- floor((nrow(Auto)/4)*3)
df <- Auto[sample(nrow(Auto)), ]
train_df <- df[1:bound, ]
test_df <- df[(bound+1):nrow(df), ]
d) The accuracy metric (using displacement, weight, and horsepower as predictors) is 0.89, so the error rate is ~0.11.
l<- lda(mpg01 ~ displacement + horsepower + weight, data = train_df)
l_preds<- predict(l, newdata = test_df)
l_matrix<- table(test_df$mpg01, l_preds$class)
l_matrix
##
## 0 1
## 0 37 8
## 1 2 51
acc = (41+46)/sum(l_matrix)
acc
## [1] 0.8877551
e) The accuracy metric is 0.91, so the error rate is ~0.09.
q<- qda(mpg01 ~ displacement + horsepower + weight, data = train_df)
q_preds<- predict(q, newdata = test_df)
q_matrix<- table(test_df$mpg01, q_preds$class)
q_matrix
##
## 0 1
## 0 37 8
## 1 4 49
q_acc= (44+45)/sum(q_matrix)
q_acc
## [1] 0.9081633
f) The logistic regression had an accuracy of 0.9, and a subsequent error rate of ~0.1.
p<- glm(mpg01 ~ displacement + horsepower + weight, data = train_df, family = "binomial")
p_probs<- predict(p, type = "response", newdata = test_df)
p_preds<- ifelse(p_probs>0.5, "up", "down")
p_matrix<- table(test_df$mpg01, p_preds)
p_matrix
## p_preds
## down up
## 0 40 5
## 1 5 48
acc = (43+45)/sum(p_matrix)
acc
## [1] 0.8979592
g) The KNN did not classify any points incorrectly, achieving an error rate of 0.0.
k<- knn(train = data.frame(train_df$mpg01), test =data.frame(test_df$mpg01), cl = train_df$mpg01, k = 1)
k_matrix<- table(test_df$mpg01, k)
k_matrix
## k
## 0 1
## 0 45 0
## 1 0 53
k_acc= (50+48)/sum(k_matrix)
k_acc
## [1] 1
13. When we use all variables except “chas” in the models, we can obtain highly accurate results. The logistic regression had a 0.99 accuracy, the LDA had a 0.83 accuracy, and the knn did not have any incorrect classifications. Because these metrics were so high, I think there might be some over-fitting for this dataset.
Log Regression on Boston data Accuracy = 0.99
data("Boston")
Boston$median<- ifelse(Boston$crim> median(Boston$crim), 1,0)
bound <- floor((nrow(Boston)/4)*3)
df <- Boston[sample(nrow(Boston)), ]
df<- df[,c(1:3,5:15)]
df.train <- df[1:bound, ]
df.test <- df[(bound+1):nrow(df), ]
logreg<- glm(as.factor(median) ~ ., data = df.train, family = "binomial")
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
logreg_probs <- predict(logreg, df.test, type="response")
logreg_preds <- ifelse(logreg_probs>.5, "1","0")
logreg_matrix<- table(df.test$median, logreg_preds)
logreg_matrix
## logreg_preds
## 0 1
## 0 62 1
## 1 0 64
acc = (59+67)/sum(logreg_matrix)
LDA on Boston dataset Accuracy = 0.835
l<- lda(as.factor(median) ~ ., data = df.train)
l_preds<- predict(l, newdata = df.test)
l_matrix<- table(df.test$median, l_preds$class)
l_matrix
##
## 0 1
## 0 62 1
## 1 12 52
acc = (59+47)/sum(l_matrix)
acc
## [1] 0.8346457
KNN on Boston dataset Accuracy = 1.0
k<- knn(train = data.frame(df.train$median), test =data.frame(df.test$median), cl = df.train$median, k = 1)
k_matrix<- table(df.test$median, k)
k_matrix
## k
## 0 1
## 0 63 0
## 1 0 64
k_acc= (60+67)/sum(k_matrix)
k_acc
## [1] 1