1a)
library(ggplot2)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ tibble 3.1.1 ✓ dplyr 1.0.5
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## ✓ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ISLR)
options(digits = 3)
data <- na.omit(Auto)
pairs(data[,1:7])
Mpg appears to have a negative association with power-related variables, and a positive association with year.
1b)
cor(data[,1:7])
## mpg cylinders displacement horsepower weight acceleration
## mpg 1.000 -0.778 -0.805 -0.778 -0.832 0.423
## cylinders -0.778 1.000 0.951 0.843 0.898 -0.505
## displacement -0.805 0.951 1.000 0.897 0.933 -0.544
## horsepower -0.778 0.843 0.897 1.000 0.865 -0.689
## weight -0.832 0.898 0.933 0.865 1.000 -0.417
## acceleration 0.423 -0.505 -0.544 -0.689 -0.417 1.000
## year 0.581 -0.346 -0.370 -0.416 -0.309 0.290
## year
## mpg 0.581
## cylinders -0.346
## displacement -0.370
## horsepower -0.416
## weight -0.309
## acceleration 0.290
## year 1.000
Mpg is negatively correlated with cylinders, displacement, horsepower, and weight. And it i spositively correlated with acceleration and year. These numbers confirm the intuition of the associations from the scatterplots in part a).
1c)
model <- lm(mpg~cylinders+displacement+horsepower+weight+acceleration+year, data = data)
summary(model)
##
## Call:
## lm(formula = mpg ~ cylinders + displacement + horsepower + weight +
## acceleration + year, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.69 -2.39 -0.08 2.03 14.36
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.45e+01 4.76e+00 -3.05 0.0024 **
## cylinders -3.30e-01 3.32e-01 -0.99 0.3212
## displacement 7.68e-03 7.36e-03 1.04 0.2973
## horsepower -3.91e-04 1.38e-02 -0.03 0.9775
## weight -6.79e-03 6.70e-04 -10.14 <2e-16 ***
## acceleration 8.53e-02 1.02e-01 0.84 0.4038
## year 7.53e-01 5.26e-02 14.32 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.44 on 385 degrees of freedom
## Multiple R-squared: 0.809, Adjusted R-squared: 0.806
## F-statistic: 272 on 6 and 385 DF, p-value: <2e-16
Overall, the regression is significant, so the joint effect of the predictors on mpg is present. However, only weight and year have significant individual effects on mpg, and the other attributes are not significantly different from 0. The year coefficient suggests for a car one year newer, it on average has a .753 higher mpg.
1d)
par(mfrow=c(2,2))
plot(model)
Observation 14 has unusually high leverage. Overall, the fits seem ok, but the residual vs. fitted plot has a bit of a concerning shape.
1e)
model_int <- lm(mpg~cylinders*year, data = data)
summary(model_int)
##
## Call:
## lm(formula = mpg ~ cylinders * year, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.216 -2.579 -0.156 2.257 15.253
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -61.6178 15.1028 -4.08 5.5e-05 ***
## cylinders 5.5104 2.7370 2.01 0.045 *
## year 1.3405 0.1991 6.73 6.0e-11 ***
## cylinders:year -0.1135 0.0365 -3.11 0.002 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.13 on 388 degrees of freedom
## Multiple R-squared: 0.722, Adjusted R-squared: 0.72
## F-statistic: 336 on 3 and 388 DF, p-value: <2e-16
Fitting a model consisting of cylinders, year, and the interaction of the two, we get some surprising results. Year’s marginal effectremains positive, but cylinders becomes a positive influence on the mpg of the car when the interaction between the two terms is removed. The interaction coefficient is significant and negative, implying that cylinders does still negatively impact mpg overall.
1f)
model_nonlinear <- lm(mpg~log(cylinders) + I(weight^2) + I(sqrt(displacement)), data = data)
summary(model_nonlinear)
##
## Call:
## lm(formula = mpg ~ log(cylinders) + I(weight^2) + I(sqrt(displacement)),
## data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.295 -2.841 -0.483 2.339 17.460
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.22e+01 1.90e+00 22.23 < 2e-16 ***
## log(cylinders) 1.19e+00 2.37e+00 0.50 0.62
## I(weight^2) -4.62e-07 1.06e-07 -4.34 1.8e-05 ***
## I(sqrt(displacement)) -1.21e+00 2.47e-01 -4.90 1.4e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.36 on 388 degrees of freedom
## Multiple R-squared: 0.69, Adjusted R-squared: 0.687
## F-statistic: 288 on 3 and 388 DF, p-value: <2e-16
Not much new here. I took the \[log(cylinders)\], \[weight^2\], and \[\sqrt(displacement)\], and plugged them into the regression as predictors. While weight and displacement still come out as significant and negative, log of cylinders does not and ends up with a positive but insignifcant coefficient.
2a)
library(ISLR)
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.1-1
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.20 Min. :-18.20 Min. :-18.20
## 1st Qu.:1995 1st Qu.: -1.15 1st Qu.: -1.15 1st Qu.: -1.16
## Median :2000 Median : 0.24 Median : 0.24 Median : 0.24
## Mean :2000 Mean : 0.15 Mean : 0.15 Mean : 0.15
## 3rd Qu.:2005 3rd Qu.: 1.41 3rd Qu.: 1.41 3rd Qu.: 1.41
## Max. :2010 Max. : 12.03 Max. : 12.03 Max. : 12.03
## Lag4 Lag5 Volume Today Direction
## Min. :-18.20 Min. :-18.20 Min. :0.09 Min. :-18.20 Down:484
## 1st Qu.: -1.16 1st Qu.: -1.17 1st Qu.:0.33 1st Qu.: -1.15 Up :605
## Median : 0.24 Median : 0.23 Median :1.00 Median : 0.24
## Mean : 0.15 Mean : 0.14 Mean :1.57 Mean : 0.15
## 3rd Qu.: 1.41 3rd Qu.: 1.41 3rd Qu.:2.05 3rd Qu.: 1.41
## Max. : 12.03 Max. : 12.03 Max. :9.33 Max. : 12.03
pairs(Weekly)
Most of the relationships between variables seem to not be super interesting, but year and volume are very highly correlated it appears.
2b)
logit <- glm(Direction~., data = Weekly[,c(2:7,9)], family = "binomial")
summary(logit)
##
## Call:
## glm(formula = Direction ~ ., family = "binomial", data = Weekly[,
## c(2:7, 9)])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.695 -1.256 0.991 1.085 1.458
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.2669 0.0859 3.11 0.0019 **
## Lag1 -0.0413 0.0264 -1.56 0.1181
## Lag2 0.0584 0.0269 2.18 0.0296 *
## Lag3 -0.0161 0.0267 -0.60 0.5469
## Lag4 -0.0278 0.0265 -1.05 0.2937
## Lag5 -0.0145 0.0264 -0.55 0.5833
## Volume -0.0227 0.0369 -0.62 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500
##
## Number of Fisher Scoring iterations: 4
Lag 2 seems to be statistically significant, but no other lags, or the volume, are significant.
2c)
log_predict <- predict(logit, Weekly, type = "response")
logit.pred <- ifelse(log_predict > .5, "Up", "Down")
pred.table <- table(logit.pred, Weekly$Direction)
pred.table
##
## logit.pred Down Up
## Down 54 48
## Up 430 557
sum(diag(pred.table))/sum(pred.table)
## [1] 0.561
The prediction accuracy is \[56.1\]% overall, and it shows when the logistic regression is right and wrong. For down predictions, it is correct \[54/102 \approx 52.9\]% accurate, and \[557/987 \approx 56.4\]% accurate. Overall, this is not a very good model, as it barely beats a coin flip which should be accurate 50% of the time in the long-run. This was obvious, since the model did not have very strong predictors included.
2d)
train <- Weekly %>% filter(Year >= 1990 & Year <= 2008)
test <- Weekly %>% filter(Year > 2008)
logit2 <- glm(Direction~Lag2, data = train, family = "binomial")
summary(logit2)
##
## Call:
## glm(formula = Direction ~ Lag2, family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.54 -1.26 1.02 1.09 1.37
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.2033 0.0643 3.16 0.0016 **
## Lag2 0.0581 0.0287 2.02 0.0430 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1354.7 on 984 degrees of freedom
## Residual deviance: 1350.5 on 983 degrees of freedom
## AIC: 1355
##
## Number of Fisher Scoring iterations: 4
log2_pred <- predict(logit2, test, type = "response")
log2.pred <- ifelse(log2_pred > .5, "Up", "Down")
pred.table2 <- table(log2.pred, test$Direction)
pred.table2
##
## log2.pred Down Up
## Down 9 5
## Up 34 56
sum(diag(pred.table2))/sum(pred.table2)
## [1] 0.625
We have an improved accuracy of .625, getting 9/14 down predictions and 56/90 Up predictions correct.
2e)
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
lda_model <- lda(Direction~Lag2, data = train)
lda_pred <- predict(lda_model, test)$class
table(lda_pred, test$Direction)
##
## lda_pred Down Up
## Down 9 5
## Up 34 56
sum(diag(table(lda_pred, test$Direction)))/sum(table(lda_pred, test$Direction))
## [1] 0.625
Same exact prediction accuracy of .625.
2f)
qda_model <- qda(Direction~Lag2, data = train)
qda_pred <- predict(qda_model, test)$class
table(qda_pred, test$Direction)
##
## qda_pred Down Up
## Down 0 0
## Up 43 61
sum(diag(table(qda_pred, test$Direction)))/sum(table(qda_pred, test$Direction))
## [1] 0.587
Lower prediction accuracy than second logit model and lda.
2g)
library(class)
xtrain <- as.matrix(train$Lag2)
xtest <- as.matrix(test$Lag2)
knn_pred <- knn(xtrain, xtest, train$Direction, k=1)
table(knn_pred, test$Direction)
##
## knn_pred Down Up
## Down 21 30
## Up 22 31
sum(diag(table(knn_pred, test$Direction)))/sum(table(knn_pred, test$Direction))
## [1] 0.5
2h) Logistic Regression and LDA produced the best results, with qda being barely above .5 and knn being no better than a flipping a coin for prediction.
2i)
#k = 5
knn_pred <- knn(xtrain, xtest, train$Direction, k=5)
table(knn_pred, test$Direction)
##
## knn_pred Down Up
## Down 15 22
## Up 28 39
sum(diag(table(knn_pred, test$Direction)))/sum(table(knn_pred, test$Direction))
## [1] 0.519
#k = 10
knn_pred <- knn(xtrain, xtest, train$Direction, k=10)
table(knn_pred, test$Direction)
##
## knn_pred Down Up
## Down 17 20
## Up 26 41
sum(diag(table(knn_pred, test$Direction)))/sum(table(knn_pred, test$Direction))
## [1] 0.558
#logit with lag1^2 and 2
logit3 <- glm(Direction~I(Lag1^2)+Lag2, data = train, family = "binomial")
summary(logit3)
##
## Call:
## glm(formula = Direction ~ I(Lag1^2) + Lag2, family = "binomial",
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.59 -1.26 1.01 1.09 1.34
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.18162 0.06862 2.65 0.0081 **
## I(Lag1^2) 0.00408 0.00458 0.89 0.3734
## Lag2 0.06509 0.02979 2.19 0.0289 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1354.7 on 984 degrees of freedom
## Residual deviance: 1349.7 on 982 degrees of freedom
## AIC: 1356
##
## Number of Fisher Scoring iterations: 4
log3_pred <- predict(logit3, test, type = "response")
log3.pred <- ifelse(log3_pred > .5, "Up", "Down")
pred.table3 <- table(log3.pred, test$Direction)
pred.table3
##
## log3.pred Down Up
## Down 8 2
## Up 35 59
sum(diag(pred.table3))/sum(pred.table3)
## [1] 0.644
The best performing model I found was a logistic regression with sqrt(Lag1) and Lag2, as it obtained 64.4% classification accuracy.