pairs(Weekly)
cor(Weekly[,-9])
## Year Lag1 Lag2 Lag3 Lag4
## Year 1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1 -0.03228927 1.000000000 -0.07485305 0.05863568 -0.071273876
## Lag2 -0.03339001 -0.074853051 1.00000000 -0.07572091 0.058381535
## Lag3 -0.03000649 0.058635682 -0.07572091 1.00000000 -0.075395865
## Lag4 -0.03112792 -0.071273876 0.05838153 -0.07539587 1.000000000
## Lag5 -0.03051910 -0.008183096 -0.07249948 0.06065717 -0.075675027
## Volume 0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today -0.03245989 -0.075031842 0.05916672 -0.07124364 -0.007825873
## Lag5 Volume Today
## Year -0.030519101 0.84194162 -0.032459894
## Lag1 -0.008183096 -0.06495131 -0.075031842
## Lag2 -0.072499482 -0.08551314 0.059166717
## Lag3 0.060657175 -0.06928771 -0.071243639
## Lag4 -0.075675027 -0.06107462 -0.007825873
## Lag5 1.000000000 -0.05851741 0.011012698
## Volume -0.058517414 1.00000000 -0.033077783
## Today 0.011012698 -0.03307778 1.000000000
plot(Volume)
plot(log(Volume))
There appears to be a positive correlation between Volume and Year. It looks like the Volume variable could benefit from a transformation so that it may become a better predictor. Taking the log of Volume gives us a more uniform variable to work with.
logist.full <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data = Weekly, family = binomial)
summary(logist.full)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6949 -1.2565 0.9913 1.0849 1.4579
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
Lag2 appears to be moderately significant.
full.probs <- predict(logist.full, type = 'response')
full.pred <- rep('Down', 1089)
full.pred[full.probs > 0.5] <- 'Up'
table(full.pred, Direction)
## Direction
## full.pred Down Up
## Down 54 48
## Up 430 557
mean(full.pred == Direction)
## [1] 0.5610652
The percentage of correct predictions is approximately 56.1%.
The majority of the mistakes made involve predicting that the market will be up when it actually goes down.
training <- (Year <= 2008)
Weekly.Held <- Weekly[!training, ]
Direction.Held <- Direction[!training]
log.train.fit <- glm(Direction ~ Lag2, data = Weekly, subset = training, family = 'binomial')
log.train.prob <- predict(log.train.fit, Weekly.Held, type = 'response')
log.train.pred <- rep('Down', 104)
log.train.pred[log.train.prob > 0.5] <- 'Up'
table(log.train.pred, Direction.Held)
## Direction.Held
## log.train.pred Down Up
## Down 9 5
## Up 34 56
mean(log.train.pred == Direction.Held)
## [1] 0.625
Adjusting the model so that we use the last 2 years of data as test data to compare to the training data and only including the significant variable results in a model that is accurate 62.5% of the time.
This model results in more inaccuracies related to predicting the market will be up, but in reality it goes down.
lda.train.fit <- lda(Direction ~ Lag2, data = Weekly, subset = training)
plot(lda.train.fit)
lda.train.pred <- predict(lda.train.fit, Weekly.Held)
names(lda.train.pred)
## [1] "class" "posterior" "x"
lda.train.class <- lda.train.pred$class
table(lda.train.class, Direction.Held)
## Direction.Held
## lda.train.class Down Up
## Down 9 5
## Up 34 56
mean(lda.train.class == Direction.Held)
## [1] 0.625
Using LDA we get the exact same results that we got using Logistic Regression. The model is accurate 62.5% of the time.
qda.train.fit <- qda(Direction ~ Lag2, data = Weekly, subset = training)
qda.train.fit
## Call:
## qda(Direction ~ Lag2, data = Weekly, subset = training)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag2
## Down -0.03568254
## Up 0.26036581
qda.train.class <- predict(qda.train.fit, Weekly.Held)$class
table(qda.train.class, Direction.Held)
## Direction.Held
## qda.train.class Down Up
## Down 0 0
## Up 43 61
mean(qda.train.class == Direction.Held)
## [1] 0.5865385
Using quadratic discriminant analysis our accuraccy has decreased to 58.65%.
This model predicts that the market will be up 100% of the time, so it appears that it’s not really predicting much of anything in relation to this data set.
training.Weekly <- as.matrix(Lag2[training])
test.Weekly <- as.matrix(Lag2[!training])
training.Direction <- Direction[training]
set.seed(1)
knn.pred <- knn(training.Weekly, test.Weekly, training.Direction, k = 1)
table(knn.pred, Direction.Held)
## Direction.Held
## knn.pred Down Up
## Down 21 30
## Up 22 31
mean(knn.pred == Direction.Held)
## [1] 0.5
Using kNN we get a model that is only accurate 50% of the time.
While the model is less accurate than other models we had, there is a little more balance on which direction the predictions will
Both Logistic Regression and Linear Discriminant Analysis give us the highest result of 62.5% accuracy.
All of the models produce slightly different results in the type of inaccuracy though. Decisions on which model is “best” may need to depend on the preference of direction of the error. If predicting innacurately that the market will go up, when it actually goes down, there could be a considerable amount of money lost using that prediction. Depeninding on who these predictions were for, it could be prefferable that we use a model that is slightly less accurate if the innacuracies lean more in a direction where money is not lost.
In truth, the low levels of accuracy of all of these models suggest that using previous weeks as predictors is not reliable for predicting whether or not the stock market will be up or down.
#k=2
set.seed(1)
knn.pred <- knn(training.Weekly, test.Weekly, training.Direction, k = 2)
table(knn.pred, Direction.Held)
## Direction.Held
## knn.pred Down Up
## Down 19 27
## Up 24 34
mean(knn.pred == Direction.Held)
## [1] 0.5096154
#k=3
set.seed(1)
knn.pred <- knn(training.Weekly, test.Weekly, training.Direction, k = 3)
table(knn.pred, Direction.Held)
## Direction.Held
## knn.pred Down Up
## Down 16 20
## Up 27 41
mean(knn.pred == Direction.Held)
## [1] 0.5480769
#k=4
set.seed(1)
knn.pred <- knn(training.Weekly, test.Weekly, training.Direction, k = 4)
table(knn.pred, Direction.Held)
## Direction.Held
## knn.pred Down Up
## Down 20 17
## Up 23 44
mean(knn.pred == Direction.Held)
## [1] 0.6153846
#k=5
set.seed(1)
knn.pred <- knn(training.Weekly, test.Weekly, training.Direction, k = 5)
table(knn.pred, Direction.Held)
## Direction.Held
## knn.pred Down Up
## Down 16 21
## Up 27 40
mean(knn.pred == Direction.Held)
## [1] 0.5384615
If we start increasing k in the kNN model, we find that k=4 gives us the most accuracy possible, with predictions being correct 61.54% of the time. When k increases to 5 the accuracy starts to decrease again.
Sinc k=4 gives us almost the same level of accuracy as our initial LDA and Logistic Regression models, while still having a better balance of the direction of inaccuracy, it may be that using kNN where k=4 is the best method to model this specific data.
logist.exp <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + log(Volume), data = Weekly, family = binomial)
summary(logist.exp)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## log(Volume), family = binomial, data = Weekly)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6922 -1.2600 0.9928 1.0847 1.4665
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.22562 0.06224 3.625 0.000289 ***
## Lag1 -0.04127 0.02637 -1.565 0.117578
## Lag2 0.05834 0.02679 2.178 0.029433 *
## Lag3 -0.01607 0.02663 -0.603 0.546213
## Lag4 -0.02790 0.02643 -1.055 0.291218
## Lag5 -0.01457 0.02636 -0.553 0.580433
## log(Volume) -0.05133 0.05607 -0.915 0.359988
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1485.9 on 1082 degrees of freedom
## AIC: 1499.9
##
## Number of Fisher Scoring iterations: 4
exp.probs <- predict(logist.exp, type = 'response')
exp.pred <- rep('Down', 1089)
exp.pred[full.probs > 0.5] <- 'Up'
table(exp.pred, Direction)
## Direction
## exp.pred Down Up
## Down 54 48
## Up 430 557
mean(exp.pred == Direction)
## [1] 0.5610652
As noted in part (a), it looks like the variable Volume would benefit from a Log transformation. Testing this transformation on the overall logistic model (all training data), it does appear to reduce the p-value for Volume, but not enough to make it a significant variable. Including it in the model does not change any of the overall predictions.
lda.exp.fit <- lda(Direction ~ Lag2 + Lag3, data = Weekly, subset = training)
lda.exp.pred <- predict(lda.exp.fit, Weekly.Held)
names(lda.exp.pred)
## [1] "class" "posterior" "x"
lda.exp.class <- lda.exp.pred$class
table(lda.exp.class, Direction.Held)
## Direction.Held
## lda.exp.class Down Up
## Down 8 4
## Up 35 57
mean(lda.exp.class == Direction.Held)
## [1] 0.625
While additional variables do not seem to make any positive addition to the model, I found it interesting that the addition of the Lag3 variable makes such an insignificant difference that there is no change to the prediction accuracy.
summary(mpg)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 9.00 17.00 22.75 23.45 29.00 46.60
# median = 22.75
mpg01 <- rep(0, length(mpg))
mpg01[mpg > median(mpg)] <- 1
Auto2 <- data.frame(Auto,mpg01)
par(mfrow = c(2,3))
boxplot(cylinders ~ mpg01, main = 'Cyl/mpg01', col = c('slateblue2','darkslategray3'))
boxplot(displacement ~ mpg01, main = 'Disp/mpg01', col = c('hotpink3','darkslategray3'))
boxplot(horsepower ~ mpg01, main = 'HP/mpg01', col = c('gold1','darkslategray3'))
boxplot(weight ~ mpg01, main = 'Wt/mpg01', col = c('palegreen1','darkslategray3'))
boxplot(acceleration ~ mpg01, main = 'Acc/mpg01', col = c('thistle4','darkslategray3'))
boxplot(year ~ mpg01, main = 'Year/mpg01', col = c('mistyrose','darkslategray3'))
All variables except mpg appear to be useful in predicting high or low gas mileage.
set.seed(21)
set.training <- sample(nrow(Auto2), nrow(Auto2) * 0.7)
training.auto <- Auto2[set.training, ]
test.auto <- Auto2[-set.training, ]
dim(training.auto)
## [1] 274 10
dim(test.auto)
## [1] 118 10
lda.Auto.fit <- lda(mpg01 ~ cylinders + displacement + horsepower + weight + year, data = training.auto)
lda.Auto.pred <- predict(lda.Auto.fit, test.auto)
table(lda.Auto.pred$class, test.auto$mpg01, dnn = c('Predicted MPG Cat', "Actual MPG Cat"))
## Actual MPG Cat
## Predicted MPG Cat 0 1
## 0 52 1
## 1 14 51
mean(lda.Auto.pred$class != test.auto$mpg01)
## [1] 0.1271186
The test error rate for the LDA model is 12.7%
qda.Auto.fit <- qda(mpg01 ~ cylinders + displacement + horsepower + weight + year, data = training.auto)
qda.Auto.pred <- predict(qda.Auto.fit, test.auto)
table(qda.Auto.pred$class, test.auto$mpg01, dnn = c('Predicted MPG Cat', "Actual MPG Cat"))
## Actual MPG Cat
## Predicted MPG Cat 0 1
## 0 55 3
## 1 11 49
mean(qda.Auto.pred$class != test.auto$mpg01)
## [1] 0.1186441
The error rate of the model for QDA is 11.86%.
logi.Auto.fit <- glm(mpg01 ~ cylinders + displacement + horsepower + weight + year, data = training.auto, family = 'binomial')
#summary(logi.Auto.fit)
logi.Auto.pred <- predict(logi.Auto.fit, test.auto, type = 'response')
logi.Auto.class <- rep(0, dim(test.auto)[1])
logi.Auto.class[logi.Auto.pred > 0.5] <- 1
table(logi.Auto.class, test.auto$mpg01,dnn = c('Predicted MPG Cat', "Actual MPG Cat"))
## Actual MPG Cat
## Predicted MPG Cat 0 1
## 0 56 6
## 1 10 46
mean(logi.Auto.class != test.auto$mpg01)
## [1] 0.1355932
The model error rate for logistic regression is 13.56%
training.auto.knn <- cbind(training.auto[,c(2:7,10)])
test.auto.knn <- cbind(test.auto[,c(2:7,10)])
training.mpg.knn <- cbind(training.auto$mpg01)
set.seed(1)
knn.auto.pred <- knn(training.auto.knn, test.auto.knn, training.mpg.knn, k=1)
table(knn.auto.pred, test.auto.knn$mpg01,dnn = c('k=1 Predicted', "Actual MPG Cat"))
## Actual MPG Cat
## k=1 Predicted 0 1
## 0 53 3
## 1 13 49
mean(knn.auto.pred != test.auto.knn$mpg01)
## [1] 0.1355932
set.seed(1)
knn.auto.pred <- knn(training.auto.knn, test.auto.knn, training.mpg.knn, k=2)
table(knn.auto.pred, test.auto.knn$mpg01,dnn = c('k=2 Predicted', "Actual MPG Cat"))
## Actual MPG Cat
## k=2 Predicted 0 1
## 0 50 5
## 1 16 47
mean(knn.auto.pred != test.auto.knn$mpg01)
## [1] 0.1779661
set.seed(1)
knn.auto.pred <- knn(training.auto.knn, test.auto.knn, training.mpg.knn, k=3)
table(knn.auto.pred, test.auto.knn$mpg01,dnn = c('k=3 Predicted', "Actual MPG Cat"))
## Actual MPG Cat
## k=3 Predicted 0 1
## 0 50 3
## 1 16 49
mean(knn.auto.pred != test.auto.knn$mpg01)
## [1] 0.1610169
set.seed(1)
knn.auto.pred <- knn(training.auto.knn, test.auto.knn, training.mpg.knn, k=4)
table(knn.auto.pred, test.auto.knn$mpg01,dnn = c('k=4 Predicted', "Actual MPG Cat"))
## Actual MPG Cat
## k=4 Predicted 0 1
## 0 51 1
## 1 15 51
mean(knn.auto.pred != test.auto.knn$mpg01)
## [1] 0.1355932
set.seed(1)
knn.auto.pred <- knn(training.auto.knn, test.auto.knn, training.mpg.knn, k=5)
table(knn.auto.pred, test.auto.knn$mpg01,dnn = c('k=5 Predicted', "Actual MPG Cat"))
## Actual MPG Cat
## k=5 Predicted 0 1
## 0 50 1
## 1 16 51
mean(knn.auto.pred != test.auto.knn$mpg01)
## [1] 0.1440678
set.seed(1)
knn.auto.pred <- knn(training.auto.knn, test.auto.knn, training.mpg.knn, k=6)
table(knn.auto.pred, test.auto.knn$mpg01,dnn = c('k=6 Predicted', "Actual MPG Cat"))
## Actual MPG Cat
## k=6 Predicted 0 1
## 0 52 1
## 1 14 51
mean(knn.auto.pred != test.auto.knn$mpg01)
## [1] 0.1271186
set.seed(1)
knn.auto.pred <- knn(training.auto.knn, test.auto.knn, training.mpg.knn, k=7)
table(knn.auto.pred, test.auto.knn$mpg01, dnn = c('k=7 Predicted', "Actual MPG Cat"))
## Actual MPG Cat
## k=7 Predicted 0 1
## 0 51 2
## 1 15 50
mean(knn.auto.pred != test.auto.knn$mpg01)
## [1] 0.1440678
k=6 seems to have the best prediction error rate, at 12.71%.
crime01 <- rep(0, length(crim))
crime01[crim > median(crim)] <- 1
Boston2 <- data.frame(Boston, crime01)
set.seed(21)
set.training.Boston <- sample(nrow(Boston2), nrow(Boston2) * 0.7)
training.Boston <- Boston2[set.training.Boston, ]
test.Boston <- Boston2[-set.training.Boston, ]
dim(training.Boston)
## [1] 354 15
dim(test.Boston)
## [1] 152 15
#Checking for significant variables
logi.Boston.exp <- glm(crime01 ~ . - crim, data = training.Boston, family = 'binomial')
summary(logi.Boston.exp)
##
## Call:
## glm(formula = crime01 ~ . - crim, family = "binomial", data = training.Boston)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1727 -0.1827 0.0000 0.0054 3.3116
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -13.371994 9.765189 -1.369 0.170889
## zn -0.081448 0.038285 -2.127 0.033384 *
## indus -0.112759 0.056635 -1.991 0.046483 *
## chas 1.117820 0.840720 1.330 0.183650
## nox 46.735010 8.910699 5.245 1.56e-07 ***
## rm -0.166443 0.937410 -0.178 0.859071
## age 0.016902 0.013801 1.225 0.220698
## dis 0.532352 0.257082 2.071 0.038382 *
## rad 0.610121 0.172841 3.530 0.000416 ***
## tax -0.005554 0.003552 -1.564 0.117917
## ptratio 0.399709 0.157735 2.534 0.011275 *
## black -0.063328 0.021067 -3.006 0.002646 **
## lstat 0.033245 0.056040 0.593 0.553025
## medv 0.127053 0.084904 1.496 0.134539
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 490.02 on 353 degrees of freedom
## Residual deviance: 147.07 on 340 degrees of freedom
## AIC: 175.07
##
## Number of Fisher Scoring iterations: 9
logi.Boston.fit <- glm(crime01 ~ nox + rad, data = training.Boston, family = 'binomial')
logi.Boston.pred <- predict(logi.Boston.fit, test.Boston, type = 'response')
logi.Boston.class <- rep(0, dim(test.Boston)[1])
logi.Boston.class[logi.Boston.pred > 0.5] <- 1
table(logi.Boston.class, test.Boston$crime01,dnn = c('Predicted Crime Cat', "Actual Crime Cat"))
## Actual Crime Cat
## Predicted Crime Cat 0 1
## 0 74 10
## 1 10 58
mean(logi.Boston.class == test.Boston$crime01)
## [1] 0.8684211
The logistic regression model has an accuracy rate of 86.84%.
lda.Boston.fit <- lda(crime01 ~ nox + rad, data = training.Boston)
lda.Boston.pred <- predict(lda.Boston.fit, test.Boston)
table(lda.Boston.pred$class, test.Boston$crime01, dnn = c('Predicted Crime Cat', "Actual Crime Cat"))
## Actual Crime Cat
## Predicted Crime Cat 0 1
## 0 81 13
## 1 3 55
mean(lda.Boston.pred$class == test.Boston$crime01)
## [1] 0.8947368
The LDA model has an accuracy rate of 89.47%.
training.Boston.knn <- cbind(training.Boston[,c(5,9,15)])
test.Boston.knn <- cbind(test.Boston[,c(5,9,15)])
training.crime.knn <- cbind(training.Boston$crime01)
set.seed(1)
knn.Boston.pred <- knn(training.Boston.knn, test.Boston.knn, training.crime.knn, k=1)
table(knn.Boston.pred, test.Boston.knn$crime01, dnn = c('k=1 Predicted', "Actual Crime Cat"))
## Actual Crime Cat
## k=1 Predicted 0 1
## 0 84 0
## 1 0 68
mean(knn.Boston.pred == test.Boston.knn$crime01)
## [1] 1
set.seed(1)
knn.Boston.pred <- knn(training.Boston.knn, test.Boston.knn, training.crime.knn, k=2)
table(knn.Boston.pred, test.Boston.knn$crime01, dnn = c('k=2 Predicted', "Actual Crime Cat"))
## Actual Crime Cat
## k=2 Predicted 0 1
## 0 84 1
## 1 0 67
mean(knn.Boston.pred == test.Boston.knn$crime01)
## [1] 0.9934211
set.seed(1)
knn.Boston.pred <- knn(training.Boston.knn, test.Boston.knn, training.crime.knn, k=3)
table(knn.Boston.pred, test.Boston.knn$crime01, dnn = c('k=3 Predicted', "Actual Crime Cat"))
## Actual Crime Cat
## k=3 Predicted 0 1
## 0 84 1
## 1 0 67
mean(knn.Boston.pred == test.Boston.knn$crime01)
## [1] 0.9934211
set.seed(1)
knn.Boston.pred <- knn(training.Boston.knn, test.Boston.knn, training.crime.knn, k=4)
table(knn.Boston.pred, test.Boston.knn$crime01, dnn = c('k=4 Predicted', "Actual Crime Cat"))
## Actual Crime Cat
## k=4 Predicted 0 1
## 0 84 1
## 1 0 67
mean(knn.Boston.pred == test.Boston.knn$crime01)
## [1] 0.9934211
The best overall accuracy for the Boston Data is kNN with k=1. This gives us as close to perfect predictions as possible.