Chapter 04 (page 168): 10, 11, 13
library(GGally)
# pairwise graph
ggpairs(Weekly, upper = "blank", ggplot2::aes(colour = Year))
### Correlation Matrix
cor(Weekly[, -9])
## Year Lag1 Lag2 Lag3 Lag4
## Year 1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1 -0.03228927 1.000000000 -0.07485305 0.05863568 -0.071273876
## Lag2 -0.03339001 -0.074853051 1.00000000 -0.07572091 0.058381535
## Lag3 -0.03000649 0.058635682 -0.07572091 1.00000000 -0.075395865
## Lag4 -0.03112792 -0.071273876 0.05838153 -0.07539587 1.000000000
## Lag5 -0.03051910 -0.008183096 -0.07249948 0.06065717 -0.075675027
## Volume 0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today -0.03245989 -0.075031842 0.05916672 -0.07124364 -0.007825873
## Lag5 Volume Today
## Year -0.030519101 0.84194162 -0.032459894
## Lag1 -0.008183096 -0.06495131 -0.075031842
## Lag2 -0.072499482 -0.08551314 0.059166717
## Lag3 0.060657175 -0.06928771 -0.071243639
## Lag4 -0.075675027 -0.06107462 -0.007825873
## Lag5 1.000000000 -0.05851741 0.011012698
## Volume -0.058517414 1.00000000 -0.033077783
## Today 0.011012698 -0.03307778 1.000000000
The correlation matrix shows that most of the variables have very little correlation. The only significant correlation seems to be between Year and Volume with a value of 0.8419.
# scatterplot
ggplot(data = Weekly, mapping = aes(x = Year, y = Volume)) +
geom_point(colour = "skyblue")
A closer look at the graph confirms that there is indeed a positive correlation between Year and Volume variable. The Volume of shares traded (average number of daily shares traded in billions) increases between 1990 and 2010.
# logistic regression model
glm.fit_Weekly = glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data = Weekly, family = binomial)
summary (glm.fit_Weekly)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6949 -1.2565 0.9913 1.0849 1.4579
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
The only predictor that appears to be significant is Lag2 which has a p value that is less than 0.05. So the percentage return for 2 weeks ago is significant in our logistic regression model.
# predict the probabilities
glm.probs_Weekly = predict.glm(glm.fit_Weekly, Weekly, type = "response")
# replicate the word "Down" 1089 times
glm.pred_Weekly = rep("Down", 1089)
# if the probability is more than 0.5 then set the Direction to Up
glm.pred_Weekly[glm.probs_Weekly > 0.5] = "Up"
# confusion matrix
table(glm.pred_Weekly, Weekly$Direction)
##
## glm.pred_Weekly Down Up
## Down 54 48
## Up 430 557
# correct predictions : (Up + Down)/Total Observations
(557 + 54) / 1089
## [1] 0.5610652
The result shows that the logistic regression model correctly predicted the movement of the stock market 56.11% of the time. However, the training error rate 100% - 56.11% = 43.89% and usually the training error rate is often overly optimistic-it tends to underestimate the test error rate.To get a more accurate model we can fit the logistic regression model using part of the data.
# boolean vector element for Year from 1990 to 2008
train_Weekly = (Weekly$Year <= 2008)
test_Weekly = (!train_Weekly)
# subset data where Year > 2009
Weekly.2008 = Weekly[test_Weekly , ]
Direction.2008 = Direction[test_Weekly]
# dimensions of subset Direction.2008
dim(Weekly.2008)
## [1] 104 9
The output above indicates that there are 104 observations that are FALSE on the training data set. So there are 104 observations that are from 2009 to 2010. Next we are going to fit a logistic regression model with Lag2 as the predictor.
# fit a logistic regression model on the subset train_Weekly with Lag2 as the predictor
glm.fit.subset_Weekly = glm(Direction ~ Lag2,
data = Weekly,
family = binomial,
subset = train_Weekly)
# predict the probabilities on subset Weekly.2008
glm.probs.subset_Weekly = predict (glm.fit.subset_Weekly, Weekly.2008, type = "response")
# replicate the word "Down" 104 times
glm.pred.subset_Weekly = rep("Down", 104)
# if the probability is more than 0.5 then set the Direction to Up
glm.pred.subset_Weekly[glm.probs.subset_Weekly > 0.5] = " Up"
# confusion matrix
table(glm.pred.subset_Weekly, Direction.2008)
## Direction.2008
## glm.pred.subset_Weekly Down Up
## Up 34 56
## Down 9 5
# correct predictions : Total Observations/(Up + Up*Down)
56 / (56 + 34)
## [1] 0.6222222
The result of the new model with only Lag2 as the predictor shows that the logistic regression model correctly predicted the movement of the stock market 62.22% of the time. which is slightly better than the previous model with 56.66%.
# fit a LDA model using observations from 1990 to 2008
lda.fit_Weekly = lda(Direction ~ Lag2, data = Weekly, subset = train_Weekly)
lda.fit_Weekly
## Call:
## lda(Direction ~ Lag2, data = Weekly, subset = train_Weekly)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag2
## Down -0.03568254
## Up 0.26036581
##
## Coefficients of linear discriminants:
## LD1
## Lag2 0.4414162
The output shows that 44.77% of the training observations corresponds to days when the stock market went down and 55.23% of the training observations corresponds to days when the stock market went up.
# prediction
lda.pred_Weekly = predict(lda.fit_Weekly, Weekly.2008)
lda.class = lda.pred_Weekly$class
# confusion matrix
table(lda.class , Direction.2008)
## Direction.2008
## lda.class Down Up
## Down 9 5
## Up 34 56
# correct predictions rate
mean(lda.class == Direction.2008)
## [1] 0.625
# applying 50% threshold
sum(lda.pred_Weekly$posterior[, 1] >= .5)
## [1] 14
sum(lda.pred_Weekly$posterior[, 1] < .5)
## [1] 90
The result of the linear discriminant analysis(lda) regression on the new model with only Lag2 as the predictor shows that the lda model correctly predicts the movement of the stock market 62.50% of the time. This is almost identical to the new logistic regression model that correctly predicted the movement of the stock market 62.22% of the time.
The probability is higher on the lower part of the threshold which suggests that the movement of the market will decrease.
qda.fit = qda(Direction ~ Lag2 , data = Weekly , subset = train_Weekly)
qda.fit
## Call:
## qda(Direction ~ Lag2, data = Weekly, subset = train_Weekly)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag2
## Down -0.03568254
## Up 0.26036581
qda.class_Weekly = predict(qda.fit , Weekly.2008)$class
# confusion matrix
table(qda.class_Weekly , Direction.2008)
## Direction.2008
## qda.class_Weekly Down Up
## Down 0 0
## Up 43 61
# correct predictions rate
mean(qda.class_Weekly == Direction.2008)
## [1] 0.5865385
The output shows that the quadratic discriminant analysis regression is accurate 58.65% of the time.
library(class)
# matrix with predictors of the training data
train_Weekly.X = as.matrix(cbind(Weekly$Lag2)[train_Weekly, ])
# matrix with predictors o the data for which we wish to make predictions
test_Weekly.X = as.matrix(cbind(Weekly$Lag2)[test_Weekly, ])
train_Weekly.Direction = Direction[train_Weekly]
#dimensions of training data set
dim(train_Weekly.X)
## [1] 985 1
set.seed(1)
knn.pred_Weekly = knn(train_Weekly.X, test_Weekly.X, train_Weekly.Direction , k = 1)
# confusion matrix
table(knn.pred_Weekly, Direction.2008)
## Direction.2008
## knn.pred_Weekly Down Up
## Down 21 30
## Up 22 31
#test error rate
mean(knn.pred_Weekly != Direction.2008)
## [1] 0.5
The result shows that KNN correctly predicts 50% of the time. ### (h) Which of these methods appears to provide the best results on this data? The Linear Discriminant Analysis(LDA) and logistic regression predictions are almost identical. The accuracy of LDA prediction percentage is the highest so therefore it performs the best on the data. QDA is the 3rd best and KNN is the worst of all four analysis.
# QDA fit
qda.fit_weekly2 = qda(Direction ~ Lag1+Lag2+Lag3+Lag4+Lag5 , data = Weekly , subset = train_Weekly)
qda.class_Weekly2 = predict(qda.fit_weekly2 , Weekly.2008)$class
# confusion matrix
table(qda.class_Weekly2 , Direction.2008)
## Direction.2008
## qda.class_Weekly2 Down Up
## Down 10 23
## Up 33 38
# correct predictions rate
mean(qda.class_Weekly2 == Direction.2008)
## [1] 0.4615385
# LDA fit
lda.fit_Weekly2 = lda(Direction ~ Lag1+Lag2+Lag3+Lag4+Lag5 , data = Weekly , subset = train_Weekly)
# prediction
lda.pred_Weekly2 = predict(lda.fit_Weekly2, Weekly.2008)
lda.class2 = lda.pred_Weekly2$class
# confusion matrix
table(lda.class2 , Direction.2008)
## Direction.2008
## lda.class2 Down Up
## Down 9 13
## Up 34 48
# correct predictions rate
mean(lda.class2 == Direction.2008)
## [1] 0.5480769
# KKN fit with K=10
predictors.set = cbind(Lag1, Lag2, Lag3, Lag4, Lag5)
# matrix with predictors of the training data
train_Weekly2.X = as.matrix(predictors.set)[train_Weekly, ]
# matrix with predictors o the data for which we wish to make predictions
test_Weekly2.X = as.matrix(predictors.set)[test_Weekly, ]
knn.pred_Weekly2 = knn(train_Weekly2.X, test_Weekly2.X, train_Weekly.Direction , k = 10)
# confusion matrix
table(knn.pred_Weekly2, Direction.2008)
## Direction.2008
## knn.pred_Weekly2 Down Up
## Down 15 20
## Up 28 41
#test error rate
mean(knn.pred_Weekly2 != Direction.2008)
## [1] 0.4615385
QDA accurately predicts 46.15% of the time. LDA accurately predicts 54.81% of the time. LDA accurately predicts 46.15% of the time. The accuracy is worse than the results obtained from using only Lag 2 as the predictors which indicates adding more predictors worsens the accuracy.
# Binary variable mpg01
# 1 = above median, 0 = below median
mpg01 <- ifelse(mpg > median(mpg), yes = 1, no = 0)
# data set containing mpg01 and other Auto variables
Auto <- data.frame(Auto[1:9], mpg01)
cor(Auto[, -9])
## mpg cylinders displacement horsepower weight
## mpg 1.0000000 -0.7776175 -0.8051269 -0.7784268 -0.8322442
## cylinders -0.7776175 1.0000000 0.9508233 0.8429834 0.8975273
## displacement -0.8051269 0.9508233 1.0000000 0.8972570 0.9329944
## horsepower -0.7784268 0.8429834 0.8972570 1.0000000 0.8645377
## weight -0.8322442 0.8975273 0.9329944 0.8645377 1.0000000
## acceleration 0.4233285 -0.5046834 -0.5438005 -0.6891955 -0.4168392
## year 0.5805410 -0.3456474 -0.3698552 -0.4163615 -0.3091199
## origin 0.5652088 -0.5689316 -0.6145351 -0.4551715 -0.5850054
## mpg01 0.8369392 -0.7591939 -0.7534766 -0.6670526 -0.7577566
## acceleration year origin mpg01
## mpg 0.4233285 0.5805410 0.5652088 0.8369392
## cylinders -0.5046834 -0.3456474 -0.5689316 -0.7591939
## displacement -0.5438005 -0.3698552 -0.6145351 -0.7534766
## horsepower -0.6891955 -0.4163615 -0.4551715 -0.6670526
## weight -0.4168392 -0.3091199 -0.5850054 -0.7577566
## acceleration 1.0000000 0.2903161 0.2127458 0.3468215
## year 0.2903161 1.0000000 0.1815277 0.4299042
## origin 0.2127458 0.1815277 1.0000000 0.5136984
## mpg01 0.3468215 0.4299042 0.5136984 1.0000000
# pairwise graphs
ggpairs(Auto[, -9], upper = "blank", ggplot2::aes(colour = mpg01))
ggpairs(Auto[, -c(1, 2, 7, 8, 9)], upper = "blank", ggplot2::aes(colour = mpg01))
The predictors displacement, horsepower, wight, and acceleration seem most likely to be useful in predicting mpg01.
# split data
set.seed(1)
train_Auto = (Auto$year %% 2 == 0)
test_Auto = (!train_Auto)
# Testing set
Auto.78 = Auto[test_Auto, ]
mpg01.78 = Auto$mpg01[test_Auto]
# dimensions of subset Auto.78
dim(Auto.78)
## [1] 182 10
lda.fit_Auto = lda(mpg01 ~ displacement + horsepower+ weight + acceleration, data = Auto, subset = train_Auto)
# prediction
lda.pred_Auto = predict(lda.fit_Auto, Auto.78)
lda.class_Auto = lda.pred_Auto$class
# confusion matrix
table(lda.class_Auto , mpg01.78)
## mpg01.78
## lda.class_Auto 0 1
## 0 83 8
## 1 17 74
#test error rate
mean(lda.class_Auto != mpg01.78)
## [1] 0.1373626
The test error of the model is 13.74%.
qda.fit_Auto=qda(mpg01 ~ displacement + horsepower+ weight + acceleration,data = Auto, subset = train_Auto)
qda.fit_Auto
## Call:
## qda(mpg01 ~ displacement + horsepower + weight + acceleration,
## data = Auto, subset = train_Auto)
##
## Prior probabilities of groups:
## 0 1
## 0.4571429 0.5428571
##
## Group means:
## displacement horsepower weight acceleration
## 0 271.7396 133.14583 3604.823 14.47500
## 1 111.6623 77.92105 2314.763 16.62895
qda.class_Auto = predict(qda.fit_Auto, Auto.78)$class
# confusion matrix
table(qda.class_Auto, mpg01.78)
## mpg01.78
## qda.class_Auto 0 1
## 0 85 9
## 1 15 73
#test error rate
mean(qda.class_Auto != mpg01.78)
## [1] 0.1318681
The test error of the model is 13.19%.
# logistic regression model
glm.fit_Auto = glm(mpg01 ~ displacement + horsepower+ weight + acceleration, data = Auto, subset = train_Auto, family = binomial)
summary(glm.fit_Auto)
##
## Call:
## glm(formula = mpg01 ~ displacement + horsepower + weight + acceleration,
## family = binomial, data = Auto, subset = train_Auto)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.58551 -0.03929 0.08284 0.27080 2.04949
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 18.961548 4.717535 4.019 5.84e-05 ***
## displacement -0.021290 0.009638 -2.209 0.0272 *
## horsepower -0.075496 0.034087 -2.215 0.0268 *
## weight -0.001710 0.001463 -1.169 0.2425
## acceleration -0.211668 0.209091 -1.012 0.3114
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 289.577 on 209 degrees of freedom
## Residual deviance: 84.914 on 205 degrees of freedom
## AIC: 94.914
##
## Number of Fisher Scoring iterations: 8
The only predictor that appears to be significant are displacement and horsepower with p values less than 0.05.
# predict the probabilities
glm.probs_Auto = predict.glm(glm.fit_Auto, Auto, type = "response")
# replicate the word "0" 392 times
glm.pred_Auto = rep("0", 392)
# if the probability is more than 0.5 then set mpg01 to 1
glm.pred_Auto[glm.probs_Auto > 0.5] = "1"
# confusion matrix
table(glm.pred_Auto, mpg01)
## mpg01
## glm.pred_Auto 0 1
## 0 173 18
## 1 23 178
# test error rate
mean(glm.pred_Auto != mpg01)
## [1] 0.1045918
The test error of the model is 10.46%.
library(class)
train_Auto.X = cbind(horsepower, weight)[train_Auto , ]
test_Auto.X = cbind(horsepower, weight)[!train_Auto , ]
train_Auto.mpg01 = Auto$mpg01 [train_Auto]
set.seed(1)
knn.pred_Auto = knn(train_Auto.X, test_Auto.X, train_Auto.mpg01 , k = 30)
# confusion matrix
table(knn.pred_Auto, mpg01.78)
## mpg01.78
## knn.pred_Auto 0 1
## 0 83 8
## 1 17 74
#test error rate
mean(knn.pred_Auto != mpg01.78)
## [1] 0.1373626
K = 30 performs the best on this data. The test error of the model is 13.74%.
ggpairs(Boston, upper = "blank", ggplot2::aes(colour = crim01))
ggpairs(Boston[, c(3, 5, 9, 10, 14)], upper = "blank", ggplot2::aes(colour = crim01))
library(caret)
## Loading required package: lattice
# split data
inTrain = createDataPartition(Boston$crim, p=0.75, list = FALSE)
train_Boston = Boston[inTrain,]
test_Boston = Boston[-inTrain,]
# logistic regression model
glm.fit_Boston = glm(crim01 ~ nox + rad + tax, data = train_Boston, family = binomial)
summary(glm.fit_Boston)
##
## Call:
## glm(formula = crim01 ~ nox + rad + tax, family = binomial, data = train_Boston)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.95592 -0.38968 -0.01501 0.00634 2.55026
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -19.561717 2.532405 -7.725 1.12e-14 ***
## nox 34.344256 4.685388 7.330 2.30e-13 ***
## rad 0.673422 0.126528 5.322 1.02e-07 ***
## tax -0.007205 0.002240 -3.217 0.0013 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 526.79 on 379 degrees of freedom
## Residual deviance: 203.66 on 376 degrees of freedom
## AIC: 211.66
##
## Number of Fisher Scoring iterations: 8
The significant variables are nitrogen oxides concentration(nox), index of accessibility to radial highways(rad), and full-value property-tax rate per $10,000(tax). ### Confusion Matrix and Test Error
# predict the probabilities
glm.probs_Boston = predict.glm(glm.fit_Boston, test_Boston, type = "response")
# replicate the word "0" 506 times
glm.pred_Boston = ifelse(glm.probs_Boston>median(glm.probs_Boston), "1", "0")
# confusion matrix
table(glm.pred_Boston, test_Boston$crim01)
##
## glm.pred_Boston 0 1
## 0 56 7
## 1 7 56
# correct predictions rate
mean(glm.pred_Boston == test_Boston$crim01)
## [1] 0.8888889
# test error rate
mean(glm.pred_Boston != test_Boston$crim01)
## [1] 0.1111111
The test error of the model is 13.49%.
lda.fit_Boston = lda(crim01 ~ nox + rad + tax, data = train_Boston)
lda.fit_Boston
## Call:
## lda(crim01 ~ nox + rad + tax, data = train_Boston)
##
## Prior probabilities of groups:
## 0 1
## 0.5 0.5
##
## Group means:
## nox rad tax
## 0 0.4742453 4.168421 307.0211
## 1 0.6309421 14.442105 500.6895
##
## Coefficients of linear discriminants:
## LD1
## nox 9.845777061
## rad 0.091745363
## tax -0.001577726
lda.pred_Boston = predict(lda.fit_Boston, test_Boston)
lda.class_Boston = lda.pred_Boston$class
# confusion matrix
table(lda.class_Boston, test_Boston$crim01)
##
## lda.class_Boston 0 1
## 0 61 11
## 1 2 52
# correct predictions rate
mean(lda.class_Boston == test_Boston$crim01)
## [1] 0.8968254
#test error rate
mean(lda.class_Boston != test_Boston$crim01)
## [1] 0.1031746
library(class)
train_Boston.X = as.matrix(train_Boston[, c(4, 8,9)])
test_Boston.X = as.matrix(test_Boston[, c(4, 8,9)])
set.seed(1)
knn.pred_Boston = knn(train_Boston.X, test_Boston.X, train_Boston$crim01 , k = 1)
# confusion matrix
table(knn.pred_Boston, test_Boston$crim01)
##
## knn.pred_Boston 0 1
## 0 61 1
## 1 2 62
# correct predictions rate
mean(knn.pred_Boston == test_Boston$crim01)
## [1] 0.9761905
#test error rate
mean(knn.pred_Boston != test_Boston$crim01)
## [1] 0.02380952