``` #1
Weekly <- Weekly
complete.cases(Weekly) %>% sum()
## [1] 1089
The data has 1089 full rows, which are all of them.
In the next table we see the output of full logistic regression:
glm(formula = Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,
data = Weekly,
family = binomial) %>% summary()
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6949 -1.2565 0.9913 1.0849 1.4579
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
As we can see only the Lag2 and the intercept are significant.
Next we will perform reduced regression on the years 1990-2008:
test <- subset(Weekly, Year>2008)
train <- subset(Weekly, Year<=2008)
logistic_reg <- glm(formula = Direction~Lag2, data = test,
family = binomial)
logistic_reg %>% summary()
##
## Call:
## glm(formula = Direction ~ Lag2, family = binomial, data = test)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5767 -1.3052 0.9242 1.0419 1.2978
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.32377 0.20136 1.608 0.108
## Lag2 0.08562 0.06707 1.277 0.202
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 141.04 on 103 degrees of freedom
## Residual deviance: 139.37 on 102 degrees of freedom
## AIC: 143.37
##
## Number of Fisher Scoring iterations: 4
As we could have expected - now Lag2 and the intercept aren’t significant, as we have less control variables.
In order to check the 0’s and 1’s we will use the contrast function:
contrasts(Weekly$Direction)
## Up
## Down 0
## Up 1
Next, we going to produce confusion matrix for logistic regression, lda & qda, respectively:
pred_logit_test <- predict(logistic_reg, newdata = test,
type = "response")
ind <- rep("Down", nrow(test))
ind[pred_logit_test >= .5] = "Up"
confusion_matrix_log <- confusionMatrix(test$Direction,
ind %>% as.factor())
lda_reg <- lda(formula = Direction~Lag2, data = test)
pred_lda_test <- predict(lda_reg, newdata = test,
type = "response")
ind <- rep("Down", nrow(test))
ind[pred_lda_test$posterior[,2] >= .5] = "Up"
confusion_matrix_lda <- confusionMatrix(test$Direction,
ind %>% as.factor())
qda_reg <- qda(formula = Direction~Lag2, data = test)
pred_qda_test <- predict(qda_reg, newdata = test,
type = "response")
ind_2 <- rep("Down", nrow(test))
ind_2[pred_qda_test$posterior[,2] >= .5] = "Up"
confusion_matrix_qda <- confusionMatrix(test$Direction,
ind_2 %>% as.factor())
confusion_matrix_log
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 8 35
## Up 4 57
##
## Accuracy : 0.625
## 95% CI : (0.5247, 0.718)
## No Information Rate : 0.8846
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1348
##
## Mcnemar's Test P-Value : 1.556e-06
##
## Sensitivity : 0.66667
## Specificity : 0.61957
## Pos Pred Value : 0.18605
## Neg Pred Value : 0.93443
## Prevalence : 0.11538
## Detection Rate : 0.07692
## Detection Prevalence : 0.41346
## Balanced Accuracy : 0.64312
##
## 'Positive' Class : Down
##
confusion_matrix_qda
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 9 34
## Up 4 57
##
## Accuracy : 0.6346
## 95% CI : (0.5345, 0.7269)
## No Information Rate : 0.875
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1602
##
## Mcnemar's Test P-Value : 2.546e-06
##
## Sensitivity : 0.69231
## Specificity : 0.62637
## Pos Pred Value : 0.20930
## Neg Pred Value : 0.93443
## Prevalence : 0.12500
## Detection Rate : 0.08654
## Detection Prevalence : 0.41346
## Balanced Accuracy : 0.65934
##
## 'Positive' Class : Down
##
confusion_matrix_lda
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 8 35
## Up 4 57
##
## Accuracy : 0.625
## 95% CI : (0.5247, 0.718)
## No Information Rate : 0.8846
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1348
##
## Mcnemar's Test P-Value : 1.556e-06
##
## Sensitivity : 0.66667
## Specificity : 0.61957
## Pos Pred Value : 0.18605
## Neg Pred Value : 0.93443
## Prevalence : 0.11538
## Detection Rate : 0.07692
## Detection Prevalence : 0.41346
## Balanced Accuracy : 0.64312
##
## 'Positive' Class : Down
##
As we can see the three algorithms performing very poorly, and not much better than “as good as random”. We can also see that the logistic regression and the qda are identical (because we use only 2 variables). Next, we are going to see the results expressed also in ROC curve, and with AUC parameter. We also print accuracy as a function of the cutoff:
pred_qda <- prediction(pred_qda_test$posterior[,2],test$Direction)
roc_qda <- performance(pred_qda, measure = "tpr",
x.measure = "fpr")
plot(roc_qda)
abline(a=0, b= 1)
performance(pred_qda, measure = "acc") %>% plot()
auc_qda <- performance(pred_qda, measure = "auc")
auc_qda@y.values
## [[1]]
## [1] 0.546321
pred_lda <- prediction(pred_lda_test$posterior[,2],test$Direction)
roc_lda <- performance(pred_lda, measure = "tpr",
x.measure = "fpr")
plot(roc_lda)
abline(a=0, b= 1)
performance(pred_lda, measure = "acc") %>% plot()
auc_lda <- performance(pred_lda, measure = "auc")
auc_lda@y.values
## [[1]]
## [1] 0.546321
pred_log <- prediction(pred_logit_test,test$Direction)
roc_log <- performance(pred_log, measure = "tpr",
x.measure = "fpr")
plot(roc_log)
abline(a=0, b= 1)
performance(pred_log, measure = "acc") %>% plot()
auc_log <- performance(pred_log, measure = "auc")
auc_log@y.values
## [[1]]
## [1] 0.546321
We can see that the qda (and logistic) performs slightly better than lda, but in general the outcomes haven’t been able to predict the real values.
In order to get first impression of the data let us look at the histogram of Sales:
Carseats <- Carseats
hist(Carseats$Sales)
Next we will see a tree grown in order to predict sales. We will see both summary and the tree itself (It’s hard to read the labels):
train <- sample(1:nrow(Carseats), 250)
tree.carseats <- tree(Sales~., data=Carseats, subset = train)
summary(tree.carseats)
##
## Regression tree:
## tree(formula = Sales ~ ., data = Carseats, subset = train)
## Variables actually used in tree construction:
## [1] "ShelveLoc" "Price" "CompPrice" "Age" "Education"
## [6] "Advertising"
## Number of terminal nodes: 17
## Residual mean deviance: 2.809 = 654.5 / 233
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -6.25600 -0.91780 -0.09675 0.00000 1.11100 3.81200
plot(tree.carseats)
text(tree.carseats, pretty = 0)
tree.pred <- predict(tree.carseats,
Carseats[-train,], type="tree")
tmp <- ((Carseats[-train,]["Sales"]-tree.pred$where)^2) %>% sum()
MSE <- tmp/length(train)
The MSE is 109.4948724 for a basic tree model.
We can see next the tree with cross validation. We see in the graph the deviation as a function of the size of the tree:
cv.carseats <- cv.tree(tree.carseats)
plot(cv.carseats)
Now we can see the pruned tree:
prune.carseats <- prune.tree(tree.carseats, best = 12)
plot(prune.carseats)
text(prune.carseats, pretty=0)
tree.pred <- predict(prune.carseats,
Carseats[-train,], type="tree")
tmp <- ((Carseats[-train,]["Sales"]-tree.pred$where)^2) %>% sum()
MSE.cv <- tmp/length(train)
The MSE of the cross validated tree is 39.3233524, about 25% of the un prunned tree.
Next we will try to enhance performance by using random forest, boosting and bagging.
First we will see a summary of the random forest, and the importance of each variables:
random_forest <- randomForest(Sales~.,data = Carseats, subset = train)
random_forest
##
## Call:
## randomForest(formula = Sales ~ ., data = Carseats, subset = train)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 3
##
## Mean of squared residuals: 3.348719
## % Var explained: 59.93
importance(random_forest)
## IncNodePurity
## CompPrice 190.50593
## Income 169.34829
## Advertising 165.49109
## Population 136.24074
## Price 482.25998
## ShelveLoc 441.75233
## Age 249.50959
## Education 86.84339
## Urban 22.14311
## US 23.71668
random_forest %>% varImpPlot()
pred_rf <- predict(random_forest, Carseats[-train,])
test_err <- with(Carseats[-train,], mean( (Sales-pred_rf)^2 ))
The MSE of the random forest is 2.3666011.
Now we will implement some boosting:
boost_cars <- gbm(Sales~., data = Carseats[train,],
distribution = "gaussian", n.trees = 10000,
shrinkage = 0.01, interaction.depth = 4)
summary(boost_cars)
## var rel.inf
## Price Price 27.5949690
## ShelveLoc ShelveLoc 21.6605986
## CompPrice CompPrice 14.0202369
## Age Age 11.4507513
## Income Income 7.7984456
## Advertising Advertising 7.6542877
## Population Population 5.7623563
## Education Education 2.8613128
## Urban Urban 0.8410468
## US US 0.3559951
Boosting requires some more decisions than random forest, and hence it is a more difficult method to implement.
In order to see the importance of the variables we will also plot the success of each important variables against its abillity to predict the outcome:
plot(boost_cars, i = "Price")
plot(boost_cars, i = "ShelveLoc")
plot(boost_cars, i = "Age")
Finally we will implement bagging; we will use sequence of numbers (100 to 10,000 ny 100) in order to decide what is the best number of trees to grow and will choose from them by their MSE:
n.trees <- seq(from = 100, to = 10000, by = 100)
predmat <- predict(boost_cars,
newdata = Carseats[-train,], n.trees = n.trees)
dim(predmat)
## [1] 150 100
boost_err <- with(Carseats[-train,], apply( (predmat - Sales)^2,
2, mean) )
plot(n.trees, boost_err, pch = 23,
ylab = "Mean Squared Error", xlab = "# Trees",
main = "Boosting Test Error")
abline(h = min(test_err), col = "red")
As we can see the ideal number of trees is about 800.
bagging_cars <- bagging(Sales~.,data = Carseats,subset = train)
pred_bag <- predict(bagging_cars, Carseats[-train,])
test_err_bag <- with(Carseats[-train,], mean( (Sales-pred_bag)^2 ))
test_err_bag
## [1] 2.160169
And just for the last chord we will check among the three most important variables which predicts best and what is the range in which it has good predictions:
plotting <- cbind(pred_bag,Carseats[-train,])
plotting %>% ggplot() +
geom_point(aes(x = Price, y = pred_bag - Sales, color = Age))+
facet_wrap(~ShelveLoc)
In all of the variables checked the predictions are better among mid-values.