library(ISLR)
library(MASS)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
dat <- Weekly
attach(dat)
str(dat)
## 'data.frame': 1089 obs. of 9 variables:
## $ Year : num 1990 1990 1990 1990 1990 1990 1990 1990 1990 1990 ...
## $ Lag1 : num 0.816 -0.27 -2.576 3.514 0.712 ...
## $ Lag2 : num 1.572 0.816 -0.27 -2.576 3.514 ...
## $ Lag3 : num -3.936 1.572 0.816 -0.27 -2.576 ...
## $ Lag4 : num -0.229 -3.936 1.572 0.816 -0.27 ...
## $ Lag5 : num -3.484 -0.229 -3.936 1.572 0.816 ...
## $ Volume : num 0.155 0.149 0.16 0.162 0.154 ...
## $ Today : num -0.27 -2.576 3.514 0.712 1.178 ...
## $ Direction: Factor w/ 2 levels "Down","Up": 1 1 2 2 2 1 2 2 2 1 ...
head(dat)
## Year Lag1 Lag2 Lag3 Lag4 Lag5 Volume Today Direction
## 1 1990 0.816 1.572 -3.936 -0.229 -3.484 0.1549760 -0.270 Down
## 2 1990 -0.270 0.816 1.572 -3.936 -0.229 0.1485740 -2.576 Down
## 3 1990 -2.576 -0.270 0.816 1.572 -3.936 0.1598375 3.514 Up
## 4 1990 3.514 -2.576 -0.270 0.816 1.572 0.1616300 0.712 Up
## 5 1990 0.712 3.514 -2.576 -0.270 0.816 0.1537280 1.178 Up
## 6 1990 1.178 0.712 3.514 -2.576 -0.270 0.1544440 -1.372 Down
dat[!complete.cases(dat),]
## [1] Year Lag1 Lag2 Lag3 Lag4 Lag5 Volume
## [8] Today Direction
## <0 rows> (or 0-length row.names)
There are no missing values, and there is only 1 factor variable: Direction, all other variables are numberic. For binomial and quasibinomial families the response, in our case Direction, can also be specified as a factor. Coding Direction to 0,1 level factor:
dat$Direction <- ifelse(dat$Direction=="Up",1,0)
glm_model<-glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,data = dat,family = "binomial")
summary(glm_model)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = "binomial", data = dat)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6949 -1.2565 0.9913 1.0849 1.4579
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
Only Lag2’s predictor is siginificant at the level of 5% and the intercept is significant at the level of 1%.
First, create train and test sets, and the new glm model:
train_dat <- subset(dat, Year<2009)
summary(train_dat$Year)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1990 1994 1999 1999 2004 2008
test_dat <- subset(dat, Year>=2009)
summary(test_dat$Year)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2009 2009 2010 2010 2010 2010
glm2<-glm(formula = Direction ~ Lag2,data = train_dat,family = "binomial")
summary(glm2)
##
## Call:
## glm(formula = Direction ~ Lag2, family = "binomial", data = train_dat)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.536 -1.264 1.021 1.091 1.368
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.20326 0.06428 3.162 0.00157 **
## Lag2 0.05810 0.02870 2.024 0.04298 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1354.7 on 984 degrees of freedom
## Residual deviance: 1350.5 on 983 degrees of freedom
## AIC: 1354.5
##
## Number of Fisher Scoring iterations: 4
Second, predict Direction_hat:
Direction_hat <- predict(glm2,type="response",newdata = test_dat)
we need to choose a threshold in order to compute the confusion matrix with respect to the values in the test set, as in the classification pdf p 15.Here is a plot of the prediction’s accuracy, sensitivity and specificity for different thresholds in the train set:
we can see that ~0.5 is a threshold that creates a very low specificity (high chances of type 1 error), and with high power (low chances of type 2 error). If we assume a loss-averter decision-maker, who cares more about losing than about gaining money, the maximal errors with respect to type 2 error will be set to 10%. Given this decision rule I will minimize the type 1 error (maximize the sensitivity).
restricted_conf <- which(confusion_design[3,]>0.1)
max_index <- which.max(confusion_design[2,restricted_conf])
chosen_threshold <- confusion_design[1, max_index]
The maximizer is 0.592
confusion_lgm2 <- table(test_dat$Direction, ifelse(Direction_hat>=chosen_threshold, 1, 0))
confusion_lgm2
##
## 0 1
## 0 39 4
## 1 47 14
lda_train <- lda(formula = Direction ~ Lag2, data = train_dat)
test_hat_lda <- predict(lda_train, newdata = test_dat, type = "response")
confusion_lda <- table(test_dat$Direction, test_hat_lda$class)
confusion_lda
##
## 0 1
## 0 9 34
## 1 5 56
qda_train <- qda(formula = Direction ~ Lag2, data = train_dat)
train_hat_qda <- predict(qda_train, newdata = test_dat, type = "response")
confusion_qda <- table(test_dat$Direction, train_hat_qda$class)
confusion_qda
##
## 0 1
## 0 0 43
## 1 0 61
conf_matrix_prop <- function(conf_matrix) {
TP <- conf_matrix[2,2]
FP <- conf_matrix[1,2]
FN <- conf_matrix[2,1]
TN <- conf_matrix[1,1]
False_Positive_Rate <- FP / (TN + FP) # 1 - Specificity (type 1 error)
True_Positive_Rate <- TP / (TP + FN) # Sensitivity (or Power)
Positive_Predictive_Value <- TP / (FP + TP) # Precision
Negative_Predictive_Value <- TN / (TN + FN)
Accuracy <- (TP + TN) / (TP + TN + FP + FN)
Error_Rate <- (FP + FN) / (TP + TN + FP + FN)
result <- c(False_Positive_Rate, True_Positive_Rate, Positive_Predictive_Value, Negative_Predictive_Value, Accuracy, Error_Rate)
names(result) <- c("False Positive Rate", "True Positive Rate", "Positive Predictive Value", "Negative Predictive Value", "Accuracy", "Error Rate")
return(result)
}
list(Logistic = confusion_lda,"Logistic Properties" = conf_matrix_prop(confusion_lgm2), LDA = confusion_lda, "LDA Properties" = conf_matrix_prop(confusion_lda),QDA = confusion_qda, "QDA Properties" = conf_matrix_prop(confusion_qda))
## $Logistic
##
## 0 1
## 0 9 34
## 1 5 56
##
## $`Logistic Properties`
## False Positive Rate True Positive Rate
## 0.09302326 0.22950820
## Positive Predictive Value Negative Predictive Value
## 0.77777778 0.45348837
## Accuracy Error Rate
## 0.50961538 0.49038462
##
## $LDA
##
## 0 1
## 0 9 34
## 1 5 56
##
## $`LDA Properties`
## False Positive Rate True Positive Rate
## 0.7906977 0.9180328
## Positive Predictive Value Negative Predictive Value
## 0.6222222 0.6428571
## Accuracy Error Rate
## 0.6250000 0.3750000
##
## $QDA
##
## 0 1
## 0 0 43
## 1 0 61
##
## $`QDA Properties`
## False Positive Rate True Positive Rate
## 1.0000000 1.0000000
## Positive Predictive Value Negative Predictive Value
## 0.5865385 NaN
## Accuracy Error Rate
## 0.5865385 0.4134615
Let us compare the models:
In terms pf false positives rate the Logit model with the cut-off chosen by-hand is the best, as this was the main consideration. On the other hand, the Logit doesn’t capture the true positives rate, nor it has a reletivly good accuracy level. This two latter considerations appears to be optimal in the LDA model (which is actually equivalent in our case to the Logit model, only here it is with a threshold of 0.5). The QDA model is too optimistic, as it predicts only positive values. it’s accuracy is higher than of the Logit, but it only reflects the proportion of increases in the stock market.
library(tree)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
library(gbm)
## Loaded gbm 2.1.5
I’ll now Import the Carseats data and as usual: inspect it, examine the structure and check for missing values:
carseats <- Carseats
head(carseats)
## Sales CompPrice Income Advertising Population Price ShelveLoc Age
## 1 9.50 138 73 11 276 120 Bad 42
## 2 11.22 111 48 16 260 83 Good 65
## 3 10.06 113 35 10 269 80 Medium 59
## 4 7.40 117 100 4 466 97 Medium 55
## 5 4.15 141 64 3 340 128 Bad 38
## 6 10.81 124 113 13 501 72 Bad 78
## Education Urban US
## 1 17 Yes Yes
## 2 10 Yes Yes
## 3 12 Yes Yes
## 4 14 Yes Yes
## 5 13 Yes No
## 6 16 No Yes
str(carseats)
## 'data.frame': 400 obs. of 11 variables:
## $ Sales : num 9.5 11.22 10.06 7.4 4.15 ...
## $ CompPrice : num 138 111 113 117 141 124 115 136 132 132 ...
## $ Income : num 73 48 35 100 64 113 105 81 110 113 ...
## $ Advertising: num 11 16 10 4 3 13 0 15 0 0 ...
## $ Population : num 276 260 269 466 340 501 45 425 108 131 ...
## $ Price : num 120 83 80 97 128 72 108 120 124 124 ...
## $ ShelveLoc : Factor w/ 3 levels "Bad","Good","Medium": 1 2 3 3 1 1 3 2 3 3 ...
## $ Age : num 42 65 59 55 38 78 71 67 76 76 ...
## $ Education : num 17 10 12 14 13 16 15 10 10 17 ...
## $ Urban : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 1 2 2 1 1 ...
## $ US : Factor w/ 2 levels "No","Yes": 2 2 2 2 1 2 1 2 1 2 ...
which(is.na(carseats))
## integer(0)
The dataset is of sales of child car seats at 400 different stores. The data contains 11 varaibles (8 numeric, 2 two-level factors, 1 three-level factor) and 400 observations, and no missing values were found.
set.seed(1234)
index <- sample(1:nrow(carseats), size=.25*nrow(carseats))
test_carseats <-carseats[index,]
train_carseats <- carseats[-index,]
First, I’ll grow the tree:
tree <- tree( Sales ~ . , train_carseats )
summary(tree)
##
## Regression tree:
## tree(formula = Sales ~ ., data = train_carseats)
## Variables actually used in tree construction:
## [1] "ShelveLoc" "Price" "CompPrice" "Income" "Advertising"
## Number of terminal nodes: 16
## Residual mean deviance: 2.502 = 710.6 / 284
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -5.17600 -0.95880 0.05711 0.00000 1.09300 4.82600
The median residual is quite low and the mean residual is zero, which means the model is quite good at fitting the results (but we might suspect overfitting, so no pruning was done).
Now, I’ll Plot of the tree:
Calculating the MSE:
Sales_hat_tree <- predict(tree, test_carseats)
MSE_calc <- function(real, prediction) {
result <- mean((prediction-real)^2)
return(result)
}
MSE_calc(test_carseats$Sales, Sales_hat_tree)
## [1] 4.601248
With grown a large tree of 16 terminal nodes without pruning it, this MSE might be a result of overfitting due to high variance (but it seems quite low, we’ll check this out in the next .
I use 5-fold CV:
cv_tree <- cv.tree(tree, K = 5)
final_terminal_nodes = cv_tree$size[which.min(cv_tree$dev)]
And now plot of the results:
The minimizer is 16 - which is the entire tree, as we saw before. Even though, it does not seems wise to use the whole tree, so I will pay a little in terms of deviation and choose a smaller tree.
The MSE of the new pruned tree:
tree_pruned <- prune.tree(tree, best = final_terminal_nodes-2)
Sales_hat_tree_pruned <- predict(tree_pruned, test_carseats)
MSE_calc(test_carseats$Sales,Sales_hat_tree_pruned)
## [1] 4.662673
And as we can see, we now have a bit higher MSE since we used a smaller number of terminal nodes than the minizer suggests…
bagging <- randomForest(formula = Sales ~ . , data = train_carseats, mtry = 10, importance = TRUE)
boosting <- gbm(formula = Sales ~ . , data = train_carseats, distribution = "gaussian")
random_forests <- randomForest(formula = Sales ~ . , data = train_carseats, mtry = round(sqrt(10)), importance = TRUE)
The estimated importance matrix in the Bagging and Random-Forests algorithms:
importance(bagging)
## %IncMSE IncNodePurity
## CompPrice 29.8093142 247.507183
## Income 11.7841328 123.366473
## Advertising 23.4279182 173.188155
## Population 0.3713072 82.000631
## Price 69.8841572 682.493607
## ShelveLoc 76.5424938 798.031784
## Age 18.0602949 178.466993
## Education 3.7076643 62.568718
## Urban -0.1027630 8.574776
## US 2.4236214 9.743416
varImpPlot(bagging)
importance(random_forests)
## %IncMSE IncNodePurity
## CompPrice 16.47133700 214.80245
## Income 7.32172757 173.53240
## Advertising 21.95601694 236.21475
## Population -1.67405982 154.42987
## Price 40.52500070 538.44117
## ShelveLoc 50.39145703 591.38907
## Age 14.32716162 239.93417
## Education 2.95366221 102.57291
## Urban -0.06836643 17.80520
## US 4.43416719 23.83377
varImpPlot(random_forests)
summary.gbm(boosting)
## var rel.inf
## ShelveLoc ShelveLoc 37.442475
## Price Price 32.615621
## CompPrice CompPrice 10.096298
## Advertising Advertising 9.175640
## Age Age 7.656317
## Income Income 3.013648
## Population Population 0.000000
## Education Education 0.000000
## Urban Urban 0.000000
## US US 0.000000
plot(summary.gbm(boosting))
SheleveLoc and Price seems to be the most important features, both in the bagging, random_forests and boosting, as permuting them results in the highest %IncMSE (change in MSE) .
The MSEs:
Sales_hat_bagging <- predict(bagging, test_carseats)
Sales_hat_boosting <- predict(boosting, test_carseats, n.trees = boosting$n.trees)
Sales_hat_random_forests <- predict(random_forests, test_carseats)
MSE_calc(test_carseats$Sales,Sales_hat_bagging)
## [1] 2.618152
MSE_calc(test_carseats$Sales,Sales_hat_boosting)
## [1] 2.47121
MSE_calc(test_carseats$Sales,Sales_hat_random_forests)
## [1] 3.023096
% Var explained:
random_forests
##
## Call:
## randomForest(formula = Sales ~ ., data = train_carseats, mtry = round(sqrt(10)), importance = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 3
##
## Mean of squared residuals: 2.766603
## % Var explained: 65.65
bagging
##
## Call:
## randomForest(formula = Sales ~ ., data = train_carseats, mtry = 10, importance = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 10
##
## Mean of squared residuals: 2.479161
## % Var explained: 69.22
And as you can see, the boosting’s MSE is lowest (each new tree of the model helps to correct errors made by previously trained tree), but above we can see that more variation is explained in the bagging model (since it’s trees are grown deep and not pruned). The advantage of the random forest model is that each tree is trained independently, using a random sample of the data, which makes the model more robust and less likley to overfit the training data.
THE END.