library(ISLR)
library(caret)
## Warning: package 'caret' was built under R version 4.0.2
## Loading required package: lattice
## Loading required package: ggplot2
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tibble)

Exercise 3

Consider the Gini index, classification error, and entropy in a simple classification setting with two classes. Create a single plot that displays each of these quantities as a function of \(\hat{p}_{m1}\). The xaxis should display \(\hat{p}_{m1}\), ranging from 0 to 1, and the y-axis should display the value of the Gini index, classification error, and entropy.

Hint: In a setting with two classes, \(\hat{p}_{m1}\) = 1− \(\hat{p}_{m2}\). You could make this plot by hand, but it will be much easier to make in R.

p <- seq(0, 1, 0.01)
Gini <- p*(1-p)*2
Entropy <- -(p*log(p)+(1-p)*log(1-p))
Class_Error <- 1-pmax(p, 1-p)
df <- data.frame(p, Gini, Entropy, Class_Error)
df1 <- data.frame(df[1], stack(df[2:ncol(df)]))
ggplot(df1, aes(x = p, y = values, color = ind)) + 
  geom_line(size = 2) + 
  labs(title = "Relationship of Quantities") + 
  ylab("Values") + 
  scale_color_brewer(palette = "Set1") + 
  scale_color_discrete(name = "", labels = c("Gini Coefficient", "Entropy", 
                                             "Classification Error")) + 
  theme_bw()
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

Exercise 8

In the lab, a classification tree was applied to the Carseats data set after converting Sales into a qualitative response variable. Now we will seek to predict Sales using regression trees and related approaches, treating the response as a quantitative variable.

(a) Split the data set into a training set and a test set.

set.seed(1)
summary(Carseats)
##      Sales          CompPrice       Income        Advertising    
##  Min.   : 0.000   Min.   : 77   Min.   : 21.00   Min.   : 0.000  
##  1st Qu.: 5.390   1st Qu.:115   1st Qu.: 42.75   1st Qu.: 0.000  
##  Median : 7.490   Median :125   Median : 69.00   Median : 5.000  
##  Mean   : 7.496   Mean   :125   Mean   : 68.66   Mean   : 6.635  
##  3rd Qu.: 9.320   3rd Qu.:135   3rd Qu.: 91.00   3rd Qu.:12.000  
##  Max.   :16.270   Max.   :175   Max.   :120.00   Max.   :29.000  
##    Population        Price        ShelveLoc        Age          Education   
##  Min.   : 10.0   Min.   : 24.0   Bad   : 96   Min.   :25.00   Min.   :10.0  
##  1st Qu.:139.0   1st Qu.:100.0   Good  : 85   1st Qu.:39.75   1st Qu.:12.0  
##  Median :272.0   Median :117.0   Medium:219   Median :54.50   Median :14.0  
##  Mean   :264.8   Mean   :115.8                Mean   :53.32   Mean   :13.9  
##  3rd Qu.:398.5   3rd Qu.:131.0                3rd Qu.:66.00   3rd Qu.:16.0  
##  Max.   :509.0   Max.   :191.0                Max.   :80.00   Max.   :18.0  
##  Urban       US     
##  No :118   No :142  
##  Yes:282   Yes:258  
##                     
##                     
##                     
## 
dim(Carseats)
## [1] 400  11
inTrain <- createDataPartition(Carseats$Sales, p = 0.5, list = FALSE)
train <-  Carseats[inTrain,]
test <- Carseats[-inTrain,]
dim(train)
## [1] 201  11

(b) Fit a regression tree to the training set. Plot the tree, and interpret the results. What test MSE do you obtain?

The model ended up with 17 terminal nodes and 6 variables used. These included: ShelveLoc, Price, Income, Age, CompPrice and Advertising. The test MSE was 4.540765.

library(tree)
## Warning: package 'tree' was built under R version 4.0.2
tree_carseats <- tree(Sales ~ ., data = train)
summary(tree_carseats)
## 
## Regression tree:
## tree(formula = Sales ~ ., data = train)
## Variables actually used in tree construction:
## [1] "ShelveLoc"   "Price"       "Income"      "Age"         "CompPrice"  
## [6] "Advertising"
## Number of terminal nodes:  17 
## Residual mean deviance:  2.272 = 418 / 184 
## Distribution of residuals:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -4.76200 -0.95950  0.08568  0.00000  1.00600  3.55000
plot(tree_carseats)
text(tree_carseats, pretty = 0)

pred_carseats <- predict(tree_carseats, test)
mean((test$Sales - pred_carseats)^2)
## [1] 4.540765

(c) Use cross-validation in order to determine the optimal level of tree complexity. Does pruning the tree improve the test MSE?

The best level of complexity was at size 10. The test MSE was not improved; it increased to 4.700748.

cv_carseats <- cv.tree(tree_carseats, FUN = prune.tree)
# Create data frame for ggplot
dfcv_carseats <- data.frame(Size = cv_carseats$size, cv_dev = cv_carseats$dev,
                            K = cv_carseats$k)
ggplot(dfcv_carseats, aes(Size, cv_dev)) + geom_line(color = "red", size = 1.5) + 
  ylab("CV Deviation") + theme_bw()

pruned_carseats <- prune.tree(tree_carseats, best = 10)
plot(pruned_carseats)
text(pruned_carseats, pretty = 0)

pred_pruned <- predict(pruned_carseats, test)
mean((test$Sales - pred_pruned)^2)
## [1] 4.700748

(d) Use the bagging approach in order to analyze this data. What test MSE do you obtain? Use the importance() function to determine which variables are most important.

The test MSE was 2.558038. The 5 most important variables were CompPrice, Income, Advertising, Population and Price.

library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
set.seed(1)
bag_carseats <- randomForest(Sales ~ ., data = train, mtry = 10, importance = TRUE)
bag_pred <- predict(bag_carseats, test)
mean((test$Sales - bag_pred)^2)
## [1] 2.558038
importance(bag_carseats)
##                %IncMSE IncNodePurity
## CompPrice   21.1831118    163.914465
## Income      11.5870916     96.312530
## Advertising 16.6612943    121.613296
## Population  -0.5761569     48.296635
## Price       49.8770064    423.588145
## ShelveLoc   48.6842318    366.493993
## Age         17.4970778    138.454912
## Education   -1.0620839     53.131551
## Urban       -1.9187461      7.154914
## US           2.7621595      9.673692

(e) Use random forests to analyze this data. What test MSE do you obtain? Use the importance() function to determine which variables are most important. Describe the effect of m, the number of variables considered at each split, on the error rate obtained.

The test MSE was 2.646104. Price, ShelveLocGood, CompPrice, ShelveLocMedium and Age were the 5 most important variables. As the number of variables considered at each split increased toward 11, the error rate decreased in the training model. The reason for 11 variables is that ShelveLoc was split into 2 binary variables: ShelveLocGood and ShelveLocMedium.

library(caret)
set.seed(1)
rf_carseats <- train(Sales ~ ., data = train, method = "rf", trControl = 
                       trainControl("cv", number = 10), importance = TRUE)
rf_carseats
## Random Forest 
## 
## 201 samples
##  10 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 181, 181, 181, 180, 181, 181, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE      Rsquared   MAE     
##    2    1.945632  0.6131705  1.589400
##    6    1.734240  0.6477697  1.394678
##   11    1.727223  0.6328361  1.394093
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 11.
rf_carseats$bestTune
##   mtry
## 3   11
rf_carseats$finalModel
## 
## Call:
##  randomForest(x = x, y = y, mtry = param$mtry, importance = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 11
## 
##           Mean of squared residuals: 2.770705
##                     % Var explained: 62.27
rf_pred <- predict(rf_carseats, test)
mean((test$Sales - rf_pred)^2)
## [1] 2.646104
plot(varImp(rf_carseats))

Exercise 9

This problem involves the OJ data set which is part of the ISLR package.

(a) Create a training set containing a random sample of 800 observations, and a test set containing the remaining observations.

library(tree)
set.seed(1)
summary(OJ)
##  Purchase WeekofPurchase     StoreID        PriceCH         PriceMM     
##  CH:653   Min.   :227.0   Min.   :1.00   Min.   :1.690   Min.   :1.690  
##  MM:417   1st Qu.:240.0   1st Qu.:2.00   1st Qu.:1.790   1st Qu.:1.990  
##           Median :257.0   Median :3.00   Median :1.860   Median :2.090  
##           Mean   :254.4   Mean   :3.96   Mean   :1.867   Mean   :2.085  
##           3rd Qu.:268.0   3rd Qu.:7.00   3rd Qu.:1.990   3rd Qu.:2.180  
##           Max.   :278.0   Max.   :7.00   Max.   :2.090   Max.   :2.290  
##      DiscCH            DiscMM         SpecialCH        SpecialMM     
##  Min.   :0.00000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.00000   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.05186   Mean   :0.1234   Mean   :0.1477   Mean   :0.1617  
##  3rd Qu.:0.00000   3rd Qu.:0.2300   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :0.50000   Max.   :0.8000   Max.   :1.0000   Max.   :1.0000  
##     LoyalCH          SalePriceMM     SalePriceCH      PriceDiff       Store7   
##  Min.   :0.000011   Min.   :1.190   Min.   :1.390   Min.   :-0.6700   No :714  
##  1st Qu.:0.325257   1st Qu.:1.690   1st Qu.:1.750   1st Qu.: 0.0000   Yes:356  
##  Median :0.600000   Median :2.090   Median :1.860   Median : 0.2300            
##  Mean   :0.565782   Mean   :1.962   Mean   :1.816   Mean   : 0.1465            
##  3rd Qu.:0.850873   3rd Qu.:2.130   3rd Qu.:1.890   3rd Qu.: 0.3200            
##  Max.   :0.999947   Max.   :2.290   Max.   :2.090   Max.   : 0.6400            
##    PctDiscMM        PctDiscCH       ListPriceDiff       STORE      
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.000   Min.   :0.000  
##  1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.140   1st Qu.:0.000  
##  Median :0.0000   Median :0.00000   Median :0.240   Median :2.000  
##  Mean   :0.0593   Mean   :0.02731   Mean   :0.218   Mean   :1.631  
##  3rd Qu.:0.1127   3rd Qu.:0.00000   3rd Qu.:0.300   3rd Qu.:3.000  
##  Max.   :0.4020   Max.   :0.25269   Max.   :0.440   Max.   :4.000
dim(OJ)
## [1] 1070   18
train <- sample(dim(OJ)[1], size = 800)
oj_train <- OJ[train,]
oj_test <- OJ[-train,]
dim(oj_test)
## [1] 270  18

(b) Fit a tree to the training data, with Purchase as the response and the other variables as predictors. Use the summary() function to produce summary statistics about the tree, and describe the results obtained. What is the training error rate? How many terminal nodes does the tree have?

The tree used the following 5 variables: LoyalCH, PriceDiff, SpecialCH, ListPriceDiff, and PctDiscMM. The training error rate was 0.1588 and the tree has 9 terminal nodes.

tree_oj <- tree(Purchase ~ ., data = oj_train)
summary(tree_oj)
## 
## Classification tree:
## tree(formula = Purchase ~ ., data = oj_train)
## Variables actually used in tree construction:
## [1] "LoyalCH"       "PriceDiff"     "SpecialCH"     "ListPriceDiff"
## [5] "PctDiscMM"    
## Number of terminal nodes:  9 
## Residual mean deviance:  0.7432 = 587.8 / 791 
## Misclassification error rate: 0.1588 = 127 / 800

(c) Type in the name of the tree object in order to get a detailed text output. Pick one of the terminal nodes, and interpret the information displayed.

Using the example of node 8, the prediction is split according to LoyalCH (loyalty to Citrus Hill) being below 3.56%, the number of observations meeting this criterion is 59, the deviance is 10.14, and the probability of selecting Minute Maid (MM) being 98.3% (the probability of selecting Citrus Hill (CH) is 1.7%).

tree_oj
## node), split, n, deviance, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 800 1073.00 CH ( 0.60625 0.39375 )  
##    2) LoyalCH < 0.5036 365  441.60 MM ( 0.29315 0.70685 )  
##      4) LoyalCH < 0.280875 177  140.50 MM ( 0.13559 0.86441 )  
##        8) LoyalCH < 0.0356415 59   10.14 MM ( 0.01695 0.98305 ) *
##        9) LoyalCH > 0.0356415 118  116.40 MM ( 0.19492 0.80508 ) *
##      5) LoyalCH > 0.280875 188  258.00 MM ( 0.44149 0.55851 )  
##       10) PriceDiff < 0.05 79   84.79 MM ( 0.22785 0.77215 )  
##         20) SpecialCH < 0.5 64   51.98 MM ( 0.14062 0.85938 ) *
##         21) SpecialCH > 0.5 15   20.19 CH ( 0.60000 0.40000 ) *
##       11) PriceDiff > 0.05 109  147.00 CH ( 0.59633 0.40367 ) *
##    3) LoyalCH > 0.5036 435  337.90 CH ( 0.86897 0.13103 )  
##      6) LoyalCH < 0.764572 174  201.00 CH ( 0.73563 0.26437 )  
##       12) ListPriceDiff < 0.235 72   99.81 MM ( 0.50000 0.50000 )  
##         24) PctDiscMM < 0.196197 55   73.14 CH ( 0.61818 0.38182 ) *
##         25) PctDiscMM > 0.196197 17   12.32 MM ( 0.11765 0.88235 ) *
##       13) ListPriceDiff > 0.235 102   65.43 CH ( 0.90196 0.09804 ) *
##      7) LoyalCH > 0.764572 261   91.20 CH ( 0.95785 0.04215 ) *

(d) Create a plot of the tree, and interpret the results.

LoyalCH is the most important variable as it shows up as the top, second and one of the third splits. PriceDiff and ListPriceDiff show in the 3rd split, while SpecialCH and PctDiscMM are in the 4th level.

plot(tree_oj)
text(tree_oj, pretty = 0)

(e) Predict the response on the test data, and produce a confusion matrix comparing the test labels to the predicted test labels. What is the test error rate?

The test error rate is 0.1704.

tree_pred  = predict(tree_oj, oj_test, type = "class")
confusionMatrix(oj_test$Purchase, tree_pred)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  CH  MM
##         CH 160   8
##         MM  38  64
##                                           
##                Accuracy : 0.8296          
##                  95% CI : (0.7794, 0.8725)
##     No Information Rate : 0.7333          
##     P-Value [Acc > NIR] : 0.0001259       
##                                           
##                   Kappa : 0.6154          
##                                           
##  Mcnemar's Test P-Value : 1.904e-05       
##                                           
##             Sensitivity : 0.8081          
##             Specificity : 0.8889          
##          Pos Pred Value : 0.9524          
##          Neg Pred Value : 0.6275          
##              Prevalence : 0.7333          
##          Detection Rate : 0.5926          
##    Detection Prevalence : 0.6222          
##       Balanced Accuracy : 0.8485          
##                                           
##        'Positive' Class : CH              
## 
1-0.8296
## [1] 0.1704

(f) Apply the cv.tree() function to the training set in order to determine the optimal tree size.

cv_oj <- cv.tree(tree_oj, FUN = prune.misclass)
cv_oj
## $size
## [1] 9 8 7 4 2 1
## 
## $dev
## [1] 150 150 149 158 172 315
## 
## $k
## [1]       -Inf   0.000000   3.000000   4.333333  10.500000 151.000000
## 
## $method
## [1] "misclass"
## 
## attr(,"class")
## [1] "prune"         "tree.sequence"

(g) Produce a plot with tree size on the x-axis and cross-validated classification error rate on the y-axis.

# create data frame for ggplot
cv_oj1 <- data.frame(Size = cv_oj$size, cv_error = cv_oj$dev/800)
ggplot(cv_oj1, aes(Size, cv_error))+ geom_line(color = "purple", size = 1.5) + 
  ylab("CV Classification Error Rate") + 
  scale_x_continuous(breaks = c(0:10)) + theme_bw()

(h) Which tree size corresponds to the lowest cross-validated classification error rate?

The tree with size 7 has the lowest cross-validated classification error rate.

(i) Produce a pruned tree corresponding to the optimal tree size obtained using cross-validation. If cross-validation does not lead to selection of a pruned tree, then create a pruned tree with five terminal nodes.

prune_oj <- prune.tree(tree_oj, best = 7)
summary(prune_oj)
## 
## Classification tree:
## snip.tree(tree = tree_oj, nodes = c(10L, 4L))
## Variables actually used in tree construction:
## [1] "LoyalCH"       "PriceDiff"     "ListPriceDiff" "PctDiscMM"    
## Number of terminal nodes:  7 
## Residual mean deviance:  0.7748 = 614.4 / 793 
## Misclassification error rate: 0.1625 = 130 / 800

(j) Compare the training error rates between the pruned and unpruned trees. Which is higher?

The training error for the pruned tree (0.1625) is higher than the error rate of the unpruned tree (0.1588).

(k) Compare the test error rates between the pruned and unpruned trees. Which is higher?

The test error rate of the unpruned tree (0.1704) is higher than the error rate of the pruned tree (0.163).

ojprune_pred <- predict(prune_oj, oj_test, type = "class")
confusionMatrix(oj_test$Purchase, ojprune_pred)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  CH  MM
##         CH 160   8
##         MM  36  66
##                                          
##                Accuracy : 0.837          
##                  95% CI : (0.7875, 0.879)
##     No Information Rate : 0.7259         
##     P-Value [Acc > NIR] : 1.185e-05      
##                                          
##                   Kappa : 0.6336         
##                                          
##  Mcnemar's Test P-Value : 4.693e-05      
##                                          
##             Sensitivity : 0.8163         
##             Specificity : 0.8919         
##          Pos Pred Value : 0.9524         
##          Neg Pred Value : 0.6471         
##              Prevalence : 0.7259         
##          Detection Rate : 0.5926         
##    Detection Prevalence : 0.6222         
##       Balanced Accuracy : 0.8541         
##                                          
##        'Positive' Class : CH             
## 
1-0.837
## [1] 0.163