Quick Question
Compute the AUC of the CART model from the previous video, using the following command in your R console:
What is the AUC?
stevens = read.csv("stevens.csv")
# Split the data
library(caTools)
set.seed(3000)
spl = sample.split(stevens$Reverse, SplitRatio = 0.7)
Train = subset(stevens, spl==TRUE)
Test = subset(stevens, spl==FALSE)
library(rpart)
library(rpart.plot)
# CART model
StevensTree = rpart(Reverse ~ Circuit + Issue + Petitioner + Respondent + LowerCourt + Unconst, data = Train, method="class", minbucket=25)
prp(StevensTree)# Make predictions
PredictCART = predict(StevensTree, newdata = Test, type = "class")
table(Test$Reverse, PredictCART)## PredictCART
## 0 1
## 0 41 36
## 1 22 71
(41+71)/(41+36+22+71)## [1] 0.6588235
# ROC curve
library(ROCR)## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(AUC)## AUC 0.3.0
## Type AUCNews() to see the change log and ?AUC to get an overview.
PredictROC = predict(StevensTree, newdata = Test)
PredictROC## 0 1
## 1 0.3035714 0.6964286
## 3 0.3035714 0.6964286
## 4 0.4000000 0.6000000
## 6 0.4000000 0.6000000
## 8 0.4000000 0.6000000
## 21 0.3035714 0.6964286
## 32 0.5517241 0.4482759
## 36 0.5517241 0.4482759
## 40 0.3035714 0.6964286
## 42 0.5517241 0.4482759
## 46 0.5517241 0.4482759
## 47 0.4000000 0.6000000
## 53 0.5517241 0.4482759
## 55 0.3035714 0.6964286
## 59 0.1842105 0.8157895
## 60 0.4000000 0.6000000
## 66 0.4000000 0.6000000
## 67 0.4000000 0.6000000
## 68 0.1842105 0.8157895
## 72 0.3035714 0.6964286
## 79 0.3035714 0.6964286
## 80 0.5517241 0.4482759
## 87 0.7600000 0.2400000
## 88 0.1842105 0.8157895
## 92 0.7910448 0.2089552
## 95 0.7910448 0.2089552
## 102 0.7910448 0.2089552
## 106 0.7910448 0.2089552
## 110 0.7910448 0.2089552
## 112 0.7910448 0.2089552
## 114 0.7910448 0.2089552
## 125 0.7910448 0.2089552
## 130 0.7910448 0.2089552
## 134 0.7910448 0.2089552
## 138 0.7910448 0.2089552
## 145 0.7910448 0.2089552
## 146 0.7910448 0.2089552
## 148 0.3035714 0.6964286
## 149 0.3035714 0.6964286
## 152 0.3035714 0.6964286
## 154 0.5517241 0.4482759
## 161 0.7878788 0.2121212
## 164 0.4000000 0.6000000
## 167 0.7878788 0.2121212
## 169 0.3035714 0.6964286
## 171 0.7600000 0.2400000
## 175 0.5517241 0.4482759
## 176 0.0754717 0.9245283
## 177 0.0754717 0.9245283
## 178 0.0754717 0.9245283
## 180 0.0754717 0.9245283
## 187 0.0754717 0.9245283
## 188 0.7878788 0.2121212
## 190 0.0754717 0.9245283
## 192 0.0754717 0.9245283
## 196 0.0754717 0.9245283
## 197 0.3035714 0.6964286
## 208 0.3035714 0.6964286
## 210 0.0754717 0.9245283
## 216 0.7910448 0.2089552
## 218 0.7910448 0.2089552
## 220 0.0754717 0.9245283
## 224 0.4000000 0.6000000
## 226 0.7600000 0.2400000
## 227 0.4000000 0.6000000
## 228 0.7878788 0.2121212
## 235 0.3035714 0.6964286
## 239 0.7878788 0.2121212
## 242 0.7600000 0.2400000
## 244 0.7600000 0.2400000
## 247 0.4000000 0.6000000
## 255 0.3035714 0.6964286
## 260 0.5517241 0.4482759
## 261 0.7600000 0.2400000
## 264 0.3035714 0.6964286
## 265 0.3035714 0.6964286
## 268 0.3035714 0.6964286
## 272 0.5517241 0.4482759
## 273 0.3035714 0.6964286
## 274 0.5517241 0.4482759
## 275 0.3035714 0.6964286
## 282 0.4000000 0.6000000
## 286 0.7878788 0.2121212
## 291 0.4000000 0.6000000
## 294 0.1842105 0.8157895
## 305 0.4000000 0.6000000
## 306 0.3035714 0.6964286
## 308 0.7878788 0.2121212
## 311 0.7878788 0.2121212
## 313 0.7878788 0.2121212
## 314 0.7878788 0.2121212
## 315 0.7878788 0.2121212
## 317 0.7878788 0.2121212
## 320 0.7878788 0.2121212
## 321 0.7878788 0.2121212
## 323 0.4000000 0.6000000
## 331 0.3035714 0.6964286
## 335 0.3035714 0.6964286
## 338 0.7600000 0.2400000
## 341 0.5517241 0.4482759
## 345 0.5517241 0.4482759
## 346 0.3035714 0.6964286
## 350 0.3035714 0.6964286
## 352 0.3035714 0.6964286
## 353 0.1842105 0.8157895
## 355 0.3035714 0.6964286
## 356 0.1842105 0.8157895
## 358 0.3035714 0.6964286
## 359 0.3035714 0.6964286
## 360 0.4000000 0.6000000
## 361 0.4000000 0.6000000
## 362 0.5517241 0.4482759
## 364 0.3035714 0.6964286
## 368 0.3035714 0.6964286
## 381 0.4000000 0.6000000
## 382 0.1842105 0.8157895
## 384 0.3035714 0.6964286
## 387 0.1842105 0.8157895
## 389 0.3035714 0.6964286
## 390 0.4000000 0.6000000
## 394 0.3035714 0.6964286
## 400 0.7878788 0.2121212
## 402 0.4000000 0.6000000
## 405 0.7878788 0.2121212
## 408 0.3035714 0.6964286
## 410 0.3035714 0.6964286
## 416 0.4000000 0.6000000
## 422 0.7600000 0.2400000
## 432 0.0754717 0.9245283
## 434 0.7910448 0.2089552
## 436 0.0754717 0.9245283
## 441 0.7910448 0.2089552
## 444 0.0754717 0.9245283
## 448 0.0754717 0.9245283
## 450 0.0754717 0.9245283
## 451 0.0754717 0.9245283
## 452 0.7910448 0.2089552
## 454 0.0754717 0.9245283
## 456 0.0754717 0.9245283
## 459 0.0754717 0.9245283
## 462 0.0754717 0.9245283
## 464 0.0754717 0.9245283
## 467 0.0754717 0.9245283
## 468 0.0754717 0.9245283
## 470 0.0754717 0.9245283
## 473 0.0754717 0.9245283
## 476 0.0754717 0.9245283
## 478 0.0754717 0.9245283
## 480 0.0754717 0.9245283
## 482 0.0754717 0.9245283
## 483 0.0754717 0.9245283
## 484 0.0754717 0.9245283
## 494 0.7910448 0.2089552
## 498 0.1842105 0.8157895
## 504 0.4000000 0.6000000
## 509 0.4000000 0.6000000
## 521 0.7600000 0.2400000
## 527 0.4000000 0.6000000
## 531 0.4000000 0.6000000
## 535 0.4000000 0.6000000
## 538 0.7600000 0.2400000
## 539 0.1842105 0.8157895
## 540 0.4000000 0.6000000
## 543 0.7600000 0.2400000
## 545 0.4000000 0.6000000
## 546 0.7910448 0.2089552
## 551 0.7910448 0.2089552
## 552 0.7910448 0.2089552
## 556 0.4000000 0.6000000
## 558 0.1842105 0.8157895
pred = prediction(PredictROC[,2], Test$Reverse)
perf = performance(pred, "tpr", "fpr")
plot(perf)as.numeric(performance(pred, "auc")@y.values)## [1] 0.6927105
Now, recall that in Video 4, our tree had 7 splits. Let’s see how this changes if we change the value of minbucket.
First build a CART model that is similar to the one we built in Video 4, except change the minbucket parameter to 5. Plot the tree.
How many splits does the tree have?
StevensTree = rpart(Reverse ~ Circuit + Issue + Petitioner + Respondent + LowerCourt + Unconst, data = Train, method="class", minbucket=5)
prp(StevensTree)Now build a CART model that is similar to the one we built in Video 4, except change the minbucket parameter to 100. Plot the tree.
How many splits does the tree have?
StevensTree = rpart(Reverse ~ Circuit + Issue + Petitioner + Respondent + LowerCourt + Unconst, data = Train, method="class", minbucket=100)
prp(StevensTree)Quick Question
IMPORTANT NOTE: When creating random forest models, you might still get different answers from the ones you see here even if you set the random seed. This has to do with different operating systems and the random forest implementation.
Let’s see what happens if we set the seed to two different values and create two different random forest models.
First, set the seed to 100, and the re-build the random forest model, exactly like we did in the previous video (Video 5). Then make predictions on the test set. What is the accuracy of the model on the test set?
set.seed(100)
library(randomForest)## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
# Convert outcome to factor
Train$Reverse = as.factor(Train$Reverse)
Test$Reverse = as.factor(Test$Reverse)
# Try again
StevensForest = randomForest(Reverse ~ Circuit + Issue + Petitioner + Respondent + LowerCourt + Unconst, data = Train, ntree=200, nodesize=25 )
# Make predictions
PredictForest = predict(StevensForest, newdata = Test)
table(Test$Reverse, PredictForest)## PredictForest
## 0 1
## 0 43 34
## 1 19 74
(43+74)/(43+74+34+19)## [1] 0.6882353
Now, set the seed to 200, and then re-build the random forest model, exactly like we did in the previous video (Video 5). Then make predictions on the test set. What is the accuracy of this model on the test set?
set.seed(200)
StevensForest = randomForest(Reverse ~ Circuit + Issue + Petitioner + Respondent + LowerCourt + Unconst, data = Train, ntree=200, nodesize=25 )
# Make predictions
PredictForest = predict(StevensForest, newdata = Test)
table(Test$Reverse, PredictForest)## PredictForest
## 0 1
## 0 44 33
## 1 17 76
(44+76)/(44+33+17+76)## [1] 0.7058824
Explanation
You can create the models and compute the accurracies with the following commands in R: set.seed(100) StevensForest = randomForest(Reverse ~ Circuit + Issue + Petitioner + Respondent + LowerCourt + Unconst, data = Train, ntree=200, nodesize=25) PredictForest = predict(StevensForest, newdata = Test) table(Test$Reverse, PredictForest) and then repeat it, but with set.seed(200) first. As we see here, the random component of the random forest method can change the accuracy. The accuracy for a more stable dataset will not change very much, but a noisy dataset can be significantly affected by the random samples.
Quick Question
Plot the tree that we created using cross-validation. How many splits does it have?
# Install cross-validation packages
#install.packages("caret")
library(caret)## Loading required package: lattice
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
##
## Attaching package: 'caret'
## The following objects are masked from 'package:AUC':
##
## sensitivity, specificity
#install.packages("e1071")
library(e1071)
# Define cross-validation experiment
numFolds = caret::trainControl( method = "cv", number = 10 )
cpGrid = expand.grid( .cp = seq(0.01,0.5,0.01))
# Perform the cross validation
caret::train(Reverse ~ Circuit + Issue + Petitioner + Respondent + LowerCourt + Unconst,
data = Train,
method = "rpart",
trControl = numFolds,
tuneGrid = cpGrid )## CART
##
## 396 samples
## 6 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 357, 356, 357, 356, 357, 356, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.01 0.6433974 0.267916905
## 0.02 0.6359615 0.248964339
## 0.03 0.6208974 0.225208239
## 0.04 0.6333974 0.258053232
## 0.05 0.6436538 0.283134471
## 0.06 0.6436538 0.283134471
## 0.07 0.6436538 0.283134471
## 0.08 0.6436538 0.283134471
## 0.09 0.6436538 0.283134471
## 0.10 0.6436538 0.283134471
## 0.11 0.6436538 0.283134471
## 0.12 0.6436538 0.283134471
## 0.13 0.6436538 0.283134471
## 0.14 0.6436538 0.283134471
## 0.15 0.6436538 0.283134471
## 0.16 0.6436538 0.283134471
## 0.17 0.6436538 0.283134471
## 0.18 0.6436538 0.283134471
## 0.19 0.6436538 0.283134471
## 0.20 0.6061538 0.188881247
## 0.21 0.5808333 0.122201772
## 0.22 0.5605769 0.063162246
## 0.23 0.5479487 0.021739130
## 0.24 0.5453846 0.009090909
## 0.25 0.5453846 0.000000000
## 0.26 0.5453846 0.000000000
## 0.27 0.5453846 0.000000000
## 0.28 0.5453846 0.000000000
## 0.29 0.5453846 0.000000000
## 0.30 0.5453846 0.000000000
## 0.31 0.5453846 0.000000000
## 0.32 0.5453846 0.000000000
## 0.33 0.5453846 0.000000000
## 0.34 0.5453846 0.000000000
## 0.35 0.5453846 0.000000000
## 0.36 0.5453846 0.000000000
## 0.37 0.5453846 0.000000000
## 0.38 0.5453846 0.000000000
## 0.39 0.5453846 0.000000000
## 0.40 0.5453846 0.000000000
## 0.41 0.5453846 0.000000000
## 0.42 0.5453846 0.000000000
## 0.43 0.5453846 0.000000000
## 0.44 0.5453846 0.000000000
## 0.45 0.5453846 0.000000000
## 0.46 0.5453846 0.000000000
## 0.47 0.5453846 0.000000000
## 0.48 0.5453846 0.000000000
## 0.49 0.5453846 0.000000000
## 0.50 0.5453846 0.000000000
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.19.
# Create a new CART model
StevensTreeCV = rpart(Reverse ~ Circuit + Issue + Petitioner + Respondent + LowerCourt + Unconst,
data = Train,
method="class",
cp = 0.18)
prp(StevensTreeCV)# Make predictions
PredictCV = predict(StevensTreeCV, newdata = Test, type = "class")
table(Test$Reverse, PredictCV)## PredictCV
## 0 1
## 0 59 18
## 1 29 64
(59+64)/(59+18+29+64)## [1] 0.7235294
Explanation
If you follow the R commands from the previous video, you can plot the tree with prp(StevensTreeCV). The tree with the best accuracy only has one split! When we were picking different minbucket parameters before, it seemed like this tree was probably not doing a good job of fitting the data. However, this tree with one split gives us the best out-of-sample accuracy. This reminds us that sometimes the simplest models are the best!
Quick Question
# Read in the data
Claims = read.csv("ClaimsData.csv")
str(Claims)## 'data.frame': 458005 obs. of 16 variables:
## $ age : int 85 59 67 52 67 68 75 70 67 67 ...
## $ alzheimers : int 0 0 0 0 0 0 0 0 0 0 ...
## $ arthritis : int 0 0 0 0 0 0 0 0 0 0 ...
## $ cancer : int 0 0 0 0 0 0 0 0 0 0 ...
## $ copd : int 0 0 0 0 0 0 0 0 0 0 ...
## $ depression : int 0 0 0 0 0 0 0 0 0 0 ...
## $ diabetes : int 0 0 0 0 0 0 0 0 0 0 ...
## $ heart.failure : int 0 0 0 0 0 0 0 0 0 0 ...
## $ ihd : int 0 0 0 0 0 0 0 0 0 0 ...
## $ kidney : int 0 0 0 0 0 0 0 0 0 0 ...
## $ osteoporosis : int 0 0 0 0 0 0 0 0 0 0 ...
## $ stroke : int 0 0 0 0 0 0 0 0 0 0 ...
## $ reimbursement2008: int 0 0 0 0 0 0 0 0 0 0 ...
## $ bucket2008 : int 1 1 1 1 1 1 1 1 1 1 ...
## $ reimbursement2009: int 0 0 0 0 0 0 0 0 0 0 ...
## $ bucket2009 : int 1 1 1 1 1 1 1 1 1 1 ...
# Percentage of patients in each cost bucket
table(Claims$bucket2009)/nrow(Claims)##
## 1 2 3 4 5
## 0.671267781 0.190170413 0.089466272 0.043324855 0.005770679
# Split the data
library(caTools)
set.seed(88)
spl = sample.split(Claims$bucket2009, SplitRatio = 0.6)
ClaimsTrain = subset(Claims, spl==TRUE)
ClaimsTest = subset(Claims, spl==FALSE)What is the average age of patients in the training set, ClaimsTrain?
mean(ClaimsTrain$age)## [1] 72.63773
What proportion of people in the training set (ClaimsTrain) had at least one diagnosis code for diabetes?
sum(ClaimsTrain$diabetes >= 1)/nrow(ClaimsTrain)## [1] 0.3808983
table(ClaimsTrain$diabetes)/nrow(ClaimsTrain)##
## 0 1
## 0.6191017 0.3808983
Quick Question
Suppose that instead of the baseline method discussed in the previous video, we used the baseline method of predicting the most frequent outcome for all observations. This new baseline method would predict cost bucket 1 for everyone.
What would the accuracy of this baseline method be on the test set?
table(ClaimsTest$bucket2009)/nrow(ClaimsTest)##
## 1 2 3 4 5
## 0.671269964 0.190172596 0.089464089 0.043323763 0.005769588
What would the penalty error of this baseline method be on the test set?
table(ClaimsTest$bucket2009)##
## 1 2 3 4 5
## 122978 34840 16390 7937 1057
(34840*2+16390*4+7937*6+1057*8)/nrow(ClaimsTest)## [1] 1.044301
Explanation
To compute the accuracy, you can create a table of the variable ClaimsTest\(bucket2009: table(ClaimsTest\)bucket2009) According to the table output, this baseline method would get 122978 observations correct, and all other observations wrong. So the accuracy of this baseline method is 122978/nrow(ClaimsTest) = 0.67127. For the penalty error, since this baseline method predicts 1 for all observations, it would have a penalty error of: \((0*122978 + 2*34840 + 4*16390 + 6*7937 + 8*1057)/nrow(ClaimsTest) = 1.044301\)
understanding why people vote
In August 2006 three researchers (Alan Gerber and Donald Green of Yale University, and Christopher Larimer of the University of Northern Iowa) carried out a large scale field experiment in Michigan, USA to test the hypothesis that one of the reasons people vote is social, or extrinsic, pressure. To quote the first paragraph of their 2008 research paper:
Among the most striking features of a democratic political system is the participation of millions of voters in elections. Why do large numbers of people vote, despite the fact that … “the casting of a single vote is of no significance where there is a multitude of electors”? One hypothesis is adherence to social norms. Voting is widely regarded as a citizen duty, and citizens worry that others will think less of them if they fail to participate in elections. Voters’ sense of civic duty has long been a leading explanation of vote turnout…
In this homework problem we will use both logistic regression and classification trees to analyze the data they collected.
The data
The researchers grouped about 344,000 voters into different groups randomly - about 191,000 voters were a “control” group, and the rest were categorized into one of four “treatment” groups. These five groups correspond to five binary variables in the dataset.
“Civic Duty” (variable civicduty) group members were sent a letter that simply said “DO YOUR CIVIC DUTY - VOTE!” “Hawthorne Effect” (variable hawthorne) group members were sent a letter that had the “Civic Duty” message plus the additional message “YOU ARE BEING STUDIED” and they were informed that their voting behavior would be examined by means of public records. “Self” (variable self) group members received the “Civic Duty” message as well as the recent voting record of everyone in that household and a message stating that another message would be sent after the election with updated records. “Neighbors” (variable neighbors) group members were given the same message as that for the “Self” group, except the message not only had the household voting records but also that of neighbors - maximizing social pressure. “Control” (variable control) group members were not sent anything, and represented the typical voting situation. Additional variables include sex (0 for male, 1 for female), yob (year of birth), and the dependent variable voting (1 if they voted, 0 otherwise).
We will first get familiar with the data. Load the CSV file gerber.csv into R. What proportion of people in this dataset voted in this election?
gerber <- read.csv("gerber.csv")
str(gerber)## 'data.frame': 344084 obs. of 8 variables:
## $ sex : int 0 1 1 1 0 1 0 0 1 0 ...
## $ yob : int 1941 1947 1982 1950 1951 1959 1956 1981 1968 1967 ...
## $ voting : int 0 0 1 1 1 1 1 0 0 0 ...
## $ hawthorne: int 0 0 1 1 1 0 0 0 0 0 ...
## $ civicduty: int 1 1 0 0 0 0 0 0 0 0 ...
## $ neighbors: int 0 0 0 0 0 0 0 0 0 0 ...
## $ self : int 0 0 0 0 0 0 0 0 0 0 ...
## $ control : int 0 0 0 0 0 1 1 1 1 1 ...
sum(gerber$voting) / nrow(gerber)## [1] 0.3158996
Which of the four “treatment groups” had the largest percentage of people who actually voted (voting = 1)?
nrow(subset(gerber, voting==1 & hawthorne==1))/sum(gerber$hawthorne)## [1] 0.3223746
tapply(gerber$voting, gerber$civicduty, mean)## 0 1
## 0.3160698 0.3145377
tapply(gerber$voting, gerber$hawthorne, mean)## 0 1
## 0.3150909 0.3223746
tapply(gerber$voting, gerber$self, mean)## 0 1
## 0.3122446 0.3451515
tapply(gerber$voting, gerber$neighbors, mean)## 0 1
## 0.3081505 0.3779482
Build a logistic regression model for voting using the four treatment group variables as the independent variables (civicduty, hawthorne, self, and neighbors). Use all the data to build the model (DO NOT split the data into a training set and testing set). Which of the following coefficients are significant in the logistic regression model? Select all that apply.
LogModel <- glm(voting ~ hawthorne + civicduty + neighbors + self, data = gerber, family = binomial)
summary(LogModel)##
## Call:
## glm(formula = voting ~ hawthorne + civicduty + neighbors + self,
## family = binomial, data = gerber)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.9744 -0.8691 -0.8389 1.4586 1.5590
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.863358 0.005006 -172.459 < 2e-16 ***
## hawthorne 0.120477 0.012037 10.009 < 2e-16 ***
## civicduty 0.084368 0.012100 6.972 3.12e-12 ***
## neighbors 0.365092 0.011679 31.260 < 2e-16 ***
## self 0.222937 0.011867 18.786 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 429238 on 344083 degrees of freedom
## Residual deviance: 428090 on 344079 degrees of freedom
## AIC: 428100
##
## Number of Fisher Scoring iterations: 4
Using a threshold of 0.3, what is the accuracy of the logistic regression model? (When making predictions, you don’t need to use the newdata argument since we didn’t split our data.)
# Predictions on the test set
predictLog= predict(LogModel, type="response")
# Confusion matrix with threshold of 0.3
table(gerber$voting, as.numeric(predictLog > 0.3))##
## 0 1
## 0 134513 100875
## 1 56730 51966
(134513 + 51966)/(134513+100875+56730+51966)## [1] 0.5419578
Using a threshold of 0.5, what is the accuracy of the logistic regression model?
table(gerber$voting, as.numeric(predictLog > 0.5))##
## 0
## 0 235388
## 1 108696
(235388+0)/(108696+235388)## [1] 0.6841004
Compare your previous two answers to the percentage of people who did not vote (the baseline accuracy) and compute the AUC of the model. What is happening here?
table(gerber$voting) ##
## 0 1
## 235388 108696
235388/(235388+108696)## [1] 0.6841004
# AUC
library(ROCR)
ROCRpred = prediction(predictLog, gerber$voting)
as.numeric(performance(ROCRpred, "auc")@y.values)## [1] 0.5308461
Explanation
Even though all of our variables are significant, our model does not improve over the baseline model of just predicting that someone will not vote, and the AUC is low. So while the treatment groups do make a difference, this is a weak predictive model.
We will now try out trees. Build a CART tree for voting using all data and the same four treatment variables we used before. Don’t set the option method=“class” - we are actually going to create a regression tree here. We are interested in building a tree to explore the fraction of people who vote, or the probability of voting. We’d like CART to split our groups if they have different probabilities of voting. If we used method=‘class’, CART would only split if one of the groups had a probability of voting above 50% and the other had a probability of voting less than 50% (since the predicted outcomes would be different). However, with regression trees, CART will split even if both groups have probability less than 50%.
Leave all the parameters at their default values. You can use the following command in R to build the tree:
library(rpart)
library(rpart.plot)
CARTmodel = rpart(voting ~ civicduty + hawthorne + self + neighbors, data=gerber)
prp(CARTmodel)Plot the tree. What happens, and if relevant, why?
Answer: No variables are used (the tree is only a root node) - none of the variables make a big enough effect to be split on.
Explanation
If you plot the tree, with prp(CARTmodel), you should just see one leaf! There are no splits in the tree, because none of the variables make a big enough effect to be split on.
Now build the tree using the command:
CARTmodel2 = rpart(voting ~ civicduty + hawthorne + self + neighbors, data=gerber, cp=0.0)to force the complete tree to be built. Then plot the tree. What do you observe about the order of the splits?
prp(CARTmodel2)Explanation
We saw in Problem 1 that the highest fraction of voters was in the Neighbors group, followed by the Self group, followed by the Hawthorne group, and lastly the Civic Duty group. And we see here that the tree detects this trend.
Using only the CART tree plot, determine what fraction (a number between 0 and 1) of “Civic Duty” people voted:
Answer: 0.31
Make a new tree that includes the “sex” variable, again with cp = 0.0. Notice that sex appears as a split that is of secondary importance to the treatment group.
In the control group, which gender is more likely to vote?
CARTmodel3= rpart(voting ~ civicduty + hawthorne + self + neighbors + sex, data=gerber, cp=0.0)
prp(CARTmodel3)Explanation
You can see that there is a split on the “sex” variable after every treatment variable split. For the control group, which corresponds to the bottom left, sex = 0 (male) corresponds to a higher voting percentage. For the civic duty group, which corresponds to the bottom right, sex = 0 (male) corresponds to a higher voting percentage.
We know trees can handle “nonlinear” relationships, e.g. “in the ‘Civic Duty’ group and female”, but as we will see in the next few questions, it is possible to do the same for logistic regression. First, let’s explore what trees can tell us some more.
Let’s just focus on the “Control” treatment group. Create a regression tree using just the “control” variable, then create another tree with the “control” and “sex” variables, both with cp=0.0.
In the “control” only tree, what is the absolute value of the difference in the predicted probability of voting between being in the control group versus being in a different group? You can use the absolute value function to get answer, i.e. abs(Control Prediction - Non-Control Prediction). Add the argument “digits = 6” to the prp command to get a more accurate estimate.
CARTcontrol = rpart(voting ~ control, data=gerber, cp=0.0)
CARTsex = rpart(voting ~ control + sex, data=gerber, cp=0.0)
prp(CARTcontrol, digits=6)abs(0.296638 - 0.34)## [1] 0.043362
Now, using the second tree (with control and sex), determine who is affected more by NOT being in the control group (being in any of the four treatment groups):
prp(CARTsex, digits = 6)Explanation
The first split says that if control = 1, go left. Then, if sex = 1 (female) predict 0.290456, and if sex = 0 (male) predict 0.302795. On the other side of the tree, where control = 0, if sex = 1 (female) predict 0.334176, and if sex = 0 (male) predict 0.345818. So for women, not being in the control group increases the fraction voting by 0.04372. For men, not being in the control group increases the fraction voting by 0.04302. So men and women are affected about the same.
Going back to logistic regression now, create a model using “sex” and “control”. Interpret the coefficient for “sex”:
LogModelSex = glm(voting ~ control + sex, data=gerber, family="binomial")
summary(LogModelSex)##
## Call:
## glm(formula = voting ~ control + sex, family = "binomial", data = gerber)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.9220 -0.9012 -0.8290 1.4564 1.5717
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.635538 0.006511 -97.616 < 2e-16 ***
## control -0.200142 0.007364 -27.179 < 2e-16 ***
## sex -0.055791 0.007343 -7.597 3.02e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 429238 on 344083 degrees of freedom
## Residual deviance: 428443 on 344081 degrees of freedom
## AIC: 428449
##
## Number of Fisher Scoring iterations: 4
Answer: Coefficient is negative, reflecting that women are less likely to vote
Explanation
If you look at the summary of the model, you can see that the coefficient for the “sex” variable is -0.055791. This means that women are less likely to vote, since women have a larger value in the sex variable, and a negative coefficient means that larger values are predictive of 0.
The regression tree calculated the percentage voting exactly for every one of the four possibilities (Man, Not Control), (Man, Control), (Woman, Not Control), (Woman, Control). Logistic regression has attempted to do the same, although it wasn’t able to do as well because it can’t consider exactly the joint possibility of being a women and in the control group.
We can quantify this precisely. Create the following dataframe (this contains all of the possible values of sex and control), and evaluate your logistic regression using the predict function (where “LogModelSex” is the name of your logistic regression model that uses both control and sex):
Possibilities = data.frame(sex=c(0,0,1,1),control=c(0,1,0,1))
predict(LogModelSex, newdata=Possibilities, type="response")## 1 2 3 4
## 0.3462559 0.3024455 0.3337375 0.2908065
Answer: 0.00035
Explanation
The CART tree predicts 0.290456 for the (Woman, Control) case, and the logistic regression model predicts 0.2908065. So the absolute difference, to five decimal places, is 0.00035.
So the difference is not too big for this dataset, but it is there. We’re going to add a new term to our logistic regression now, that is the combination of the “sex” and “control” variables - so if this new variable is 1, that means the person is a woman AND in the control group. We can do that with the following command:
LogModel2 = glm(voting ~ sex + control + sex:control, data=gerber, family="binomial")
summary(LogModel2)##
## Call:
## glm(formula = voting ~ sex + control + sex:control, family = "binomial",
## data = gerber)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.9213 -0.9019 -0.8284 1.4573 1.5724
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.637471 0.007603 -83.843 < 2e-16 ***
## sex -0.051888 0.010801 -4.804 1.55e-06 ***
## control -0.196553 0.010356 -18.980 < 2e-16 ***
## sex:control -0.007259 0.014729 -0.493 0.622
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 429238 on 344083 degrees of freedom
## Residual deviance: 428442 on 344080 degrees of freedom
## AIC: 428450
##
## Number of Fisher Scoring iterations: 4
Answer: If a person is a woman and in the control group, the chance that she voted goes down.
Explanation
This coefficient is negative, so that means that a value of 1 in this variable decreases the chance of voting. This variable will have variable 1 if the person is a woman and in the control group.
Run the same code as before to calculate the average for each group:
predict(LogModel2, newdata=Possibilities, type="response")## 1 2 3 4
## 0.3458183 0.3027947 0.3341757 0.2904558
Now what is the difference between the logistic regression model and the CART model for the (Woman, Control) case? Again, give your answer with five numbers after the decimal point.
Explanation
The logistic regression model now predicts 0.2904558 for the (Woman, Control) case, so there is now a very small difference (practically zero) between CART and logistic regression.
This example has shown that trees can capture nonlinear relationships that logistic regression can not, but that we can get around this sometimes by using variables that are the combination of two variables. Should we always include all possible interaction terms of the independent variables when building a logistic regression model?
Explanation
We should not use all possible interaction terms in a logistic regression model due to overfitting. Even in this simple problem, we have four treatment groups and two values for sex. If we have an interaction term for every treatment variable with sex, we will double the number of variables. In smaller data sets, this could quickly lead to overfitting.
One of the earliest applications of the predictive analytics methods we have studied so far in this class was to automatically recognize letters, which post office machines use to sort mail. In this problem, we will build a model that uses statistics of images of four letters in the Roman alphabet – A, B, P, and R – to predict which letter a particular image corresponds to.
Note that this is a multiclass classification problem. We have mostly focused on binary classification problems (e.g., predicting whether an individual voted or not, whether the Supreme Court will affirm or reverse a case, whether or not a person is at risk for a certain disease, etc.). In this problem, we have more than two classifications that are possible for each observation, like in the D2Hawkeye lecture.
The file letters_ABPR.csv contains 3116 observations, each of which corresponds to a certain image of one of the four letters A, B, P and R. The images came from 20 different fonts, which were then randomly distorted to produce the final images; each such distorted image is represented as a collection of pixels, each of which is “on” or “off”. For each such distorted image, we have available certain statistics of the image in terms of these pixels, as well as which of the four letters the image is. This data comes from the UCI Machine Learning Repository.
This dataset contains the following 17 variables:
letter = the letter that the image corresponds to (A, B, P or R)
xbox = the horizontal position of where the smallest box covering the letter shape begins.
ybox = the vertical position of where the smallest box covering the letter shape begins.
width = the width of this smallest box.
height = the height of this smallest box.
onpix = the total number of “on” pixels in the character image
xbar = the mean horizontal position of all of the “on” pixels
ybar = the mean vertical position of all of the “on” pixels
x2bar = the mean squared horizontal position of all of the “on” pixels in the image
y2bar = the mean squared vertical position of all of the “on” pixels in the image
xybar = the mean of the product of the horizontal and vertical position of all of the “on” pixels in the image
x2ybar = the mean of the product of the squared horizontal position and the vertical position of all of the “on” pixels
xy2bar = the mean of the product of the horizontal position and the squared vertical position of all of the “on” pixels
xedge = the mean number of edges (the number of times an “off” pixel is followed by an “on” pixel, or the image boundary is hit) as the image is scanned from left to right, along the whole vertical length of the image xedgeycor = the mean of the product of the number of horizontal edges at each vertical position and the vertical position
yedge = the mean number of edges as the images is scanned from top to bottom, along the whole horizontal length of the image
yedgexcor = the mean of the product of the number of vertical edges at each horizontal position and the horizontal position
Let’s warm up by attempting to predict just whether a letter is B or not. To begin, load the file letters_ABPR.csv into R, and call it letters. Then, create a new variable isB in the dataframe, which takes the value “TRUE” if the observation corresponds to the letter B, and “FALSE” if it does not.
Now split the data set into a training and testing set, putting 50% of the data in the training set. Set the seed to 1000 before making the split. The first argument to sample.split should be the dependent variable “letters$isB”. Remember that TRUE values from sample.split should go in the training set.
Before building models, let’s consider a baseline method that always predicts the most frequent outcome, which is “not B”. What is the accuracy of this baseline method on the test set?
# Load the csv file:
letters = read.csv("letters_ABPR.csv")
# Then create the "isB" variable:
letters$isB = as.factor(letters$letter == "B")
# Then set the seed and split your data into a training and testing set:
set.seed(1000)
spl = sample.split(letters$isB, SplitRatio = 0.5)
train = subset(letters, spl == TRUE)
test = subset(letters, spl == FALSE)To compute the accuracy of the baseline method on the test set, we first need to see which outcome value is more frequent in the training set, by using the table function. The output of table(train$isB) tells us that “not B” is more common. So our baseline method is to predict “not B” for everything. How well would this do on the test set? We need to run the table command again, this time on the test set:
table(test$isB)##
## FALSE TRUE
## 1175 383
There are 1175 observations that are not B, and 383 observations that are B. So the baseline method accuracy on the test set would be 1175/(1175+383) = 0.754172
Now build a classification tree to predict whether a letter is a B or not, using the training set to build your model. Remember to remove the variable “letter” out of the model, as this is related to what we are trying to predict! To just remove one variable, you can either write out the other variables, or remember what we did in the Billboards problem in Week 3, and use the following notation:
CARTb = rpart(isB ~ . - letter, data=train, method=“class”)
We are just using the default parameters in our CART model, so we don’t need to add the minbucket or cp arguments at all. We also added the argument method=“class” since this is a classification problem.
What is the accuracy of the CART model on the test set? (Use type=“class” when making predictions on the test set.)
CARTb = rpart(isB ~ . - letter, data=train, method="class")
# Then, we can use the predict function to make predictions on the test set:
predictions = predict(CARTb, newdata=test, type="class")
# We can use the following command to build our confusion matrix:
table(test$isB, predictions)## predictions
## FALSE TRUE
## FALSE 1118 57
## TRUE 43 340
# To compute the accuracy on the test set, we need to divide the sum of the true positives and true negatives by the total number of observations: (1118+340)/nrow(test) = 0.9358151Now, build a random forest model to predict whether the letter is a B or not (the isB variable) using the training set. You should use all of the other variables as independent variables, except letter (since it helped us define what we are trying to predict!). Use the default settings for ntree and nodesize (don’t include these arguments at all). Right before building the model, set the seed to 1000. (NOTE: You might get a slightly different answer on this problem, even if you set the random seed. This has to do with your operating system and the implementation of the random forest algorithm.)
What is the accuracy of the model on the test set?
set.seed(1000)
# RFb = randomForest(isB ~ xbox + ybox + width + height + onpix + xbar + ybar + x2bar + y2bar + xybar + x2ybar + xy2bar + xedge + xedgeycor + yedge + yedgexcor, data=train)
RFb = randomForest(isB ~ . - letter, data=train)
# We can make predictions with the predict function:
predictions = predict(RFb, newdata=test)
# And then generate our confusion matrix with the table function:
table(test$isB, predictions)## predictions
## FALSE TRUE
## FALSE 1165 10
## TRUE 9 374
#The accuracy of the model on the test set is the sum of the true positives and true negatives, divided by the total number of observations in the test set:
(1165+374)/nrow(test)## [1] 0.9878049
#In lecture, we noted that random forests tends to improve on CART in terms of predictive accuracy. Sometimes, this improvement can be quite significant, as it is here.Let us now move on to the problem that we were originally interested in, which is to predict whether or not a letter is one of the four letters A, B, P or R.
As we saw in the D2Hawkeye lecture, building a multiclass classification CART model in R is no harder than building the models for binary classification problems. Fortunately, building a random forest model is just as easy.
The variable in our data frame which we will be trying to predict is “letter”. Start by converting letter in the original data set (letters) to a factor by running the following command in R:
letters\(letter = as.factor( letters\)letter )
Now, generate new training and testing sets of the letters data frame using letters$letter as the first input to the sample.split function. Before splitting, set your seed to 2000. Again put 50% of the data in the training set. (Why do we need to split the data again? Remember that sample.split balances the outcome variable in the training and testing sets. With a new outcome variable, we want to re-generate our split.)
In a multiclass classification problem, a simple baseline model is to predict the most frequent class of all of the options.
What is the baseline accuracy on the testing set?
letters$letter = as.factor( letters$letter )
set.seed(2000)
spl = sample.split(letters$letter, SplitRatio = 0.5)
train2 = subset(letters, spl == TRUE)
test2 = subset(letters, spl == FALSE)Then to compute the accuracy of the baseline method on the test set, we need to first figure out the most common outcome in the training set. The output of table(train2\(letter) tells us that "P" has the most observations. So we will predict P for all letters. On the test set, we can run the table command table(test2\)letter) to see that it has 401 observations that are actually P. So the test set accuracy of the baseline method is 401/nrow(test) = 0.2573813.
table(train2$letter)##
## A B P R
## 394 383 402 379
table(test2$letter)##
## A B P R
## 395 383 401 379
Now build a classification tree to predict “letter”, using the training set to build your model. You should use all of the other variables as independent variables, except “isB”, since it is related to what we are trying to predict! Just use the default parameters in your CART model. Add the argument method=“class” since this is a classification problem. Even though we have multiple classes here, nothing changes in how we build the model from the binary case.
What is the test set accuracy of your CART model? Use the argument type=“class” when making predictions.
(HINT: When you are computing the test set accuracy using the confusion matrix, you want to add everything on the main diagonal and divide by the total number of observations in the test set, which can be computed with nrow(test), where test is the name of your test set).
CARTletter = rpart(letter ~ . - isB, data=train2, method="class")
predictLetter = predict(CARTletter, newdata=test2, type="class")
table(test2$letter, predictLetter)## predictLetter
## A B P R
## A 348 4 0 43
## B 8 318 12 45
## P 2 21 363 15
## R 10 24 5 340
(348+318+363+340)/nrow(test2)## [1] 0.8786906
Now build a random forest model on the training data, using the same independent variables as in the previous problem – again, don’t forget to remove the isB variable. Just use the default parameter values for ntree and nodesize (you don’t need to include these arguments at all). Set the seed to 1000 right before building your model. (Remember that you might get a slightly different result even if you set the random seed.)
What is the test set accuracy of your random forest model?
set.seed(1000)
RFletter = randomForest(letter ~ . - isB, data=train2)
predictLetter = predict(RFletter, newdata=test2)
table(test2$letter, predictLetter)## predictLetter
## A B P R
## A 390 0 3 2
## B 0 380 1 2
## P 0 5 393 3
## R 3 12 0 364
(390+380 +393+364)/nrow(test2)## [1] 0.9801027
You should find this value rather striking, for several reasons. The first is that it is significantly higher than the value for CART, highlighting the gain in accuracy that is possible from using random forest models. The second is that while the accuracy of CART decreased significantly as we transitioned from the problem of predicting B/not B (a relatively simple problem) to the problem of predicting the four letters (certainly a harder problem), the accuracy of the random forest model decreased by a tiny amount.