setwd("C:/Users/jzchen/Documents/Courses/Analytics Edge/Unit_4_Trees")
stevens <- read.csv("stevens.csv")
library(randomForest)
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
Split the dataset
library(caTools)
set.seed(3000)
spl <- sample.split(stevens$Reverse, SplitRatio = 0.7)
Train <- subset(stevens, spl == TRUE)
Test <- subset(stevens, spl == FALSE)
If we use the following command,
stevensForest <- randomForest(Reverse ~ Circuit + Issue + Petitioner + Respondent + LowerCourt + Unconst, data = Train, nodesize = 25, ntree = 200)
You should see an interesting warning message here.In CART, we added the argument method=“class”, so that it was clear that we’re doing a classification problem.As I mentioned earlier, trees can also be used for regression problems, which you’ll see in the recitation.The randomForest function does not have a method argument.So when we want to do a classification problem, we need to make sure outcome is a factor.
Let’s convert the variable Reverse to a factor variable in both our training and our testing sets.
Train$Reverse <- as.factor(Train$Reverse)
Test$Reverse <- as.factor(Test$Reverse)
stevensForest <- randomForest(Reverse ~ Circuit + Issue + Petitioner + Respondent + LowerCourt + Unconst, data = Train, nodesize = 25, ntree = 200)
predictForest <- predict(stevensForest, newdata = Test)
table(Test$Reverse, predictForest)
## predictForest
## 0 1
## 0 40 37
## 1 19 74
Accuracy is about 69%
set.seed(200)
stevensForest <- randomForest(Reverse ~ Circuit + Issue + Petitioner + Respondent + LowerCourt + Unconst, data = Train, nodesize = 25, ntree = 200)
predictForest <- predict(stevensForest, newdata = Test)
table(Test$Reverse, predictForest)
## predictForest
## 0 1
## 0 44 33
## 1 17 76
Accuracy is (44+76)/nrow(Test) = 0.7058824
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(e1071)
numFolds <- trainControl(method = "cv", number = 10)
Then we need to pick the possible values for our cp parameter, using the expand.grid function.
cpGrid <- expand.grid(.cp = seq(0.01, 0.5, 0.01))
train(Reverse ~ Circuit + Issue + Petitioner + Respondent + LowerCourt + Unconst, data = Train, method = "rpart", trControl = numFolds, tuneGrid = cpGrid)
## Loading required package: rpart
## CART
##
## 396 samples
## 8 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
##
## Summary of sample sizes: 357, 356, 357, 356, 356, 356, ...
##
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa Accuracy SD Kappa SD
## 0.01 0.6365385 0.252522710 0.045831216 0.10138314
## 0.02 0.6337179 0.248281522 0.061267954 0.12752003
## 0.03 0.6314103 0.251796733 0.053552823 0.11468412
## 0.04 0.6314103 0.253786180 0.053552823 0.11438075
## 0.05 0.6440385 0.282995035 0.062472910 0.13104160
## 0.06 0.6440385 0.282995035 0.062472910 0.13104160
## 0.07 0.6440385 0.282995035 0.062472910 0.13104160
## 0.08 0.6440385 0.282995035 0.062472910 0.13104160
## 0.09 0.6440385 0.282995035 0.062472910 0.13104160
## 0.10 0.6440385 0.282995035 0.062472910 0.13104160
## 0.11 0.6440385 0.282995035 0.062472910 0.13104160
## 0.12 0.6440385 0.282995035 0.062472910 0.13104160
## 0.13 0.6440385 0.282995035 0.062472910 0.13104160
## 0.14 0.6440385 0.282995035 0.062472910 0.13104160
## 0.15 0.6440385 0.282995035 0.062472910 0.13104160
## 0.16 0.6440385 0.282995035 0.062472910 0.13104160
## 0.17 0.6440385 0.282995035 0.062472910 0.13104160
## 0.18 0.6440385 0.282995035 0.062472910 0.13104160
## 0.19 0.6440385 0.282995035 0.062472910 0.13104160
## 0.20 0.6085897 0.193703966 0.058244587 0.14192310
## 0.21 0.5807692 0.121202966 0.046444754 0.12714614
## 0.22 0.5605128 0.062732119 0.032700267 0.09526381
## 0.23 0.5428846 0.003553299 0.008506582 0.01123652
## 0.24 0.5428846 0.003553299 0.008506582 0.01123652
## 0.25 0.5453846 0.000000000 0.005958436 0.00000000
## 0.26 0.5453846 0.000000000 0.005958436 0.00000000
## 0.27 0.5453846 0.000000000 0.005958436 0.00000000
## 0.28 0.5453846 0.000000000 0.005958436 0.00000000
## 0.29 0.5453846 0.000000000 0.005958436 0.00000000
## 0.30 0.5453846 0.000000000 0.005958436 0.00000000
## 0.31 0.5453846 0.000000000 0.005958436 0.00000000
## 0.32 0.5453846 0.000000000 0.005958436 0.00000000
## 0.33 0.5453846 0.000000000 0.005958436 0.00000000
## 0.34 0.5453846 0.000000000 0.005958436 0.00000000
## 0.35 0.5453846 0.000000000 0.005958436 0.00000000
## 0.36 0.5453846 0.000000000 0.005958436 0.00000000
## 0.37 0.5453846 0.000000000 0.005958436 0.00000000
## 0.38 0.5453846 0.000000000 0.005958436 0.00000000
## 0.39 0.5453846 0.000000000 0.005958436 0.00000000
## 0.40 0.5453846 0.000000000 0.005958436 0.00000000
## 0.41 0.5453846 0.000000000 0.005958436 0.00000000
## 0.42 0.5453846 0.000000000 0.005958436 0.00000000
## 0.43 0.5453846 0.000000000 0.005958436 0.00000000
## 0.44 0.5453846 0.000000000 0.005958436 0.00000000
## 0.45 0.5453846 0.000000000 0.005958436 0.00000000
## 0.46 0.5453846 0.000000000 0.005958436 0.00000000
## 0.47 0.5453846 0.000000000 0.005958436 0.00000000
## 0.48 0.5453846 0.000000000 0.005958436 0.00000000
## 0.49 0.5453846 0.000000000 0.005958436 0.00000000
## 0.50 0.5453846 0.000000000 0.005958436 0.00000000
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.19.
returns the optimal cp, which is 0.18
Create a new CART model using the picked parameter
stevensTreeCV <- rpart(Reverse ~ Circuit + Issue + Petitioner + Respondent + LowerCourt + Unconst, data = Train, method = "class", cp = 0.18)
predictionCV <- predict(stevensTreeCV, newdata = Test, type = "class")
table(Test$Reverse, predictionCV)
## predictionCV
## 0 1
## 0 59 18
## 1 29 64
Model accuracy is (59+64)/(59+18+29+64) = 0.7235294
Remember that the accuracy of our previous CART model was 0.659. Cross validation helps us make sure we’re selecting a good parameter value, and often this will significantly increase the accuracy.
library(rpart)
library(rpart.plot)
prp(stevensTreeCV)