knitr::opts_chunk$set(echo = TRUE)
library(caTools)
library(rpart)
library(rpart.plot) # provides plotting function 'prp'
library(ROCR)
library(randomForest)
library(tidyverse)
library(caret)
library(e1071)
stevens <- read.csv("stevens.csv")
## 'data.frame': 566 obs. of 9 variables:
## $ Docket : Factor w/ 566 levels "00-1011","00-1045",..: 63 69 70 145 97 181 242 289 334 436 ...
## $ Term : int 1994 1994 1994 1994 1995 1995 1996 1997 1997 1999 ...
## $ Circuit : Factor w/ 13 levels "10th","11th",..: 4 11 7 3 9 11 13 11 12 2 ...
## $ Issue : Factor w/ 11 levels "Attorneys","CivilRights",..: 5 5 5 5 9 5 5 5 5 3 ...
## $ Petitioner: Factor w/ 12 levels "AMERICAN.INDIAN",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ Respondent: Factor w/ 12 levels "AMERICAN.INDIAN",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ LowerCourt: Factor w/ 2 levels "conser","liberal": 2 2 2 1 1 1 1 1 1 1 ...
## $ Unconst : int 0 0 0 0 0 1 0 1 0 0 ...
## $ Reverse : int 1 1 1 1 1 0 1 1 1 1 ...
set.seed(3000)
spl <- sample.split(stevens$Reverse, SplitRatio = 0.7)
# 0.7 in training set, based on outcome variable $Reverse
Train <-subset(stevens, spl == TRUE)
Test <- subset(stevens, spl == FALSE)
StevensTree <- rpart(Reverse ~ Circuit + Issue + Petitioner + Respondent + LowerCourt + Unconst,
data = Train, method = "class", minbucket = 25)
# minbucket is minimum size of bucket, to prevent over-fitting
prp(StevensTree)
PredictCart <- predict(StevensTree, newdata = Test, type = "class")
# type = "class" for majrity class predictions i.e. threshold 0.5
confusion_matrix <- table(Test$Reverse, PredictCart)
accuracy <- (confusion_matrix['0','0']+confusion_matrix['1','1'])/sum(confusion_matrix[])
confusion_matrix
## PredictCart
## 0 1
## 0 41 36
## 1 22 71
paste("Accuracy :", accuracy)
## [1] "Accuracy : 0.658823529411765"
PredictROC <- predict(StevensTree, newdata = Test)
head(PredictROC)
## 0 1
## 1 0.3035714 0.6964286
## 3 0.3035714 0.6964286
## 4 0.4000000 0.6000000
## 6 0.4000000 0.6000000
## 8 0.4000000 0.6000000
## 21 0.3035714 0.6964286
First column is probability of outcome ‘0’. Second column is probability of outcome ‘1’
pred <- prediction(PredictROC[,2], Test$Reverse)
perf <- performance(pred, "tpr", "fpr")
plot(perf)
paste("AUC : ", as.numeric(performance(pred, "auc")@y.values))
## [1] "AUC : 0.69271051529116"
StevensTree5 <- rpart(Reverse ~ Circuit + Issue + Petitioner + Respondent + LowerCourt + Unconst,
data = Train, method = "class", minbucket = 5)
# minbucket is minimum size of bucket, to prevent over-fitting
prp(StevensTree5)
StevensTree100 <- rpart(Reverse ~ Circuit + Issue + Petitioner + Respondent + LowerCourt + Unconst,
data = Train, method = "class", minbucket = 100)
# minbucket is minimum size of bucket, to prevent over-fitting
prp(StevensTree100)
set.seed(200)
StevensForest <- randomForest(as.factor(Reverse) ~ Circuit + Issue + Petitioner + Respondent + LowerCourt + Unconst,
data = Train, nodesize = 25, ntree = 200)
# nodesize is same as 'minbucket' in CART
# ntree is number of trees to build
# note that outcome variable is 'as.factor', this is a classification problem (as opposed to a regression)
PredictForest <- predict(StevensForest, newdata = (Test %>% mutate(Reverse = as.factor(Reverse))))
confusion_forest <- table(Test$Reverse, PredictForest)
confusion_forest
## PredictForest
## 0 1
## 0 43 34
## 1 18 75
paste("Accuracy :", (confusion_forest['0','0']+confusion_forest['1','1'])/sum(confusion_forest[]))
## [1] "Accuracy : 0.694117647058824"
numFolds <- trainControl(method = "cv", number = 10)
cpGrid <- expand.grid(.cp = seq(0.01, 0.5, 0.01))
train(as.factor(Reverse) ~ Circuit + Issue + Petitioner + Respondent + LowerCourt + Unconst,
data = Train, method = "rpart", trControl = numFolds, tuneGrid = cpGrid)
## CART
##
## 396 samples
## 6 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 356, 357, 356, 356, 356, 356, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.01 0.6208333 0.220666218
## 0.02 0.6257692 0.232410416
## 0.03 0.6207692 0.228557541
## 0.04 0.6233333 0.236189388
## 0.05 0.6436538 0.282368078
## 0.06 0.6436538 0.282368078
## 0.07 0.6436538 0.282368078
## 0.08 0.6436538 0.282368078
## 0.09 0.6436538 0.282368078
## 0.10 0.6436538 0.282368078
## 0.11 0.6436538 0.282368078
## 0.12 0.6436538 0.282368078
## 0.13 0.6436538 0.282368078
## 0.14 0.6436538 0.282368078
## 0.15 0.6436538 0.282368078
## 0.16 0.6436538 0.282368078
## 0.17 0.6436538 0.282368078
## 0.18 0.5936538 0.161987805
## 0.19 0.5705769 0.108232864
## 0.20 0.5530769 0.063509246
## 0.21 0.5274359 -0.001288591
## 0.22 0.5274359 -0.001288591
## 0.23 0.5274359 -0.001288591
## 0.24 0.5224359 -0.029531989
## 0.25 0.5224359 -0.029531989
## 0.26 0.5274359 -0.030522088
## 0.27 0.5274359 -0.030522088
## 0.28 0.5274359 -0.030522088
## 0.29 0.5453846 0.000000000
## 0.30 0.5453846 0.000000000
## 0.31 0.5453846 0.000000000
## 0.32 0.5453846 0.000000000
## 0.33 0.5453846 0.000000000
## 0.34 0.5453846 0.000000000
## 0.35 0.5453846 0.000000000
## 0.36 0.5453846 0.000000000
## 0.37 0.5453846 0.000000000
## 0.38 0.5453846 0.000000000
## 0.39 0.5453846 0.000000000
## 0.40 0.5453846 0.000000000
## 0.41 0.5453846 0.000000000
## 0.42 0.5453846 0.000000000
## 0.43 0.5453846 0.000000000
## 0.44 0.5453846 0.000000000
## 0.45 0.5453846 0.000000000
## 0.46 0.5453846 0.000000000
## 0.47 0.5453846 0.000000000
## 0.48 0.5453846 0.000000000
## 0.49 0.5453846 0.000000000
## 0.50 0.5453846 0.000000000
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.17.
Maximum accuracy when cp = 0.18
StevensTreeCV <- rpart(as.factor(Reverse) ~ Circuit + Issue + Petitioner + Respondent + LowerCourt + Unconst,
data = Train, method = 'class', cp = 0.18)
PredictCV <- predict(StevensTreeCV, newdata = (Test %>% mutate(Reverse = as.factor(Reverse))), type = "class")
prp(StevensTreeCV)
confusion_cv <- table(Test$Reverse, PredictCV)
confusion_cv
## PredictCV
## 0 1
## 0 59 18
## 1 29 64
paste("Accuracy :", (confusion_cv['0','0']+confusion_cv['1','1'])/sum(confusion_cv[]))
## [1] "Accuracy : 0.723529411764706"
A single-split model gives better accuracy than a model with more splits!