knitr::opts_chunk$set(echo = TRUE)

library(caTools)
library(rpart)
library(rpart.plot) # provides plotting function 'prp'
library(ROCR)
library(randomForest)
library(tidyverse)
library(caret)
library(e1071)

Prediction of Supreme Court decisions using Classification and Regression Trees (CART)

stevens <- read.csv("stevens.csv")

Show data summary

## 'data.frame':    566 obs. of  9 variables:
##  $ Docket    : Factor w/ 566 levels "00-1011","00-1045",..: 63 69 70 145 97 181 242 289 334 436 ...
##  $ Term      : int  1994 1994 1994 1994 1995 1995 1996 1997 1997 1999 ...
##  $ Circuit   : Factor w/ 13 levels "10th","11th",..: 4 11 7 3 9 11 13 11 12 2 ...
##  $ Issue     : Factor w/ 11 levels "Attorneys","CivilRights",..: 5 5 5 5 9 5 5 5 5 3 ...
##  $ Petitioner: Factor w/ 12 levels "AMERICAN.INDIAN",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ Respondent: Factor w/ 12 levels "AMERICAN.INDIAN",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ LowerCourt: Factor w/ 2 levels "conser","liberal": 2 2 2 1 1 1 1 1 1 1 ...
##  $ Unconst   : int  0 0 0 0 0 1 0 1 0 0 ...
##  $ Reverse   : int  1 1 1 1 1 0 1 1 1 1 ...

Generating training and test sets

set.seed(3000)

spl <- sample.split(stevens$Reverse, SplitRatio = 0.7) 
# 0.7 in training set, based on outcome variable $Reverse

Train <-subset(stevens, spl == TRUE)
Test <- subset(stevens, spl == FALSE)

Generate CART model

StevensTree <- rpart(Reverse ~ Circuit + Issue + Petitioner + Respondent + LowerCourt + Unconst,
                     data = Train, method = "class", minbucket = 25)
# minbucket is minimum size of bucket, to prevent over-fitting

prp(StevensTree)

Prediction

PredictCart <- predict(StevensTree, newdata = Test, type = "class") 
# type = "class" for majrity class predictions i.e. threshold 0.5

confusion_matrix <- table(Test$Reverse, PredictCart)

accuracy <- (confusion_matrix['0','0']+confusion_matrix['1','1'])/sum(confusion_matrix[])

confusion_matrix
##    PredictCart
##      0  1
##   0 41 36
##   1 22 71
paste("Accuracy :", accuracy)
## [1] "Accuracy : 0.658823529411765"

ROC curve. Model performance

PredictROC <- predict(StevensTree, newdata = Test)

head(PredictROC)
##            0         1
## 1  0.3035714 0.6964286
## 3  0.3035714 0.6964286
## 4  0.4000000 0.6000000
## 6  0.4000000 0.6000000
## 8  0.4000000 0.6000000
## 21 0.3035714 0.6964286

First column is probability of outcome ‘0’. Second column is probability of outcome ‘1’

pred <- prediction(PredictROC[,2], Test$Reverse)
perf <- performance(pred, "tpr", "fpr")
plot(perf)

paste("AUC : ", as.numeric(performance(pred, "auc")@y.values))
## [1] "AUC :  0.69271051529116"

Different minimum bucket size (=5)

StevensTree5 <- rpart(Reverse ~ Circuit + Issue + Petitioner + Respondent + LowerCourt + Unconst,
                     data = Train, method = "class", minbucket = 5)
# minbucket is minimum size of bucket, to prevent over-fitting

prp(StevensTree5)

Different minimum bucket size (=100)

StevensTree100 <- rpart(Reverse ~ Circuit + Issue + Petitioner + Respondent + LowerCourt + Unconst,
                     data = Train, method = "class", minbucket = 100)
# minbucket is minimum size of bucket, to prevent over-fitting

prp(StevensTree100)

Using ‘Random Forest’

set.seed(200)
StevensForest <- randomForest(as.factor(Reverse) ~ Circuit + Issue + Petitioner + Respondent + LowerCourt + Unconst,
                              data = Train, nodesize = 25, ntree = 200)
# nodesize is same as 'minbucket' in CART
# ntree is number of trees to build
# note that outcome variable is 'as.factor', this is a classification problem (as opposed to a regression)

PredictForest <- predict(StevensForest, newdata = (Test %>% mutate(Reverse = as.factor(Reverse))))

confusion_forest <- table(Test$Reverse, PredictForest)

confusion_forest
##    PredictForest
##      0  1
##   0 43 34
##   1 18 75
paste("Accuracy :", (confusion_forest['0','0']+confusion_forest['1','1'])/sum(confusion_forest[]))
## [1] "Accuracy : 0.694117647058824"

Cross-validation

numFolds <- trainControl(method = "cv", number = 10)
cpGrid <- expand.grid(.cp = seq(0.01, 0.5, 0.01))

train(as.factor(Reverse) ~ Circuit + Issue + Petitioner + Respondent + LowerCourt + Unconst,
      data = Train, method = "rpart", trControl = numFolds, tuneGrid = cpGrid)
## CART 
## 
## 396 samples
##   6 predictor
##   2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 356, 357, 356, 356, 356, 356, ... 
## Resampling results across tuning parameters:
## 
##   cp    Accuracy   Kappa       
##   0.01  0.6208333   0.220666218
##   0.02  0.6257692   0.232410416
##   0.03  0.6207692   0.228557541
##   0.04  0.6233333   0.236189388
##   0.05  0.6436538   0.282368078
##   0.06  0.6436538   0.282368078
##   0.07  0.6436538   0.282368078
##   0.08  0.6436538   0.282368078
##   0.09  0.6436538   0.282368078
##   0.10  0.6436538   0.282368078
##   0.11  0.6436538   0.282368078
##   0.12  0.6436538   0.282368078
##   0.13  0.6436538   0.282368078
##   0.14  0.6436538   0.282368078
##   0.15  0.6436538   0.282368078
##   0.16  0.6436538   0.282368078
##   0.17  0.6436538   0.282368078
##   0.18  0.5936538   0.161987805
##   0.19  0.5705769   0.108232864
##   0.20  0.5530769   0.063509246
##   0.21  0.5274359  -0.001288591
##   0.22  0.5274359  -0.001288591
##   0.23  0.5274359  -0.001288591
##   0.24  0.5224359  -0.029531989
##   0.25  0.5224359  -0.029531989
##   0.26  0.5274359  -0.030522088
##   0.27  0.5274359  -0.030522088
##   0.28  0.5274359  -0.030522088
##   0.29  0.5453846   0.000000000
##   0.30  0.5453846   0.000000000
##   0.31  0.5453846   0.000000000
##   0.32  0.5453846   0.000000000
##   0.33  0.5453846   0.000000000
##   0.34  0.5453846   0.000000000
##   0.35  0.5453846   0.000000000
##   0.36  0.5453846   0.000000000
##   0.37  0.5453846   0.000000000
##   0.38  0.5453846   0.000000000
##   0.39  0.5453846   0.000000000
##   0.40  0.5453846   0.000000000
##   0.41  0.5453846   0.000000000
##   0.42  0.5453846   0.000000000
##   0.43  0.5453846   0.000000000
##   0.44  0.5453846   0.000000000
##   0.45  0.5453846   0.000000000
##   0.46  0.5453846   0.000000000
##   0.47  0.5453846   0.000000000
##   0.48  0.5453846   0.000000000
##   0.49  0.5453846   0.000000000
##   0.50  0.5453846   0.000000000
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.17.

Maximum accuracy when cp = 0.18

StevensTreeCV <- rpart(as.factor(Reverse) ~ Circuit + Issue + Petitioner + Respondent + LowerCourt + Unconst,
                       data = Train, method = 'class', cp = 0.18)

PredictCV <- predict(StevensTreeCV, newdata = (Test %>% mutate(Reverse = as.factor(Reverse))), type = "class")

prp(StevensTreeCV)

confusion_cv <- table(Test$Reverse, PredictCV)

confusion_cv
##    PredictCV
##      0  1
##   0 59 18
##   1 29 64
paste("Accuracy :", (confusion_cv['0','0']+confusion_cv['1','1'])/sum(confusion_cv[]))
## [1] "Accuracy : 0.723529411764706"

A single-split model gives better accuracy than a model with more splits!