Jury, Judge and Classifier

knitr::opts_chunk$set(echo = TRUE)

library(caTools)
library(rpart)
library(rpart.plot) # provides plotting function 'prp'

library(ROCR)
library(randomForest)
library(tidyverse)
library(caret)
library(e1071)

Prediction of Supreme Court decisions using Classification and Regression Trees (CART)

stevens <- read.csv("stevens.csv")

Show data summary

## 'data.frame':    566 obs. of  9 variables:
##  $ Docket    : Factor w/ 566 levels "00-1011","00-1045",..: 63 69 70 145 97 181 242 289 334 436 ...
##  $ Term      : int  1994 1994 1994 1994 1995 1995 1996 1997 1997 1999 ...
##  $ Circuit   : Factor w/ 13 levels "10th","11th",..: 4 11 7 3 9 11 13 11 12 2 ...
##  $ Issue     : Factor w/ 11 levels "Attorneys","CivilRights",..: 5 5 5 5 9 5 5 5 5 3 ...
##  $ Petitioner: Factor w/ 12 levels "AMERICAN.INDIAN",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ Respondent: Factor w/ 12 levels "AMERICAN.INDIAN",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ LowerCourt: Factor w/ 2 levels "conser","liberal": 2 2 2 1 1 1 1 1 1 1 ...
##  $ Unconst   : int  0 0 0 0 0 1 0 1 0 0 ...
##  $ Reverse   : int  1 1 1 1 1 0 1 1 1 1 ...

Generating training and test sets

set.seed(3000)

spl <- sample.split(stevens$Reverse, SplitRatio = 0.7) 
# 0.7 in training set, based on outcome variable $Reverse

Train <-subset(stevens, spl == TRUE)
Test <- subset(stevens, spl == FALSE)

Generate CART model

StevensTree <- rpart(Reverse ~ Circuit + Issue + Petitioner + Respondent + LowerCourt + Unconst,
                     data = Train, method = "class", minbucket = 25)
# minbucket is minimum size of bucket, to prevent over-fitting

prp(StevensTree)

Prediction

PredictCart <- predict(StevensTree, newdata = Test, type = "class") 
# type = "class" for majrity class predictions i.e. threshold 0.5

confusion_matrix <- table(Test$Reverse, PredictCart)

accuracy <- (confusion_matrix['0','0']+confusion_matrix['1','1'])/sum(confusion_matrix[])

confusion_matrix

##    PredictCart
##      0  1
##   0 41 36
##   1 22 71

paste("Accuracy :", accuracy)

## [1] "Accuracy : 0.658823529411765"

ROC curve. Model performance

PredictROC <- predict(StevensTree, newdata = Test)

head(PredictROC)

##            0         1
## 1  0.3035714 0.6964286
## 3  0.3035714 0.6964286
## 4  0.4000000 0.6000000
## 6  0.4000000 0.6000000
## 8  0.4000000 0.6000000
## 21 0.3035714 0.6964286

First column is probability of outcome ‘0’. Second column is probability of outcome ‘1’

pred <- prediction(PredictROC[,2], Test$Reverse)
perf <- performance(pred, "tpr", "fpr")
plot(perf)

paste("AUC : ", as.numeric(performance(pred, "auc")@y.values))

## [1] "AUC :  0.69271051529116"

Different minimum bucket size (=5)

StevensTree5 <- rpart(Reverse ~ Circuit + Issue + Petitioner + Respondent + LowerCourt + Unconst,
                     data = Train, method = "class", minbucket = 5)
# minbucket is minimum size of bucket, to prevent over-fitting

prp(StevensTree5)

Different minimum bucket size (=100)

StevensTree100 <- rpart(Reverse ~ Circuit + Issue + Petitioner + Respondent + LowerCourt + Unconst,
                     data = Train, method = "class", minbucket = 100)
# minbucket is minimum size of bucket, to prevent over-fitting

prp(StevensTree100)

Using ‘Random Forest’

set.seed(200)
StevensForest <- randomForest(as.factor(Reverse) ~ Circuit + Issue + Petitioner + Respondent + LowerCourt + Unconst,
                              data = Train, nodesize = 25, ntree = 200)
# nodesize is same as 'minbucket' in CART
# ntree is number of trees to build
# note that outcome variable is 'as.factor', this is a classification problem (as opposed to a regression)

PredictForest <- predict(StevensForest, newdata = (Test %>% mutate(Reverse = as.factor(Reverse))))

confusion_forest <- table(Test$Reverse, PredictForest)

confusion_forest

##    PredictForest
##      0  1
##   0 43 34
##   1 18 75

paste("Accuracy :", (confusion_forest['0','0']+confusion_forest['1','1'])/sum(confusion_forest[]))

## [1] "Accuracy : 0.694117647058824"

Cross-validation

numFolds <- trainControl(method = "cv", number = 10)
cpGrid <- expand.grid(.cp = seq(0.01, 0.5, 0.01))

train(as.factor(Reverse) ~ Circuit + Issue + Petitioner + Respondent + LowerCourt + Unconst,
      data = Train, method = "rpart", trControl = numFolds, tuneGrid = cpGrid)

## CART 
## 
## 396 samples
##   6 predictor
##   2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 356, 357, 356, 356, 356, 356, ... 
## Resampling results across tuning parameters:
## 
##   cp    Accuracy   Kappa       
##   0.01  0.6208333   0.220666218
##   0.02  0.6257692   0.232410416
##   0.03  0.6207692   0.228557541
##   0.04  0.6233333   0.236189388
##   0.05  0.6436538   0.282368078
##   0.06  0.6436538   0.282368078
##   0.07  0.6436538   0.282368078
##   0.08  0.6436538   0.282368078
##   0.09  0.6436538   0.282368078
##   0.10  0.6436538   0.282368078
##   0.11  0.6436538   0.282368078
##   0.12  0.6436538   0.282368078
##   0.13  0.6436538   0.282368078
##   0.14  0.6436538   0.282368078
##   0.15  0.6436538   0.282368078
##   0.16  0.6436538   0.282368078
##   0.17  0.6436538   0.282368078
##   0.18  0.5936538   0.161987805
##   0.19  0.5705769   0.108232864
##   0.20  0.5530769   0.063509246
##   0.21  0.5274359  -0.001288591
##   0.22  0.5274359  -0.001288591
##   0.23  0.5274359  -0.001288591
##   0.24  0.5224359  -0.029531989
##   0.25  0.5224359  -0.029531989
##   0.26  0.5274359  -0.030522088
##   0.27  0.5274359  -0.030522088
##   0.28  0.5274359  -0.030522088
##   0.29  0.5453846   0.000000000
##   0.30  0.5453846   0.000000000
##   0.31  0.5453846   0.000000000
##   0.32  0.5453846   0.000000000
##   0.33  0.5453846   0.000000000
##   0.34  0.5453846   0.000000000
##   0.35  0.5453846   0.000000000
##   0.36  0.5453846   0.000000000
##   0.37  0.5453846   0.000000000
##   0.38  0.5453846   0.000000000
##   0.39  0.5453846   0.000000000
##   0.40  0.5453846   0.000000000
##   0.41  0.5453846   0.000000000
##   0.42  0.5453846   0.000000000
##   0.43  0.5453846   0.000000000
##   0.44  0.5453846   0.000000000
##   0.45  0.5453846   0.000000000
##   0.46  0.5453846   0.000000000
##   0.47  0.5453846   0.000000000
##   0.48  0.5453846   0.000000000
##   0.49  0.5453846   0.000000000
##   0.50  0.5453846   0.000000000
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.17.

Maximum accuracy when cp = 0.18

StevensTreeCV <- rpart(as.factor(Reverse) ~ Circuit + Issue + Petitioner + Respondent + LowerCourt + Unconst,
                       data = Train, method = 'class', cp = 0.18)

PredictCV <- predict(StevensTreeCV, newdata = (Test %>% mutate(Reverse = as.factor(Reverse))), type = "class")

prp(StevensTreeCV)

confusion_cv <- table(Test$Reverse, PredictCV)

confusion_cv

##    PredictCV
##      0  1
##   0 59 18
##   1 29 64

paste("Accuracy :", (confusion_cv['0','0']+confusion_cv['1','1'])/sum(confusion_cv[]))

## [1] "Accuracy : 0.723529411764706"

A single-split model gives better accuracy than a model with more splits!

Jury, Judge and Classifier

David Fong

3/14/2019

Prediction of Supreme Court decisions using Classification and Regression Trees (CART)

Show data summary

Generating training and test sets

Generate CART model

Prediction

ROC curve. Model performance

Different minimum bucket size (=5)

Different minimum bucket size (=100)

Using ‘Random Forest’

Cross-validation