Unit 4 - “Judge, Jury, and Classifier” Lecture

VIDEO 4

# Read in the data
setwd('C:/Users/daria.alekseeva/Documents/Edx/Trees/')
stevens = read.csv("stevens.csv")
str(stevens)
## 'data.frame':    566 obs. of  9 variables:
##  $ Docket    : Factor w/ 566 levels "00-1011","00-1045",..: 63 69 70 145 97 181 242 289 334 436 ...
##  $ Term      : int  1994 1994 1994 1994 1995 1995 1996 1997 1997 1999 ...
##  $ Circuit   : Factor w/ 13 levels "10th","11th",..: 4 11 7 3 9 11 13 11 12 2 ...
##  $ Issue     : Factor w/ 11 levels "Attorneys","CivilRights",..: 5 5 5 5 9 5 5 5 5 3 ...
##  $ Petitioner: Factor w/ 12 levels "AMERICAN.INDIAN",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ Respondent: Factor w/ 12 levels "AMERICAN.INDIAN",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ LowerCourt: Factor w/ 2 levels "conser","liberal": 2 2 2 1 1 1 1 1 1 1 ...
##  $ Unconst   : int  0 0 0 0 0 1 0 1 0 0 ...
##  $ Reverse   : int  1 1 1 1 1 0 1 1 1 1 ...
# Split the data
library(caTools)
set.seed(200)
spl = sample.split(stevens$Reverse, SplitRatio = 0.7)
Train = subset(stevens, spl==TRUE)
Test = subset(stevens, spl==FALSE)

# Install rpart library
#install.packages("rpart")
library(rpart)
#install.packages("rpart.plot")
library(rpart.plot)

# CART model
StevensTree = rpart(Reverse ~ Circuit + Issue + Petitioner + Respondent + LowerCourt + Unconst, data = Train, method="class", minbucket=25)

prp(StevensTree)

# Make predictions
PredictCART = predict(StevensTree, newdata = Test, type = "class")
table(Test$Reverse, PredictCART)
##    PredictCART
##      0  1
##   0 35 42
##   1 26 67
(41+71)/(41+36+22+71)
## [1] 0.6588235
# ROC curve
library(ROCR)
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## 
## The following object is masked from 'package:stats':
## 
##     lowess
PredictROC = predict(StevensTree, newdata = Test)
PredictROC
##             0         1
## 4   0.1444444 0.8555556
## 5   0.1444444 0.8555556
## 6   0.1444444 0.8555556
## 7   0.1444444 0.8555556
## 8   0.1444444 0.8555556
## 14  0.3888889 0.6111111
## 21  0.3888889 0.6111111
## 28  0.3888889 0.6111111
## 34  0.1444444 0.8555556
## 35  0.1444444 0.8555556
## 36  0.6603774 0.3396226
## 42  0.3888889 0.6111111
## 44  0.1444444 0.8555556
## 55  0.3888889 0.6111111
## 58  0.3888889 0.6111111
## 60  0.1444444 0.8555556
## 66  0.1444444 0.8555556
## 67  0.1444444 0.8555556
## 68  0.1444444 0.8555556
## 70  0.6603774 0.3396226
## 71  0.3888889 0.6111111
## 72  0.3888889 0.6111111
## 77  0.1444444 0.8555556
## 78  0.6603774 0.3396226
## 79  0.3888889 0.6111111
## 87  0.6486486 0.3513514
## 95  0.8260870 0.1739130
## 101 0.8260870 0.1739130
## 102 0.8260870 0.1739130
## 107 0.8260870 0.1739130
## 112 0.8260870 0.1739130
## 116 0.8260870 0.1739130
## 120 0.8260870 0.1739130
## 127 0.8260870 0.1739130
## 133 0.8260870 0.1739130
## 138 0.8260870 0.1739130
## 139 0.8260870 0.1739130
## 140 0.8260870 0.1739130
## 141 0.8260870 0.1739130
## 153 0.3888889 0.6111111
## 154 0.6603774 0.3396226
## 161 0.6603774 0.3396226
## 165 0.3888889 0.6111111
## 166 0.3888889 0.6111111
## 169 0.3888889 0.6111111
## 174 0.6603774 0.3396226
## 175 0.6603774 0.3396226
## 179 0.1444444 0.8555556
## 181 0.1444444 0.8555556
## 183 0.1444444 0.8555556
## 190 0.1444444 0.8555556
## 196 0.1444444 0.8555556
## 197 0.3888889 0.6111111
## 198 0.6603774 0.3396226
## 204 0.3888889 0.6111111
## 211 0.8260870 0.1739130
## 213 0.3066667 0.6933333
## 219 0.8260870 0.1739130
## 221 0.8260870 0.1739130
## 225 0.3888889 0.6111111
## 227 0.3066667 0.6933333
## 231 0.3066667 0.6933333
## 235 0.3888889 0.6111111
## 241 0.3066667 0.6933333
## 242 0.3066667 0.6933333
## 243 0.3066667 0.6933333
## 244 0.3066667 0.6933333
## 245 0.3066667 0.6933333
## 246 0.6486486 0.3513514
## 247 0.3066667 0.6933333
## 251 0.6603774 0.3396226
## 260 0.6603774 0.3396226
## 266 0.6603774 0.3396226
## 268 0.3888889 0.6111111
## 270 0.6603774 0.3396226
## 274 0.6603774 0.3396226
## 282 0.3066667 0.6933333
## 285 0.3888889 0.6111111
## 290 0.3066667 0.6933333
## 292 0.3888889 0.6111111
## 297 0.3066667 0.6933333
## 298 0.6603774 0.3396226
## 300 0.3888889 0.6111111
## 301 0.3888889 0.6111111
## 303 0.3888889 0.6111111
## 307 0.3066667 0.6933333
## 310 0.3066667 0.6933333
## 311 0.6603774 0.3396226
## 312 0.3888889 0.6111111
## 315 0.6603774 0.3396226
## 320 0.3888889 0.6111111
## 322 0.6603774 0.3396226
## 325 0.6486486 0.3513514
## 328 0.6603774 0.3396226
## 329 0.3888889 0.6111111
## 332 0.3066667 0.6933333
## 337 0.3888889 0.6111111
## 338 0.6486486 0.3513514
## 345 0.3888889 0.6111111
## 348 0.3066667 0.6933333
## 355 0.6603774 0.3396226
## 357 0.3066667 0.6933333
## 359 0.3888889 0.6111111
## 360 0.6486486 0.3513514
## 366 0.3888889 0.6111111
## 371 0.3066667 0.6933333
## 373 0.3888889 0.6111111
## 376 0.3066667 0.6933333
## 378 0.3888889 0.6111111
## 380 0.3066667 0.6933333
## 383 0.3888889 0.6111111
## 387 0.3066667 0.6933333
## 393 0.3066667 0.6933333
## 403 0.6603774 0.3396226
## 405 0.6603774 0.3396226
## 407 0.6603774 0.3396226
## 410 0.3888889 0.6111111
## 412 0.6603774 0.3396226
## 414 0.3066667 0.6933333
## 424 0.3888889 0.6111111
## 430 0.1444444 0.8555556
## 434 0.8260870 0.1739130
## 439 0.1444444 0.8555556
## 448 0.1444444 0.8555556
## 453 0.1444444 0.8555556
## 456 0.1444444 0.8555556
## 457 0.1444444 0.8555556
## 459 0.1444444 0.8555556
## 461 0.1444444 0.8555556
## 462 0.1444444 0.8555556
## 463 0.1444444 0.8555556
## 464 0.1444444 0.8555556
## 465 0.1444444 0.8555556
## 466 0.1444444 0.8555556
## 467 0.1444444 0.8555556
## 472 0.1444444 0.8555556
## 473 0.1444444 0.8555556
## 475 0.1444444 0.8555556
## 478 0.1444444 0.8555556
## 480 0.1444444 0.8555556
## 481 0.1444444 0.8555556
## 488 0.1444444 0.8555556
## 490 0.1444444 0.8555556
## 493 0.8260870 0.1739130
## 495 0.1444444 0.8555556
## 503 0.3066667 0.6933333
## 505 0.6486486 0.3513514
## 508 0.6486486 0.3513514
## 510 0.6486486 0.3513514
## 511 0.3066667 0.6933333
## 512 0.6486486 0.3513514
## 516 0.3066667 0.6933333
## 517 0.3066667 0.6933333
## 521 0.6486486 0.3513514
## 524 0.6486486 0.3513514
## 525 0.6486486 0.3513514
## 529 0.3066667 0.6933333
## 533 0.3066667 0.6933333
## 536 0.3066667 0.6933333
## 540 0.6486486 0.3513514
## 541 0.6486486 0.3513514
## 542 0.6486486 0.3513514
## 546 0.8260870 0.1739130
## 548 0.3066667 0.6933333
## 551 0.8260870 0.1739130
## 554 0.6486486 0.3513514
## 557 0.6486486 0.3513514
## 558 0.3066667 0.6933333
## 561 0.3066667 0.6933333
## 563 0.8260870 0.1739130
pred = prediction(PredictROC[,2], Test$Reverse)
perf = performance(pred, "tpr", "fpr")
plot(perf)

# Compute the AUC of the CART model from the previous video, using the following command in your R console:
as.numeric(performance(pred, "auc")@y.values)
## [1] 0.6856584
# CART model with bucket size 5
StevensTree = rpart(Reverse ~ Circuit + Issue + Petitioner + Respondent + LowerCourt + Unconst, data = Train, method="class", minbucket=5)

prp(StevensTree)

# CART model with bucket size 100
StevensTree = rpart(Reverse ~ Circuit + Issue + Petitioner + Respondent + LowerCourt + Unconst, data = Train, method="class", minbucket=100)

prp(StevensTree)


# VIDEO 5 - Random Forests

# Install randomForest package
#install.packages("randomForest")
library(randomForest)
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
# Build random forest model
StevensForest = randomForest(Reverse ~ Circuit + Issue + Petitioner + Respondent + LowerCourt + Unconst, data = Train, ntree=200, nodesize=25 )
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
# Convert outcome to factor
Train$Reverse = as.factor(Train$Reverse)
Test$Reverse = as.factor(Test$Reverse)

# Try again
StevensForest = randomForest(Reverse ~ Circuit + Issue + Petitioner + Respondent + LowerCourt + Unconst, data = Train, ntree=200, nodesize=25 )

# Make predictions
PredictForest = predict(StevensForest, newdata = Test)
table(Test$Reverse, PredictForest)
##    PredictForest
##      0  1
##   0 43 34
##   1 22 71
(40+74)/(40+37+19+74)
## [1] 0.6705882
# VIDEO 6

# Install cross-validation packages
#install.packages("caret")
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
#install.packages("e1071")
library(e1071)

# Define cross-validation experiment
numFolds = trainControl( method = "cv", number = 10 )
cpGrid = expand.grid( .cp = seq(0.01,0.5,0.01)) 

# Perform the cross validation
train(Reverse ~ Circuit + Issue + Petitioner + Respondent + LowerCourt + Unconst, data = Train, method = "rpart", trControl = numFolds, tuneGrid = cpGrid )
## CART 
## 
## 396 samples
##   8 predictor
##   2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 357, 357, 356, 356, 357, 357, ... 
## Resampling results across tuning parameters:
## 
##   cp    Accuracy   Kappa       Accuracy SD  Kappa SD  
##   0.01  0.6414103  0.26524035  0.053154217  0.10809392
##   0.02  0.6339744  0.25549154  0.032310914  0.06169834
##   0.03  0.6389744  0.26992125  0.040226367  0.08084489
##   0.04  0.6441026  0.28267457  0.049889204  0.10360430
##   0.05  0.6616026  0.32203736  0.054799632  0.11075452
##   0.06  0.6616026  0.32203736  0.054799632  0.11075452
##   0.07  0.6616026  0.32203736  0.054799632  0.11075452
##   0.08  0.6616026  0.32203736  0.054799632  0.11075452
##   0.09  0.6616026  0.32203736  0.054799632  0.11075452
##   0.10  0.6616026  0.32203736  0.054799632  0.11075452
##   0.11  0.6616026  0.32203736  0.054799632  0.11075452
##   0.12  0.6616026  0.32203736  0.054799632  0.11075452
##   0.13  0.6616026  0.32203736  0.054799632  0.11075452
##   0.14  0.6616026  0.32203736  0.054799632  0.11075452
##   0.15  0.6616026  0.32203736  0.054799632  0.11075452
##   0.16  0.6616026  0.32203736  0.054799632  0.11075452
##   0.17  0.6616026  0.32203736  0.054799632  0.11075452
##   0.18  0.6616026  0.32203736  0.054799632  0.11075452
##   0.19  0.6616026  0.32203736  0.054799632  0.11075452
##   0.20  0.6616026  0.32203736  0.054799632  0.11075452
##   0.21  0.6616026  0.32203736  0.054799632  0.11075452
##   0.22  0.6616026  0.32203736  0.054799632  0.11075452
##   0.23  0.6616026  0.32203736  0.054799632  0.11075452
##   0.24  0.6410897  0.27321847  0.058935986  0.13444037
##   0.25  0.5935897  0.14790079  0.046677154  0.13564530
##   0.26  0.5605128  0.05759208  0.025546836  0.09326580
##   0.27  0.5605128  0.05759208  0.025546836  0.09326580
##   0.28  0.5453846  0.00000000  0.005958436  0.00000000
##   0.29  0.5453846  0.00000000  0.005958436  0.00000000
##   0.30  0.5453846  0.00000000  0.005958436  0.00000000
##   0.31  0.5453846  0.00000000  0.005958436  0.00000000
##   0.32  0.5453846  0.00000000  0.005958436  0.00000000
##   0.33  0.5453846  0.00000000  0.005958436  0.00000000
##   0.34  0.5453846  0.00000000  0.005958436  0.00000000
##   0.35  0.5453846  0.00000000  0.005958436  0.00000000
##   0.36  0.5453846  0.00000000  0.005958436  0.00000000
##   0.37  0.5453846  0.00000000  0.005958436  0.00000000
##   0.38  0.5453846  0.00000000  0.005958436  0.00000000
##   0.39  0.5453846  0.00000000  0.005958436  0.00000000
##   0.40  0.5453846  0.00000000  0.005958436  0.00000000
##   0.41  0.5453846  0.00000000  0.005958436  0.00000000
##   0.42  0.5453846  0.00000000  0.005958436  0.00000000
##   0.43  0.5453846  0.00000000  0.005958436  0.00000000
##   0.44  0.5453846  0.00000000  0.005958436  0.00000000
##   0.45  0.5453846  0.00000000  0.005958436  0.00000000
##   0.46  0.5453846  0.00000000  0.005958436  0.00000000
##   0.47  0.5453846  0.00000000  0.005958436  0.00000000
##   0.48  0.5453846  0.00000000  0.005958436  0.00000000
##   0.49  0.5453846  0.00000000  0.005958436  0.00000000
##   0.50  0.5453846  0.00000000  0.005958436  0.00000000
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final value used for the model was cp = 0.23.
# Create a new CART model
StevensTreeCV = rpart(Reverse ~ Circuit + Issue + Petitioner + Respondent + LowerCourt + Unconst, data = Train, method="class", cp = 0.18)

# Make predictions
PredictCV = predict(StevensTreeCV, newdata = Test, type = "class")
table(Test$Reverse, PredictCV)
##    PredictCV
##      0  1
##   0 51 26
##   1 28 65
(59+64)/(59+18+29+64)
## [1] 0.7235294
prp(StevensTreeCV)