Load data
stevens <- read.csv("stevens.csv")
str(stevens)
## 'data.frame': 566 obs. of 9 variables:
## $ Docket : Factor w/ 566 levels "00-1011","00-1045",..: 63 69 70 145 97 181 242 289 334 436 ...
## $ Term : int 1994 1994 1994 1994 1995 1995 1996 1997 1997 1999 ...
## $ Circuit : Factor w/ 13 levels "10th","11th",..: 4 11 7 3 9 11 13 11 12 2 ...
## $ Issue : Factor w/ 11 levels "Attorneys","CivilRights",..: 5 5 5 5 9 5 5 5 5 3 ...
## $ Petitioner: Factor w/ 12 levels "AMERICAN.INDIAN",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ Respondent: Factor w/ 12 levels "AMERICAN.INDIAN",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ LowerCourt: Factor w/ 2 levels "conser","liberal": 2 2 2 1 1 1 1 1 1 1 ...
## $ Unconst : int 0 0 0 0 0 1 0 1 0 0 ...
## $ Reverse : int 1 1 1 1 1 0 1 1 1 1 ...
Split data into training and test sets
library(caTools)
## Warning: package 'caTools' was built under R version 3.1.3
set.seed(3000)
spl <- sample.split(stevens$Reverse, SplitRatio=0.7)
train <- subset(stevens, spl == TRUE)
test <- subset(stevens, spl == FALSE)
Build CART model
library(rpart)
## Warning: package 'rpart' was built under R version 3.1.3
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.1.3
stevensTree <- rpart(Reverse ~ Circuit + Issue + Petitioner + Respondent +
LowerCourt + Unconst, data=train,
method="class", minbucket=25)
prp(stevensTree)
Predict using test data
predictCART <- predict(stevensTree, newdata=test, type="class")
confmat <- table(test$Reverse, predictCART)
N <- sum(confmat)
(accuracy <- (confmat[1,1] + confmat[2,2]) / N)
## [1] 0.6588235
baseline.accuracy <- sum(confmat[2,]) / N
Generate ROC curve
library(ROCR)
## Warning: package 'ROCR' was built under R version 3.1.3
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.1.3
##
## Attaching package: 'gplots'
##
## The following object is masked from 'package:stats':
##
## lowess
predictROC <- predict(stevensTree, newdata=test)
head(predictROC)
## 0 1
## 1 0.3035714 0.6964286
## 3 0.3035714 0.6964286
## 4 0.4000000 0.6000000
## 6 0.4000000 0.6000000
## 8 0.4000000 0.6000000
## 21 0.3035714 0.6964286
pred <- prediction(predictROC[,2], test$Reverse)
perf <- performance(pred, "tpr", "fpr")
plot(perf)
Compute AUC
as.numeric(performance(pred, "auc")@y.values)
## [1] 0.6927105
Change minibucket to see how many splits:
small <- rpart(Reverse ~ Circuit + Issue + Petitioner + Respondent +
LowerCourt + Unconst, data=train,
method="class", minbucket=5)
prp(small)
big <- rpart(Reverse ~ Circuit + Issue + Petitioner + Respondent +
LowerCourt + Unconst, data=train,
method="class", minbucket=100)
prp(big)
Build random forest. Output variable needs to be factor so that it will do logisitic regression. Choose nodesize=25 (aka minibucket in CART)
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.1.3
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
train$Reverse <- as.factor(train$Reverse)
test$Reverse <- as.factor(test$Reverse)
stevensForest <- randomForest(Reverse ~ Circuit + Issue + Petitioner + Respondent +
LowerCourt + Unconst, data=train,
nodesize=25, ntree=200)
Make prediction using test data
predictForest <- predict(stevensForest, newdata=test)
confmat <- table(test$Reverse, predictForest)
N <- sum(confmat)
(accuracy <- (confmat[1,1] + confmat[2,2]) / N)
## [1] 0.6882353
Sets seed to 100 and 200 and get accuracy
computeAccuracy <- function(seed) {
set.seed(seed)
forest <- randomForest(Reverse ~ Circuit + Issue + Petitioner + Respondent +
LowerCourt + Unconst, data=train,
nodesize=25, ntree=200)
predictForest <- predict(forest, newdata=test)
confmat <- table(test$Reverse, predictForest)
(confmat[1,1] + confmat[2,2]) / sum(confmat)
}
(computeAccuracy(100))
## [1] 0.6882353
(computeAccuracy(200))
## [1] 0.7058824
How to determine minbucket? K-fold Cross validation!
library(caret)
## Warning: package 'caret' was built under R version 3.1.3
## Loading required package: lattice
## Loading required package: ggplot2
library(e1071)
## Warning: package 'e1071' was built under R version 3.1.3
kfolds <- 10
numFolds <- trainControl(method="cv", number=kfolds)
cpGrid <- expand.grid(.cp=seq(0.01, 0.5, 0.01))
(cpRes <- train(Reverse ~ Circuit + Issue + Petitioner + Respondent + LowerCourt +
Unconst,
data=train, method="rpart", trControl=numFolds, tuneGrid=cpGrid))
## CART
##
## 396 samples
## 8 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
##
## Summary of sample sizes: 357, 356, 357, 356, 356, 356, ...
##
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa Accuracy SD Kappa SD
## 0.01 0.6365385 0.252522710 0.045831216 0.10138314
## 0.02 0.6337179 0.248281522 0.061267954 0.12752003
## 0.03 0.6314103 0.251796733 0.053552823 0.11468412
## 0.04 0.6314103 0.253786180 0.053552823 0.11438075
## 0.05 0.6440385 0.282995035 0.062472910 0.13104160
## 0.06 0.6440385 0.282995035 0.062472910 0.13104160
## 0.07 0.6440385 0.282995035 0.062472910 0.13104160
## 0.08 0.6440385 0.282995035 0.062472910 0.13104160
## 0.09 0.6440385 0.282995035 0.062472910 0.13104160
## 0.10 0.6440385 0.282995035 0.062472910 0.13104160
## 0.11 0.6440385 0.282995035 0.062472910 0.13104160
## 0.12 0.6440385 0.282995035 0.062472910 0.13104160
## 0.13 0.6440385 0.282995035 0.062472910 0.13104160
## 0.14 0.6440385 0.282995035 0.062472910 0.13104160
## 0.15 0.6440385 0.282995035 0.062472910 0.13104160
## 0.16 0.6440385 0.282995035 0.062472910 0.13104160
## 0.17 0.6440385 0.282995035 0.062472910 0.13104160
## 0.18 0.6440385 0.282995035 0.062472910 0.13104160
## 0.19 0.6440385 0.282995035 0.062472910 0.13104160
## 0.20 0.6085897 0.193703966 0.058244587 0.14192310
## 0.21 0.5807692 0.121202966 0.046444754 0.12714614
## 0.22 0.5605128 0.062732119 0.032700267 0.09526381
## 0.23 0.5428846 0.003553299 0.008506582 0.01123652
## 0.24 0.5428846 0.003553299 0.008506582 0.01123652
## 0.25 0.5453846 0.000000000 0.005958436 0.00000000
## 0.26 0.5453846 0.000000000 0.005958436 0.00000000
## 0.27 0.5453846 0.000000000 0.005958436 0.00000000
## 0.28 0.5453846 0.000000000 0.005958436 0.00000000
## 0.29 0.5453846 0.000000000 0.005958436 0.00000000
## 0.30 0.5453846 0.000000000 0.005958436 0.00000000
## 0.31 0.5453846 0.000000000 0.005958436 0.00000000
## 0.32 0.5453846 0.000000000 0.005958436 0.00000000
## 0.33 0.5453846 0.000000000 0.005958436 0.00000000
## 0.34 0.5453846 0.000000000 0.005958436 0.00000000
## 0.35 0.5453846 0.000000000 0.005958436 0.00000000
## 0.36 0.5453846 0.000000000 0.005958436 0.00000000
## 0.37 0.5453846 0.000000000 0.005958436 0.00000000
## 0.38 0.5453846 0.000000000 0.005958436 0.00000000
## 0.39 0.5453846 0.000000000 0.005958436 0.00000000
## 0.40 0.5453846 0.000000000 0.005958436 0.00000000
## 0.41 0.5453846 0.000000000 0.005958436 0.00000000
## 0.42 0.5453846 0.000000000 0.005958436 0.00000000
## 0.43 0.5453846 0.000000000 0.005958436 0.00000000
## 0.44 0.5453846 0.000000000 0.005958436 0.00000000
## 0.45 0.5453846 0.000000000 0.005958436 0.00000000
## 0.46 0.5453846 0.000000000 0.005958436 0.00000000
## 0.47 0.5453846 0.000000000 0.005958436 0.00000000
## 0.48 0.5453846 0.000000000 0.005958436 0.00000000
## 0.49 0.5453846 0.000000000 0.005958436 0.00000000
## 0.50 0.5453846 0.000000000 0.005958436 0.00000000
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.19.
plot(cpRes$results$cp, cpRes$results$Accuracy, type="l", xlab="cp", ylab="accuracy")
Now create new model with cp parameter
stevensTreeCV <- rpart(Reverse ~ Circuit + Issue + Petitioner + Respondent +
LowerCourt + Unconst,
data=train, method="class", cp=cpRes$bestTune)
predictCV <- predict(stevensTreeCV, newdata=test, type="class")
confmat <- table(test$Reverse, predictCV)
(confmat[1,1] + confmat[2,2]) / sum(confmat)
## [1] 0.7235294
Plot the tree
prp(stevensTreeCV)