load("SQF_clean.rda")
To begin with, we need to identify a task. Recall from previous sections that cs_* stands for the reasons for stop. Can we generate a model to predict whether a pedestrian will be arrested based on the following feautres?
(You should perform your reasoning for feautre selection. )
Now let’s try out some classification packages in R.
Let’s simply divide the dataset into two sets: training and testing sets, where data in testing set will not occur in training set. And we evaluate the performance of our predicitive models.
set.seed(5)
dat$arstmade = as.factor(dat$arstmade)
for(b in grep("cs_", colnames(dat), value=TRUE)) dat[[b]] <- as.factor(dat[[b]])
select <- grep("cs_", colnames(dat), value=TRUE)
select <- c(select, c("sex","race","city","timestop","arstmade"))
dat <- dat[,select]
dat <- dat[complete.cases(dat),]
index = sample(nrow(dat), nrow(dat)*.75)
train = dat[index,]
test = dat[-index,]
In this case, training set has 34340 instances and testing set has 11447.
performance <- function(pred, truth){
cm <- table(pred,truth)
tp <- cm[2,2]
tn <- cm[1,1]
fp <- cm[2,1]
fn <- cm[1,2]
accuracy <- (tp+tn) / length(truth)
recall <- (tp) / (tp+fn)
precision <- (tp) / (tp+fp)
fmeasure <- 2*recall*precision/(recall+precision)
result <- as.data.frame(c(accuracy, recall, precision, fmeasure))
colnames(result) <- c("Performance")
row.names(result) <- c("Accuracy", "Recall", "Precision", "F-measure")
return(result)
}
library(rpart)
m <- rpart(arstmade ~ sex + race + city + timestop + cs_objcs + cs_descr + cs_casng + cs_lkout + cs_cloth + cs_drgtr + cs_furtv + cs_vcrim + cs_bulge, data=train, control = rpart.control(cp=.0001, minsplit=10))
pred <- predict(m,test,type="class")
truth <- test$arstmade
performance(pred, truth)
## Performance
## Accuracy 0.8458985
## Recall 0.1192661
## Precision 0.4770642
## Fmeasure 0.1908257
library(e1071)
m <- naiveBayes(arstmade ~ sex + race + city + timestop + cs_objcs + cs_descr + cs_casng + cs_lkout + cs_cloth + cs_drgtr + cs_furtv + cs_vcrim + cs_bulge, data=train)
pred <- predict(m,test,type="class")
truth <- test$arstmade
performance(pred, truth)
## Performance
## Accuracy 0.84747095
## Recall 0.03268349
## Precision 0.49137931
## F-measure 0.06129032
library(doParallel)
registerDoParallel()
getDoParWorkers()
## [1] 4
library(e1071)
m <- svm(arstmade ~ sex + race + city + timestop + cs_objcs + cs_descr + cs_casng + cs_lkout + cs_cloth + cs_drgtr + cs_furtv + cs_vcrim + cs_bulge, data=train, cost=100)
pred <- predict(m,test)
truth <- test$arstmade
performance(pred, truth)
## Performance
## Accuracy 0.84886870
## Recall 0.03841743
## Precision 0.55833333
## F-measure 0.07188841
library(randomForest)
m <- randomForest(arstmade ~ sex + race + city + timestop + cs_objcs + cs_descr + cs_casng + cs_lkout + cs_cloth + cs_drgtr + cs_furtv + cs_vcrim + cs_bulge, data=train)
pred <- predict(m,test,type="class")
truth <- test$arstmade
performance(pred, truth)
## Performance
## Accuracy 0.85044116
## Recall 0.04357798
## Precision 0.63333333
## F-measure 0.08154506