Instructions

In class we reviewed methods for creating a decision tree and using a k-fold analysis.

Please perform the following:

[X]Display the charts and associated accuracy for the provided data.

Tasks

Load your data, capture accuracy, and display the graph.

Compare baseline accuracy to 5-fold cross validation accuracy.

Steps

Code

Load file

data_raw <- read.csv(".\\Decision Tree Basics.csv")
data <- 
  data_raw %>%
  na.omit %>%
  rename(date = date_) %>%
  mutate(
    sex = factor(sex, levels = c('M', 'F'), labels = c('Male', 'Female')),
    fatal = factor(fatal, levels = 0:1, labels = c('No', 'Yes')),
    latino = factor(latino, levels = 0:1, labels = c('No', 'Yes')),
    inside = factor(inside, levels = 0:1, labels = c('outside', 'inside')),
    officer_involved = factor(officer_involved, levels = c('N', 'Y'), labels = c('No', 'Yes')),
    offender_injured = factor(offender_injured, levels = c('N', 'Y'), labels = c('No', 'Yes')),
    offender_deceased = factor(offender_deceased, levels = c('N', 'Y'), labels = c('No', 'Yes')),
    the_geom = NULL,
    the_geom_webmercator = NULL,
    objectid = NULL,
    dc_key = NULL,
    code = NULL,
    date = NULL,
    time = NULL,
    wound = NULL,
    offender_injured = NULL,
    offender_deceased = NULL,
    point_x = NULL,
    point_y = NULL,
    location = NULL,
    dist = NULL,
    lat = NULL,
    lng = NULL,
    outside = NULL,
    inside = NULL
    )

Use set.seed(0)

set.seed(0)

Perform a 5-fold validation on the data

k <- 5
sr <- .8

indx <- createDataPartition(data$fatal, p = sr, list = F)
train <- data[indx, ]
test <- data[-indx, ]

nrow(data) == nrow(train) + nrow(test)

prop.table(xtabs(~ data$fatal))
prop.table(xtabs(~ train$fatal))
prop.table(xtabs(~ test$fatal))

fit <-
  train(
    form = fatal ~ .,
    data = train,
    trControl = 
      trainControl(
        method = 'cv',
        number = k,
        p = sr,
        verboseIter = T,
        savePredictions = T,
        sampling = 'down'),
    method = 'rf')

pre <- predict(fit, test)

result = data.frame(
  ground_truth = test$fatal,
  predict = pre)

xtabs(~ground_truth + predict, data = result)

Capture the overall accuracy

confusionMatrix(reference = test$fatal, data = pre, positive = 'Yes')
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  1124  293
##        Yes  977  252
##                                           
##                Accuracy : 0.52            
##                  95% CI : (0.5008, 0.5392)
##     No Information Rate : 0.794           
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : -0.0018         
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.46239         
##             Specificity : 0.53498         
##          Pos Pred Value : 0.20504         
##          Neg Pred Value : 0.79323         
##              Prevalence : 0.20597         
##          Detection Rate : 0.09524         
##    Detection Prevalence : 0.46447         
##       Balanced Accuracy : 0.49868         
##                                           
##        'Positive' Class : Yes             
## 

Chart the above

plot(fit)