In class we reviewed methods for creating a decision tree and using a k-fold analysis.
Please perform the following:
[X]Display the charts and associated accuracy for the provided data.
Load your data, capture accuracy, and display the graph.
Compare baseline accuracy to 5-fold cross validation accuracy.
data_raw <- read.csv(".\\Decision Tree Basics.csv")
data <-
data_raw %>%
na.omit %>%
rename(date = date_) %>%
mutate(
sex = factor(sex, levels = c('M', 'F'), labels = c('Male', 'Female')),
fatal = factor(fatal, levels = 0:1, labels = c('No', 'Yes')),
latino = factor(latino, levels = 0:1, labels = c('No', 'Yes')),
inside = factor(inside, levels = 0:1, labels = c('outside', 'inside')),
officer_involved = factor(officer_involved, levels = c('N', 'Y'), labels = c('No', 'Yes')),
offender_injured = factor(offender_injured, levels = c('N', 'Y'), labels = c('No', 'Yes')),
offender_deceased = factor(offender_deceased, levels = c('N', 'Y'), labels = c('No', 'Yes')),
the_geom = NULL,
the_geom_webmercator = NULL,
objectid = NULL,
dc_key = NULL,
code = NULL,
date = NULL,
time = NULL,
wound = NULL,
offender_injured = NULL,
offender_deceased = NULL,
point_x = NULL,
point_y = NULL,
location = NULL,
dist = NULL,
lat = NULL,
lng = NULL,
outside = NULL,
inside = NULL
)
set.seed(0)
k <- 5
sr <- .8
indx <- createDataPartition(data$fatal, p = sr, list = F)
train <- data[indx, ]
test <- data[-indx, ]
nrow(data) == nrow(train) + nrow(test)
prop.table(xtabs(~ data$fatal))
prop.table(xtabs(~ train$fatal))
prop.table(xtabs(~ test$fatal))
fit <-
train(
form = fatal ~ .,
data = train,
trControl =
trainControl(
method = 'cv',
number = k,
p = sr,
verboseIter = T,
savePredictions = T,
sampling = 'down'),
method = 'rf')
pre <- predict(fit, test)
result = data.frame(
ground_truth = test$fatal,
predict = pre)
xtabs(~ground_truth + predict, data = result)
confusionMatrix(reference = test$fatal, data = pre, positive = 'Yes')
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 1124 293
## Yes 977 252
##
## Accuracy : 0.52
## 95% CI : (0.5008, 0.5392)
## No Information Rate : 0.794
## P-Value [Acc > NIR] : 1
##
## Kappa : -0.0018
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.46239
## Specificity : 0.53498
## Pos Pred Value : 0.20504
## Neg Pred Value : 0.79323
## Prevalence : 0.20597
## Detection Rate : 0.09524
## Detection Prevalence : 0.46447
## Balanced Accuracy : 0.49868
##
## 'Positive' Class : Yes
##
plot(fit)