#Load the dataset / Remove all rows after 1953
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
sapd <- fread("C:\\Users\\lsunb\\Downloads\\pubsafedash_cfs (1).csv")
sapd <- sapd[1:1953, ]
str(sapd)
## Classes 'data.table' and 'data.frame': 1953 obs. of 11 variables:
## $ Master_Incident_Number: chr "SAPD-2025-0331214" "SAPD-2025-0358398" "SAPD-2025-0354704" "SAPD-2025-0293910" ...
## $ Response_Date : POSIXct, format: "2025-03-10 01:49:41" "2025-03-15 05:05:06" ...
## $ Priority : int 6 3 6 5 2 3 6 4 2 4 ...
## $ Problem : chr "SAPD Call - NonEmerg" "Burglary Vehicle In Progress" "Recovered Stolen Property" "Disturbance Neighbor" ...
## $ Service_Area : chr "SOUTH" "CENTRAL" "WEST" "EAST" ...
## $ Type : chr "Non-Emergency" "Non-Emergency" "Non-Emergency" "Non-Emergency" ...
## $ Seconds : num NA 190 631 NA 345 ...
## $ Weekday : chr "Mon" "Sat" "Fri" "Sun" ...
## $ Disposition_Groups : chr "NOR-Outside Agency" "Cleared-Burg Business" "Cleared-Information" "Cleared-Information" ...
## $ Disposition_Type : chr "NOR" "Cleared" "Cleared" "Cleared" ...
## $ Postal_Code : chr "78112" "78201" "78201" "78202" ...
## - attr(*, ".internal.selfref")=<externalptr>
head(sapd)
## Master_Incident_Number Response_Date Priority
## <char> <POSc> <int>
## 1: SAPD-2025-0331214 2025-03-10 01:49:41 6
## 2: SAPD-2025-0358398 2025-03-15 05:05:06 3
## 3: SAPD-2025-0354704 2025-03-14 14:14:14 6
## 4: SAPD-2025-0293910 2025-03-02 21:55:28 5
## 5: SAPD-2025-0421747 2025-03-27 02:43:32 2
## 6: SAPD-2025-0373561 2025-03-18 01:50:44 3
## Problem Service_Area Type Seconds Weekday
## <char> <char> <char> <num> <char>
## 1: SAPD Call - NonEmerg SOUTH Non-Emergency NA Mon
## 2: Burglary Vehicle In Progress CENTRAL Non-Emergency 190 Sat
## 3: Recovered Stolen Property WEST Non-Emergency 631 Fri
## 4: Disturbance Neighbor EAST Non-Emergency NA Sun
## 5: Burglary (In Progress) EAST Non-Emergency 345 Thu
## 6: Wrong Way Driver SOUTH Non-Emergency NA Tue
## Disposition_Groups Disposition_Type Postal_Code
## <char> <char> <char>
## 1: NOR-Outside Agency NOR 78112
## 2: Cleared-Burg Business Cleared 78201
## 3: Cleared-Information Cleared 78201
## 4: Cleared-Information Cleared 78202
## 5: NOR-Disturbance NOR 78203
## 6: Cleared-Information Cleared 78204
summary(sapd)
## Master_Incident_Number Response_Date Priority
## Length:1953 Min. :2025-03-01 00:05:37.00 Min. :1.000
## Class :character 1st Qu.:2025-03-09 03:02:44.00 1st Qu.:3.000
## Mode :character Median :2025-03-16 23:20:49.00 Median :4.000
## Mean :2025-03-16 17:22:10.95 Mean :4.511
## 3rd Qu.:2025-03-23 23:47:30.00 3rd Qu.:6.000
## Max. :2025-03-31 23:16:47.00 Max. :8.000
##
## Problem Service_Area Type Seconds
## Length:1953 Length:1953 Length:1953 Min. : 0.0
## Class :character Class :character Class :character 1st Qu.: 340.8
## Mode :character Mode :character Mode :character Median : 726.5
## Mean : 1265.3
## 3rd Qu.: 1527.0
## Max. :17254.0
## NA's :501
## Weekday Disposition_Groups Disposition_Type Postal_Code
## Length:1953 Length:1953 Length:1953 Length:1953
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
#Data Cleaning (Remove "Disposition_Groups", "Master_Incident_Number", "Response_Date", "Postal_Code", "Problem")
sapd <- sapd[, -c(1, 2, 4, 9, 11)]
#Data Cleaning (Remove rows in "Seconds" that are missing)
sapd <- sapd[!is.na(sapd$Seconds), ]
#Data Cleaning (Remove rows in "Disposition_Type" that are "Cancelled_Call)
sapd <- sapd[sapd$Disposition_Type != "Cancelled Call", ]
#Converting categories to factors
sapd$Priority <- as.factor(sapd$Priority)
sapd$Service_Area <- as.factor(sapd$Service_Area)
sapd$Type <- as.factor(sapd$Type)
sapd$Weekday <- as.factor(sapd$Weekday)
sapd$Disposition_Type <- as.factor(sapd$Disposition_Type)
#Question 1 Response Time vs Priority
library(ggplot2)
# Boxplot to visualize variation
ggplot(sapd, aes(x = Priority, y = Seconds)) +
geom_boxplot(fill = "steelblue") +
labs(title = "Response Time in Seconds by Priority Level", y = "Response Time (seconds)")

sapd %>%
group_by(Priority) %>%
summarise(
mean_response = mean(Seconds, na.rm = TRUE),
median_response = median(Seconds, na.rm = TRUE),
n = n()
)
## # A tibble: 8 × 4
## Priority mean_response median_response n
## <fct> <dbl> <dbl> <int>
## 1 1 310. 315 28
## 2 2 581. 442. 270
## 3 3 641. 439 111
## 4 4 1577. 965 462
## 5 5 1640. 1066 139
## 6 6 1521. 1041 191
## 7 7 1680. 1062 209
## 8 8 34 0 41
#The time it takes to respond in seconds is highest when the priority of the case is 4. Then it goes down to 7 and 6.
#Question 2 Proportion of unresolved calls (‘NOR’) different across days of the week
table_weekday <- table(sapd$Weekday, sapd$Disposition_Type)
chisq.test(table_weekday)
##
## Pearson's Chi-squared test
##
## data: table_weekday
## X-squared = 10.97, df = 6, p-value = 0.0893
#Null hypothesis: The proportion of NOR cases is the same across all days of the week.
#Alternative hypothesis: The proportion of NOR cases differs by weekday.
#The p-value is >0.05 therefore we fail reject the null. This means that there not is a significant relationship between weekday and whether a case is cleared or unresolved.
#Question 2 Barplot Weekday
sapd$Weekday <- factor(sapd$Weekday, levels = c("Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"))
sapd$Disposition_Type <- factor(sapd$Disposition_Type, levels = c("Cleared", "NOR"))
# Create bar plot
ggplot(sapd, aes(x = Weekday, fill = Disposition_Type)) +
geom_bar(position = "fill") +
labs(title = "Proportion of Disposition Type by Weekday",
x = "Day of the Week",
y = "Proportion",
fill = "Disposition Type") +
scale_y_continuous(labels = scales::percent_format()) +
theme_minimal()

#Question 2 Proportion of unresolved calls (‘NOR’) based on Service Area
table_area <- table(sapd$Service_Area, sapd$Disposition_Type)
chisq.test(table_area)
##
## Pearson's Chi-squared test
##
## data: table_area
## X-squared = 21.615, df = 6, p-value = 0.001422
#Null hypothesis: The proportion of NOR cases is the same across all days of the week.
#Alternative hypothesis: The proportion of NOR cases differs by weekday.
#The p-value is <0.05 therefore we reject the null. This means that there is a significant relationship between weekday and whether a case is cleared or unresolved.
#Question 2 Barplot Service Area
sapd$Disposition_Type <- factor(sapd$Disposition_Type, levels = c("Cleared", "NOR"))
ggplot(sapd, aes(x = Service_Area, fill = Disposition_Type)) +
geom_bar(position = "fill") +
labs(title = "Proportion of Disposition Type by Service Area",
x = "Service Area",
y = "Proportion",
fill = "Disposition Type") +
scale_y_continuous(labels = scales::percent_format()) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

#Question 3 Pattern in calls by day of week
ggplot(sapd, aes(x = Weekday)) +
geom_bar(fill = "red") +
labs(title = "Number of Calls by Day of the Week", x = "Day", y = "Number of Calls")

#Checking class distribution of Disposition Type
prop.table(table(sapd$Disposition_Type))
##
## Cleared NOR
## 0.3735355 0.6264645
#Check for multicollinearity
library(car)
## Warning: package 'car' was built under R version 4.4.2
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
sapd_predictors <- sapd %>%
select(-Disposition_Type)
print(names(sapd_predictors))
## [1] "Priority" "Service_Area" "Type" "Seconds" "Weekday"
sapd_predictors_reduced <- sapd %>%
select(Priority, Service_Area, Seconds, Weekday)
sapd_predictors_reduced_with_target <- cbind(sapd_predictors_reduced, Disposition_Type = sapd$Disposition_Type)
# Fit the logistic regression model
lm_model <- glm(Disposition_Type ~ Priority + Service_Area + Seconds + Weekday,
data = sapd_predictors_reduced_with_target, family = "binomial")
# Check for multicollinearity using VIF
library(car)
vif(lm_model)
## GVIF Df GVIF^(1/(2*Df))
## Priority 1.190016 7 1.012504
## Service_Area 1.103889 6 1.008271
## Seconds 1.093910 1 1.045901
## Weekday 1.063755 6 1.005164
#Looking at the VIF above, there seems to be no multicollinearity.
# One-hot encode the categorical variables
library(caret)
# Use dummyVars to create one-hot encoding
dummies <- dummyVars(Disposition_Type ~ Priority + Service_Area + Weekday,
data = sapd_predictors_reduced_with_target)
sapd_predictors_encoded <- predict(dummies, newdata = sapd_predictors_reduced_with_target)
## Warning in model.frame.default(Terms, newdata, na.action = na.action, xlev =
## object$lvls): variable 'Disposition_Type' is not a factor
# Add 'Seconds' to the encoded data (assuming 'Seconds' is continuous)
sapd_predictors_encoded <- cbind(sapd_predictors_encoded, sapd_predictors_reduced_with_target$Seconds)
# View the encoded dataset
head(sapd_predictors_encoded)
## Priority.1 Priority.2 Priority.3 Priority.4 Priority.5 Priority.6 Priority.7
## 1 0 0 1 0 0 0 0
## 2 0 0 0 0 0 1 0
## 3 0 1 0 0 0 0 0
## 4 0 0 0 1 0 0 0
## 5 0 1 0 0 0 0 0
## 6 0 0 0 1 0 0 0
## Priority.8 Service_Area.CENTRAL Service_Area.DOWNTOWN Service_Area.EAST
## 1 0 1 0 0
## 2 0 0 0 0
## 3 0 0 0 1
## 4 0 1 0 0
## 5 0 0 1 0
## 6 0 0 0 0
## Service_Area.NORTH Service_Area.PRUE Service_Area.SOUTH Service_Area.WEST
## 1 0 0 0 0
## 2 0 0 0 1
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 1 0 0 0
## Weekday.Sun Weekday.Mon Weekday.Tue Weekday.Wed Weekday.Thu Weekday.Fri
## 1 0 0 0 0 0 0
## 2 0 0 0 0 0 1
## 3 0 0 0 0 1 0
## 4 0 0 0 0 0 0
## 5 0 0 0 0 0 1
## 6 0 0 0 0 1 0
## Weekday.Sat
## 1 1 190
## 2 0 631
## 3 0 345
## 4 1 0
## 5 0 399
## 6 0 1408
# Calculate correlation matrix for one-hot encoded variables
cor_matrix <- cor(sapd_predictors_encoded)
# Visualize the correlation matrix using the corrplot package
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.2
## corrplot 0.95 loaded
corrplot(cor_matrix, method = "circle", type = "upper", tl.cex = 0.8)

# Split the dataset into training and test sets (80% train, 20% test)
set.seed(123) # For reproducibility
train_index <- sample(1:nrow(sapd), size = 0.8 * nrow(sapd))
train_data <- sapd[train_index, ]
test_data <- sapd[-train_index, ]
table(train_data$Disposition_Type)
##
## Cleared NOR
## 430 730
library(caret)
library(dplyr)
# Ensure target is a factor
train_data$Disposition_Type <- as.factor(train_data$Disposition_Type)
# Separate predictors and target
x_train <- train_data %>% select(-Disposition_Type)
y_train <- train_data$Disposition_Type
# Perform upsampling
train_data_upsampled <- upSample(x = x_train, y = y_train)
# Check new class distribution
table(train_data_upsampled$Class)
##
## Cleared NOR
## 730 730
names(train_data_upsampled)[names(train_data_upsampled) == "Class"] <- "Disposition_Type"
set.seed(123) # For reproducibility
# Define 10-fold cross-validation
cv_control <- trainControl(method = "cv", number = 10,
classProbs = TRUE,
summaryFunction = twoClassSummary,
savePredictions = "final")
# Convert target to factor with explicit levels
train_data_upsampled$Disposition_Type <- factor(train_data_upsampled$Disposition_Type, levels = c("Cleared", "NOR"))
# Fit logistic regression with cross-validation
logit_cv_model <- train(Disposition_Type ~ Priority + Service_Area + Seconds + Weekday,
data = train_data_upsampled,
method = "glm",
family = "binomial",
trControl = cv_control,
metric = "ROC")
# View model performance
print(logit_cv_model)
## Generalized Linear Model
##
## 1460 samples
## 4 predictor
## 2 classes: 'Cleared', 'NOR'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1314, 1314, 1314, 1314, 1314, 1314, ...
## Resampling results:
##
## ROC Sens Spec
## 0.6517639 0.6287671 0.5849315
# Predict on the test set
test_preds <- predict(logit_cv_model, newdata = test_data)
# Ensure true labels are a factor with same levels
test_true <- factor(test_data$Disposition_Type, levels = c("Cleared", "NOR"))
# Confusion matrix and accuracy
conf_mat <- confusionMatrix(test_preds, test_true)
print(conf_mat)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Cleared NOR
## Cleared 67 73
## NOR 45 106
##
## Accuracy : 0.5945
## 95% CI : (0.5356, 0.6514)
## No Information Rate : 0.6151
## P-Value [Acc > NIR] : 0.78372
##
## Kappa : 0.1819
##
## Mcnemar's Test P-Value : 0.01294
##
## Sensitivity : 0.5982
## Specificity : 0.5922
## Pos Pred Value : 0.4786
## Neg Pred Value : 0.7020
## Prevalence : 0.3849
## Detection Rate : 0.2302
## Detection Prevalence : 0.4811
## Balanced Accuracy : 0.5952
##
## 'Positive' Class : Cleared
##
library(caret)
library(dplyr)
library(glmnet) # For regularized logistic regression
## Warning: package 'glmnet' was built under R version 4.4.3
## Loading required package: Matrix
## Loaded glmnet 4.1-8
# Normalize or standardize continuous features
preProcValues <- preProcess(train_data, method = c("center", "scale"))
train_data_scaled <- predict(preProcValues, train_data)
# Perform upsampling
x_train <- train_data_scaled %>% select(-Disposition_Type)
y_train <- train_data_scaled$Disposition_Type
train_data_upsampled <- upSample(x = x_train, y = y_train)
# Define 10-fold cross-validation
cv_control <- trainControl(method = "cv", number = 10,
classProbs = TRUE,
summaryFunction = twoClassSummary,
savePredictions = "final",
sampling = "up")
# Fit regularized logistic regression with cross-validation
logit_cv_model <- train(Class ~ Priority + Service_Area + Seconds + Weekday,
data = train_data_upsampled,
method = "glmnet",
family = "binomial",
trControl = cv_control,
metric = "ROC")
# View model performance
print(logit_cv_model)
## glmnet
##
## 1460 samples
## 4 predictor
## 2 classes: 'Cleared', 'NOR'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1314, 1314, 1314, 1314, 1314, 1314, ...
## Addtional sampling using up-sampling
##
## Resampling results across tuning parameters:
##
## alpha lambda ROC Sens Spec
## 0.10 0.000152701 0.6613342 0.6356164 0.5808219
## 0.10 0.001527010 0.6611278 0.6410959 0.5726027
## 0.10 0.015270103 0.6534622 0.6465753 0.5452055
## 0.55 0.000152701 0.6612967 0.6328767 0.5808219
## 0.55 0.001527010 0.6612591 0.6452055 0.5657534
## 0.55 0.015270103 0.6506474 0.6657534 0.5178082
## 1.00 0.000152701 0.6613530 0.6328767 0.5808219
## 1.00 0.001527010 0.6601145 0.6438356 0.5575342
## 1.00 0.015270103 0.6430944 0.7342466 0.4589041
##
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were alpha = 1 and lambda = 0.000152701.
# Predict on the test set (probabilities)
test_preds <- predict(logit_cv_model, newdata = test_data, type = "prob")
# Ensure true labels are a factor with same levels (Cleared = 0, NOR = 1)
test_true <- factor(test_data$Disposition_Type, levels = c("Cleared", "NOR"))
# Adjust the threshold (e.g., classify as 'NOR' if probability > 0.6)
test_preds_adjusted <- ifelse(test_preds$NOR > 0.6, "NOR", "Cleared")
# Convert to factor to match levels in test_true
test_preds_adjusted <- factor(test_preds_adjusted, levels = c("Cleared", "NOR"))
# Confusion matrix and accuracy
conf_mat <- confusionMatrix(test_preds_adjusted, test_true, positive = "NOR")
print(conf_mat)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Cleared NOR
## Cleared 12 4
## NOR 100 175
##
## Accuracy : 0.6426
## 95% CI : (0.5846, 0.6977)
## No Information Rate : 0.6151
## P-Value [Acc > NIR] : 0.1833
##
## Kappa : 0.101
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.9777
## Specificity : 0.1071
## Pos Pred Value : 0.6364
## Neg Pred Value : 0.7500
## Prevalence : 0.6151
## Detection Rate : 0.6014
## Detection Prevalence : 0.9450
## Balanced Accuracy : 0.5424
##
## 'Positive' Class : NOR
##
library(dplyr)
train_data_upsampled <- train_data_upsampled %>%
select(-Type)
install.packages("rpart.plot")
## Installing package into 'C:/Users/lsunb/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'rpart.plot' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\lsunb\AppData\Local\Temp\RtmpgLHMLX\downloaded_packages
install.packages("kernlab")
## Installing package into 'C:/Users/lsunb/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'kernlab' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'kernlab'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\lsunb\AppData\Local\R\win-library\4.4\00LOCK\kernlab\libs\x64\kernlab.dll
## to C:\Users\lsunb\AppData\Local\R\win-library\4.4\kernlab\libs\x64\kernlab.dll:
## Permission denied
## Warning: restored 'kernlab'
##
## The downloaded binary packages are in
## C:\Users\lsunb\AppData\Local\Temp\RtmpgLHMLX\downloaded_packages
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.4.3
library(caret)
# Fit the initial (large) decision tree
tree_model <- rpart(Class ~ ., data = train_data_upsampled, method = "class", cp = 0.001)
# Print the CP table to find optimal complexity parameter
printcp(tree_model)
##
## Classification tree:
## rpart(formula = Class ~ ., data = train_data_upsampled, method = "class",
## cp = 0.001)
##
## Variables actually used in tree construction:
## [1] Priority Seconds Service_Area Weekday
##
## Root node error: 730/1460 = 0.5
##
## n= 1460
##
## CP nsplit rel error xerror xstd
## 1 0.2315068 0 1.00000 1.05753 0.026128
## 2 0.0059361 1 0.76849 0.80274 0.025657
## 3 0.0054795 11 0.69315 0.79315 0.025605
## 4 0.0047945 13 0.68219 0.78630 0.025567
## 5 0.0041096 20 0.63151 0.78219 0.025543
## 6 0.0036530 24 0.61096 0.78219 0.025543
## 7 0.0034247 34 0.55479 0.76027 0.025408
## 8 0.0031963 39 0.53562 0.76027 0.025408
## 9 0.0030822 42 0.52603 0.76027 0.025408
## 10 0.0027397 53 0.48767 0.74658 0.025317
## 11 0.0022831 58 0.47397 0.74247 0.025288
## 12 0.0020548 62 0.46301 0.73699 0.025250
## 13 0.0013699 64 0.45890 0.73699 0.025250
## 14 0.0010000 73 0.44521 0.71918 0.025118
plotcp(tree_model) # Visualize cross-validated error vs. cp

# Choose optimal CP value that minimizes xerror (cross-validated error)
opt_cp <- tree_model$cptable[which.min(tree_model$cptable[, "xerror"]), "CP"]
# Prune the tree using optimal cp
pruned_tree <- prune(tree_model, cp = opt_cp)
# Visualize the pruned tree
rpart.plot(pruned_tree, type = 2, extra = 106)
## Warning: labs do not fit even at cex 0.15, there may be some overplotting

# Predict on the test set
preds <- predict(pruned_tree, newdata = test_data, type = "class")
# Confusion matrix
confusionMatrix(preds, test_data$Disposition_Type)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Cleared NOR
## Cleared 45 34
## NOR 67 145
##
## Accuracy : 0.6529
## 95% CI : (0.5952, 0.7075)
## No Information Rate : 0.6151
## P-Value [Acc > NIR] : 0.102379
##
## Kappa : 0.2242
##
## Mcnemar's Test P-Value : 0.001452
##
## Sensitivity : 0.4018
## Specificity : 0.8101
## Pos Pred Value : 0.5696
## Neg Pred Value : 0.6840
## Prevalence : 0.3849
## Detection Rate : 0.1546
## Detection Prevalence : 0.2715
## Balanced Accuracy : 0.6059
##
## 'Positive' Class : Cleared
##
install.packages("smotefamily")
## Installing package into 'C:/Users/lsunb/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'smotefamily' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\lsunb\AppData\Local\Temp\RtmpgLHMLX\downloaded_packages
library(smotefamily)
## Warning: package 'smotefamily' was built under R version 4.4.3
library(caret)
library(e1071)
## Warning: package 'e1071' was built under R version 4.4.2
set.seed(123)
ctrl <- trainControl(
method = "cv",
number = 10,
classProbs = TRUE,
summaryFunction = twoClassSummary,
savePredictions = "final",
sampling = "up" # <-- Balances the classes by upsampling minority class
)
svm_linear_grid <- expand.grid(C = c(0.01, 0.1, 1, 10, 100, 1000))
svm_linear <- train(Class ~ Priority + Service_Area + Seconds + Weekday,
data = train_data_upsampled,
method = "svmLinear",
trControl = ctrl,
metric = "ROC",
preProcess = c("center", "scale"))
# Custom tuning grid for radial SVM
svm_radial_grid <- expand.grid(
sigma = c(0.001, 0.01, 0.05, 0.1, 0.5),
C = c(0.1, 1, 10, 100)
)
svm_radial <- train(Class ~ Priority + Service_Area + Seconds + Weekday,
data = train_data_upsampled,
method = "svmRadial",
trControl = ctrl,
metric = "ROC",
preProcess = c("center", "scale"),
tuneGrid = svm_radial_grid)
# Predict (use type="prob" if you want to threshold)
pred_linear <- predict(svm_linear, newdata = test_data)
pred_radial <- predict(svm_radial, newdata = test_data)
# True values
test_true <- factor(test_data$Disposition_Type, levels = c("Cleared", "NOR"))
# Confusion Matrices
confusionMatrix(pred_linear, test_true)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Cleared NOR
## Cleared 71 70
## NOR 41 109
##
## Accuracy : 0.6186
## 95% CI : (0.5601, 0.6746)
## No Information Rate : 0.6151
## P-Value [Acc > NIR] : 0.477827
##
## Kappa : 0.2316
##
## Mcnemar's Test P-Value : 0.007869
##
## Sensitivity : 0.6339
## Specificity : 0.6089
## Pos Pred Value : 0.5035
## Neg Pred Value : 0.7267
## Prevalence : 0.3849
## Detection Rate : 0.2440
## Detection Prevalence : 0.4845
## Balanced Accuracy : 0.6214
##
## 'Positive' Class : Cleared
##
confusionMatrix(pred_radial, test_true)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Cleared NOR
## Cleared 9 3
## NOR 103 176
##
## Accuracy : 0.6357
## 95% CI : (0.5776, 0.6911)
## No Information Rate : 0.6151
## P-Value [Acc > NIR] : 0.2547
##
## Kappa : 0.0764
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.08036
## Specificity : 0.98324
## Pos Pred Value : 0.75000
## Neg Pred Value : 0.63082
## Prevalence : 0.38488
## Detection Rate : 0.03093
## Detection Prevalence : 0.04124
## Balanced Accuracy : 0.53180
##
## 'Positive' Class : Cleared
##
# Load necessary library
library(caret)
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.4.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
# Define the tuning grid for Random Forest
rf_grid <- expand.grid(mtry = c(1, 2, 3, 4))
# Train the Random Forest model with custom tuning grid
rf_model <- train(
Class ~ Priority + Service_Area + Seconds + Weekday,
data = train_data_upsampled,
method = "rf",
metric = "ROC", # Optimize based on ROC AUC
trControl = ctrl, # Your trainControl object
tuneGrid = rf_grid,
preProcess = c("center", "scale") # Optional, depending on your needs
)
# Make predictions on the test set
rf_preds <- predict(rf_model, newdata = test_data)
# Evaluate performance using confusion matrix
confusionMatrix(rf_preds, test_data$Disposition_Type)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Cleared NOR
## Cleared 62 52
## NOR 50 127
##
## Accuracy : 0.6495
## 95% CI : (0.5916, 0.7043)
## No Information Rate : 0.6151
## P-Value [Acc > NIR] : 0.1259
##
## Kappa : 0.2622
##
## Mcnemar's Test P-Value : 0.9211
##
## Sensitivity : 0.5536
## Specificity : 0.7095
## Pos Pred Value : 0.5439
## Neg Pred Value : 0.7175
## Prevalence : 0.3849
## Detection Rate : 0.2131
## Detection Prevalence : 0.3918
## Balanced Accuracy : 0.6315
##
## 'Positive' Class : Cleared
##