#Load the dataset / Remove all rows after 1953
library(dplyr) 
## Warning: package 'dplyr' was built under R version 4.4.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
sapd <- fread("C:\\Users\\lsunb\\Downloads\\pubsafedash_cfs (1).csv")
sapd <- sapd[1:1953, ]
str(sapd)
## Classes 'data.table' and 'data.frame':   1953 obs. of  11 variables:
##  $ Master_Incident_Number: chr  "SAPD-2025-0331214" "SAPD-2025-0358398" "SAPD-2025-0354704" "SAPD-2025-0293910" ...
##  $ Response_Date         : POSIXct, format: "2025-03-10 01:49:41" "2025-03-15 05:05:06" ...
##  $ Priority              : int  6 3 6 5 2 3 6 4 2 4 ...
##  $ Problem               : chr  "SAPD Call - NonEmerg" "Burglary Vehicle In Progress" "Recovered Stolen Property" "Disturbance Neighbor" ...
##  $ Service_Area          : chr  "SOUTH" "CENTRAL" "WEST" "EAST" ...
##  $ Type                  : chr  "Non-Emergency" "Non-Emergency" "Non-Emergency" "Non-Emergency" ...
##  $ Seconds               : num  NA 190 631 NA 345 ...
##  $ Weekday               : chr  "Mon" "Sat" "Fri" "Sun" ...
##  $ Disposition_Groups    : chr  "NOR-Outside Agency" "Cleared-Burg Business" "Cleared-Information" "Cleared-Information" ...
##  $ Disposition_Type      : chr  "NOR" "Cleared" "Cleared" "Cleared" ...
##  $ Postal_Code           : chr  "78112" "78201" "78201" "78202" ...
##  - attr(*, ".internal.selfref")=<externalptr>
head(sapd)
##    Master_Incident_Number       Response_Date Priority
##                    <char>              <POSc>    <int>
## 1:      SAPD-2025-0331214 2025-03-10 01:49:41        6
## 2:      SAPD-2025-0358398 2025-03-15 05:05:06        3
## 3:      SAPD-2025-0354704 2025-03-14 14:14:14        6
## 4:      SAPD-2025-0293910 2025-03-02 21:55:28        5
## 5:      SAPD-2025-0421747 2025-03-27 02:43:32        2
## 6:      SAPD-2025-0373561 2025-03-18 01:50:44        3
##                         Problem Service_Area          Type Seconds Weekday
##                          <char>       <char>        <char>   <num>  <char>
## 1:         SAPD Call - NonEmerg        SOUTH Non-Emergency      NA     Mon
## 2: Burglary Vehicle In Progress      CENTRAL Non-Emergency     190     Sat
## 3:    Recovered Stolen Property         WEST Non-Emergency     631     Fri
## 4:         Disturbance Neighbor         EAST Non-Emergency      NA     Sun
## 5:       Burglary (In Progress)         EAST Non-Emergency     345     Thu
## 6:             Wrong Way Driver        SOUTH Non-Emergency      NA     Tue
##       Disposition_Groups Disposition_Type Postal_Code
##                   <char>           <char>      <char>
## 1:    NOR-Outside Agency              NOR       78112
## 2: Cleared-Burg Business          Cleared       78201
## 3:   Cleared-Information          Cleared       78201
## 4:   Cleared-Information          Cleared       78202
## 5:       NOR-Disturbance              NOR       78203
## 6:   Cleared-Information          Cleared       78204
summary(sapd)
##  Master_Incident_Number Response_Date                       Priority    
##  Length:1953            Min.   :2025-03-01 00:05:37.00   Min.   :1.000  
##  Class :character       1st Qu.:2025-03-09 03:02:44.00   1st Qu.:3.000  
##  Mode  :character       Median :2025-03-16 23:20:49.00   Median :4.000  
##                         Mean   :2025-03-16 17:22:10.95   Mean   :4.511  
##                         3rd Qu.:2025-03-23 23:47:30.00   3rd Qu.:6.000  
##                         Max.   :2025-03-31 23:16:47.00   Max.   :8.000  
##                                                                         
##    Problem          Service_Area           Type              Seconds       
##  Length:1953        Length:1953        Length:1953        Min.   :    0.0  
##  Class :character   Class :character   Class :character   1st Qu.:  340.8  
##  Mode  :character   Mode  :character   Mode  :character   Median :  726.5  
##                                                           Mean   : 1265.3  
##                                                           3rd Qu.: 1527.0  
##                                                           Max.   :17254.0  
##                                                           NA's   :501      
##    Weekday          Disposition_Groups Disposition_Type   Postal_Code       
##  Length:1953        Length:1953        Length:1953        Length:1953       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
## 
#Data Cleaning (Remove "Disposition_Groups", "Master_Incident_Number", "Response_Date", "Postal_Code", "Problem")
sapd <- sapd[, -c(1, 2, 4, 9, 11)]
#Data Cleaning (Remove rows in "Seconds" that are missing)
sapd <- sapd[!is.na(sapd$Seconds), ]
#Data Cleaning (Remove rows in "Disposition_Type" that are "Cancelled_Call)
sapd <- sapd[sapd$Disposition_Type != "Cancelled Call", ]
#Converting categories to factors
sapd$Priority <- as.factor(sapd$Priority)
sapd$Service_Area <- as.factor(sapd$Service_Area)
sapd$Type <- as.factor(sapd$Type)
sapd$Weekday <- as.factor(sapd$Weekday)
sapd$Disposition_Type <- as.factor(sapd$Disposition_Type)
#Question 1 Response Time vs Priority
library(ggplot2)
# Boxplot to visualize variation
ggplot(sapd, aes(x = Priority, y = Seconds)) +
  geom_boxplot(fill = "steelblue") +
  labs(title = "Response Time in Seconds by Priority Level", y = "Response Time (seconds)")

sapd %>%
  group_by(Priority) %>%
  summarise(
    mean_response = mean(Seconds, na.rm = TRUE),
    median_response = median(Seconds, na.rm = TRUE),
    n = n()
  )
## # A tibble: 8 × 4
##   Priority mean_response median_response     n
##   <fct>            <dbl>           <dbl> <int>
## 1 1                 310.            315     28
## 2 2                 581.            442.   270
## 3 3                 641.            439    111
## 4 4                1577.            965    462
## 5 5                1640.           1066    139
## 6 6                1521.           1041    191
## 7 7                1680.           1062    209
## 8 8                  34               0     41
#The time it takes to respond in seconds is highest when the priority of the case is 4. Then it goes down to 7 and 6. 
#Question 2 Proportion of unresolved calls (‘NOR’) different across days of the week
table_weekday <- table(sapd$Weekday, sapd$Disposition_Type)
chisq.test(table_weekday)
## 
##  Pearson's Chi-squared test
## 
## data:  table_weekday
## X-squared = 10.97, df = 6, p-value = 0.0893
#Null hypothesis: The proportion of NOR cases is the same across all days of the week.
#Alternative hypothesis: The proportion of NOR cases differs by weekday.
#The p-value is >0.05 therefore we fail reject the null. This means that there not is a significant relationship between weekday and whether a case is cleared or unresolved.
#Question 2 Barplot Weekday
sapd$Weekday <- factor(sapd$Weekday, levels = c("Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"))
sapd$Disposition_Type <- factor(sapd$Disposition_Type, levels = c("Cleared", "NOR"))

# Create bar plot
ggplot(sapd, aes(x = Weekday, fill = Disposition_Type)) +
  geom_bar(position = "fill") +
  labs(title = "Proportion of Disposition Type by Weekday",
       x = "Day of the Week",
       y = "Proportion",
       fill = "Disposition Type") +
  scale_y_continuous(labels = scales::percent_format()) +
  theme_minimal()

#Question 2 Proportion of unresolved calls (‘NOR’) based on Service Area
table_area <- table(sapd$Service_Area, sapd$Disposition_Type)
chisq.test(table_area)
## 
##  Pearson's Chi-squared test
## 
## data:  table_area
## X-squared = 21.615, df = 6, p-value = 0.001422
#Null hypothesis: The proportion of NOR cases is the same across all days of the week.
#Alternative hypothesis: The proportion of NOR cases differs by weekday.
#The p-value is <0.05 therefore we reject the null. This means that there is a significant relationship between weekday and whether a case is cleared or unresolved.
#Question 2 Barplot Service Area
sapd$Disposition_Type <- factor(sapd$Disposition_Type, levels = c("Cleared", "NOR"))

ggplot(sapd, aes(x = Service_Area, fill = Disposition_Type)) +
  geom_bar(position = "fill") +
  labs(title = "Proportion of Disposition Type by Service Area",
       x = "Service Area",
       y = "Proportion",
       fill = "Disposition Type") +
  scale_y_continuous(labels = scales::percent_format()) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  

#Question 3 Pattern in calls by day of week
ggplot(sapd, aes(x = Weekday)) +
  geom_bar(fill = "red") +
  labs(title = "Number of Calls by Day of the Week", x = "Day", y = "Number of Calls")

#Checking class distribution of Disposition Type
prop.table(table(sapd$Disposition_Type))
## 
##   Cleared       NOR 
## 0.3735355 0.6264645
#Check for multicollinearity
library(car)
## Warning: package 'car' was built under R version 4.4.2
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
sapd_predictors <- sapd %>%
  select(-Disposition_Type)

print(names(sapd_predictors))
## [1] "Priority"     "Service_Area" "Type"         "Seconds"      "Weekday"
sapd_predictors_reduced <- sapd %>%
  select(Priority, Service_Area, Seconds, Weekday)
sapd_predictors_reduced_with_target <- cbind(sapd_predictors_reduced, Disposition_Type = sapd$Disposition_Type)
# Fit the logistic regression model
lm_model <- glm(Disposition_Type ~ Priority + Service_Area + Seconds + Weekday, 
                data = sapd_predictors_reduced_with_target, family = "binomial")

# Check for multicollinearity using VIF
library(car)
vif(lm_model)
##                  GVIF Df GVIF^(1/(2*Df))
## Priority     1.190016  7        1.012504
## Service_Area 1.103889  6        1.008271
## Seconds      1.093910  1        1.045901
## Weekday      1.063755  6        1.005164
#Looking at the VIF above, there seems to be no multicollinearity. 
# One-hot encode the categorical variables
library(caret)

# Use dummyVars to create one-hot encoding
dummies <- dummyVars(Disposition_Type ~ Priority + Service_Area + Weekday, 
                     data = sapd_predictors_reduced_with_target)
sapd_predictors_encoded <- predict(dummies, newdata = sapd_predictors_reduced_with_target)
## Warning in model.frame.default(Terms, newdata, na.action = na.action, xlev =
## object$lvls): variable 'Disposition_Type' is not a factor
# Add 'Seconds' to the encoded data (assuming 'Seconds' is continuous)
sapd_predictors_encoded <- cbind(sapd_predictors_encoded, sapd_predictors_reduced_with_target$Seconds)

# View the encoded dataset
head(sapd_predictors_encoded)
##   Priority.1 Priority.2 Priority.3 Priority.4 Priority.5 Priority.6 Priority.7
## 1          0          0          1          0          0          0          0
## 2          0          0          0          0          0          1          0
## 3          0          1          0          0          0          0          0
## 4          0          0          0          1          0          0          0
## 5          0          1          0          0          0          0          0
## 6          0          0          0          1          0          0          0
##   Priority.8 Service_Area.CENTRAL Service_Area.DOWNTOWN Service_Area.EAST
## 1          0                    1                     0                 0
## 2          0                    0                     0                 0
## 3          0                    0                     0                 1
## 4          0                    1                     0                 0
## 5          0                    0                     1                 0
## 6          0                    0                     0                 0
##   Service_Area.NORTH Service_Area.PRUE Service_Area.SOUTH Service_Area.WEST
## 1                  0                 0                  0                 0
## 2                  0                 0                  0                 1
## 3                  0                 0                  0                 0
## 4                  0                 0                  0                 0
## 5                  0                 0                  0                 0
## 6                  1                 0                  0                 0
##   Weekday.Sun Weekday.Mon Weekday.Tue Weekday.Wed Weekday.Thu Weekday.Fri
## 1           0           0           0           0           0           0
## 2           0           0           0           0           0           1
## 3           0           0           0           0           1           0
## 4           0           0           0           0           0           0
## 5           0           0           0           0           0           1
## 6           0           0           0           0           1           0
##   Weekday.Sat     
## 1           1  190
## 2           0  631
## 3           0  345
## 4           1    0
## 5           0  399
## 6           0 1408
# Calculate correlation matrix for one-hot encoded variables
cor_matrix <- cor(sapd_predictors_encoded)

# Visualize the correlation matrix using the corrplot package
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.2
## corrplot 0.95 loaded
corrplot(cor_matrix, method = "circle", type = "upper", tl.cex = 0.8)

# Split the dataset into training and test sets (80% train, 20% test)
set.seed(123)  # For reproducibility
train_index <- sample(1:nrow(sapd), size = 0.8 * nrow(sapd))
train_data <- sapd[train_index, ]
test_data <- sapd[-train_index, ]
table(train_data$Disposition_Type)
## 
## Cleared     NOR 
##     430     730
library(caret)
library(dplyr)

# Ensure target is a factor
train_data$Disposition_Type <- as.factor(train_data$Disposition_Type)

# Separate predictors and target
x_train <- train_data %>% select(-Disposition_Type)
y_train <- train_data$Disposition_Type

# Perform upsampling
train_data_upsampled <- upSample(x = x_train, y = y_train)

# Check new class distribution
table(train_data_upsampled$Class)
## 
## Cleared     NOR 
##     730     730
names(train_data_upsampled)[names(train_data_upsampled) == "Class"] <- "Disposition_Type"
set.seed(123)  # For reproducibility

# Define 10-fold cross-validation
cv_control <- trainControl(method = "cv", number = 10,
                           classProbs = TRUE, 
                           summaryFunction = twoClassSummary,
                           savePredictions = "final")

# Convert target to factor with explicit levels
train_data_upsampled$Disposition_Type <- factor(train_data_upsampled$Disposition_Type, levels = c("Cleared", "NOR"))

# Fit logistic regression with cross-validation
logit_cv_model <- train(Disposition_Type ~ Priority + Service_Area + Seconds + Weekday,
                        data = train_data_upsampled,
                        method = "glm",
                        family = "binomial",
                        trControl = cv_control,
                        metric = "ROC")

# View model performance
print(logit_cv_model)
## Generalized Linear Model 
## 
## 1460 samples
##    4 predictor
##    2 classes: 'Cleared', 'NOR' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1314, 1314, 1314, 1314, 1314, 1314, ... 
## Resampling results:
## 
##   ROC        Sens       Spec     
##   0.6517639  0.6287671  0.5849315
# Predict on the test set
test_preds <- predict(logit_cv_model, newdata = test_data)

# Ensure true labels are a factor with same levels
test_true <- factor(test_data$Disposition_Type, levels = c("Cleared", "NOR"))

# Confusion matrix and accuracy
conf_mat <- confusionMatrix(test_preds, test_true)
print(conf_mat)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Cleared NOR
##    Cleared      67  73
##    NOR          45 106
##                                           
##                Accuracy : 0.5945          
##                  95% CI : (0.5356, 0.6514)
##     No Information Rate : 0.6151          
##     P-Value [Acc > NIR] : 0.78372         
##                                           
##                   Kappa : 0.1819          
##                                           
##  Mcnemar's Test P-Value : 0.01294         
##                                           
##             Sensitivity : 0.5982          
##             Specificity : 0.5922          
##          Pos Pred Value : 0.4786          
##          Neg Pred Value : 0.7020          
##              Prevalence : 0.3849          
##          Detection Rate : 0.2302          
##    Detection Prevalence : 0.4811          
##       Balanced Accuracy : 0.5952          
##                                           
##        'Positive' Class : Cleared         
## 
library(caret)
library(dplyr)
library(glmnet)  # For regularized logistic regression
## Warning: package 'glmnet' was built under R version 4.4.3
## Loading required package: Matrix
## Loaded glmnet 4.1-8
# Normalize or standardize continuous features
preProcValues <- preProcess(train_data, method = c("center", "scale"))
train_data_scaled <- predict(preProcValues, train_data)

# Perform upsampling
x_train <- train_data_scaled %>% select(-Disposition_Type)
y_train <- train_data_scaled$Disposition_Type
train_data_upsampled <- upSample(x = x_train, y = y_train)

# Define 10-fold cross-validation
cv_control <- trainControl(method = "cv", number = 10, 
                           classProbs = TRUE, 
                           summaryFunction = twoClassSummary,
                           savePredictions = "final", 
                           sampling = "up")

# Fit regularized logistic regression with cross-validation
logit_cv_model <- train(Class ~ Priority + Service_Area + Seconds + Weekday,
                        data = train_data_upsampled,
                        method = "glmnet",
                        family = "binomial",
                        trControl = cv_control,
                        metric = "ROC")

# View model performance
print(logit_cv_model)
## glmnet 
## 
## 1460 samples
##    4 predictor
##    2 classes: 'Cleared', 'NOR' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1314, 1314, 1314, 1314, 1314, 1314, ... 
## Addtional sampling using up-sampling
## 
## Resampling results across tuning parameters:
## 
##   alpha  lambda       ROC        Sens       Spec     
##   0.10   0.000152701  0.6613342  0.6356164  0.5808219
##   0.10   0.001527010  0.6611278  0.6410959  0.5726027
##   0.10   0.015270103  0.6534622  0.6465753  0.5452055
##   0.55   0.000152701  0.6612967  0.6328767  0.5808219
##   0.55   0.001527010  0.6612591  0.6452055  0.5657534
##   0.55   0.015270103  0.6506474  0.6657534  0.5178082
##   1.00   0.000152701  0.6613530  0.6328767  0.5808219
##   1.00   0.001527010  0.6601145  0.6438356  0.5575342
##   1.00   0.015270103  0.6430944  0.7342466  0.4589041
## 
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were alpha = 1 and lambda = 0.000152701.
# Predict on the test set (probabilities)
test_preds <- predict(logit_cv_model, newdata = test_data, type = "prob")

# Ensure true labels are a factor with same levels (Cleared = 0, NOR = 1)
test_true <- factor(test_data$Disposition_Type, levels = c("Cleared", "NOR"))

# Adjust the threshold (e.g., classify as 'NOR' if probability > 0.6)
test_preds_adjusted <- ifelse(test_preds$NOR > 0.6, "NOR", "Cleared")

# Convert to factor to match levels in test_true
test_preds_adjusted <- factor(test_preds_adjusted, levels = c("Cleared", "NOR"))

# Confusion matrix and accuracy
conf_mat <- confusionMatrix(test_preds_adjusted, test_true, positive = "NOR")
print(conf_mat)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Cleared NOR
##    Cleared      12   4
##    NOR         100 175
##                                           
##                Accuracy : 0.6426          
##                  95% CI : (0.5846, 0.6977)
##     No Information Rate : 0.6151          
##     P-Value [Acc > NIR] : 0.1833          
##                                           
##                   Kappa : 0.101           
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.9777          
##             Specificity : 0.1071          
##          Pos Pred Value : 0.6364          
##          Neg Pred Value : 0.7500          
##              Prevalence : 0.6151          
##          Detection Rate : 0.6014          
##    Detection Prevalence : 0.9450          
##       Balanced Accuracy : 0.5424          
##                                           
##        'Positive' Class : NOR             
## 
library(dplyr)

train_data_upsampled <- train_data_upsampled %>%
  select(-Type)
install.packages("rpart.plot")
## Installing package into 'C:/Users/lsunb/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'rpart.plot' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\lsunb\AppData\Local\Temp\RtmpgLHMLX\downloaded_packages
install.packages("kernlab")
## Installing package into 'C:/Users/lsunb/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'kernlab' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'kernlab'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\lsunb\AppData\Local\R\win-library\4.4\00LOCK\kernlab\libs\x64\kernlab.dll
## to C:\Users\lsunb\AppData\Local\R\win-library\4.4\kernlab\libs\x64\kernlab.dll:
## Permission denied
## Warning: restored 'kernlab'
## 
## The downloaded binary packages are in
##  C:\Users\lsunb\AppData\Local\Temp\RtmpgLHMLX\downloaded_packages
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.4.3
library(caret)
# Fit the initial (large) decision tree
tree_model <- rpart(Class ~ ., data = train_data_upsampled, method = "class", cp = 0.001)

# Print the CP table to find optimal complexity parameter
printcp(tree_model)
## 
## Classification tree:
## rpart(formula = Class ~ ., data = train_data_upsampled, method = "class", 
##     cp = 0.001)
## 
## Variables actually used in tree construction:
## [1] Priority     Seconds      Service_Area Weekday     
## 
## Root node error: 730/1460 = 0.5
## 
## n= 1460 
## 
##           CP nsplit rel error  xerror     xstd
## 1  0.2315068      0   1.00000 1.05753 0.026128
## 2  0.0059361      1   0.76849 0.80274 0.025657
## 3  0.0054795     11   0.69315 0.79315 0.025605
## 4  0.0047945     13   0.68219 0.78630 0.025567
## 5  0.0041096     20   0.63151 0.78219 0.025543
## 6  0.0036530     24   0.61096 0.78219 0.025543
## 7  0.0034247     34   0.55479 0.76027 0.025408
## 8  0.0031963     39   0.53562 0.76027 0.025408
## 9  0.0030822     42   0.52603 0.76027 0.025408
## 10 0.0027397     53   0.48767 0.74658 0.025317
## 11 0.0022831     58   0.47397 0.74247 0.025288
## 12 0.0020548     62   0.46301 0.73699 0.025250
## 13 0.0013699     64   0.45890 0.73699 0.025250
## 14 0.0010000     73   0.44521 0.71918 0.025118
plotcp(tree_model)  # Visualize cross-validated error vs. cp

# Choose optimal CP value that minimizes xerror (cross-validated error)
opt_cp <- tree_model$cptable[which.min(tree_model$cptable[, "xerror"]), "CP"]

# Prune the tree using optimal cp
pruned_tree <- prune(tree_model, cp = opt_cp)

# Visualize the pruned tree
rpart.plot(pruned_tree, type = 2, extra = 106)
## Warning: labs do not fit even at cex 0.15, there may be some overplotting

# Predict on the test set
preds <- predict(pruned_tree, newdata = test_data, type = "class")

# Confusion matrix
confusionMatrix(preds, test_data$Disposition_Type)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Cleared NOR
##    Cleared      45  34
##    NOR          67 145
##                                           
##                Accuracy : 0.6529          
##                  95% CI : (0.5952, 0.7075)
##     No Information Rate : 0.6151          
##     P-Value [Acc > NIR] : 0.102379        
##                                           
##                   Kappa : 0.2242          
##                                           
##  Mcnemar's Test P-Value : 0.001452        
##                                           
##             Sensitivity : 0.4018          
##             Specificity : 0.8101          
##          Pos Pred Value : 0.5696          
##          Neg Pred Value : 0.6840          
##              Prevalence : 0.3849          
##          Detection Rate : 0.1546          
##    Detection Prevalence : 0.2715          
##       Balanced Accuracy : 0.6059          
##                                           
##        'Positive' Class : Cleared         
## 
install.packages("smotefamily")
## Installing package into 'C:/Users/lsunb/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'smotefamily' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\lsunb\AppData\Local\Temp\RtmpgLHMLX\downloaded_packages
library(smotefamily)
## Warning: package 'smotefamily' was built under R version 4.4.3
library(caret)
library(e1071)
## Warning: package 'e1071' was built under R version 4.4.2
set.seed(123)

ctrl <- trainControl(
  method = "cv",
  number = 10,
  classProbs = TRUE,
  summaryFunction = twoClassSummary,
  savePredictions = "final",
  sampling = "up"  # <-- Balances the classes by upsampling minority class
)
svm_linear_grid <- expand.grid(C = c(0.01, 0.1, 1, 10, 100, 1000))

svm_linear <- train(Class ~ Priority + Service_Area + Seconds + Weekday,
                    data = train_data_upsampled,
                    method = "svmLinear",
                    trControl = ctrl,
                    metric = "ROC",
                    preProcess = c("center", "scale"))
# Custom tuning grid for radial SVM
svm_radial_grid <- expand.grid(
  sigma = c(0.001, 0.01, 0.05, 0.1, 0.5),
  C = c(0.1, 1, 10, 100)
)

svm_radial <- train(Class ~ Priority + Service_Area + Seconds + Weekday,
                    data = train_data_upsampled,
                    method = "svmRadial",
                    trControl = ctrl,
                    metric = "ROC",
                    preProcess = c("center", "scale"),
                    tuneGrid = svm_radial_grid)
# Predict (use type="prob" if you want to threshold)
pred_linear <- predict(svm_linear, newdata = test_data)
pred_radial <- predict(svm_radial, newdata = test_data)

# True values
test_true <- factor(test_data$Disposition_Type, levels = c("Cleared", "NOR"))

# Confusion Matrices
confusionMatrix(pred_linear, test_true)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Cleared NOR
##    Cleared      71  70
##    NOR          41 109
##                                           
##                Accuracy : 0.6186          
##                  95% CI : (0.5601, 0.6746)
##     No Information Rate : 0.6151          
##     P-Value [Acc > NIR] : 0.477827        
##                                           
##                   Kappa : 0.2316          
##                                           
##  Mcnemar's Test P-Value : 0.007869        
##                                           
##             Sensitivity : 0.6339          
##             Specificity : 0.6089          
##          Pos Pred Value : 0.5035          
##          Neg Pred Value : 0.7267          
##              Prevalence : 0.3849          
##          Detection Rate : 0.2440          
##    Detection Prevalence : 0.4845          
##       Balanced Accuracy : 0.6214          
##                                           
##        'Positive' Class : Cleared         
## 
confusionMatrix(pred_radial, test_true)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Cleared NOR
##    Cleared       9   3
##    NOR         103 176
##                                           
##                Accuracy : 0.6357          
##                  95% CI : (0.5776, 0.6911)
##     No Information Rate : 0.6151          
##     P-Value [Acc > NIR] : 0.2547          
##                                           
##                   Kappa : 0.0764          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.08036         
##             Specificity : 0.98324         
##          Pos Pred Value : 0.75000         
##          Neg Pred Value : 0.63082         
##              Prevalence : 0.38488         
##          Detection Rate : 0.03093         
##    Detection Prevalence : 0.04124         
##       Balanced Accuracy : 0.53180         
##                                           
##        'Positive' Class : Cleared         
## 
# Load necessary library
library(caret)
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.4.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
# Define the tuning grid for Random Forest
rf_grid <- expand.grid(mtry = c(1, 2, 3, 4))

# Train the Random Forest model with custom tuning grid
rf_model <- train(
  Class ~ Priority + Service_Area + Seconds + Weekday,
  data = train_data_upsampled,
  method = "rf",
  metric = "ROC",              # Optimize based on ROC AUC
  trControl = ctrl,            # Your trainControl object
  tuneGrid = rf_grid,
  preProcess = c("center", "scale")  # Optional, depending on your needs
)

# Make predictions on the test set
rf_preds <- predict(rf_model, newdata = test_data)

# Evaluate performance using confusion matrix
confusionMatrix(rf_preds, test_data$Disposition_Type)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Cleared NOR
##    Cleared      62  52
##    NOR          50 127
##                                           
##                Accuracy : 0.6495          
##                  95% CI : (0.5916, 0.7043)
##     No Information Rate : 0.6151          
##     P-Value [Acc > NIR] : 0.1259          
##                                           
##                   Kappa : 0.2622          
##                                           
##  Mcnemar's Test P-Value : 0.9211          
##                                           
##             Sensitivity : 0.5536          
##             Specificity : 0.7095          
##          Pos Pred Value : 0.5439          
##          Neg Pred Value : 0.7175          
##              Prevalence : 0.3849          
##          Detection Rate : 0.2131          
##    Detection Prevalence : 0.3918          
##       Balanced Accuracy : 0.6315          
##                                           
##        'Positive' Class : Cleared         
##