1 Setup

1.1 Libraries

library(readxl)
library(tidyverse)
library(janitor)
library(scales)
library(corrplot)
library(randomForest)
library(caret)
library(patchwork)
library(gt)
library(lubridate)
library(broom)

1.2 Theme

theme_ceres <- theme_minimal(base_size = 12) +
  theme(
    plot.title       = element_text(face = "bold", size = 14, color = "#1a1a2e"),
    plot.subtitle    = element_text(size = 10, color = "#64748b"),
    plot.caption     = element_text(size = 8, color = "#94a3b8"),
    panel.grid.minor = element_blank(),
    panel.grid.major = element_line(color = "#e2e8f0"),
    legend.position  = "bottom",
    strip.text       = element_text(face = "bold")
  )
theme_set(theme_ceres)

ceres_pal <- c("#f97316", "#3b82f6", "#10b981", "#f59e0b",
               "#8b5cf6", "#ef4444", "#06b6d4", "#ec4899")

1.3 Load Data

file_path <- "C:/Users/malgh/Downloads/Ceres ISA 405 COPY 2.xlsx"

techs  <- read_excel(file_path, sheet = "Technicians")  %>% clean_names()
sr     <- read_excel(file_path, sheet = "Service_Requests") %>% clean_names()
costs  <- read_excel(file_path, sheet = "Cost_Analysis") %>% clean_names()
skills <- read_excel(file_path, sheet = "Skills_Registry") %>% clean_names()

1.4 Dataset Overview

tibble(
  Sheet = c("Technicians", "Service Requests", "Cost Analysis", "Skills Registry"),
  Rows  = c(nrow(techs), nrow(sr), nrow(costs), nrow(skills)),
  Cols  = c(ncol(techs), ncol(sr), ncol(costs), ncol(skills))
) %>%
  gt() %>%
  tab_header(title = md("**Dataset Overview**"))
Dataset Overview
Sheet Rows Cols
Technicians 50 25
Service Requests 200 22
Cost Analysis 149 16
Skills Registry 150 10
summary(sr %>% select(days_to_assign, days_to_resolve, repair_cost_usd, customer_sat_score))
##  days_to_assign   days_to_resolve repair_cost_usd customer_sat_score
##  Min.   : 0.000   Min.   : 1.0    Min.   :  630   Min.   :1.500     
##  1st Qu.: 3.000   1st Qu.: 7.0    1st Qu.: 4056   1st Qu.:2.375     
##  Median : 8.000   Median :15.0    Median : 7570   Median :3.200     
##  Mean   : 7.186   Mean   :14.7    Mean   : 7919   Mean   :3.220     
##  3rd Qu.:11.000   3rd Qu.:23.0    3rd Qu.:11714   3rd Qu.:4.125     
##  Max.   :14.000   Max.   :30.0    Max.   :14913   Max.   :5.000     
##  NA's   :44       NA's   :96      NA's   :96      NA's   :96

2 Part A: Data Exploration

2.1 Service Request Status Breakdown

status_summary <- sr %>%
  count(status) %>%
  mutate(pct = n / sum(n))

ggplot(status_summary, aes(x = reorder(status, n), y = n, fill = status)) +
  geom_col(width = 0.7, show.legend = FALSE) +
  geom_text(aes(label = paste0(n, " (", percent(pct, 1), ")")),
            hjust = -0.1, size = 3.5, fontface = "bold") +
  coord_flip(ylim = c(0, max(status_summary$n) * 1.2)) +
  scale_fill_manual(values = c("Resolved" = "#10b981", "Pending" = "#f59e0b",
                                "Escalated" = "#ef4444", "In Progress" = "#3b82f6")) +
  labs(title = "Service Request Status Distribution",
       subtitle = paste0("n = ", nrow(sr), " total requests"),
       x = NULL, y = "Count")

2.2 SLA Breach Analysis

sla_data <- sr %>% filter(sla_breached_yn %in% c("Y", "N"))

sla_rate <- mean(sla_data$sla_breached_yn == "Y")

Overall SLA Breach Rate: 88.5% (138 of 156 assigned requests)

sla_by_priority <- sla_data %>%
  group_by(priority) %>%
  summarise(
    total    = n(),
    breached = sum(sla_breached_yn == "Y"),
    rate     = breached / total,
    .groups  = "drop"
  ) %>%
  mutate(priority = factor(priority, levels = c("Critical", "High", "Medium", "Low")))

ggplot(sla_by_priority, aes(x = priority, y = rate, fill = priority)) +
  geom_col(width = 0.6, show.legend = FALSE) +
  geom_text(aes(label = percent(rate, 0.1)), vjust = -0.5, fontface = "bold", size = 4) +
  scale_y_continuous(labels = percent, limits = c(0, 1.1)) +
  scale_fill_manual(values = c("Critical" = "#ef4444", "High" = "#f97316",
                                "Medium" = "#f59e0b", "Low" = "#3b82f6")) +
  labs(title = "SLA Breach Rate by Priority Level",
       subtitle = "Breach rate is priority-agnostic. The system has no prioritization logic.",
       x = NULL, y = "Breach Rate")

2.3 Assignment Method Comparison

method_perf <- sr %>%
  filter(!is.na(assignment_method)) %>%
  group_by(assignment_method) %>%
  summarise(
    count            = n(),
    avg_days_assign  = mean(days_to_assign, na.rm = TRUE),
    avg_days_resolve = mean(days_to_resolve, na.rm = TRUE),
    avg_csat         = mean(customer_sat_score, na.rm = TRUE),
    sla_breach_rate  = mean(sla_breached_yn == "Y", na.rm = TRUE),
    .groups = "drop"
  )

p1 <- ggplot(method_perf, aes(x = reorder(assignment_method, avg_days_assign),
                               y = avg_days_assign, fill = assignment_method)) +
  geom_col(width = 0.6, show.legend = FALSE) +
  geom_text(aes(label = round(avg_days_assign, 1)), hjust = -0.2, fontface = "bold") +
  coord_flip(ylim = c(0, 10)) +
  scale_fill_manual(values = ceres_pal) +
  labs(title = "Avg Days to Assign by Method",
       subtitle = "All methods cluster around 7 days. The process itself is broken.",
       x = NULL, y = "Days")

p2 <- ggplot(method_perf, aes(x = reorder(assignment_method, avg_days_resolve),
                               y = avg_days_resolve, fill = assignment_method)) +
  geom_col(width = 0.6, show.legend = FALSE) +
  geom_text(aes(label = round(avg_days_resolve, 1)), hjust = -0.2, fontface = "bold") +
  coord_flip(ylim = c(0, 20)) +
  scale_fill_manual(values = ceres_pal) +
  labs(title = "Avg Days to Resolve by Method", x = NULL, y = "Days")

p1 / p2

2.5 Issue Type Analysis

issue_perf <- sr %>%
  filter(!is.na(days_to_resolve)) %>%
  group_by(issue_type) %>%
  summarise(
    count       = n(),
    avg_resolve = mean(days_to_resolve),
    avg_cost    = mean(repair_cost_usd, na.rm = TRUE),
    .groups = "drop"
  )

ggplot(issue_perf, aes(x = reorder(issue_type, avg_resolve),
                        y = avg_resolve, fill = avg_resolve)) +
  geom_col(width = 0.65, show.legend = FALSE) +
  geom_text(aes(label = paste0(round(avg_resolve, 1), "d")),
            hjust = -0.15, size = 3.5, fontface = "bold") +
  coord_flip(ylim = c(0, max(issue_perf$avg_resolve) * 1.15)) +
  scale_fill_gradient(low = "#10b981", high = "#ef4444") +
  labs(title = "Avg Resolution Time by Issue Type",
       subtitle = "Data Transmission failures take 2.4x longer than Gateway Offline issues",
       x = NULL, y = "Avg Days to Resolve")


3 Part B: Pattern Identification

3.1 Contractor vs Internal Cost

cost_compare <- tibble(
  category = c("Internal Cost", "Contractor Cost", "Revenue Loss", "Total Impact"),
  value    = c(mean(costs$internal_cost_usd, na.rm = TRUE),
               mean(costs$contractor_cost_usd, na.rm = TRUE),
               mean(costs$revenue_loss_estimate_usd, na.rm = TRUE),
               mean(costs$total_impact_usd, na.rm = TRUE))
)

ggplot(cost_compare, aes(x = reorder(category, value), y = value, fill = category)) +
  geom_col(width = 0.6, show.legend = FALSE) +
  geom_text(aes(label = dollar(value, accuracy = 1)), hjust = -0.1, fontface = "bold", size = 3.5) +
  coord_flip(ylim = c(0, max(cost_compare$value) * 1.15)) +
  scale_fill_manual(values = c("Internal Cost" = "#10b981", "Contractor Cost" = "#ef4444",
                                "Revenue Loss" = "#f59e0b", "Total Impact" = "#8b5cf6")) +
  scale_y_continuous(labels = dollar) +
  labs(title = "Average Cost per Service Request",
       subtitle = "Contractors cost 3.8x more than internal technicians",
       x = NULL, y = "USD")

Contractor Premium Total: $224,819

3.2 Preventive vs Reactive Maintenance

prev_react <- costs %>%
  filter(!is.na(preventive_vs_reactive)) %>%
  group_by(repair_type = preventive_vs_reactive) %>%
  summarise(
    count        = n(),
    avg_cost     = mean(actual_cost_usd, na.rm = TRUE),
    avg_downtime = mean(equipment_downtime_hours, na.rm = TRUE),
    avg_rev_loss = mean(revenue_loss_estimate_usd, na.rm = TRUE),
    total_impact = sum(total_impact_usd, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  filter(!is.na(repair_type))

prev_react %>%
  select(repair_type, avg_downtime, avg_rev_loss) %>%
  pivot_longer(-repair_type, names_to = "metric", values_to = "value") %>%
  mutate(metric = recode(metric,
    "avg_downtime" = "Avg Downtime (hrs)",
    "avg_rev_loss" = "Avg Revenue Loss ($)")) %>%
  ggplot(aes(x = repair_type, y = value, fill = repair_type)) +
  geom_col(width = 0.5, show.legend = FALSE) +
  geom_text(aes(label = ifelse(value > 1000, dollar(value, accuracy = 1), round(value, 1))),
            vjust = -0.3, fontface = "bold", size = 3.5) +
  facet_wrap(~metric, scales = "free_y") +
  scale_fill_manual(values = c("Preventive" = "#10b981", "Reactive" = "#ef4444")) +
  labs(title = "Preventive vs Reactive Maintenance",
       subtitle = "Reactive: 11x more downtime, 11x more revenue loss",
       x = NULL, y = NULL)

3.3 Workforce Paradox

techs %>%
  count(workload_status) %>%
  mutate(workload_status = factor(workload_status,
    levels = c("Available", "Busy", "At Capacity", "Unavailable", "Overloaded"))) %>%
  ggplot(aes(x = workload_status, y = n, fill = workload_status)) +
  geom_col(width = 0.6, show.legend = FALSE) +
  geom_text(aes(label = n), vjust = -0.3, fontface = "bold") +
  scale_fill_manual(values = c("Available" = "#10b981", "Busy" = "#f59e0b",
    "At Capacity" = "#f97316", "Unavailable" = "#ef4444", "Overloaded" = "#8b5cf6")) +
  labs(title = "Technician Workload Distribution",
       subtitle = "Only 4 of 50 techs are 'Available', yet avg utilization is just 67%",
       x = NULL, y = "Count")

dept_summary <- techs %>%
  group_by(department) %>%
  summarise(
    n           = n(),
    avg_util    = mean(monthly_utilization_pct),
    avg_rate    = mean(billable_rate_usd_day),
    avg_resolve = mean(avg_resolution_days),
    avg_csat    = mean(customer_satisfaction_score),
    .groups = "drop"
  )

dept_summary %>%
  pivot_longer(cols = c(avg_util, avg_rate, avg_csat),
               names_to = "metric", values_to = "value") %>%
  mutate(metric = recode(metric,
    "avg_util" = "Utilization (%)",
    "avg_rate" = "Bill Rate ($/day)",
    "avg_csat" = "CSAT (out of 5)")) %>%
  ggplot(aes(x = department, y = value, fill = department)) +
  geom_col(width = 0.5, show.legend = FALSE) +
  geom_text(aes(label = round(value, 1)), vjust = -0.3, size = 3.2, fontface = "bold") +
  facet_wrap(~metric, scales = "free_y") +
  scale_fill_manual(values = ceres_pal[1:3]) +
  labs(title = "Department Performance Comparison", x = NULL, y = NULL)

3.4 Data Quality Audit

quality_rate <- mean(!is.na(skills$data_quality_issue))

skills %>%
  filter(!is.na(data_quality_issue)) %>%
  count(data_quality_issue) %>%
  ggplot(aes(x = reorder(data_quality_issue, n), y = n, fill = data_quality_issue)) +
  geom_col(width = 0.6, show.legend = FALSE) +
  geom_text(aes(label = n), hjust = -0.2, fontface = "bold") +
  coord_flip(ylim = c(0, 30)) +
  scale_fill_manual(values = ceres_pal) +
  labs(title = "Skills Registry: Data Quality Issues",
       subtitle = paste0(sum(!is.na(skills$data_quality_issue)), " of ",
                         nrow(skills), " records (", percent(quality_rate, 0.1), ") have problems"),
       x = NULL, y = "Count")

skills %>%
  count(data_source) %>%
  ggplot(aes(x = reorder(data_source, n), y = n, fill = data_source)) +
  geom_col(width = 0.6, show.legend = FALSE) +
  geom_text(aes(label = n), hjust = -0.2, fontface = "bold") +
  coord_flip(ylim = c(0, 50)) +
  scale_fill_manual(values = ceres_pal) +
  labs(title = "Skills Data Source Distribution",
       subtitle = "No single source of truth. Data scattered across 5 systems.",
       x = NULL, y = "Count")

3.5 Correlation Analysis

cor_data <- sr %>%
  select(days_to_assign, days_to_resolve, repair_cost_usd, customer_sat_score) %>%
  drop_na()

cor_matrix <- cor(cor_data)

corrplot(cor_matrix, method = "color", type = "upper",
         addCoef.col = "black", number.cex = 0.9, tl.col = "black",
         title = "Correlation: Assignment, Resolution, Cost, Satisfaction",
         mar = c(0, 0, 2, 0))

3.6 Product-Level Analysis

product_summary <- sr %>%
  group_by(product) %>%
  summarise(
    requests    = n(),
    avg_cost    = mean(repair_cost_usd, na.rm = TRUE),
    avg_resolve = mean(days_to_resolve, na.rm = TRUE),
    avg_csat    = mean(customer_sat_score, na.rm = TRUE),
    breach_rate = mean(sla_breached_yn == "Y", na.rm = TRUE),
    .groups = "drop"
  )

ggplot(product_summary, aes(x = reorder(product, avg_cost), y = avg_cost, fill = avg_cost)) +
  geom_col(width = 0.6, show.legend = FALSE) +
  geom_text(aes(label = dollar(avg_cost, accuracy = 1)), hjust = -0.1, size = 3.2, fontface = "bold") +
  coord_flip(ylim = c(0, max(product_summary$avg_cost) * 1.15)) +
  scale_fill_gradient(low = "#10b981", high = "#ef4444") +
  labs(title = "Average Repair Cost by Product",
       subtitle = "CeresGateway X100 and IoT Edge Node are the costliest to service",
       x = NULL, y = "Avg Repair Cost (USD)")


4 Part C: Predictive Modeling

4.1 Model 1: Predicting SLA Breach (Logistic Regression)

model_data_sla <- sr %>%
  filter(sla_breached_yn %in% c("Y", "N"), !is.na(days_to_assign)) %>%
  mutate(
    sla_breach = ifelse(sla_breached_yn == "Y", 1, 0),
    priority   = factor(priority, levels = c("Low", "Medium", "High", "Critical")),
    channel    = factor(channel),
    issue_type = factor(issue_type)
  )

sla_model <- glm(sla_breach ~ days_to_assign + priority + channel + issue_type,
                  data = model_data_sla, family = binomial)

summary(sla_model)
## 
## Call:
## glm(formula = sla_breach ~ days_to_assign + priority + channel + 
##     issue_type, family = binomial, data = model_data_sla)
## 
## Coefficients:
##                                      Estimate Std. Error z value Pr(>|z|)   
## (Intercept)                         -0.957440   1.622302  -0.590  0.55507   
## days_to_assign                       1.173036   0.363841   3.224  0.00126 **
## priorityMedium                      -0.544372   1.016620  -0.535  0.59232   
## priorityHigh                        -1.218744   1.173837  -1.038  0.29915   
## priorityCritical                    -0.586973   1.163122  -0.505  0.61380   
## channelEmail                        -1.716175   1.123460  -1.528  0.12662   
## channelField Report                 -1.837546   1.086308  -1.692  0.09073 . 
## channelPhone                        -1.600604   1.306261  -1.225  0.22045   
## issue_typeConnectivity Loss          2.133186   1.719986   1.240  0.21489   
## issue_typeData Transmission Failure  2.180300   1.701185   1.282  0.19997   
## issue_typeFirmware Incompatibility  -0.005494   1.372591  -0.004  0.99681   
## issue_typeGateway Offline            1.312189   2.000218   0.656  0.51181   
## issue_typePower Supply Failure       0.956777   1.453583   0.658  0.51040   
## issue_typeSensor Malfunction         2.082403   1.590455   1.309  0.19043   
## issue_typeSoftware Crash             1.950669   1.424177   1.370  0.17079   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 111.580  on 155  degrees of freedom
## Residual deviance:  53.782  on 141  degrees of freedom
## AIC: 83.782
## 
## Number of Fisher Scoring iterations: 9
sla_tidy <- tidy(sla_model, conf.int = TRUE) %>%
  filter(term != "(Intercept)") %>%
  mutate(odds_ratio = exp(estimate))

ggplot(sla_tidy, aes(x = reorder(term, estimate), y = estimate)) +
  geom_point(size = 3, color = "#f97316") +
  geom_errorbar(aes(ymin = conf.low, ymax = conf.high), width = 0.2, color = "#64748b") +
  geom_hline(yintercept = 0, linetype = "dashed", color = "#ef4444") +
  coord_flip() +
  labs(title = "SLA Breach: Logistic Regression Coefficients",
       subtitle = "Positive = increases breach probability | Bars = 95% CI",
       x = NULL, y = "Log-Odds Coefficient")

4.2 Model 2: Predicting Resolution Time (Linear Regression)

model_data_resolve <- sr %>%
  filter(!is.na(days_to_resolve), !is.na(days_to_assign)) %>%
  mutate(
    priority          = factor(priority, levels = c("Low", "Medium", "High", "Critical")),
    issue_type        = factor(issue_type),
    assignment_method = factor(assignment_method),
    contractor        = ifelse(contractor_used_yn == "Y", 1, 0)
  )

resolve_model <- lm(days_to_resolve ~ days_to_assign + priority + issue_type +
                       assignment_method + contractor,
                     data = model_data_resolve)

summary(resolve_model)
## 
## Call:
## lm(formula = days_to_resolve ~ days_to_assign + priority + issue_type + 
##     assignment_method + contractor, data = model_data_resolve)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.5080  -6.3024   0.1616   5.5019  15.4098 
## 
## Coefficients: (1 not defined because of singularities)
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                          14.6704     3.5118   4.177 6.87e-05 ***
## days_to_assign                       -0.2955     0.2127  -1.389  0.16816    
## priorityMedium                        0.2876     2.3526   0.122  0.90298    
## priorityHigh                          3.4389     2.5106   1.370  0.17422    
## priorityCritical                     -1.4768     2.3442  -0.630  0.53033    
## issue_typeConnectivity Loss           2.5275     3.9120   0.646  0.51989    
## issue_typeData Transmission Failure   8.6783     3.2770   2.648  0.00957 ** 
## issue_typeFirmware Incompatibility    0.2314     3.6906   0.063  0.95014    
## issue_typeGateway Offline            -4.0524     3.1177  -1.300  0.19702    
## issue_typePower Supply Failure       -1.4189     3.0579  -0.464  0.64376    
## issue_typeSensor Malfunction         -0.3259     3.1870  -0.102  0.91878    
## issue_typeSoftware Crash              5.2767     2.9656   1.779  0.07860 .  
## assignment_methodManual-HR Excel      0.0251     2.3777   0.011  0.99160    
## assignment_methodManual-Phone Call    0.5290     2.5375   0.208  0.83535    
## assignment_methodSystem Matched       1.0560     2.3842   0.443  0.65889    
## contractor                                NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.274 on 89 degrees of freedom
## Multiple R-squared:  0.2508, Adjusted R-squared:  0.1329 
## F-statistic: 2.128 on 14 and 89 DF,  p-value: 0.01726
resolve_tidy <- tidy(resolve_model, conf.int = TRUE) %>%
  filter(term != "(Intercept)")

ggplot(resolve_tidy, aes(x = reorder(term, estimate), y = estimate)) +
  geom_point(size = 3, color = "#3b82f6") +
  geom_errorbar(aes(ymin = conf.low, ymax = conf.high), width = 0.2, color = "#64748b") +
  geom_hline(yintercept = 0, linetype = "dashed", color = "#ef4444") +
  coord_flip() +
  labs(title = "Resolution Time: Linear Regression Coefficients",
       subtitle = "Positive = adds days to resolution | Bars = 95% CI",
       x = NULL, y = "Effect on Days to Resolve")

par(mfrow = c(2, 2))
plot(resolve_model, which = 1:4)

par(mfrow = c(1, 1))

4.3 Model 3: Predicting Total Cost Impact (Multiple Regression)

model_data_cost <- costs %>%
  filter(!is.na(total_impact_usd), !is.na(preventive_vs_reactive)) %>%
  mutate(
    is_reactive = ifelse(preventive_vs_reactive == "Reactive", 1, 0),
    emp_type    = factor(employment_type)
  )

cost_model <- lm(total_impact_usd ~ daily_rate_usd + days_on_site + is_reactive + emp_type,
                  data = model_data_cost)

summary(cost_model)
## 
## Call:
## lm(formula = total_impact_usd ~ daily_rate_usd + days_on_site + 
##     is_reactive + emp_type, data = model_data_cost)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -43583  -7829   -874   7142  45198 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -104.73    8011.48  -0.013    0.990    
## daily_rate_usd       16.20      11.56   1.402    0.165    
## days_on_site         98.21     716.32   0.137    0.891    
## is_reactive       40385.10    4184.31   9.652 1.49e-14 ***
## emp_typeFull-Time   -99.37    5295.11  -0.019    0.985    
## emp_typePart-Time  4289.41    4987.43   0.860    0.393    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 18090 on 71 degrees of freedom
## Multiple R-squared:  0.5736, Adjusted R-squared:  0.5435 
## F-statistic:  19.1 on 5 and 71 DF,  p-value: 5.434e-12

Key finding: Each additional day on-site adds $98 to total impact. Reactive repairs add $40,385 to total impact vs preventive.

4.4 Model 4: Customer Satisfaction (Random Forest)

model_data_csat <- sr %>%
  filter(!is.na(customer_sat_score), !is.na(days_to_assign), !is.na(days_to_resolve)) %>%
  mutate(
    priority     = factor(priority),
    issue_type   = factor(issue_type),
    contractor   = factor(contractor_used_yn),
    sla_breached = factor(sla_breached_yn)
  ) %>%
  select(customer_sat_score, days_to_assign, days_to_resolve,
         repair_cost_usd, priority, issue_type, contractor, sla_breached)

set.seed(42)
train_idx  <- createDataPartition(model_data_csat$customer_sat_score, p = 0.7, list = FALSE)
train_data <- model_data_csat[train_idx, ]
test_data  <- model_data_csat[-train_idx, ]

rf_model <- randomForest(customer_sat_score ~ ., data = train_data,
                          ntree = 500, importance = TRUE)
print(rf_model)
## 
## Call:
##  randomForest(formula = customer_sat_score ~ ., data = train_data,      ntree = 500, importance = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##           Mean of squared residuals: 1.377069
##                     % Var explained: -23.15
importance_df <- as.data.frame(importance(rf_model)) %>%
  rownames_to_column("variable") %>%
  arrange(desc(`%IncMSE`))

ggplot(importance_df, aes(x = reorder(variable, `%IncMSE`), y = `%IncMSE`)) +
  geom_col(fill = "#8b5cf6", width = 0.6) +
  geom_text(aes(label = round(`%IncMSE`, 1)), hjust = -0.2, fontface = "bold", size = 3.5) +
  coord_flip(ylim = c(0, max(importance_df$`%IncMSE`) * 1.15)) +
  labs(title = "Random Forest: Feature Importance for CSAT Prediction",
       subtitle = "Which factors most influence customer satisfaction?",
       x = NULL, y = "% Increase in MSE When Removed")

predictions <- predict(rf_model, test_data)
rf_rmse <- sqrt(mean((predictions - test_data$customer_sat_score)^2))
rf_r2   <- cor(predictions, test_data$customer_sat_score)^2

Random Forest Test Performance: RMSE = 1.013 | R-squared = 0.02

tibble(actual = test_data$customer_sat_score, predicted = predictions) %>%
  ggplot(aes(x = actual, y = predicted)) +
  geom_point(alpha = 0.6, color = "#8b5cf6", size = 2.5) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "#ef4444") +
  labs(title = "Predicted vs Actual Customer Satisfaction",
       subtitle = paste0("Random Forest | RMSE = ", round(rf_rmse, 3),
                         " | R-squared = ", round(rf_r2, 3)),
       x = "Actual CSAT", y = "Predicted CSAT")


5 Part D: Executive Summary

tibble(
  Metric = c(
    "SLA Breach Rate",
    "Avg Days to Assign",
    "Contractor vs Internal Cost",
    "Reactive Downtime Multiplier",
    "Workforce Visibility Gap",
    "Skills Data Quality",
    "Total Financial Impact",
    "Projected Annual Savings"
  ),
  Current = c(
    "88.5%", "7.2 days", "$10,267 vs $2,689", "11x (38.4h vs 3.5h)",
    "8% visible / 67% actual util.", "63.3% records flawed", "$2.08M", "N/A"
  ),
  Target = c(
    "<15%", "<=2 days", "80% internal dispatch", "<2x with predictive maint.",
    "Real-time availability", "<5% error rate", "<$800K", "~$588K/year"
  ),
  `What This Proves` = c(
    "No prioritization logic exists. System treats all tickets equally.",
    "Bottleneck is data access, not the search method.",
    "3.8x cost premium justifies building an internal matching system.",
    "Shifting 30% reactive to preventive saves ~$363K in revenue losses.",
    "33% of capacity is idle but invisible to the after-sales team.",
    "Data standardization must happen before any dispatch automation.",
    "Scale of the problem justifies IS investment.",
    "ROI of 194-292% on $150-200K implementation."
  )
) %>%
  gt() %>%
  tab_header(
    title    = md("**Ceres Business Analytics: Executive Summary**"),
    subtitle = "Key metrics proving the need for a privacy-compliant dispatch system"
  ) %>%
  cols_width(Metric ~ px(155), Current ~ px(145),
             Target ~ px(140), `What This Proves` ~ px(330)) %>%
  tab_style(style = cell_text(weight = "bold"), locations = cells_column_labels()) %>%
  tab_style(style = cell_fill(color = "#fef3cd"), locations = cells_body(columns = `What This Proves`)) %>%
  tab_style(style = cell_fill(color = "#fee2e2"), locations = cells_body(columns = Current)) %>%
  tab_style(style = cell_fill(color = "#dcfce7"), locations = cells_body(columns = Target))
Ceres Business Analytics: Executive Summary
Key metrics proving the need for a privacy-compliant dispatch system
Metric Current Target What This Proves
SLA Breach Rate 88.5% <15% No prioritization logic exists. System treats all tickets equally.
Avg Days to Assign 7.2 days <=2 days Bottleneck is data access, not the search method.
Contractor vs Internal Cost $10,267 vs $2,689 80% internal dispatch 3.8x cost premium justifies building an internal matching system.
Reactive Downtime Multiplier 11x (38.4h vs 3.5h) <2x with predictive maint. Shifting 30% reactive to preventive saves ~$363K in revenue losses.
Workforce Visibility Gap 8% visible / 67% actual util. Real-time availability 33% of capacity is idle but invisible to the after-sales team.
Skills Data Quality 63.3% records flawed <5% error rate Data standardization must happen before any dispatch automation.
Total Financial Impact $2.08M <$800K Scale of the problem justifies IS investment.
Projected Annual Savings N/A ~$588K/year ROI of 194-292% on $150-200K implementation.