Rural_undivided_motorcycle

Bigram Topic Modelling

# Load required libraries
library(readxl)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidytext)
library(stringr)
library(tidyr)
library(topicmodels)
library(ggplot2)
library(stopwords)
library(RColorBrewer)

# Set working directory
setwd("C:/Users/qwx11/OneDrive - Texas State University/Hackathon_Swastika/0_Thesis/2_Thesis Defence/Code")

# Load and clean data
df <- read_excel("motorcyclist_narrative.xlsx") %>%
  filter(!is.na(Cleaned_Narrative))

# Define stopwords
custom_stopwords <- c("unit", "travel", "travelling", "traveled", "motorcycle")
all_stopwords <- c(stopwords("en"), custom_stopwords)

# Clean and lowercase text
df_clean <- df %>%
  mutate(id = row_number(),
         text = tolower(Cleaned_Narrative),
         text = str_replace_all(text, "[^a-z\\s]", " ")) %>%
  select(id, text)

# Tokenize into bigrams
bigrams <- df_clean %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2)

# Filter bigrams to remove stopwords and unwanted directional/verb phrases
bigrams_separated <- bigrams %>%
  separate(bigram, into = c("word1", "word2"), sep = " ") %>%
  filter(!word1 %in% all_stopwords,
         !word2 %in% all_stopwords,
         nchar(word1) > 3,
         nchar(word2) > 3) %>%
  mutate(bigram = paste(word1, word2)) %>%
  filter(
    !str_detect(word1, "^travel(l)?ing|^facing|north|south|east|west|bound") &
    !str_detect(word2, "^travel(l)?ing|^facing|north|south|east|west|bound")
  )

# Count bigrams per document
bigram_counts <- bigrams_separated %>%
  count(id, bigram, sort = TRUE)

# Create Document-Term Matrix
dtm <- bigram_counts %>%
  cast_dtm(document = id, term = bigram, value = n)

# Fit LDA model (5 topics)
lda_model <- LDA(dtm, k = 6, control = list(seed = 42))
top_terms <- tidy(lda_model, matrix = "beta")

# Get top 10 bigrams per topic
top_terms_df <- top_terms %>%
  group_by(topic) %>%
  slice_max(beta, n = 10) %>%
  ungroup() %>%
  arrange(topic, -beta) %>%
  group_by(topic) %>%
  mutate(rank = row_number()) %>%
  ungroup()

# Define color palette by rank
custom_palette <- c("#5E4B56", "#A9939C", "#AA6F73", "#EDA29B", "#F9D8A4")
top_terms_df$fill_color <- cut(top_terms_df$rank,
                               breaks = c(0, 2, 4, 6, 8, 10),
                               labels = custom_palette,
                               include.lowest = TRUE)
# Generate and print plots
plots <- list()

for (i in 1:6) {
  plot_data <- top_terms_df %>% filter(topic == i)
  
  p <- ggplot(plot_data, aes(x = reorder(term, beta), y = beta, fill = fill_color)) +
    geom_col(show.legend = FALSE) +
    scale_fill_manual(values = custom_palette) +
    coord_flip() +
    labs(title = paste("Top 10 Bigrams for Topic", i),
         x = "Bigrams",
         y = "Importance (LDA Weight)") +
    theme_minimal() +
    theme(plot.title = element_text(size = 14, face = "bold"))
  
  print(p)
  ggsave(paste0("Topic_", i, "_bigrams.png"), plot = p, height = 8, width = 4)
}

Descriptive Statistics

# Load necessary libraries
library(readxl)
library(dplyr)
library(forcats)
library(compareGroups)
library(openxlsx)
library(lares)
library(ggplot2)

# Set working directory
setwd("C:/Users/qwx11/OneDrive - Texas State University/Hackathon_Swastika/0_Thesis/2_Thesis Defence/Code")

# Read Excel file
file_path <- "Rural_undivided.xlsx"
data <- read_excel(file_path)

# Convert character columns to factors
data1 <- data %>% mutate_if(is.character, as.factor)

# Lump factor levels (keep top 7 levels, others grouped as "other")
data1a <- data1 %>% mutate(across(where(is.factor), fct_lump_n, n = 6, other_level = "Other"))

# Remove ID column if present
data1a <- data1a %>% select(-ID)

# Ensure proper column names are used
data3 <- data1a %>% rename_with(~ names(data1a))

# Display the first few rows to confirm preprocessing
head(data3)

## # A tibble: 6 × 20
##   ConFac  `Crash Hour` Season CrSpeed Weather LightCon RdAlgn Surface TrafficCon
##   <fct>   <fct>        <fct>  <fct>   <fct>   <fct>    <fct>  <fct>   <fct>     
## 1 Animal… Morning      Winter 50 to … Clear   Dark, n… Curve… Dry     Marked la…
## 2 Failed… Morning      Winter 30 to … Clear   Daylight Strai… Dry     None      
## 3 Unsafe… Afternoon    Winter 50 to … Clear   Daylight Curve… Dry     No passin…
## 4 Unsafe… Afternoon    Winter Greate… Clear   Daylight Strai… Dry     No passin…
## 5 N/A     Afternoon    Winter 50 to … Clear   Dark, n… Strai… Dry     Center st…
## 6 Unsafe… Afternoon    Winter 30 to … Clear   Daylight Curve… Dry     Stop/yiel…
## # ℹ 11 more variables: HarmEvent <fct>, Intersection <fct>, FHE <fct>,
## #   ObjStrk <fct>, Severity <fct>, Day <fct>, Year <dbl>, Age <fct>,
## #   Ethnicity <fct>, Gender <fct>, Helmet <fct>

# Generate descriptive statistics grouped by Severity
descr_stats <- compareGroups(Severity ~ ., data = data3)

# Create summary table
summary_stats <- createTable(descr_stats)

# Print the summary table in the console
print(summary_stats)

## 
## --------Summary descriptives table by 'Severity'---------
## 
## ____________________________________________________________________________________________________________________________________ 
##                                     Fatal     Incapacitating injury  No injury   Non-incapacitating injury Possible injury p.overall 
##                                     N=1143           N=4080            N=1802             N=4032               N=1696                
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯ 
## ConFac:                                                                                                                     <0.001   
##     Animal on the road            55 (4.81%)       407 (9.98%)      101 (5.60%)         429 (10.6%)          124 (7.31%)             
##     Failed to yield/signal       275 (24.1%)       588 (14.4%)      190 (10.5%)         413 (10.2%)          174 (10.3%)             
##     Inattention/fatigued          13 (1.14%)       157 (3.85%)       76 (4.22%)         196 (4.86%)          78 (4.60%)              
##     N/A                          290 (25.4%)       929 (22.8%)      604 (33.5%)         922 (22.9%)          476 (28.1%)             
##     Unsafe speed                 337 (29.5%)      1337 (32.8%)      476 (26.4%)        1273 (31.6%)          562 (33.1%)             
##     Other                        173 (15.1%)       662 (16.2%)      355 (19.7%)         799 (19.8%)          282 (16.6%)             
## Crash Hour:                                                                                                                  0.889   
##     Afternoon                    291 (25.5%)      1021 (25.0%)      467 (25.9%)         992 (24.6%)          421 (24.8%)             
##     Evening                      131 (11.5%)       513 (12.6%)      223 (12.4%)         527 (13.1%)          200 (11.8%)             
##     Midnight                     300 (26.2%)      1008 (24.7%)      460 (25.5%)        1012 (25.1%)          406 (23.9%)             
##     Morning                      289 (25.3%)      1017 (24.9%)      435 (24.1%)        1006 (25.0%)          441 (26.0%)             
##     Night                        132 (11.5%)       521 (12.8%)      217 (12.0%)         495 (12.3%)          228 (13.4%)             
## Season:                                                                                                                     <0.001   
##     Fall                         336 (29.4%)      1123 (27.5%)      494 (27.4%)        1073 (26.6%)          484 (28.5%)             
##     Spring                       310 (27.1%)      1209 (29.6%)      491 (27.2%)        1198 (29.7%)          479 (28.2%)             
##     Summer                       319 (27.9%)      1146 (28.1%)      465 (25.8%)        1176 (29.2%)          458 (27.0%)             
##     Winter                       178 (15.6%)       602 (14.8%)      352 (19.5%)         585 (14.5%)          275 (16.2%)             
## CrSpeed:                                                                                                                    <0.001   
##     30 to 45                     260 (22.7%)      1249 (30.6%)      887 (49.2%)        1438 (35.7%)          720 (42.5%)             
##     50 to 65                     644 (56.3%)      2033 (49.8%)      651 (36.1%)        1882 (46.7%)          707 (41.7%)             
##     Greater than 65              233 (20.4%)       689 (16.9%)      176 (9.77%)         564 (14.0%)          195 (11.5%)             
##     Less than 25                  6 (0.52%)        109 (2.67%)       88 (4.88%)         148 (3.67%)          74 (4.36%)              
## Weather:                                                                                                                       .     
##     Clear                        942 (82.4%)      3388 (83.0%)      1495 (83.0%)       3346 (83.0%)         1402 (82.7%)             
##     Cloudy                       163 (14.3%)       594 (14.6%)      214 (11.9%)         537 (13.3%)          207 (12.2%)             
##     Fog                           10 (0.87%)       14 (0.34%)        9 (0.50%)          15 (0.37%)            8 (0.47%)              
##     Other                         5 (0.44%)         3 (0.07%)        3 (0.17%)           6 (0.15%)            5 (0.29%)              
##     Rain                          20 (1.75%)       65 (1.59%)        78 (4.33%)         117 (2.90%)          69 (4.07%)              
##     Severe crosswinds             3 (0.26%)        16 (0.39%)        3 (0.17%)          11 (0.27%)            5 (0.29%)              
## LightCon:                                                                                                                      .     
##     Dark, lighted                 74 (6.47%)       237 (5.81%)      191 (10.6%)         243 (6.03%)          153 (9.02%)             
##     Dark, not lighted            368 (32.2%)      1051 (25.8%)      332 (18.4%)         838 (20.8%)          299 (17.6%)             
##     Dawn                          8 (0.70%)        35 (0.86%)        19 (1.05%)         36 (0.89%)           21 (1.24%)              
##     Daylight                     651 (57.0%)      2658 (65.1%)      1233 (68.4%)       2834 (70.3%)         1201 (70.8%)             
##     Dusk                          40 (3.50%)       97 (2.38%)        23 (1.28%)         77 (1.91%)           19 (1.12%)              
##     Unknown                       2 (0.17%)         2 (0.05%)        4 (0.22%)           4 (0.10%)            3 (0.18%)              
## RdAlgn:                                                                                                                        .     
##     Curve, grade                 130 (11.4%)       465 (11.4%)      121 (6.71%)         422 (10.5%)          144 (8.49%)             
##     Curve, hillcrest              23 (2.01%)       79 (1.94%)        17 (0.94%)         67 (1.66%)           33 (1.95%)              
##     Curve, level                 285 (24.9%)       975 (23.9%)      288 (16.0%)         864 (21.4%)          347 (20.5%)             
##     Straight, grade              124 (10.8%)       351 (8.60%)      122 (6.77%)         351 (8.71%)          126 (7.43%)             
##     Straight, hillcrest           38 (3.32%)       105 (2.57%)       35 (1.94%)         103 (2.55%)          35 (2.06%)              
##     Straight, level              543 (47.5%)      2096 (51.4%)      1215 (67.4%)       2221 (55.1%)         1009 (59.5%)             
##     Other                         0 (0.00%)         9 (0.22%)        4 (0.22%)           4 (0.10%)            2 (0.12%)              
## Surface:                                                                                                                       .     
##     Dry                          1098 (96.1%)     3878 (95.0%)      1649 (91.5%)       3741 (92.8%)         1535 (90.5%)             
##     Ice                           1 (0.09%)         1 (0.02%)        3 (0.17%)           1 (0.02%)            0 (0.00%)              
##     Sand, mud, dirt               8 (0.70%)        44 (1.08%)        17 (0.94%)         50 (1.24%)           29 (1.71%)              
##     Standing water                1 (0.09%)         2 (0.05%)        6 (0.33%)          17 (0.42%)            4 (0.24%)              
##     Wet                           27 (2.36%)       131 (3.21%)      113 (6.27%)         186 (4.61%)          110 (6.49%)             
##     Other                         8 (0.70%)        24 (0.59%)        14 (0.78%)         37 (0.92%)           18 (1.06%)              
## TrafficCon:                                                                                                                 <0.001   
##     Center stripe/divider        222 (19.4%)       828 (20.3%)      258 (14.3%)         770 (19.1%)          295 (17.4%)             
##     Marked lanes                 365 (31.9%)      1186 (29.1%)      595 (33.0%)        1201 (29.8%)          579 (34.1%)             
##     No passing zone              246 (21.5%)       708 (17.4%)      154 (8.55%)         598 (14.8%)          191 (11.3%)             
##     None                         120 (10.5%)       711 (17.4%)      368 (20.4%)         853 (21.2%)          299 (17.6%)             
##     Signal light                  32 (2.80%)       115 (2.82%)      142 (7.88%)         127 (3.15%)          86 (5.07%)              
##     Stop/yield/warning           144 (12.6%)       461 (11.3%)      251 (13.9%)         416 (10.3%)          203 (12.0%)             
##     Other                         14 (1.22%)       71 (1.74%)        34 (1.89%)         67 (1.66%)           43 (2.54%)              
## HarmEvent:                                                                                                                  <0.001   
##     Animal                        48 (4.20%)       346 (8.48%)       80 (4.44%)         338 (8.38%)          96 (5.66%)              
##     Fixed object                 290 (25.4%)       741 (18.2%)      222 (12.3%)         562 (13.9%)          218 (12.9%)             
##     MV in transport              530 (46.4%)      1372 (33.6%)      963 (53.4%)        1167 (28.9%)          618 (36.4%)             
##     Overturned                   259 (22.7%)      1533 (37.6%)      471 (26.1%)        1851 (45.9%)          708 (41.7%)             
##     Parked car                    2 (0.17%)        27 (0.66%)        40 (2.22%)         30 (0.74%)           16 (0.94%)              
##     Other                         14 (1.22%)       61 (1.50%)        26 (1.44%)         84 (2.08%)           40 (2.36%)              
## Intersection:                                                                                                               <0.001   
##     Driveway access              113 (9.89%)       351 (8.60%)      204 (11.3%)         301 (7.47%)          161 (9.49%)             
##     Intersection                 182 (15.9%)       498 (12.2%)      252 (14.0%)         402 (9.97%)          227 (13.4%)             
##     Intersection related          74 (6.47%)       306 (7.50%)      275 (15.3%)         425 (10.5%)          214 (12.6%)             
##     Non intersection             774 (67.7%)      2925 (71.7%)      1071 (59.4%)       2904 (72.0%)         1094 (64.5%)             
## FHE:                                                                                                                        <0.001   
##     Angle collisions             116 (10.1%)       348 (8.53%)      213 (11.8%)         291 (7.22%)          178 (10.5%)             
##     Head-on collision            233 (20.4%)       390 (9.56%)      142 (7.88%)         246 (6.10%)          106 (6.25%)             
##     Rear-end collision           109 (9.54%)       350 (8.58%)      341 (18.9%)         358 (8.88%)          193 (11.4%)             
##     Run-off-road                 605 (52.9%)      2613 (64.0%)      776 (43.1%)        2711 (67.2%)         1018 (60.0%)             
##     Sideswipe collision           18 (1.57%)       85 (2.08%)       140 (7.77%)         108 (2.68%)          52 (3.07%)              
##     Turning collisions            60 (5.25%)       281 (6.89%)      165 (9.16%)         303 (7.51%)          139 (8.20%)             
##     Other                         2 (0.17%)        13 (0.32%)        25 (1.39%)         15 (0.37%)           10 (0.59%)              
## ObjStrk:                                                                                                                    <0.001   
##     Ditch                         22 (1.92%)       81 (1.99%)        36 (2.00%)         97 (2.41%)           42 (2.48%)              
##     Hit fixed object             240 (21.0%)       622 (15.2%)      172 (9.54%)         391 (9.70%)          136 (8.02%)             
##     Hit median barrier            32 (2.80%)       57 (1.40%)        18 (1.00%)         60 (1.49%)           28 (1.65%)              
##     Hit tree, shrub, landscaping  64 (5.60%)       104 (2.55%)       22 (1.22%)         79 (1.96%)           25 (1.47%)              
##     Not applicable               428 (37.4%)      1379 (33.8%)      1014 (56.3%)       1288 (31.9%)          662 (39.0%)             
##     Overturned                   337 (29.5%)      1766 (43.3%)      506 (28.1%)        2015 (50.0%)          747 (44.0%)             
##     Other                         20 (1.75%)       71 (1.74%)        34 (1.89%)         102 (2.53%)          56 (3.30%)              
## Day:                                                                                                                        <0.001   
##     Weekday                      660 (57.7%)      2229 (54.6%)      1122 (62.3%)       2304 (57.1%)         1025 (60.4%)             
##     Weekend                      483 (42.3%)      1851 (45.4%)      680 (37.7%)        1728 (42.9%)          671 (39.6%)             
## Year                             2020 (2.01)       2020 (2.02)      2020 (1.99)         2020 (2.05)          2020 (1.96)    <0.001   
## Age:                                                                                                                        <0.001   
##     16 to 25                     145 (12.7%)       615 (15.1%)      337 (18.7%)         827 (20.5%)          364 (21.5%)             
##     26 to 45                     427 (37.4%)      1706 (41.8%)      683 (37.9%)        1633 (40.5%)          705 (41.6%)             
##     46 to 65                     458 (40.1%)      1407 (34.5%)      506 (28.1%)        1250 (31.0%)          492 (29.0%)             
##     Greater than 65              111 (9.71%)       314 (7.70%)      264 (14.7%)         289 (7.17%)          126 (7.43%)             
##     Less than 15                  2 (0.17%)        38 (0.93%)        12 (0.67%)         33 (0.82%)            9 (0.53%)              
## Ethnicity:                                                                                                                  <0.001   
##     Asian                         8 (0.70%)        40 (0.98%)        29 (1.61%)         52 (1.29%)           20 (1.18%)              
##     Black                         63 (5.51%)       251 (6.15%)      164 (9.10%)         297 (7.37%)          164 (9.67%)             
##     Hispanic                     150 (13.1%)       568 (13.9%)      285 (15.8%)         623 (15.5%)          299 (17.6%)             
##     Other                         9 (0.79%)        44 (1.08%)       178 (9.88%)         56 (1.39%)           48 (2.83%)              
##     White                        913 (79.9%)      3177 (77.9%)      1146 (63.6%)       3004 (74.5%)         1165 (68.7%)             
## Gender:                                                                                                                     <0.001   
##     Female                        44 (3.85%)       245 (6.00%)       72 (4.00%)         283 (7.02%)          120 (7.08%)             
##     Male                         1099 (96.2%)     3835 (94.0%)      1596 (88.6%)       3744 (92.9%)         1573 (92.7%)             
##     Unknown                       0 (0.00%)         0 (0.00%)       134 (7.44%)          5 (0.12%)            3 (0.18%)              
## Helmet:                                                                                                                     <0.001   
##     Not worn                     566 (49.5%)      1890 (46.3%)      580 (32.2%)        1535 (38.1%)          600 (35.4%)             
##     Unknown                       58 (5.07%)       430 (10.5%)      358 (19.9%)         554 (13.7%)          227 (13.4%)             
##     Worn, damaged                445 (38.9%)      1286 (31.5%)      228 (12.7%)        1110 (27.5%)          458 (27.0%)             
##     Worn, not damaged             74 (6.47%)       474 (11.6%)      636 (35.3%)         833 (20.7%)          411 (24.2%)             
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯

# Export the summary table to an Excel file
export2xls(summary_stats, file = "Descriptive_Statistics_Grouped_by_Severity_sample.xlsx")

colnames(data3)

##  [1] "ConFac"       "Crash Hour"   "Season"       "CrSpeed"      "Weather"     
##  [6] "LightCon"     "RdAlgn"       "Surface"      "TrafficCon"   "HarmEvent"   
## [11] "Intersection" "FHE"          "ObjStrk"      "Severity"     "Day"         
## [16] "Year"         "Age"          "Ethnicity"    "Gender"       "Helmet"

head(data3)

## # A tibble: 6 × 20
##   ConFac  `Crash Hour` Season CrSpeed Weather LightCon RdAlgn Surface TrafficCon
##   <fct>   <fct>        <fct>  <fct>   <fct>   <fct>    <fct>  <fct>   <fct>     
## 1 Animal… Morning      Winter 50 to … Clear   Dark, n… Curve… Dry     Marked la…
## 2 Failed… Morning      Winter 30 to … Clear   Daylight Strai… Dry     None      
## 3 Unsafe… Afternoon    Winter 50 to … Clear   Daylight Curve… Dry     No passin…
## 4 Unsafe… Afternoon    Winter Greate… Clear   Daylight Strai… Dry     No passin…
## 5 N/A     Afternoon    Winter 50 to … Clear   Dark, n… Strai… Dry     Center st…
## 6 Unsafe… Afternoon    Winter 30 to … Clear   Daylight Curve… Dry     Stop/yiel…
## # ℹ 11 more variables: HarmEvent <fct>, Intersection <fct>, FHE <fct>,
## #   ObjStrk <fct>, Severity <fct>, Day <fct>, Year <dbl>, Age <fct>,
## #   Ethnicity <fct>, Gender <fct>, Helmet <fct>

Variable Importance Plot (XGBoost)

# Load required libraries
library(xgboost)

## 
## Attaching package: 'xgboost'

## The following object is masked from 'package:dplyr':
## 
##     slice

library(ggplot2)

# Check if target variable 'Severity' exists
if (!"Severity" %in% colnames(data3)) stop("Target variable 'Severity' not found in the dataset")

# Fit XGBoost model
set.seed(102)
bst <- xgboost(
  data = data.matrix(subset(data3, select = -c(Severity, Year))),  # Exclude target variable
  label = as.numeric(data3$Severity),  # Use correct target variable
  objective = "reg:squarederror",
  nrounds = 100,
  max_depth = 5,
  eta = 0.3,
  verbose = 0
)

# Extract and plot variable importance
vi_bst <- xgb.importance(model = bst)
plot <- ggplot(vi_bst, aes(x = reorder(Feature, Gain), y = Gain, fill = Gain)) +
  geom_bar(stat = "identity", color = "black", width = 0.6, show.legend = FALSE) +
  geom_text(aes(label = round(Gain, 3)), hjust = -0.1, color = "black", size = 3) +
  coord_flip(clip = "off") +
  scale_fill_gradient(low = "#FFB6C1", high = "#8B0000") +
  labs(x = "Variables", y = "Importance") +
  theme_minimal(base_size = 15) +
  theme(
    axis.title.x = element_text(size = 11, face = "bold"),
    axis.title.y = element_text(size = 11, face = "bold"),
    axis.text = element_text(size = 10),
    axis.text.y = element_text(margin = ggplot2::margin(r = -10)),
    panel.grid.minor = element_blank(),
    panel.grid.major.y = element_blank(),
    panel.grid.major.x = element_line(color = "grey80", linetype = "dashed"),
    plot.margin = unit(c(1, 1, 1, 2), "cm")
  )

# Save the plot
ggsave(
  filename = "var_im.jpg",
  plot = plot,
  path = getwd(),
  width = 8,
  height = 4,
  dpi = 300
)

# Display the plot
print(plot)

Correlation Matrix (Cramer’s V)

library(vcd)

## Loading required package: grid

library(reshape2)

## 
## Attaching package: 'reshape2'

## The following object is masked from 'package:tidyr':
## 
##     smiths

library(ggplot2)

# Function to calculate Cramér's V correlation matrix
categorical_correlation <- function(df) {
  vars <- colnames(df)
  n <- length(vars)
  mat <- matrix(NA, nrow = n, ncol = n, dimnames = list(vars, vars))
  
  for (i in 1:n) {
    for (j in 1:n) {
      if (i == j) {
        mat[i, j] <- NA  # Exclude self-correlations from the plot
      } else {
        tbl <- table(df[[vars[i]]], df[[vars[j]]])
        mat[i, j] <- assocstats(tbl)$cramer  # Cramér's V calculation
      }
    }
  }
  
  return(as.data.frame(mat))
}

# Calculate correlation matrix for categorical variables in dat3
corr_matrix <- categorical_correlation(data3)

# Convert to long format for visualization
corr_matrix_melted <- melt(as.matrix(corr_matrix), na.rm = TRUE)

# Filter only highly correlated variables (Cramér's V > 0.5)
high_corr <- subset(corr_matrix_melted, value > 0.5)

# Plot the heatmap with improved clarity
ggplot(data = high_corr, aes(x = Var1, y = Var2, fill = value)) +
  geom_tile(color = "white") +
  geom_text(aes(label = round(value, 2)), size = 5, color = "black") +  # Display correlation values inside boxes
  scale_fill_gradient(low = "#d4e157", high = "#1b5e20", 
                      limits = c(0.5, 1), name = "Cramér's V") +  
  theme_minimal(base_size = 14) +  # Improved font size
  labs(title = "Highly Correlated Categorical Variables (Cramér's V > 0.5)", 
       x = "Variables", y = "Variables") +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1, size = 12),
        axis.text.y = element_text(size = 12),
        plot.title = element_text(size = 16, face = "bold"),
        legend.title = element_text(size = 14),
        legend.text = element_text(size = 12))

colnames(data3)

##  [1] "ConFac"       "Crash Hour"   "Season"       "CrSpeed"      "Weather"     
##  [6] "LightCon"     "RdAlgn"       "Surface"      "TrafficCon"   "HarmEvent"   
## [11] "Intersection" "FHE"          "ObjStrk"      "Severity"     "Day"         
## [16] "Year"         "Age"          "Ethnicity"    "Gender"       "Helmet"

Variable Pair Reason

HarmEvent & ObjStrk_Overturned Highly correlated (0.84) → pick only one FHE_Run.off.road & HarmEvent_MV.in.transport Strong negative correlation (–0.93) → choose one Weather_Rain & Surface_Wet Strong correlation (0.71) → consider merging to one variable Intersection & FHE 0.57 correlation → fine to keep both unless model is sensitive to multicollinearity

# Load required libraries
library(dplyr)
library(clustrd)

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

library(ggplot2)

# Step 1: Select specified variables and clean the data
data4 <- data3 %>%
  select("Helmet", "CrSpeed", "ConFac", "TrafficCon", 
         "Age", "RdAlgn", "Crash Hour", 
         "LightCon", "Season", "FHE", "Severity", "ObjStrk") %>%
  na.omit()  # Remove rows with missing values

# Step 2: Convert all numeric variables to factor (clustrd expects categorical)
data4_clean <- data4 %>%
  mutate_if(is.numeric, as.factor)

# Step 3: Initialize empty data frame to store elbow info
elbow_data <- data.frame(Clusters = integer(), Criterion = numeric())

# Step 4: Run clusmca for k = 2 to 10
for (k in 2:10) {
  cat("Trying k =", k, "\n")
  model <- tryCatch({
    clusmca(data4_clean, nclus = k, ndim = 2, method = "clusCA", nstart = 10)
  }, error = function(e) NULL)
  
  if (!is.null(model) && !is.null(model$criterion)) {
    elbow_data <- rbind(elbow_data, data.frame(Clusters = k, Criterion = model$criterion))
    cat("Criterion for k =", k, ":", model$criterion, "\n")
  }
}

## Trying k = 2 
##   |                                                                              |                                                                      |   0%  |                                                                              |=======                                                               |  10%  |                                                                              |==============                                                        |  20%  |                                                                              |=====================                                                 |  30%  |                                                                              |============================                                          |  40%  |                                                                              |===================================                                   |  50%  |                                                                              |==========================================                            |  60%  |                                                                              |=================================================                     |  70%  |                                                                              |========================================================              |  80%  |                                                                              |===============================================================       |  90%  |                                                                              |======================================================================| 100%Criterion for k = 2 : 2.083076 
## Trying k = 3 
##   |                                                                              |                                                                      |   0%  |                                                                              |=======                                                               |  10%  |                                                                              |==============                                                        |  20%  |                                                                              |=====================                                                 |  30%  |                                                                              |============================                                          |  40%  |                                                                              |===================================                                   |  50%  |                                                                              |==========================================                            |  60%  |                                                                              |=================================================                     |  70%  |                                                                              |========================================================              |  80%  |                                                                              |===============================================================       |  90%  |                                                                              |======================================================================| 100%Criterion for k = 3 : 3.23477 
## Trying k = 4 
##   |                                                                              |                                                                      |   0%  |                                                                              |=======                                                               |  10%  |                                                                              |==============                                                        |  20%  |                                                                              |=====================                                                 |  30%  |                                                                              |============================                                          |  40%

##   |                                                                              |===================================                                   |  50%  |                                                                              |==========================================                            |  60%  |                                                                              |=================================================                     |  70%  |                                                                              |========================================================              |  80%  |                                                                              |===============================================================       |  90%  |                                                                              |======================================================================| 100%Criterion for k = 4 : 3.421034 
## Trying k = 5 
##   |                                                                              |                                                                      |   0%  |                                                                              |=======                                                               |  10%  |                                                                              |==============                                                        |  20%

##   |                                                                              |=====================                                                 |  30%  |                                                                              |============================                                          |  40%  |                                                                              |===================================                                   |  50%  |                                                                              |==========================================                            |  60%  |                                                                              |=================================================                     |  70%  |                                                                              |========================================================              |  80%  |                                                                              |===============================================================       |  90%  |                                                                              |======================================================================| 100%Criterion for k = 5 : 3.610805 
## Trying k = 6 
##   |                                                                              |                                                                      |   0%  |                                                                              |=======                                                               |  10%

##   |                                                                              |==============                                                        |  20%  |                                                                              |=====================                                                 |  30%  |                                                                              |============================                                          |  40%  |                                                                              |===================================                                   |  50%  |                                                                              |==========================================                            |  60%  |                                                                              |=================================================                     |  70%

##   |                                                                              |========================================================              |  80%  |                                                                              |===============================================================       |  90%  |                                                                              |======================================================================| 100%Criterion for k = 6 : 3.624342 
## Trying k = 7 
##   |                                                                              |                                                                      |   0%  |                                                                              |=======                                                               |  10%  |                                                                              |==============                                                        |  20%  |                                                                              |=====================                                                 |  30%  |                                                                              |============================                                          |  40%  |                                                                              |===================================                                   |  50%  |                                                                              |==========================================                            |  60%  |                                                                              |=================================================                     |  70%  |                                                                              |========================================================              |  80%  |                                                                              |===============================================================       |  90%  |                                                                              |======================================================================| 100%Criterion for k = 7 : 3.706446 
## Trying k = 8 
##   |                                                                              |                                                                      |   0%  |                                                                              |=======                                                               |  10%  |                                                                              |==============                                                        |  20%  |                                                                              |=====================                                                 |  30%

##   |                                                                              |============================                                          |  40%  |                                                                              |===================================                                   |  50%  |                                                                              |==========================================                            |  60%  |                                                                              |=================================================                     |  70%  |                                                                              |========================================================              |  80%  |                                                                              |===============================================================       |  90%  |                                                                              |======================================================================| 100%Criterion for k = 8 : 3.798688 
## Trying k = 9 
##   |                                                                              |                                                                      |   0%  |                                                                              |=======                                                               |  10%  |                                                                              |==============                                                        |  20%  |                                                                              |=====================                                                 |  30%  |                                                                              |============================                                          |  40%  |                                                                              |===================================                                   |  50%

##   |                                                                              |==========================================                            |  60%  |                                                                              |=================================================                     |  70%  |                                                                              |========================================================              |  80%  |                                                                              |===============================================================       |  90%  |                                                                              |======================================================================| 100%Criterion for k = 9 : 3.849937 
## Trying k = 10 
##   |                                                                              |                                                                      |   0%  |                                                                              |=======                                                               |  10%  |                                                                              |==============                                                        |  20%

##   |                                                                              |=====================                                                 |  30%  |                                                                              |============================                                          |  40%  |                                                                              |===================================                                   |  50%  |                                                                              |==========================================                            |  60%  |                                                                              |=================================================                     |  70%  |                                                                              |========================================================              |  80%  |                                                                              |===============================================================       |  90%  |                                                                              |======================================================================| 100%Criterion for k = 10 : 3.89122

# Step 5: Plot the elbow graph
ggplot(elbow_data, aes(x = Clusters, y = Criterion)) +
  geom_line(color = "#2c3e50", size = 1.2) +
  geom_point(color = "#e74c3c", size = 2) +
  theme_minimal(base_size = 14) +
  labs(
    title = "",
    x = "Number of Clusters (k)",
    y = "Model Criterion"
  )

5-Clusters

# Load required libraries

library(dplyr)
library(clustrd)
library(xgboost)
library(compareGroups)

# Step 1: Select the required columns
data4 <- data3 %>%
    select( "Helmet", "CrSpeed", "ConFac", "TrafficCon", 
                "Age", "RdAlgn", "Crash Hour", 
                "LightCon", "Season", "FHE", "Severity", "ObjStrk") %>%
  na.omit()   # Remove rows with missing values

# Step 5: Preprocess the data for CCA
dat <- data4 %>%
  mutate_if(is.numeric, as.factor) %>%
  na.omit()

# Step 6: Perform Cluster Correspondence Analysis (CCA)
set.seed(1234)
outclusCA <- clusmca(dat, nclus = 5, ndim = 2, method = "clusCA", nstart = 10)

##   |                                                                              |                                                                      |   0%  |                                                                              |=======                                                               |  10%  |                                                                              |==============                                                        |  20%  |                                                                              |=====================                                                 |  30%  |                                                                              |============================                                          |  40%  |                                                                              |===================================                                   |  50%  |                                                                              |==========================================                            |  60%  |                                                                              |=================================================                     |  70%  |                                                                              |========================================================              |  80%  |                                                                              |===============================================================       |  90%  |                                                                              |======================================================================| 100%

# Step 7: Plot the results
plot(outclusCA, cludesc = TRUE, dim = c(1, 2))

# Print clustering output
print(outclusCA)

## Solution with 5 clusters of sizes 3382 (26.5%), 3354 (26.3%), 2577 (20.2%), 2324 (18.2%), 1116 (8.8%) in 2 dimensions. 
## 
## Cluster centroids:
##             Dim.1   Dim.2
## Cluster 1  0.0025 -0.0005
## Cluster 2  0.0078 -0.0031
## Cluster 3 -0.0046 -0.0010
## Cluster 4 -0.0112 -0.0009
## Cluster 5  0.0030  0.0149
## 
## Within cluster sum of squares by cluster:
## [1] 0.0201 0.0148 0.0168 0.0152 0.0147
##  (between_SS / total_SS =  91.41 %) 
## 
## Objective criterion value: 3.6108 
## 
## Available output:
## 
## [1] "obscoord"  "attcoord"  "centroid"  "cluster"   "criterion" "size"     
## [7] "odata"     "nstart"

# Step 8: Save clusters to the dataset and export
# Set working directory
setwd("C:/Users/qwx11/OneDrive - Texas State University/Hackathon_Swastika/0_Thesis/2_Thesis Defence/Code")

dat$Cluster <- outclusCA$cluster
write.csv(dat, "DatawithCluster_v2.csv")

6-Clusters

# Load required libraries

library(dplyr)
library(clustrd)
library(xgboost)
library(compareGroups)

# Step 1: Select the required columns
data4 <- data3 %>%
    select( "Helmet", "CrSpeed", "ConFac", "TrafficCon", 
                "Age", "RdAlgn", "Crash Hour", 
                "LightCon", "Season", "FHE", "Severity", "ObjStrk") %>%
  na.omit()   # Remove rows with missing values

# Step 5: Preprocess the data for CCA
dat <- data4 %>%
  mutate_if(is.numeric, as.factor) %>%
  na.omit()

# Step 6: Perform Cluster Correspondence Analysis (CCA)
set.seed(1234)
outclusCA <- clusmca(dat, nclus = 6, ndim = 2, method = "clusCA", nstart = 10)

##   |                                                                              |                                                                      |   0%  |                                                                              |=======                                                               |  10%  |                                                                              |==============                                                        |  20%  |                                                                              |=====================                                                 |  30%  |                                                                              |============================                                          |  40%  |                                                                              |===================================                                   |  50%  |                                                                              |==========================================                            |  60%  |                                                                              |=================================================                     |  70%  |                                                                              |========================================================              |  80%  |                                                                              |===============================================================       |  90%  |                                                                              |======================================================================| 100%

# Step 7: Plot the results
plot(outclusCA, cludesc = TRUE, dim = c(1, 2))

# Step 8: Save clusters to the dataset and export
dat$Cluster <- outclusCA$cluster
write.csv(dat, "DatawithCluster_v2.csv")

# Print clustering output
print(outclusCA)

## Solution with 6 clusters of sizes 3232 (25.3%), 2753 (21.6%), 2069 (16.2%), 1968 (15.4%), 1615 (12.7%), 1116 (8.8%) in 2 dimensions. 
## 
## Cluster centroids:
##             Dim.1   Dim.2
## Cluster 1 -0.0080  0.0033
## Cluster 2 -0.0034  0.0004
## Cluster 3  0.0071  0.0005
## Cluster 4  0.0018  0.0012
## Cluster 5  0.0125  0.0012
## Cluster 6 -0.0031 -0.0150
## 
## Within cluster sum of squares by cluster:
## [1] 0.0143 0.0133 0.0103 0.0106 0.0082 0.0152
##  (between_SS / total_SS =  92.62 %) 
## 
## Objective criterion value: 3.6694 
## 
## Available output:
## 
## [1] "obscoord"  "attcoord"  "centroid"  "cluster"   "criterion" "size"     
## [7] "odata"     "nstart"