Rural_undivided_motorcycle
Bigram Topic Modelling
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidytext)
library(stringr)
library(tidyr)
library(topicmodels)
library(ggplot2)
library(stopwords)
library(RColorBrewer)
# Set working directory
setwd("C:/Users/qwx11/OneDrive - Texas State University/Hackathon_Swastika/0_Thesis/2_Thesis Defence/Code")
# Load and clean data
df <- read_excel("motorcyclist_narrative.xlsx") %>%
filter(!is.na(Cleaned_Narrative))
# Define stopwords
custom_stopwords <- c("unit", "travel", "travelling", "traveled", "motorcycle")
all_stopwords <- c(stopwords("en"), custom_stopwords)
# Clean and lowercase text
df_clean <- df %>%
mutate(id = row_number(),
text = tolower(Cleaned_Narrative),
text = str_replace_all(text, "[^a-z\\s]", " ")) %>%
select(id, text)
# Tokenize into bigrams
bigrams <- df_clean %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2)
# Filter bigrams to remove stopwords and unwanted directional/verb phrases
bigrams_separated <- bigrams %>%
separate(bigram, into = c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% all_stopwords,
!word2 %in% all_stopwords,
nchar(word1) > 3,
nchar(word2) > 3) %>%
mutate(bigram = paste(word1, word2)) %>%
filter(
!str_detect(word1, "^travel(l)?ing|^facing|north|south|east|west|bound") &
!str_detect(word2, "^travel(l)?ing|^facing|north|south|east|west|bound")
)
# Count bigrams per document
bigram_counts <- bigrams_separated %>%
count(id, bigram, sort = TRUE)
# Create Document-Term Matrix
dtm <- bigram_counts %>%
cast_dtm(document = id, term = bigram, value = n)
# Fit LDA model (5 topics)
lda_model <- LDA(dtm, k = 6, control = list(seed = 42))
top_terms <- tidy(lda_model, matrix = "beta")
# Get top 10 bigrams per topic
top_terms_df <- top_terms %>%
group_by(topic) %>%
slice_max(beta, n = 10) %>%
ungroup() %>%
arrange(topic, -beta) %>%
group_by(topic) %>%
mutate(rank = row_number()) %>%
ungroup()
# Define color palette by rank
custom_palette <- c("#5E4B56", "#A9939C", "#AA6F73", "#EDA29B", "#F9D8A4")
top_terms_df$fill_color <- cut(top_terms_df$rank,
breaks = c(0, 2, 4, 6, 8, 10),
labels = custom_palette,
include.lowest = TRUE)
# Generate and print plots
plots <- list()
for (i in 1:6) {
plot_data <- top_terms_df %>% filter(topic == i)
p <- ggplot(plot_data, aes(x = reorder(term, beta), y = beta, fill = fill_color)) +
geom_col(show.legend = FALSE) +
scale_fill_manual(values = custom_palette) +
coord_flip() +
labs(title = paste("Top 10 Bigrams for Topic", i),
x = "Bigrams",
y = "Importance (LDA Weight)") +
theme_minimal() +
theme(plot.title = element_text(size = 14, face = "bold"))
print(p)
ggsave(paste0("Topic_", i, "_bigrams.png"), plot = p, height = 8, width = 4)
}Descriptive Statistics
# Load necessary libraries
library(readxl)
library(dplyr)
library(forcats)
library(compareGroups)
library(openxlsx)
library(lares)
library(ggplot2)
# Set working directory
setwd("C:/Users/qwx11/OneDrive - Texas State University/Hackathon_Swastika/0_Thesis/2_Thesis Defence/Code")
# Read Excel file
file_path <- "Rural_undivided.xlsx"
data <- read_excel(file_path)
# Convert character columns to factors
data1 <- data %>% mutate_if(is.character, as.factor)
# Lump factor levels (keep top 7 levels, others grouped as "other")
data1a <- data1 %>% mutate(across(where(is.factor), fct_lump_n, n = 6, other_level = "Other"))
# Remove ID column if present
data1a <- data1a %>% select(-ID)
# Ensure proper column names are used
data3 <- data1a %>% rename_with(~ names(data1a))
# Display the first few rows to confirm preprocessing
head(data3)## # A tibble: 6 × 20
## ConFac `Crash Hour` Season CrSpeed Weather LightCon RdAlgn Surface TrafficCon
## <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct>
## 1 Animal… Morning Winter 50 to … Clear Dark, n… Curve… Dry Marked la…
## 2 Failed… Morning Winter 30 to … Clear Daylight Strai… Dry None
## 3 Unsafe… Afternoon Winter 50 to … Clear Daylight Curve… Dry No passin…
## 4 Unsafe… Afternoon Winter Greate… Clear Daylight Strai… Dry No passin…
## 5 N/A Afternoon Winter 50 to … Clear Dark, n… Strai… Dry Center st…
## 6 Unsafe… Afternoon Winter 30 to … Clear Daylight Curve… Dry Stop/yiel…
## # ℹ 11 more variables: HarmEvent <fct>, Intersection <fct>, FHE <fct>,
## # ObjStrk <fct>, Severity <fct>, Day <fct>, Year <dbl>, Age <fct>,
## # Ethnicity <fct>, Gender <fct>, Helmet <fct>
# Generate descriptive statistics grouped by Severity
descr_stats <- compareGroups(Severity ~ ., data = data3)
# Create summary table
summary_stats <- createTable(descr_stats)
# Print the summary table in the console
print(summary_stats)##
## --------Summary descriptives table by 'Severity'---------
##
## ____________________________________________________________________________________________________________________________________
## Fatal Incapacitating injury No injury Non-incapacitating injury Possible injury p.overall
## N=1143 N=4080 N=1802 N=4032 N=1696
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
## ConFac: <0.001
## Animal on the road 55 (4.81%) 407 (9.98%) 101 (5.60%) 429 (10.6%) 124 (7.31%)
## Failed to yield/signal 275 (24.1%) 588 (14.4%) 190 (10.5%) 413 (10.2%) 174 (10.3%)
## Inattention/fatigued 13 (1.14%) 157 (3.85%) 76 (4.22%) 196 (4.86%) 78 (4.60%)
## N/A 290 (25.4%) 929 (22.8%) 604 (33.5%) 922 (22.9%) 476 (28.1%)
## Unsafe speed 337 (29.5%) 1337 (32.8%) 476 (26.4%) 1273 (31.6%) 562 (33.1%)
## Other 173 (15.1%) 662 (16.2%) 355 (19.7%) 799 (19.8%) 282 (16.6%)
## Crash Hour: 0.889
## Afternoon 291 (25.5%) 1021 (25.0%) 467 (25.9%) 992 (24.6%) 421 (24.8%)
## Evening 131 (11.5%) 513 (12.6%) 223 (12.4%) 527 (13.1%) 200 (11.8%)
## Midnight 300 (26.2%) 1008 (24.7%) 460 (25.5%) 1012 (25.1%) 406 (23.9%)
## Morning 289 (25.3%) 1017 (24.9%) 435 (24.1%) 1006 (25.0%) 441 (26.0%)
## Night 132 (11.5%) 521 (12.8%) 217 (12.0%) 495 (12.3%) 228 (13.4%)
## Season: <0.001
## Fall 336 (29.4%) 1123 (27.5%) 494 (27.4%) 1073 (26.6%) 484 (28.5%)
## Spring 310 (27.1%) 1209 (29.6%) 491 (27.2%) 1198 (29.7%) 479 (28.2%)
## Summer 319 (27.9%) 1146 (28.1%) 465 (25.8%) 1176 (29.2%) 458 (27.0%)
## Winter 178 (15.6%) 602 (14.8%) 352 (19.5%) 585 (14.5%) 275 (16.2%)
## CrSpeed: <0.001
## 30 to 45 260 (22.7%) 1249 (30.6%) 887 (49.2%) 1438 (35.7%) 720 (42.5%)
## 50 to 65 644 (56.3%) 2033 (49.8%) 651 (36.1%) 1882 (46.7%) 707 (41.7%)
## Greater than 65 233 (20.4%) 689 (16.9%) 176 (9.77%) 564 (14.0%) 195 (11.5%)
## Less than 25 6 (0.52%) 109 (2.67%) 88 (4.88%) 148 (3.67%) 74 (4.36%)
## Weather: .
## Clear 942 (82.4%) 3388 (83.0%) 1495 (83.0%) 3346 (83.0%) 1402 (82.7%)
## Cloudy 163 (14.3%) 594 (14.6%) 214 (11.9%) 537 (13.3%) 207 (12.2%)
## Fog 10 (0.87%) 14 (0.34%) 9 (0.50%) 15 (0.37%) 8 (0.47%)
## Other 5 (0.44%) 3 (0.07%) 3 (0.17%) 6 (0.15%) 5 (0.29%)
## Rain 20 (1.75%) 65 (1.59%) 78 (4.33%) 117 (2.90%) 69 (4.07%)
## Severe crosswinds 3 (0.26%) 16 (0.39%) 3 (0.17%) 11 (0.27%) 5 (0.29%)
## LightCon: .
## Dark, lighted 74 (6.47%) 237 (5.81%) 191 (10.6%) 243 (6.03%) 153 (9.02%)
## Dark, not lighted 368 (32.2%) 1051 (25.8%) 332 (18.4%) 838 (20.8%) 299 (17.6%)
## Dawn 8 (0.70%) 35 (0.86%) 19 (1.05%) 36 (0.89%) 21 (1.24%)
## Daylight 651 (57.0%) 2658 (65.1%) 1233 (68.4%) 2834 (70.3%) 1201 (70.8%)
## Dusk 40 (3.50%) 97 (2.38%) 23 (1.28%) 77 (1.91%) 19 (1.12%)
## Unknown 2 (0.17%) 2 (0.05%) 4 (0.22%) 4 (0.10%) 3 (0.18%)
## RdAlgn: .
## Curve, grade 130 (11.4%) 465 (11.4%) 121 (6.71%) 422 (10.5%) 144 (8.49%)
## Curve, hillcrest 23 (2.01%) 79 (1.94%) 17 (0.94%) 67 (1.66%) 33 (1.95%)
## Curve, level 285 (24.9%) 975 (23.9%) 288 (16.0%) 864 (21.4%) 347 (20.5%)
## Straight, grade 124 (10.8%) 351 (8.60%) 122 (6.77%) 351 (8.71%) 126 (7.43%)
## Straight, hillcrest 38 (3.32%) 105 (2.57%) 35 (1.94%) 103 (2.55%) 35 (2.06%)
## Straight, level 543 (47.5%) 2096 (51.4%) 1215 (67.4%) 2221 (55.1%) 1009 (59.5%)
## Other 0 (0.00%) 9 (0.22%) 4 (0.22%) 4 (0.10%) 2 (0.12%)
## Surface: .
## Dry 1098 (96.1%) 3878 (95.0%) 1649 (91.5%) 3741 (92.8%) 1535 (90.5%)
## Ice 1 (0.09%) 1 (0.02%) 3 (0.17%) 1 (0.02%) 0 (0.00%)
## Sand, mud, dirt 8 (0.70%) 44 (1.08%) 17 (0.94%) 50 (1.24%) 29 (1.71%)
## Standing water 1 (0.09%) 2 (0.05%) 6 (0.33%) 17 (0.42%) 4 (0.24%)
## Wet 27 (2.36%) 131 (3.21%) 113 (6.27%) 186 (4.61%) 110 (6.49%)
## Other 8 (0.70%) 24 (0.59%) 14 (0.78%) 37 (0.92%) 18 (1.06%)
## TrafficCon: <0.001
## Center stripe/divider 222 (19.4%) 828 (20.3%) 258 (14.3%) 770 (19.1%) 295 (17.4%)
## Marked lanes 365 (31.9%) 1186 (29.1%) 595 (33.0%) 1201 (29.8%) 579 (34.1%)
## No passing zone 246 (21.5%) 708 (17.4%) 154 (8.55%) 598 (14.8%) 191 (11.3%)
## None 120 (10.5%) 711 (17.4%) 368 (20.4%) 853 (21.2%) 299 (17.6%)
## Signal light 32 (2.80%) 115 (2.82%) 142 (7.88%) 127 (3.15%) 86 (5.07%)
## Stop/yield/warning 144 (12.6%) 461 (11.3%) 251 (13.9%) 416 (10.3%) 203 (12.0%)
## Other 14 (1.22%) 71 (1.74%) 34 (1.89%) 67 (1.66%) 43 (2.54%)
## HarmEvent: <0.001
## Animal 48 (4.20%) 346 (8.48%) 80 (4.44%) 338 (8.38%) 96 (5.66%)
## Fixed object 290 (25.4%) 741 (18.2%) 222 (12.3%) 562 (13.9%) 218 (12.9%)
## MV in transport 530 (46.4%) 1372 (33.6%) 963 (53.4%) 1167 (28.9%) 618 (36.4%)
## Overturned 259 (22.7%) 1533 (37.6%) 471 (26.1%) 1851 (45.9%) 708 (41.7%)
## Parked car 2 (0.17%) 27 (0.66%) 40 (2.22%) 30 (0.74%) 16 (0.94%)
## Other 14 (1.22%) 61 (1.50%) 26 (1.44%) 84 (2.08%) 40 (2.36%)
## Intersection: <0.001
## Driveway access 113 (9.89%) 351 (8.60%) 204 (11.3%) 301 (7.47%) 161 (9.49%)
## Intersection 182 (15.9%) 498 (12.2%) 252 (14.0%) 402 (9.97%) 227 (13.4%)
## Intersection related 74 (6.47%) 306 (7.50%) 275 (15.3%) 425 (10.5%) 214 (12.6%)
## Non intersection 774 (67.7%) 2925 (71.7%) 1071 (59.4%) 2904 (72.0%) 1094 (64.5%)
## FHE: <0.001
## Angle collisions 116 (10.1%) 348 (8.53%) 213 (11.8%) 291 (7.22%) 178 (10.5%)
## Head-on collision 233 (20.4%) 390 (9.56%) 142 (7.88%) 246 (6.10%) 106 (6.25%)
## Rear-end collision 109 (9.54%) 350 (8.58%) 341 (18.9%) 358 (8.88%) 193 (11.4%)
## Run-off-road 605 (52.9%) 2613 (64.0%) 776 (43.1%) 2711 (67.2%) 1018 (60.0%)
## Sideswipe collision 18 (1.57%) 85 (2.08%) 140 (7.77%) 108 (2.68%) 52 (3.07%)
## Turning collisions 60 (5.25%) 281 (6.89%) 165 (9.16%) 303 (7.51%) 139 (8.20%)
## Other 2 (0.17%) 13 (0.32%) 25 (1.39%) 15 (0.37%) 10 (0.59%)
## ObjStrk: <0.001
## Ditch 22 (1.92%) 81 (1.99%) 36 (2.00%) 97 (2.41%) 42 (2.48%)
## Hit fixed object 240 (21.0%) 622 (15.2%) 172 (9.54%) 391 (9.70%) 136 (8.02%)
## Hit median barrier 32 (2.80%) 57 (1.40%) 18 (1.00%) 60 (1.49%) 28 (1.65%)
## Hit tree, shrub, landscaping 64 (5.60%) 104 (2.55%) 22 (1.22%) 79 (1.96%) 25 (1.47%)
## Not applicable 428 (37.4%) 1379 (33.8%) 1014 (56.3%) 1288 (31.9%) 662 (39.0%)
## Overturned 337 (29.5%) 1766 (43.3%) 506 (28.1%) 2015 (50.0%) 747 (44.0%)
## Other 20 (1.75%) 71 (1.74%) 34 (1.89%) 102 (2.53%) 56 (3.30%)
## Day: <0.001
## Weekday 660 (57.7%) 2229 (54.6%) 1122 (62.3%) 2304 (57.1%) 1025 (60.4%)
## Weekend 483 (42.3%) 1851 (45.4%) 680 (37.7%) 1728 (42.9%) 671 (39.6%)
## Year 2020 (2.01) 2020 (2.02) 2020 (1.99) 2020 (2.05) 2020 (1.96) <0.001
## Age: <0.001
## 16 to 25 145 (12.7%) 615 (15.1%) 337 (18.7%) 827 (20.5%) 364 (21.5%)
## 26 to 45 427 (37.4%) 1706 (41.8%) 683 (37.9%) 1633 (40.5%) 705 (41.6%)
## 46 to 65 458 (40.1%) 1407 (34.5%) 506 (28.1%) 1250 (31.0%) 492 (29.0%)
## Greater than 65 111 (9.71%) 314 (7.70%) 264 (14.7%) 289 (7.17%) 126 (7.43%)
## Less than 15 2 (0.17%) 38 (0.93%) 12 (0.67%) 33 (0.82%) 9 (0.53%)
## Ethnicity: <0.001
## Asian 8 (0.70%) 40 (0.98%) 29 (1.61%) 52 (1.29%) 20 (1.18%)
## Black 63 (5.51%) 251 (6.15%) 164 (9.10%) 297 (7.37%) 164 (9.67%)
## Hispanic 150 (13.1%) 568 (13.9%) 285 (15.8%) 623 (15.5%) 299 (17.6%)
## Other 9 (0.79%) 44 (1.08%) 178 (9.88%) 56 (1.39%) 48 (2.83%)
## White 913 (79.9%) 3177 (77.9%) 1146 (63.6%) 3004 (74.5%) 1165 (68.7%)
## Gender: <0.001
## Female 44 (3.85%) 245 (6.00%) 72 (4.00%) 283 (7.02%) 120 (7.08%)
## Male 1099 (96.2%) 3835 (94.0%) 1596 (88.6%) 3744 (92.9%) 1573 (92.7%)
## Unknown 0 (0.00%) 0 (0.00%) 134 (7.44%) 5 (0.12%) 3 (0.18%)
## Helmet: <0.001
## Not worn 566 (49.5%) 1890 (46.3%) 580 (32.2%) 1535 (38.1%) 600 (35.4%)
## Unknown 58 (5.07%) 430 (10.5%) 358 (19.9%) 554 (13.7%) 227 (13.4%)
## Worn, damaged 445 (38.9%) 1286 (31.5%) 228 (12.7%) 1110 (27.5%) 458 (27.0%)
## Worn, not damaged 74 (6.47%) 474 (11.6%) 636 (35.3%) 833 (20.7%) 411 (24.2%)
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
# Export the summary table to an Excel file
export2xls(summary_stats, file = "Descriptive_Statistics_Grouped_by_Severity_sample.xlsx")## [1] "ConFac" "Crash Hour" "Season" "CrSpeed" "Weather"
## [6] "LightCon" "RdAlgn" "Surface" "TrafficCon" "HarmEvent"
## [11] "Intersection" "FHE" "ObjStrk" "Severity" "Day"
## [16] "Year" "Age" "Ethnicity" "Gender" "Helmet"
## # A tibble: 6 × 20
## ConFac `Crash Hour` Season CrSpeed Weather LightCon RdAlgn Surface TrafficCon
## <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct>
## 1 Animal… Morning Winter 50 to … Clear Dark, n… Curve… Dry Marked la…
## 2 Failed… Morning Winter 30 to … Clear Daylight Strai… Dry None
## 3 Unsafe… Afternoon Winter 50 to … Clear Daylight Curve… Dry No passin…
## 4 Unsafe… Afternoon Winter Greate… Clear Daylight Strai… Dry No passin…
## 5 N/A Afternoon Winter 50 to … Clear Dark, n… Strai… Dry Center st…
## 6 Unsafe… Afternoon Winter 30 to … Clear Daylight Curve… Dry Stop/yiel…
## # ℹ 11 more variables: HarmEvent <fct>, Intersection <fct>, FHE <fct>,
## # ObjStrk <fct>, Severity <fct>, Day <fct>, Year <dbl>, Age <fct>,
## # Ethnicity <fct>, Gender <fct>, Helmet <fct>
Variable Importance Plot (XGBoost)
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
library(ggplot2)
# Check if target variable 'Severity' exists
if (!"Severity" %in% colnames(data3)) stop("Target variable 'Severity' not found in the dataset")
# Fit XGBoost model
set.seed(102)
bst <- xgboost(
data = data.matrix(subset(data3, select = -c(Severity, Year))), # Exclude target variable
label = as.numeric(data3$Severity), # Use correct target variable
objective = "reg:squarederror",
nrounds = 100,
max_depth = 5,
eta = 0.3,
verbose = 0
)
# Extract and plot variable importance
vi_bst <- xgb.importance(model = bst)
plot <- ggplot(vi_bst, aes(x = reorder(Feature, Gain), y = Gain, fill = Gain)) +
geom_bar(stat = "identity", color = "black", width = 0.6, show.legend = FALSE) +
geom_text(aes(label = round(Gain, 3)), hjust = -0.1, color = "black", size = 3) +
coord_flip(clip = "off") +
scale_fill_gradient(low = "#FFB6C1", high = "#8B0000") +
labs(x = "Variables", y = "Importance") +
theme_minimal(base_size = 15) +
theme(
axis.title.x = element_text(size = 11, face = "bold"),
axis.title.y = element_text(size = 11, face = "bold"),
axis.text = element_text(size = 10),
axis.text.y = element_text(margin = ggplot2::margin(r = -10)),
panel.grid.minor = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.major.x = element_line(color = "grey80", linetype = "dashed"),
plot.margin = unit(c(1, 1, 1, 2), "cm")
)
# Save the plot
ggsave(
filename = "var_im.jpg",
plot = plot,
path = getwd(),
width = 8,
height = 4,
dpi = 300
)
# Display the plot
print(plot)Correlation Matrix (Cramer’s V)
## Loading required package: grid
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library(ggplot2)
# Function to calculate Cramér's V correlation matrix
categorical_correlation <- function(df) {
vars <- colnames(df)
n <- length(vars)
mat <- matrix(NA, nrow = n, ncol = n, dimnames = list(vars, vars))
for (i in 1:n) {
for (j in 1:n) {
if (i == j) {
mat[i, j] <- NA # Exclude self-correlations from the plot
} else {
tbl <- table(df[[vars[i]]], df[[vars[j]]])
mat[i, j] <- assocstats(tbl)$cramer # Cramér's V calculation
}
}
}
return(as.data.frame(mat))
}
# Calculate correlation matrix for categorical variables in dat3
corr_matrix <- categorical_correlation(data3)
# Convert to long format for visualization
corr_matrix_melted <- melt(as.matrix(corr_matrix), na.rm = TRUE)
# Filter only highly correlated variables (Cramér's V > 0.5)
high_corr <- subset(corr_matrix_melted, value > 0.5)
# Plot the heatmap with improved clarity
ggplot(data = high_corr, aes(x = Var1, y = Var2, fill = value)) +
geom_tile(color = "white") +
geom_text(aes(label = round(value, 2)), size = 5, color = "black") + # Display correlation values inside boxes
scale_fill_gradient(low = "#d4e157", high = "#1b5e20",
limits = c(0.5, 1), name = "Cramér's V") +
theme_minimal(base_size = 14) + # Improved font size
labs(title = "Highly Correlated Categorical Variables (Cramér's V > 0.5)",
x = "Variables", y = "Variables") +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1, size = 12),
axis.text.y = element_text(size = 12),
plot.title = element_text(size = 16, face = "bold"),
legend.title = element_text(size = 14),
legend.text = element_text(size = 12))## [1] "ConFac" "Crash Hour" "Season" "CrSpeed" "Weather"
## [6] "LightCon" "RdAlgn" "Surface" "TrafficCon" "HarmEvent"
## [11] "Intersection" "FHE" "ObjStrk" "Severity" "Day"
## [16] "Year" "Age" "Ethnicity" "Gender" "Helmet"
Variable Pair Reason
HarmEvent & ObjStrk_Overturned Highly correlated (0.84) → pick only one FHE_Run.off.road & HarmEvent_MV.in.transport Strong negative correlation (–0.93) → choose one Weather_Rain & Surface_Wet Strong correlation (0.71) → consider merging to one variable Intersection & FHE 0.57 correlation → fine to keep both unless model is sensitive to multicollinearity
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(ggplot2)
# Step 1: Select specified variables and clean the data
data4 <- data3 %>%
select("Helmet", "CrSpeed", "ConFac", "TrafficCon",
"Age", "RdAlgn", "Crash Hour",
"LightCon", "Season", "FHE", "Severity", "ObjStrk") %>%
na.omit() # Remove rows with missing values
# Step 2: Convert all numeric variables to factor (clustrd expects categorical)
data4_clean <- data4 %>%
mutate_if(is.numeric, as.factor)
# Step 3: Initialize empty data frame to store elbow info
elbow_data <- data.frame(Clusters = integer(), Criterion = numeric())
# Step 4: Run clusmca for k = 2 to 10
for (k in 2:10) {
cat("Trying k =", k, "\n")
model <- tryCatch({
clusmca(data4_clean, nclus = k, ndim = 2, method = "clusCA", nstart = 10)
}, error = function(e) NULL)
if (!is.null(model) && !is.null(model$criterion)) {
elbow_data <- rbind(elbow_data, data.frame(Clusters = k, Criterion = model$criterion))
cat("Criterion for k =", k, ":", model$criterion, "\n")
}
}## Trying k = 2
## | | | 0% | |======= | 10% | |============== | 20% | |===================== | 30% | |============================ | 40% | |=================================== | 50% | |========================================== | 60% | |================================================= | 70% | |======================================================== | 80% | |=============================================================== | 90% | |======================================================================| 100%Criterion for k = 2 : 2.083076
## Trying k = 3
## | | | 0% | |======= | 10% | |============== | 20% | |===================== | 30% | |============================ | 40% | |=================================== | 50% | |========================================== | 60% | |================================================= | 70% | |======================================================== | 80% | |=============================================================== | 90% | |======================================================================| 100%Criterion for k = 3 : 3.23477
## Trying k = 4
## | | | 0% | |======= | 10% | |============== | 20% | |===================== | 30% | |============================ | 40%
## | |=================================== | 50% | |========================================== | 60% | |================================================= | 70% | |======================================================== | 80% | |=============================================================== | 90% | |======================================================================| 100%Criterion for k = 4 : 3.421034
## Trying k = 5
## | | | 0% | |======= | 10% | |============== | 20%
## | |===================== | 30% | |============================ | 40% | |=================================== | 50% | |========================================== | 60% | |================================================= | 70% | |======================================================== | 80% | |=============================================================== | 90% | |======================================================================| 100%Criterion for k = 5 : 3.610805
## Trying k = 6
## | | | 0% | |======= | 10%
## | |============== | 20% | |===================== | 30% | |============================ | 40% | |=================================== | 50% | |========================================== | 60% | |================================================= | 70%
## | |======================================================== | 80% | |=============================================================== | 90% | |======================================================================| 100%Criterion for k = 6 : 3.624342
## Trying k = 7
## | | | 0% | |======= | 10% | |============== | 20% | |===================== | 30% | |============================ | 40% | |=================================== | 50% | |========================================== | 60% | |================================================= | 70% | |======================================================== | 80% | |=============================================================== | 90% | |======================================================================| 100%Criterion for k = 7 : 3.706446
## Trying k = 8
## | | | 0% | |======= | 10% | |============== | 20% | |===================== | 30%
## | |============================ | 40% | |=================================== | 50% | |========================================== | 60% | |================================================= | 70% | |======================================================== | 80% | |=============================================================== | 90% | |======================================================================| 100%Criterion for k = 8 : 3.798688
## Trying k = 9
## | | | 0% | |======= | 10% | |============== | 20% | |===================== | 30% | |============================ | 40% | |=================================== | 50%
## | |========================================== | 60% | |================================================= | 70% | |======================================================== | 80% | |=============================================================== | 90% | |======================================================================| 100%Criterion for k = 9 : 3.849937
## Trying k = 10
## | | | 0% | |======= | 10% | |============== | 20%
## | |===================== | 30% | |============================ | 40% | |=================================== | 50% | |========================================== | 60% | |================================================= | 70% | |======================================================== | 80% | |=============================================================== | 90% | |======================================================================| 100%Criterion for k = 10 : 3.89122
# Step 5: Plot the elbow graph
ggplot(elbow_data, aes(x = Clusters, y = Criterion)) +
geom_line(color = "#2c3e50", size = 1.2) +
geom_point(color = "#e74c3c", size = 2) +
theme_minimal(base_size = 14) +
labs(
title = "",
x = "Number of Clusters (k)",
y = "Model Criterion"
)5-Clusters
# Load required libraries
library(dplyr)
library(clustrd)
library(xgboost)
library(compareGroups)
# Step 1: Select the required columns
data4 <- data3 %>%
select( "Helmet", "CrSpeed", "ConFac", "TrafficCon",
"Age", "RdAlgn", "Crash Hour",
"LightCon", "Season", "FHE", "Severity", "ObjStrk") %>%
na.omit() # Remove rows with missing values
# Step 5: Preprocess the data for CCA
dat <- data4 %>%
mutate_if(is.numeric, as.factor) %>%
na.omit()
# Step 6: Perform Cluster Correspondence Analysis (CCA)
set.seed(1234)
outclusCA <- clusmca(dat, nclus = 5, ndim = 2, method = "clusCA", nstart = 10)## | | | 0% | |======= | 10% | |============== | 20% | |===================== | 30% | |============================ | 40% | |=================================== | 50% | |========================================== | 60% | |================================================= | 70% | |======================================================== | 80% | |=============================================================== | 90% | |======================================================================| 100%
## Solution with 5 clusters of sizes 3382 (26.5%), 3354 (26.3%), 2577 (20.2%), 2324 (18.2%), 1116 (8.8%) in 2 dimensions.
##
## Cluster centroids:
## Dim.1 Dim.2
## Cluster 1 0.0025 -0.0005
## Cluster 2 0.0078 -0.0031
## Cluster 3 -0.0046 -0.0010
## Cluster 4 -0.0112 -0.0009
## Cluster 5 0.0030 0.0149
##
## Within cluster sum of squares by cluster:
## [1] 0.0201 0.0148 0.0168 0.0152 0.0147
## (between_SS / total_SS = 91.41 %)
##
## Objective criterion value: 3.6108
##
## Available output:
##
## [1] "obscoord" "attcoord" "centroid" "cluster" "criterion" "size"
## [7] "odata" "nstart"
6-Clusters
# Load required libraries
library(dplyr)
library(clustrd)
library(xgboost)
library(compareGroups)
# Step 1: Select the required columns
data4 <- data3 %>%
select( "Helmet", "CrSpeed", "ConFac", "TrafficCon",
"Age", "RdAlgn", "Crash Hour",
"LightCon", "Season", "FHE", "Severity", "ObjStrk") %>%
na.omit() # Remove rows with missing values
# Step 5: Preprocess the data for CCA
dat <- data4 %>%
mutate_if(is.numeric, as.factor) %>%
na.omit()
# Step 6: Perform Cluster Correspondence Analysis (CCA)
set.seed(1234)
outclusCA <- clusmca(dat, nclus = 6, ndim = 2, method = "clusCA", nstart = 10)## | | | 0% | |======= | 10% | |============== | 20% | |===================== | 30% | |============================ | 40% | |=================================== | 50% | |========================================== | 60% | |================================================= | 70% | |======================================================== | 80% | |=============================================================== | 90% | |======================================================================| 100%
# Step 8: Save clusters to the dataset and export
dat$Cluster <- outclusCA$cluster
write.csv(dat, "DatawithCluster_v2.csv")
# Print clustering output
print(outclusCA)## Solution with 6 clusters of sizes 3232 (25.3%), 2753 (21.6%), 2069 (16.2%), 1968 (15.4%), 1615 (12.7%), 1116 (8.8%) in 2 dimensions.
##
## Cluster centroids:
## Dim.1 Dim.2
## Cluster 1 -0.0080 0.0033
## Cluster 2 -0.0034 0.0004
## Cluster 3 0.0071 0.0005
## Cluster 4 0.0018 0.0012
## Cluster 5 0.0125 0.0012
## Cluster 6 -0.0031 -0.0150
##
## Within cluster sum of squares by cluster:
## [1] 0.0143 0.0133 0.0103 0.0106 0.0082 0.0152
## (between_SS / total_SS = 92.62 %)
##
## Objective criterion value: 3.6694
##
## Available output:
##
## [1] "obscoord" "attcoord" "centroid" "cluster" "criterion" "size"
## [7] "odata" "nstart"