Data Structure
library(readxl)
library(ggplot2)
library(dplyr)
library(randomForest)
library(caret)
data <- read_excel("C:/CE7393_HW/HW1_Data.xlsx", sheet = "UnitL4")
str(data)
## tibble [1,295 × 19] (S3: tbl_df/tbl/data.frame)
## $ Wthr_Cond_ID : chr [1:1295] "Clear" "Clear" "Clear" "Clear" ...
## $ Light_Cond_ID : chr [1:1295] "Dark, not lighted" "Dark, not lighted" "Daylight" "Daylight" ...
## $ Road_Type_ID : chr [1:1295] "2 lane, 2 way" "2 lane, 2 way" "2 lane, 2 way" "2 lane, 2 way" ...
## $ Road_Algn_ID : chr [1:1295] "Straight, level" "Straight, level" "Straight, level" "Straight, level" ...
## $ SurfDry : num [1:1295] 1 1 1 1 1 1 1 1 1 1 ...
## $ Traffic_Cntl_ID : chr [1:1295] "Marked lanes" "Center stripe/divider" "Marked lanes" "Center stripe/divider" ...
## $ Harm_Evnt_ID : chr [1:1295] "Motor vehicle in transport" "Motor vehicle in transport" "Motor vehicle in transport" "Fixed object" ...
## $ Intrsct_Relat_ID : chr [1:1295] "Non intersection" "Non intersection" "Intersection" "Non intersection" ...
## $ FHE_Collsn_ID : chr [1:1295] "Sd both going straight-rear end" "Sd both going straight-rear end" "Other" "Omv vehicle going straight" ...
## $ Road_Part_Adj_ID : chr [1:1295] "Main/proper lane" "Main/proper lane" "Main/proper lane" "Main/proper lane" ...
## $ Road_Cls_ID : chr [1:1295] "Farm to market" "Us & state highways" "Farm to market" "Us & state highways" ...
## $ Pop_Group_ID : chr [1:1295] "10,000 - 24,999 pop" "Rural" "Other" "Rural" ...
## $ Crash_Speed_LimitCat: chr [1:1295] "30-40 mph" "65-70 mph" "45-60 mph" "65-70 mph" ...
## $ Veh_Body_Styl_ID : chr [1:1295] "Farm equipment" "Farm equipment" "Farm equipment" "Farm equipment" ...
## $ Prsn_Ethnicity_ID : chr [1:1295] "White" "White" "White" "White" ...
## $ GenMale : num [1:1295] 1 1 1 1 1 1 1 1 1 1 ...
## $ TrafVol : num [1:1295] 7654 13770 11470 16972 413 ...
## $ Prsn_Age : chr [1:1295] "25-54 years" "25-54 years" "Other" "25-54 years" ...
## $ Prsn_Injry_Sev_ID : chr [1:1295] "O" "O" "O" "O" ...
missing_values <- sapply(data, function(x) sum(is.na(x)))
print(missing_values)
## Wthr_Cond_ID Light_Cond_ID Road_Type_ID
## 0 0 0
## Road_Algn_ID SurfDry Traffic_Cntl_ID
## 0 0 0
## Harm_Evnt_ID Intrsct_Relat_ID FHE_Collsn_ID
## 0 0 0
## Road_Part_Adj_ID Road_Cls_ID Pop_Group_ID
## 0 0 0
## Crash_Speed_LimitCat Veh_Body_Styl_ID Prsn_Ethnicity_ID
## 0 0 0
## GenMale TrafVol Prsn_Age
## 0 0 0
## Prsn_Injry_Sev_ID
## 0
Data Conversion
data <- data %>%
mutate(across(c(Light_Cond_ID, Wthr_Cond_ID, Crash_Speed_LimitCat, Road_Algn_ID, Prsn_Age, Prsn_Injry_Sev_ID), as.factor))
Data Ditribution
Data Ditribution of Injury Severity
ggplot(data, aes(x = Prsn_Injry_Sev_ID)) +
geom_bar(fill = '#AEC6CF') +
theme_minimal() +
theme_bw(base_size = 20) +
labs(title = '',
x = 'Injury Severity ID',
y = 'Count') +
theme(
plot.title = element_text(hjust = 0.5, size = 22, face = "bold"),
axis.title.x = element_text(size = 18),
axis.title.y = element_text(size = 18),
axis.text.x = element_text(size = 16),
axis.text.y = element_text(size = 16),
panel.grid = element_blank()
)

Data Ditribution of Age
ggplot(data, aes(x = Prsn_Age)) +
geom_bar(fill = 'lightblue') +
theme_minimal() +
theme_bw(base_size = 20) +
labs(title = '',
x = 'Age Category',
y = 'Count') +
theme(
plot.title = element_text(hjust = 0.5, size = 22, face = "bold"),
axis.title.x = element_text(size = 18),
axis.title.y = element_text(size = 18),
axis.text.x = element_text(size = 16),
axis.text.y = element_text(size = 16),
panel.grid = element_blank()
)

Data Ditribution of Speed Limit
ggplot(data, aes(x = Crash_Speed_LimitCat)) +
geom_bar(fill = 'antiquewhite') +
theme_minimal() +
theme_bw(base_size = 20) +
labs(title = '',
x = 'Speed Limit Category',
y = 'Count') +
theme(
plot.title = element_text(hjust = 0.5, size = 22, face = "bold"),
axis.title.x = element_text(size = 18),
axis.title.y = element_text(size = 18),
axis.text.x = element_text(size = 16, angle = 0, hjust = 1),
axis.text.y = element_text(size = 16),
panel.grid = element_blank()
)

Data Ditribution of Light Condition
ggplot(data, aes(x = Light_Cond_ID)) +
geom_bar(fill = 'navajowhite3') +
theme_minimal() +
theme_bw(base_size = 20) +
labs(title = '',
x = 'Light Condition',
y = 'Count') +
theme(
plot.title = element_text(hjust = 0.5, size = 22, face = "bold"),
axis.title.x = element_text(size = 18),
axis.title.y = element_text(size = 18),
axis.text.x = element_text(size = 16, angle = 0, hjust = 1),
axis.text.y = element_text(size = 16),
panel.grid = element_blank()
)

Analysis
Traffic Volume vs Injury Severity
ggplot(data, aes(x = Prsn_Injry_Sev_ID, y = TrafVol, fill = Prsn_Injry_Sev_ID)) +
geom_violin(trim = FALSE, position = position_dodge(width = 0.8)) +
labs(title = "",
x = "Injury Severity",
y = "Traffic Volume") +
scale_fill_manual(values = c("KA" = "#FFB6C1",
"BC" = "#B0E0E6",
"O" = "#E6E6FA")) +
theme_minimal() +
theme_bw(base_size = 20) +
theme(
plot.title = element_text(hjust = 0.5, size = 22, face = "bold"),
axis.title.x = element_text(size = 18),
axis.title.y = element_text(size = 18),
axis.text.x = element_text(size = 16),
axis.text.y = element_text(size = 16),
legend.title = element_text(size = 18),
legend.text = element_text(size = 16),
panel.grid = element_blank()
)

Heatmap of Injury Severity vs Ethnicity
heatmap_data <- data %>%
group_by(Prsn_Injry_Sev_ID, Prsn_Ethnicity_ID) %>%
summarise(Count = n()) %>%
ungroup()
# Create the heatmap
ggplot(heatmap_data, aes(x = Prsn_Ethnicity_ID, y = Prsn_Injry_Sev_ID, fill = Count)) +
geom_tile() + # Creates the heatmap tiles
scale_fill_gradient(low = "lightblue", high = "darkblue") + # Color scale from light to dark blue
labs(title = "",
x = "Ethnicity",
y = "Injury Severity",
fill = "Count") + # Labels and title
theme_minimal() +
theme_bw(base_size = 20) +
theme(
plot.title = element_text(hjust = 0.5, size = 22, face = "bold"),
axis.title.x = element_text(size = 18),
axis.title.y = element_text(size = 18),
axis.text.x = element_text(size = 16, angle = 45, hjust = 1), # Rotate X-axis labels
axis.text.y = element_text(size = 16),
legend.title = element_text(size = 18),
legend.text = element_text(size = 16),
panel.grid = element_blank()
)

Density Plot for Person Age vs Severity
ggplot(data, aes(x = Prsn_Age, fill = Prsn_Injry_Sev_ID)) +
geom_density(alpha = 0.6) + # Density plot with transparency
labs(title = "",
x = "Person Age",
y = "Density",
fill = "Injury Severity") + # Custom labels for axes and title
scale_fill_manual(values = c("KA" = "dodgerblue", # Contrasting green
"BC" = "darkblue", # Contrasting purple
"O" = "lightblue")) +# Contrasting orange
theme_minimal() +
theme_bw(base_size = 20) +
theme(
plot.title = element_text(hjust = 0.5, size = 22, face = "bold"),
axis.title.x = element_text(size = 18),
axis.title.y = element_text(size = 18),
axis.text.x = element_text(size = 16),
axis.text.y = element_text(size = 16),
panel.grid = element_blank()
)

Light Condition vs Injury Severity
ggplot(data, aes(x = Light_Cond_ID, fill = Prsn_Injry_Sev_ID)) +
geom_bar(position = "dodge") +
labs(title = "",
x = "Light Condition",
y = "Count",
fill = "Injury Severity") +
theme_minimal() +
theme_bw(base_size = 20) +
theme(
plot.title = element_text(hjust = 0.5, size = 22, face = "bold"),
axis.title.x = element_text(size = 18),
axis.title.y = element_text(size = 18),
axis.text.x = element_text(size = 16),
axis.text.y = element_text(size = 16),
legend.title = element_text(size = 18),
legend.text = element_text(size = 16),
panel.grid = element_blank()
)

Weather Condition vs Injury Severity
ggplot(data, aes(x = Wthr_Cond_ID, fill = Prsn_Injry_Sev_ID)) +
geom_bar(position = "dodge") +
labs(title = "",
x = "Weather Condition",
y = "Count",
fill = "Injury Severity") +
theme_minimal() +
theme_bw(base_size = 20) +
theme(
plot.title = element_text(hjust = 0.5, size = 22, face = "bold"),
axis.title.x = element_text(size = 18),
axis.title.y = element_text(size = 18),
axis.text.x = element_text(size = 16, angle = 0, hjust = 1),
axis.text.y = element_text(size = 16),
legend.title = element_text(size = 18),
legend.text = element_text(size = 16),
panel.grid = element_blank()
)

Road Alignment, Person Age, and Speed Limit vs Injury Severity
feature_plots <- function(feature) {
ggplot(data, aes_string(x = feature, fill = "Prsn_Injry_Sev_ID")) +
geom_bar(position = "dodge") +
labs(x = feature,
y = "Count",
fill = "Injury Severity") +
theme_minimal() +
theme_bw(base_size = 20) +
theme(
axis.title.x = element_text(size = 18),
axis.title.y = element_text(size = 18),
axis.text.x = element_text(size = 16, angle = 0, hjust = 1),
axis.text.y = element_text(size = 16),
legend.title = element_text(size = 18),
legend.text = element_text(size = 16),
panel.grid = element_blank()
)
}
feature_plots("Road_Algn_ID")

feature_plots("Prsn_Age")

feature_plots("Crash_Speed_LimitCat")

Random Forest for Feature Importance (Top 10)
data <- na.omit(data)
# Model Training
set.seed(123)
rf_model <- randomForest(Prsn_Injry_Sev_ID ~ ., data = data, importance = TRUE)
importance_df <- data.frame(Feature = rownames(importance(rf_model)), Importance = importance(rf_model)[, "MeanDecreaseGini"])
top_features <- importance_df %>% arrange(desc(Importance)) %>% head(10)
print(top_features)
## Feature Importance
## TrafVol TrafVol 65.51447
## Prsn_Age Prsn_Age 25.67076
## FHE_Collsn_ID FHE_Collsn_ID 19.24259
## Traffic_Cntl_ID Traffic_Cntl_ID 18.37463
## Crash_Speed_LimitCat Crash_Speed_LimitCat 17.83830
## Road_Algn_ID Road_Algn_ID 14.56765
## Light_Cond_ID Light_Cond_ID 14.52474
## Road_Cls_ID Road_Cls_ID 14.05026
## Prsn_Ethnicity_ID Prsn_Ethnicity_ID 13.87331
## Road_Type_ID Road_Type_ID 13.22125
ggplot(top_features, aes(x = reorder(Feature, Importance), y = Importance)) +
geom_bar(stat = "identity", fill = "#B0E0E6") +
coord_flip() +
labs(title = "", x = "Features", y = "Importance (Mean Decrease Gini)") +
theme_minimal() +
theme_bw(base_size = 20) +
theme(
plot.title = element_text(hjust = 0.5, size = 22, face = "bold"),
axis.title.x = element_text(size = 18),
axis.title.y = element_text(size = 18),
axis.text.x = element_text(size = 16),
axis.text.y = element_text(size = 16),
panel.grid = element_blank()
)
