HW1 by ShriyankSomvanshi

Data Structure

library(readxl)
library(ggplot2)
library(dplyr)
library(randomForest)
library(caret)

data <- read_excel("C:/CE7393_HW/HW1_Data.xlsx", sheet = "UnitL4")

str(data)
## tibble [1,295 × 19] (S3: tbl_df/tbl/data.frame)
##  $ Wthr_Cond_ID        : chr [1:1295] "Clear" "Clear" "Clear" "Clear" ...
##  $ Light_Cond_ID       : chr [1:1295] "Dark, not lighted" "Dark, not lighted" "Daylight" "Daylight" ...
##  $ Road_Type_ID        : chr [1:1295] "2 lane, 2 way" "2 lane, 2 way" "2 lane, 2 way" "2 lane, 2 way" ...
##  $ Road_Algn_ID        : chr [1:1295] "Straight, level" "Straight, level" "Straight, level" "Straight, level" ...
##  $ SurfDry             : num [1:1295] 1 1 1 1 1 1 1 1 1 1 ...
##  $ Traffic_Cntl_ID     : chr [1:1295] "Marked lanes" "Center stripe/divider" "Marked lanes" "Center stripe/divider" ...
##  $ Harm_Evnt_ID        : chr [1:1295] "Motor vehicle in transport" "Motor vehicle in transport" "Motor vehicle in transport" "Fixed object" ...
##  $ Intrsct_Relat_ID    : chr [1:1295] "Non intersection" "Non intersection" "Intersection" "Non intersection" ...
##  $ FHE_Collsn_ID       : chr [1:1295] "Sd both going straight-rear end" "Sd both going straight-rear end" "Other" "Omv vehicle going straight" ...
##  $ Road_Part_Adj_ID    : chr [1:1295] "Main/proper lane" "Main/proper lane" "Main/proper lane" "Main/proper lane" ...
##  $ Road_Cls_ID         : chr [1:1295] "Farm to market" "Us & state highways" "Farm to market" "Us & state highways" ...
##  $ Pop_Group_ID        : chr [1:1295] "10,000 - 24,999 pop" "Rural" "Other" "Rural" ...
##  $ Crash_Speed_LimitCat: chr [1:1295] "30-40 mph" "65-70 mph" "45-60 mph" "65-70 mph" ...
##  $ Veh_Body_Styl_ID    : chr [1:1295] "Farm equipment" "Farm equipment" "Farm equipment" "Farm equipment" ...
##  $ Prsn_Ethnicity_ID   : chr [1:1295] "White" "White" "White" "White" ...
##  $ GenMale             : num [1:1295] 1 1 1 1 1 1 1 1 1 1 ...
##  $ TrafVol             : num [1:1295] 7654 13770 11470 16972 413 ...
##  $ Prsn_Age            : chr [1:1295] "25-54 years" "25-54 years" "Other" "25-54 years" ...
##  $ Prsn_Injry_Sev_ID   : chr [1:1295] "O" "O" "O" "O" ...
missing_values <- sapply(data, function(x) sum(is.na(x)))
print(missing_values)
##         Wthr_Cond_ID        Light_Cond_ID         Road_Type_ID 
##                    0                    0                    0 
##         Road_Algn_ID              SurfDry      Traffic_Cntl_ID 
##                    0                    0                    0 
##         Harm_Evnt_ID     Intrsct_Relat_ID        FHE_Collsn_ID 
##                    0                    0                    0 
##     Road_Part_Adj_ID          Road_Cls_ID         Pop_Group_ID 
##                    0                    0                    0 
## Crash_Speed_LimitCat     Veh_Body_Styl_ID    Prsn_Ethnicity_ID 
##                    0                    0                    0 
##              GenMale              TrafVol             Prsn_Age 
##                    0                    0                    0 
##    Prsn_Injry_Sev_ID 
##                    0

Data Conversion

data <- data %>%
  mutate(across(c(Light_Cond_ID, Wthr_Cond_ID, Crash_Speed_LimitCat, Road_Algn_ID, Prsn_Age, Prsn_Injry_Sev_ID), as.factor))

Data Ditribution

Data Ditribution of Injury Severity

ggplot(data, aes(x = Prsn_Injry_Sev_ID)) +
  geom_bar(fill = '#AEC6CF') +
  theme_minimal() +  
  theme_bw(base_size = 20) +  
  labs(title = '', 
       x = 'Injury Severity ID', 
       y = 'Count') +
  theme(
    plot.title = element_text(hjust = 0.5, size = 22, face = "bold"),  
    axis.title.x = element_text(size = 18),  
    axis.title.y = element_text(size = 18),  
    axis.text.x = element_text(size = 16),   
    axis.text.y = element_text(size = 16),   
    panel.grid = element_blank()             
  )

Data Ditribution of Age

ggplot(data, aes(x = Prsn_Age)) +
  geom_bar(fill = 'lightblue') +
  theme_minimal() +  
  theme_bw(base_size = 20) +  
  labs(title = '', 
       x = 'Age Category', 
       y = 'Count') +
  theme(
    plot.title = element_text(hjust = 0.5, size = 22, face = "bold"),  
    axis.title.x = element_text(size = 18),  
    axis.title.y = element_text(size = 18),  
    axis.text.x = element_text(size = 16),   
    axis.text.y = element_text(size = 16),   
    panel.grid = element_blank()             
  )

Data Ditribution of Speed Limit

ggplot(data, aes(x = Crash_Speed_LimitCat)) +
  geom_bar(fill = 'antiquewhite') +
  theme_minimal() +  
  theme_bw(base_size = 20) +  
  labs(title = '', 
       x = 'Speed Limit Category', 
       y = 'Count') +
  theme(
    plot.title = element_text(hjust = 0.5, size = 22, face = "bold"),  
    axis.title.x = element_text(size = 18),  
    axis.title.y = element_text(size = 18),  
    axis.text.x = element_text(size = 16, angle = 0, hjust = 1),   
    axis.text.y = element_text(size = 16),   
    panel.grid = element_blank()             
  )

Data Ditribution of Light Condition

ggplot(data, aes(x = Light_Cond_ID)) +
  geom_bar(fill = 'navajowhite3') +
  theme_minimal() +  
  theme_bw(base_size = 20) +  
  labs(title = '', 
       x = 'Light Condition', 
       y = 'Count') +
  theme(
    plot.title = element_text(hjust = 0.5, size = 22, face = "bold"),  
    axis.title.x = element_text(size = 18),  
    axis.title.y = element_text(size = 18),  
    axis.text.x = element_text(size = 16, angle = 0, hjust = 1),   
    axis.text.y = element_text(size = 16),   
    panel.grid = element_blank()             
  )

Analysis

Traffic Volume vs Injury Severity

ggplot(data, aes(x = Prsn_Injry_Sev_ID, y = TrafVol, fill = Prsn_Injry_Sev_ID)) +
  geom_violin(trim = FALSE, position = position_dodge(width = 0.8)) +
  labs(title = "", 
       x = "Injury Severity", 
       y = "Traffic Volume") +
  scale_fill_manual(values = c("KA" = "#FFB6C1",   
                               "BC" = "#B0E0E6",   
                               "O"  = "#E6E6FA")) +
  theme_minimal() +  
  theme_bw(base_size = 20) +  
  theme(
    plot.title = element_text(hjust = 0.5, size = 22, face = "bold"),  
    axis.title.x = element_text(size = 18),  
    axis.title.y = element_text(size = 18),  
    axis.text.x = element_text(size = 16),   
    axis.text.y = element_text(size = 16),   
    legend.title = element_text(size = 18),  
    legend.text = element_text(size = 16),   
    panel.grid = element_blank()
  )

Heatmap of Injury Severity vs Ethnicity

heatmap_data <- data %>%
  group_by(Prsn_Injry_Sev_ID, Prsn_Ethnicity_ID) %>%
  summarise(Count = n()) %>%
  ungroup()

# Create the heatmap
ggplot(heatmap_data, aes(x = Prsn_Ethnicity_ID, y = Prsn_Injry_Sev_ID, fill = Count)) +
  geom_tile() +  # Creates the heatmap tiles
  scale_fill_gradient(low = "lightblue", high = "darkblue") +  # Color scale from light to dark blue
  labs(title = "", 
       x = "Ethnicity", 
       y = "Injury Severity", 
       fill = "Count") +  # Labels and title
  theme_minimal() +  
  theme_bw(base_size = 20) +  
  theme(
    plot.title = element_text(hjust = 0.5, size = 22, face = "bold"),  
    axis.title.x = element_text(size = 18),  
    axis.title.y = element_text(size = 18),  
    axis.text.x = element_text(size = 16, angle = 45, hjust = 1),  # Rotate X-axis labels
    axis.text.y = element_text(size = 16),   
    legend.title = element_text(size = 18),  
    legend.text = element_text(size = 16),   
    panel.grid = element_blank()
  )

Density Plot for Person Age vs Severity

ggplot(data, aes(x = Prsn_Age, fill = Prsn_Injry_Sev_ID)) +
  geom_density(alpha = 0.6) +  # Density plot with transparency
  labs(title = "", 
       x = "Person Age", 
       y = "Density", 
       fill = "Injury Severity") +  # Custom labels for axes and title
  scale_fill_manual(values = c("KA" = "dodgerblue",   # Contrasting green
                               "BC" = "darkblue",   # Contrasting purple
                               "O"  = "lightblue")) +# Contrasting orange
  theme_minimal() +  
  theme_bw(base_size = 20) +  
  theme(
    plot.title = element_text(hjust = 0.5, size = 22, face = "bold"),  
    axis.title.x = element_text(size = 18),  
    axis.title.y = element_text(size = 18),  
    axis.text.x = element_text(size = 16),   
    axis.text.y = element_text(size = 16),   
    panel.grid = element_blank()
  )

Light Condition vs Injury Severity

ggplot(data, aes(x = Light_Cond_ID, fill = Prsn_Injry_Sev_ID)) +
  geom_bar(position = "dodge") +
  labs(title = "", 
       x = "Light Condition", 
       y = "Count", 
       fill = "Injury Severity") +
  theme_minimal() +  
  theme_bw(base_size = 20) +  
  theme(
    plot.title = element_text(hjust = 0.5, size = 22, face = "bold"),  
    axis.title.x = element_text(size = 18),  
    axis.title.y = element_text(size = 18),  
    axis.text.x = element_text(size = 16),   
    axis.text.y = element_text(size = 16),   
    legend.title = element_text(size = 18),  
    legend.text = element_text(size = 16),   
    panel.grid = element_blank()             
  )

Weather Condition vs Injury Severity

ggplot(data, aes(x = Wthr_Cond_ID, fill = Prsn_Injry_Sev_ID)) +
  geom_bar(position = "dodge") +
  labs(title = "", 
       x = "Weather Condition", 
       y = "Count", 
       fill = "Injury Severity") +
  theme_minimal() +  
  theme_bw(base_size = 20) +  
  theme(
    plot.title = element_text(hjust = 0.5, size = 22, face = "bold"),  
    axis.title.x = element_text(size = 18),  
    axis.title.y = element_text(size = 18),  
    axis.text.x = element_text(size = 16, angle = 0, hjust = 1),   
    axis.text.y = element_text(size = 16),   
    legend.title = element_text(size = 18),  
    legend.text = element_text(size = 16),   
    panel.grid = element_blank()             
  )

Road Alignment, Person Age, and Speed Limit vs Injury Severity

feature_plots <- function(feature) {
  ggplot(data, aes_string(x = feature, fill = "Prsn_Injry_Sev_ID")) +
    geom_bar(position = "dodge") +
    labs(x = feature, 
         y = "Count", 
         fill = "Injury Severity") +  
    theme_minimal() +  
    theme_bw(base_size = 20) +  
    theme(
      axis.title.x = element_text(size = 18),  
      axis.title.y = element_text(size = 18),  
      axis.text.x = element_text(size = 16, angle = 0, hjust = 1),   
      axis.text.y = element_text(size = 16),   
      legend.title = element_text(size = 18),  
      legend.text = element_text(size = 16),   
      panel.grid = element_blank()             
    )
}

feature_plots("Road_Algn_ID")

feature_plots("Prsn_Age")

feature_plots("Crash_Speed_LimitCat")

Random Forest for Feature Importance (Top 10)

data <- na.omit(data)

# Model Training
set.seed(123)  
rf_model <- randomForest(Prsn_Injry_Sev_ID ~ ., data = data, importance = TRUE)

importance_df <- data.frame(Feature = rownames(importance(rf_model)), Importance = importance(rf_model)[, "MeanDecreaseGini"])

top_features <- importance_df %>% arrange(desc(Importance)) %>% head(10)
print(top_features)
##                                   Feature Importance
## TrafVol                           TrafVol   65.51447
## Prsn_Age                         Prsn_Age   25.67076
## FHE_Collsn_ID               FHE_Collsn_ID   19.24259
## Traffic_Cntl_ID           Traffic_Cntl_ID   18.37463
## Crash_Speed_LimitCat Crash_Speed_LimitCat   17.83830
## Road_Algn_ID                 Road_Algn_ID   14.56765
## Light_Cond_ID               Light_Cond_ID   14.52474
## Road_Cls_ID                   Road_Cls_ID   14.05026
## Prsn_Ethnicity_ID       Prsn_Ethnicity_ID   13.87331
## Road_Type_ID                 Road_Type_ID   13.22125
ggplot(top_features, aes(x = reorder(Feature, Importance), y = Importance)) +
  geom_bar(stat = "identity", fill = "#B0E0E6") +
  coord_flip() +
  labs(title = "", x = "Features", y = "Importance (Mean Decrease Gini)") +
  theme_minimal() +  
  theme_bw(base_size = 20) +  
  theme(
    plot.title = element_text(hjust = 0.5, size = 22, face = "bold"),  
    axis.title.x = element_text(size = 18),  
    axis.title.y = element_text(size = 18),  
    axis.text.x = element_text(size = 16),   
    axis.text.y = element_text(size = 16),   
    panel.grid = element_blank()             
  )