Exploratory Data Analysis by Rohit Chakraborty

Loading the Data and Libraries

library(DataExplorer)
library(writexl)
library(readr)
library(readxl)

setwd("C:/Users/xuw12/OneDrive - Texas State University/Fall 2024/CE 7393/HW_1")
dat1 <- read_excel("HW1_Data.xlsx")

Priliminary Data Exploration

The dataset include crashes related to farm equipment vehicles. Some initial exploration of the dataset are showed below:

dim(dat1)
## [1] 1295   19
head(dat1)
## # A tibble: 6 × 19
##   Wthr_Cond_ID Light_Cond_ID   Road_Type_ID Road_Algn_ID SurfDry Traffic_Cntl_ID
##   <chr>        <chr>           <chr>        <chr>          <dbl> <chr>          
## 1 Clear        Dark, not ligh… 2 lane, 2 w… Straight, l…       1 Marked lanes   
## 2 Clear        Dark, not ligh… 2 lane, 2 w… Straight, l…       1 Center stripe/…
## 3 Clear        Daylight        2 lane, 2 w… Straight, l…       1 Marked lanes   
## 4 Clear        Daylight        2 lane, 2 w… Straight, l…       1 Center stripe/…
## 5 Clear        Dark, not ligh… 2 lane, 2 w… Straight, g…       1 None           
## 6 Clear        Daylight        Unknown      Straight, l…       1 None           
## # ℹ 13 more variables: Harm_Evnt_ID <chr>, Intrsct_Relat_ID <chr>,
## #   FHE_Collsn_ID <chr>, Road_Part_Adj_ID <chr>, Road_Cls_ID <chr>,
## #   Pop_Group_ID <chr>, Crash_Speed_LimitCat <chr>, Veh_Body_Styl_ID <chr>,
## #   Prsn_Ethnicity_ID <chr>, GenMale <dbl>, TrafVol <dbl>, Prsn_Age <chr>,
## #   Prsn_Injry_Sev_ID <chr>
names(dat1)
##  [1] "Wthr_Cond_ID"         "Light_Cond_ID"        "Road_Type_ID"        
##  [4] "Road_Algn_ID"         "SurfDry"              "Traffic_Cntl_ID"     
##  [7] "Harm_Evnt_ID"         "Intrsct_Relat_ID"     "FHE_Collsn_ID"       
## [10] "Road_Part_Adj_ID"     "Road_Cls_ID"          "Pop_Group_ID"        
## [13] "Crash_Speed_LimitCat" "Veh_Body_Styl_ID"     "Prsn_Ethnicity_ID"   
## [16] "GenMale"              "TrafVol"              "Prsn_Age"            
## [19] "Prsn_Injry_Sev_ID"
summary(dat1)
##  Wthr_Cond_ID       Light_Cond_ID      Road_Type_ID       Road_Algn_ID      
##  Length:1295        Length:1295        Length:1295        Length:1295       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##     SurfDry       Traffic_Cntl_ID    Harm_Evnt_ID       Intrsct_Relat_ID  
##  Min.   :0.0000   Length:1295        Length:1295        Length:1295       
##  1st Qu.:1.0000   Class :character   Class :character   Class :character  
##  Median :1.0000   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :0.9143                                                           
##  3rd Qu.:1.0000                                                           
##  Max.   :1.0000                                                           
##  FHE_Collsn_ID      Road_Part_Adj_ID   Road_Cls_ID        Pop_Group_ID      
##  Length:1295        Length:1295        Length:1295        Length:1295       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  Crash_Speed_LimitCat Veh_Body_Styl_ID   Prsn_Ethnicity_ID     GenMale      
##  Length:1295          Length:1295        Length:1295        Min.   :0.0000  
##  Class :character     Class :character   Class :character   1st Qu.:1.0000  
##  Mode  :character     Mode  :character   Mode  :character   Median :1.0000  
##                                                             Mean   :0.8842  
##                                                             3rd Qu.:1.0000  
##                                                             Max.   :1.0000  
##     TrafVol        Prsn_Age         Prsn_Injry_Sev_ID 
##  Min.   :  248   Length:1295        Length:1295       
##  1st Qu.: 7652   Class :character   Class :character  
##  Median :14897   Mode  :character   Mode  :character  
##  Mean   :14780                                        
##  3rd Qu.:21930                                        
##  Max.   :28983

Compare Group

library(compareGroups)
dat2 <- dat1[, !(names(dat1) %in% c("SurfDry", "GenMale", "TrafVol", "Veh_Body_Styl_ID"))] #removing variables with numerical data and vehicle body style
res1 <- compareGroups(Prsn_Injry_Sev_ID ~ ., data = dat2, ref = 1)
res2= createTable(res1, show.ratio = TRUE)
res2
## 
## --------Summary descriptives table by 'Prsn_Injry_Sev_ID'---------
## 
## __________________________________________________________________________________ 
##                                          BC          KA          O       p.overall 
##                                         N=120       N=50       N=1125              
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯ 
## Wthr_Cond_ID:                                                                .     
##     Clear                            98 (81.7%)  42 (84.0%) 944 (83.9%)            
##     Cloudy                           19 (15.8%)  6 (12.0%)  121 (10.8%)            
##     Fog                               1 (0.83%)  1 (2.00%)   8 (0.71%)             
##     Other                             0 (0.00%)  0 (0.00%)   10 (0.89%)            
##     Rain                              2 (1.67%)  1 (2.00%)   42 (3.73%)            
## Light_Cond_ID:                                                               .     
##     Dark, lighted                     5 (4.17%)  0 (0.00%)   37 (3.29%)            
##     Dark, not lighted                16 (13.3%)  12 (24.0%) 126 (11.2%)            
##     Daylight                         90 (75.0%)  36 (72.0%) 914 (81.2%)            
##     Dusk                              6 (5.00%)  2 (4.00%)   31 (2.76%)            
##     Other                             3 (2.50%)  0 (0.00%)   17 (1.51%)            
## Road_Type_ID:                                                                .     
##     2 lane, 2 way                    50 (41.7%)  26 (52.0%) 475 (42.2%)            
##     4 or more lanes, divided         27 (22.5%)  9 (18.0%)  149 (13.2%)            
##     4 or more lanes, undivided       11 (9.17%)  6 (12.0%)   79 (7.02%)            
##     Other                             0 (0.00%)  0 (0.00%)   2 (0.18%)             
##     Unknown                          32 (26.7%)  9 (18.0%)  420 (37.3%)            
## Road_Algn_ID:                                                                .     
##     Curve, level                      6 (5.00%)  2 (4.00%)   54 (4.80%)            
##     Other                             2 (1.67%)  2 (4.00%)   40 (3.56%)            
##     Straight, grade                  24 (20.0%)  5 (10.0%)  106 (9.42%)            
##     Straight, hillcrest               4 (3.33%)  4 (8.00%)   39 (3.47%)            
##     Straight, level                  84 (70.0%)  37 (74.0%) 886 (78.8%)            
## Traffic_Cntl_ID:                                                             .     
##     Center stripe/divider            32 (26.7%)  15 (30.0%) 259 (23.0%)            
##     Marked lanes                     39 (32.5%)  19 (38.0%) 331 (29.4%)            
##     No passing zone                  12 (10.0%)  6 (12.0%)   76 (6.76%)            
##     None                             21 (17.5%)  5 (10.0%)  270 (24.0%)            
##     Other                            16 (13.3%)  5 (10.0%)  189 (16.8%)            
## Harm_Evnt_ID:                                                                .     
##     Fixed object                     10 (8.33%)  4 (8.00%)  130 (11.6%)            
##     Motor vehicle in transport       99 (82.5%)  40 (80.0%) 867 (77.1%)            
##     Other                             4 (3.33%)  4 (8.00%)   24 (2.13%)            
##     Overturned                        6 (5.00%)  1 (2.00%)   19 (1.69%)            
##     Parked car                        1 (0.83%)  1 (2.00%)   85 (7.56%)            
## Intrsct_Relat_ID:                                                            .     
##     Driveway access                   8 (6.67%)  3 (6.00%)  134 (11.9%)            
##     Intersection                     15 (12.5%)  5 (10.0%)  150 (13.3%)            
##     Intersection related              7 (5.83%)  1 (2.00%)  104 (9.24%)            
##     Non intersection                 90 (75.0%)  41 (82.0%) 737 (65.5%)            
## FHE_Collsn_ID:                                                            <0.001   
##     Omv vehicle going straight       18 (15.0%)  9 (18.0%)  232 (20.6%)            
##     Other                            24 (20.0%)  6 (12.0%)  367 (32.6%)            
##     Sd both going straight-rear end  60 (50.0%)  27 (54.0%) 225 (20.0%)            
##     Sd both going straight-sideswipe  9 (7.50%)  3 (6.00%)  147 (13.1%)            
##     Sd one straight-one left turn     9 (7.50%)  5 (10.0%)  154 (13.7%)            
## Road_Part_Adj_ID:                                                          0.151   
##     Exit/off ramp                     1 (0.83%)  1 (2.00%)   2 (0.18%)             
##     Main/proper lane                 110 (91.7%) 49 (98.0%) 1066 (94.8%)           
##     Other                             1 (0.83%)  0 (0.00%)   3 (0.27%)             
##     Other (explain in narrative)      4 (3.33%)  0 (0.00%)   26 (2.31%)            
##     Unknown                           4 (3.33%)  0 (0.00%)   28 (2.49%)            
## Road_Cls_ID:                                                                 .     
##     City street                      10 (8.33%)  4 (8.00%)  207 (18.4%)            
##     County road                      15 (12.5%)  4 (8.00%)  172 (15.3%)            
##     Farm to market                   43 (35.8%)  14 (28.0%) 317 (28.2%)            
##     Other                            13 (10.8%)  1 (2.00%)   75 (6.67%)            
##     Us & state highways              39 (32.5%)  27 (54.0%) 354 (31.5%)            
## Pop_Group_ID:                                                                .     
##     10,000 - 24,999 pop               4 (3.33%)  3 (6.00%)   56 (4.98%)            
##     250,000 pop. And over             9 (7.50%)  2 (4.00%)  106 (9.42%)            
##     Other                            16 (13.3%)  4 (8.00%)  173 (15.4%)            
##     Rural                            87 (72.5%)  40 (80.0%) 739 (65.7%)            
##     Town under 2,499 pop.             4 (3.33%)  1 (2.00%)   51 (4.53%)            
## Crash_Speed_LimitCat:                                                        .     
##     > 70 mph                         19 (15.8%)  10 (20.0%) 174 (15.5%)            
##     30-40 mph                        20 (16.7%)  6 (12.0%)  310 (27.6%)            
##     45-60 mph                        45 (37.5%)  14 (28.0%) 412 (36.6%)            
##     65-70 mph                        28 (23.3%)  20 (40.0%) 175 (15.6%)            
##     Other                             8 (6.67%)  0 (0.00%)   54 (4.80%)            
## Prsn_Ethnicity_ID:                                                           .     
##     Black                             9 (7.50%)  2 (4.00%)   52 (4.62%)            
##     Hispanic                         36 (30.0%)  12 (24.0%) 374 (33.2%)            
##     Other                             1 (0.83%)  0 (0.00%)   38 (3.38%)            
##     Unknown                           0 (0.00%)  0 (0.00%)   83 (7.38%)            
##     White                            74 (61.7%)  36 (72.0%) 578 (51.4%)            
## Prsn_Age:                                                                    .     
##     15-24 years                      18 (15.0%)  5 (10.0%)  165 (14.7%)            
##     25-54 years                      56 (46.7%)  19 (38.0%) 497 (44.2%)            
##     55-64 years                      23 (19.2%)  8 (16.0%)  201 (17.9%)            
##     65-74 years                      14 (11.7%)  10 (20.0%)  94 (8.36%)            
##     Other                             9 (7.50%)  8 (16.0%)  168 (14.9%)            
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯

Box Plot

#Box plot for traffic volume across injury severity
library(ggplot2)
ggplot(dat1, aes(x = Prsn_Injry_Sev_ID, y = TrafVol)) +
  geom_boxplot(fill = "antiquewhite") +
  theme_minimal() + theme_bw(base_size = 20) +
  labs(title = "Traffic Volume Across Injury Severity Type", 
       x = "Injury Severity", 
       y = "Traffic Volume")

The boxplot shows traffic volume across three injury severity types: KA (fatal/severe injury), BC (moderate/minor injury), and O (no injury). Traffic volume is highest for no injury (O) crashes, with a wider spread, while fatal/severe injury (KA) shows lower traffic volume and less variability.

Grouped Bar Plot

Injury Severity Vs Intersection/Road Segment

ggplot(dat1, aes(x = Intrsct_Relat_ID, fill = Prsn_Injry_Sev_ID)) +
  geom_bar(position = "dodge") +  # 'dodge' will create side-by-side bars
  theme_minimal() + theme_bw(base_size = 20)+
  labs(title = "Injury Severity across Intersection/Road Segments", 
       x = "Intersection/Road Segment Type", 
       y = "Count", 
       fill = "Injury Severity")+
  scale_fill_manual(values = c("O" = "cornflowerblue",  # Set custom colors for each category
                               "KA" = "orange2", 
                               "BC" = "moccasin"))

The plot shows that non-intersection locations experienced the highest number of crashes for all injury severity types (KA, BC, and O).

Injury Severity Vs Road Alignment

ggplot(dat1, aes(x = Road_Algn_ID, fill = Prsn_Injry_Sev_ID)) +
  geom_bar(position = "dodge") +  # 'dodge' will create side-by-side bars
  theme_minimal() + theme_bw(base_size = 20)+
  labs(title = "Injury Severity across Different Road Alignemnt", 
       x = "Road Alignment", 
       y = "Count", 
       fill = "Injury Severity")+
  scale_fill_manual(values = c("O" = "cornflowerblue",  # Set custom colors for each category
                               "KA" = "orange2", 
                               "BC" = "moccasin"))

The plot shows that straight and at level roadways experienced the highest number of crashes for all injury severity types (KA, BC, and O).

Heatmap

Injury Severity Vs Ethnicity

# Create a contingency table (Injury Severity Vs Ethnicity)
contingency_table <- table(dat1$Prsn_Ethnicity_ID, dat1$Prsn_Injry_Sev_ID)

# Convert to dataframe for ggplot
contingency_df <- as.data.frame(contingency_table)

# Heatmap
ggplot(contingency_df, aes(Var1, Var2, fill = Freq)) +
  geom_tile(color = "white") +
  scale_fill_gradient(low = "lightblue", high = "red") +
  labs(title = "Heatmap of Injury Severity Vs Ethnicity", 
       x = "Ethnicity", 
       y = "Injury Severity", 
       fill = "Frequency") +
  theme_minimal() +theme_bw(base_size=20)

The heatmap shows the relationship between injury severity and ethnicity, with darker shades indicating higher frequencies. White individuals had the highest frequency of no injury (O), followed by Hispanic individuals. Fatal or severe injuries (KA) occured less frequently across all ethnic groups.

Injury Severity Vs Speed Limit

# Create a contingency table (Injury Severity Vs Speed Limit)
contingency_table <- table(dat1$Crash_Speed_LimitCat, dat1$Prsn_Injry_Sev_ID)

# Convert to data frame for ggplot
contingency_df <- as.data.frame(contingency_table)

# Heatmap
ggplot(contingency_df, aes(Var1, Var2, fill = Freq)) +
  geom_tile(color = "white") +
  scale_fill_gradient(low = "lightblue", high = "red") +
  labs(title = "Heatmap of Injury Severity Vs Speed Limit", 
       x = "Speed Limit", 
       y = "Injury Severity", 
       fill = "Frequency") +
  theme_minimal() +theme_bw(base_size=20)

The plot shows that no injury crashes are more frequent at moderate speed limits (30-60 mph).

Injury Severity Vs Road Class

# Create a contingency table (Injury Severity Vs Road Class)
contingency_table <- table(dat1$Road_Cls_ID, dat1$Prsn_Injry_Sev_ID)

# Convert to data frame for ggplot
contingency_df <- as.data.frame(contingency_table)

# Heatmap
ggplot(contingency_df, aes(Var1, Var2, fill = Freq)) +
  geom_tile(color = "white") +
  scale_fill_gradient(low = "lightblue", high = "red") +
  labs(title = "Heatmap of Injury Severity Vs Road Class", 
       x = "Road Class", 
       y = "Injury Severity", 
       fill = "Frequency") +
  theme_minimal() +theme_bw(base_size=20)

The heatmap illustrates the relationship between injury severity and road class.U.S. and state highways have the highest frequency of no injury crashes, while fatal and severe injuries show more even distribution across different road classes but at lower frequencies.

Violin Plot

library(ggpubr)
library(ggstatsplot)
library(dplyr)
library(ggplot2)
ggbetweenstats(data  = subset(dat1),
  x     = Prsn_Injry_Sev_ID,
  y     = TrafVol)+theme_bw(base_size=18)+
labs(title = "Injury Severity Vs Traffic Volume",
     x = "Injury Severity",
     y = "Traffic Volume") #custom label of the axes

The plot shows traffic volume across injury severities. Moderate/minor injuries (BC) have the highest mean traffic volume around 15,426, while fatal/severe injuries (KA) have the lowest mean at 12,192. No injury (O) cases have a mean of 14,774 with the widest spread due to more observations. Overall, traffic volume is lower for severe injuries.

Density Plot

Distribution of Crashes Across Traffic Volume and Speed

library(ggpubr)
ggdensity(dat1, x = "TrafVol",
          add = "mean", rug = TRUE,
          color = "Crash_Speed_LimitCat", fill = "Crash_Speed_LimitCat",
          palette = c("#00AFBB", "#E7B800", "antiquewhite", "azure1", "darkkhaki"))+
  facet_grid(cols = vars(Prsn_Injry_Sev_ID))+theme_bw(base_size=22)

The density plot illustrates traffic volume distributions across speed limits for different injury severities (BC, KA, O). For both no injury (O) and moderate/minor injury (BC), traffic volumes peak between 10,000 and 20,000 across various speed limits.

Correlation Matrix of Numerical Variables

library(corrplot)

# Select only numerical columns for correlation analysis
numerical_data <- dat1[, c("SurfDry", "GenMale", "TrafVol")]

# Calculate the correlation matrix
correlation_matrix <- cor(numerical_data, use = "complete.obs")

# Print the correlation matrix
print(correlation_matrix)
##              SurfDry    GenMale      TrafVol
## SurfDry  1.000000000 0.04432906 -0.002043913
## GenMale  0.044329059 1.00000000  0.046530436
## TrafVol -0.002043913 0.04653044  1.000000000
# Visualize the correlation matrix using a plot
corrplot(correlation_matrix, method = "circle")

This plot cannot present insightful outcomes as the variables “SurDry” and “GenMale” had binary values (1,0).

Cramér’s V Correlation Matrix for Categorical Varaibles

library(vcd)
library(lsr)
library(DT)

# Select categorical columns
categorical_data <- dat1[, c("Wthr_Cond_ID", "Light_Cond_ID", "Road_Type_ID", "Road_Algn_ID", "Traffic_Cntl_ID", "Harm_Evnt_ID")]

# Create a function to compute Cramér's V for each pair of categorical variables
cramers_v_matrix <- function(df) {
  var_names <- colnames(df)
  num_vars <- length(var_names)
  result <- matrix(NA, nrow = num_vars, ncol = num_vars, dimnames = list(var_names, var_names))
  
  for (i in 1:num_vars) {
    for (j in 1:num_vars) {
      if (i == j) {
        result[i, j] <- 1  # Perfect association with itself
      } else {
        result[i, j] <- cramersV(table(df[[i]], df[[j]]))
      }
    }
  }
  return(result)
}

# Calculate the Cramér's V correlation matrix
cramers_v_corr_matrix <- cramers_v_matrix(categorical_data)

# Convert the Cramér's V matrix into a data frame
cramers_v_corr_df <- as.data.frame(cramers_v_corr_matrix)

# Display the data frame as an interactive table using DT
datatable(cramers_v_corr_df, 
          options = list(pageLength = 6, autoWidth = TRUE),
          caption = "Cramér's V Correlation Matrix")

Histograms of Categorical Variables

# Select categorical columns
categorical_vars <- dat1[, c("Wthr_Cond_ID", "Light_Cond_ID", "Road_Type_ID", "Road_Algn_ID", "Traffic_Cntl_ID", "Harm_Evnt_ID")]

# Create a function to generate bar plots for each categorical variable
plot_categorical <- function(df) {
  for (var in colnames(df)) {
    # Save the ggplot object to a variable
    p <- ggplot(df, aes_string(x = var)) +
      geom_bar(fill = "antiquewhite3", color = "black") +
      theme_minimal() + theme_bw(base_size = 24)
      labs(title = paste("Distribution of", var), x = var, y = "Crash Count") +
      theme(axis.text.x = element_text(angle = 45, hjust = 1))
    
    # Print the ggplot object
    print(p)
  }
}

# Generate bar plots for all categorical variables
plot_categorical(categorical_vars)

Summary of the Histograms

# Create a data frame with variable names and explanations
explanations <- data.frame(
  Variable = c("Wthr_Cond_ID", "Light_Cond_ID", "Road_Type_ID", "Road_Algn_ID", "Traffic_Cntl_ID", "Harm_Evnt_ID"),
  Explanation = c(
    "Most of the crashes were occured in clear weather.",
    "Daylight experienced the majority of the crashes.",
    "Two-lane, two-way highways experienced a high number of crashes",
    "Straight road alignment at level experienced most of the crashes.",
    "The number of crashes are distributed along all traffic control devices.",
    "Involvement of motor vehicles have the highest count among all crashes."
  )
)

# Print the table
knitr::kable(explanations, col.names = c("Variable", "Explanation"), caption = "Explanations for the Categorical Variables")
Explanations for the Categorical Variables
Variable Explanation
Wthr_Cond_ID Most of the crashes were occured in clear weather.
Light_Cond_ID Daylight experienced the majority of the crashes.
Road_Type_ID Two-lane, two-way highways experienced a high number of crashes
Road_Algn_ID Straight road alignment at level experienced most of the crashes.
Traffic_Cntl_ID The number of crashes are distributed along all traffic control devices.
Harm_Evnt_ID Involvement of motor vehicles have the highest count among all crashes.

Stacked Bar Plot

Injury Severity Vs Road Class

  ggplot(dat1, aes(x = Road_Cls_ID, fill = Prsn_Injry_Sev_ID)) +
  geom_bar(position = "fill") +  # 'fill' will stack the bars by proportion
  theme_minimal() + 
  theme_bw(base_size = 20) +
  labs(title = "Proportion of Injury Severity across Road Class", 
       x = "Road Class", 
       y = "Proportion", 
       fill = "Injury Severity") +
  scale_fill_manual(values = c("O" = "honeydew3",  # Set custom colors for each category
                               "KA" = "salmon4", 
                               "BC" = "skyblue4"))

Injury Severity Vs Ethnicity

  ggplot(dat1, aes(x = Prsn_Ethnicity_ID, fill = Prsn_Injry_Sev_ID)) +
  geom_bar(position = "fill") +  # 'fill' will stack the bars by proportion
  theme_minimal() + 
  theme_bw(base_size = 20) +
  labs(title = "Proportion of Injury Severity across Ethnicity", 
       x = "Ethnicity", 
       y = "Proportion", 
       fill = "Injury Severity") +
  scale_fill_manual(values = c("O" = "honeydew3",  # Set custom colors for each category
                               "KA" = "salmon4", 
                               "BC" = "skyblue4"))

Injury Severity Vs Weather Conditions

  ggplot(dat1, aes(x = Wthr_Cond_ID, fill = Prsn_Injry_Sev_ID)) +
  geom_bar(position = "fill") +  # 'fill' will stack the bars by proportion
  theme_minimal() + 
  theme_bw(base_size = 20) +
  labs(title = "Proportion of Injury Severity across Weather Conditions", 
       x = "Weather Conditions", 
       y = "Proportion", 
       fill = "Injury Severity") +
  scale_fill_manual(values = c("O" = "honeydew3",  # Set custom colors for each category
                               "KA" = "salmon4", 
                               "BC" = "skyblue4"))