HW01: Exploratory Data Analysis by Mahmuda Mimi

Read and explore data

Overview of the data

library(readxl)
setwd("C:/Users/mahmu/OneDrive/Desktop/AI")
df <- read_excel("HW1_Data.xlsx")
head(df, n=10)
## # A tibble: 10 × 19
##    Wthr_Cond_ID Light_Cond_ID  Road_Type_ID Road_Algn_ID SurfDry Traffic_Cntl_ID
##    <chr>        <chr>          <chr>        <chr>          <dbl> <chr>          
##  1 Clear        Dark, not lig… 2 lane, 2 w… Straight, l…       1 Marked lanes   
##  2 Clear        Dark, not lig… 2 lane, 2 w… Straight, l…       1 Center stripe/…
##  3 Clear        Daylight       2 lane, 2 w… Straight, l…       1 Marked lanes   
##  4 Clear        Daylight       2 lane, 2 w… Straight, l…       1 Center stripe/…
##  5 Clear        Dark, not lig… 2 lane, 2 w… Straight, g…       1 None           
##  6 Clear        Daylight       Unknown      Straight, l…       1 None           
##  7 Clear        Daylight       4 or more l… Curve, level       1 Marked lanes   
##  8 Clear        Daylight       4 or more l… Curve, level       1 Marked lanes   
##  9 Clear        Daylight       Unknown      Straight, l…       1 None           
## 10 Clear        Daylight       2 lane, 2 w… Straight, l…       1 Center stripe/…
## # ℹ 13 more variables: Harm_Evnt_ID <chr>, Intrsct_Relat_ID <chr>,
## #   FHE_Collsn_ID <chr>, Road_Part_Adj_ID <chr>, Road_Cls_ID <chr>,
## #   Pop_Group_ID <chr>, Crash_Speed_LimitCat <chr>, Veh_Body_Styl_ID <chr>,
## #   Prsn_Ethnicity_ID <chr>, GenMale <dbl>, TrafVol <dbl>, Prsn_Age <chr>,
## #   Prsn_Injry_Sev_ID <chr>
dim(df)
## [1] 1295   19
names(df)
##  [1] "Wthr_Cond_ID"         "Light_Cond_ID"        "Road_Type_ID"        
##  [4] "Road_Algn_ID"         "SurfDry"              "Traffic_Cntl_ID"     
##  [7] "Harm_Evnt_ID"         "Intrsct_Relat_ID"     "FHE_Collsn_ID"       
## [10] "Road_Part_Adj_ID"     "Road_Cls_ID"          "Pop_Group_ID"        
## [13] "Crash_Speed_LimitCat" "Veh_Body_Styl_ID"     "Prsn_Ethnicity_ID"   
## [16] "GenMale"              "TrafVol"              "Prsn_Age"            
## [19] "Prsn_Injry_Sev_ID"
str(df)
## tibble [1,295 × 19] (S3: tbl_df/tbl/data.frame)
##  $ Wthr_Cond_ID        : chr [1:1295] "Clear" "Clear" "Clear" "Clear" ...
##  $ Light_Cond_ID       : chr [1:1295] "Dark, not lighted" "Dark, not lighted" "Daylight" "Daylight" ...
##  $ Road_Type_ID        : chr [1:1295] "2 lane, 2 way" "2 lane, 2 way" "2 lane, 2 way" "2 lane, 2 way" ...
##  $ Road_Algn_ID        : chr [1:1295] "Straight, level" "Straight, level" "Straight, level" "Straight, level" ...
##  $ SurfDry             : num [1:1295] 1 1 1 1 1 1 1 1 1 1 ...
##  $ Traffic_Cntl_ID     : chr [1:1295] "Marked lanes" "Center stripe/divider" "Marked lanes" "Center stripe/divider" ...
##  $ Harm_Evnt_ID        : chr [1:1295] "Motor vehicle in transport" "Motor vehicle in transport" "Motor vehicle in transport" "Fixed object" ...
##  $ Intrsct_Relat_ID    : chr [1:1295] "Non intersection" "Non intersection" "Intersection" "Non intersection" ...
##  $ FHE_Collsn_ID       : chr [1:1295] "Sd both going straight-rear end" "Sd both going straight-rear end" "Other" "Omv vehicle going straight" ...
##  $ Road_Part_Adj_ID    : chr [1:1295] "Main/proper lane" "Main/proper lane" "Main/proper lane" "Main/proper lane" ...
##  $ Road_Cls_ID         : chr [1:1295] "Farm to market" "Us & state highways" "Farm to market" "Us & state highways" ...
##  $ Pop_Group_ID        : chr [1:1295] "10,000 - 24,999 pop" "Rural" "Other" "Rural" ...
##  $ Crash_Speed_LimitCat: chr [1:1295] "30-40 mph" "65-70 mph" "45-60 mph" "65-70 mph" ...
##  $ Veh_Body_Styl_ID    : chr [1:1295] "Farm equipment" "Farm equipment" "Farm equipment" "Farm equipment" ...
##  $ Prsn_Ethnicity_ID   : chr [1:1295] "White" "White" "White" "White" ...
##  $ GenMale             : num [1:1295] 1 1 1 1 1 1 1 1 1 1 ...
##  $ TrafVol             : num [1:1295] 11419 5267 5247 13009 13229 ...
##  $ Prsn_Age            : chr [1:1295] "25-54 years" "25-54 years" "Other" "25-54 years" ...
##  $ Prsn_Injry_Sev_ID   : chr [1:1295] "O" "O" "O" "O" ...
df1 <- df
library(DT)
datatable(
  df1, extensions = 'Buttons', options = list(
    dom = 'Bfrtip',
    buttons = c('excel')
  )
)
library(skimr)
skim(df1)
Data summary
Name df1
Number of rows 1295
Number of columns 19
_______________________
Column type frequency:
character 16
numeric 3
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Wthr_Cond_ID 0 1 3 6 0 5 0
Light_Cond_ID 0 1 4 17 0 5 0
Road_Type_ID 0 1 5 26 0 5 0
Road_Algn_ID 0 1 5 19 0 5 0
Traffic_Cntl_ID 0 1 4 21 0 5 0
Harm_Evnt_ID 0 1 5 26 0 5 0
Intrsct_Relat_ID 0 1 12 20 0 4 0
FHE_Collsn_ID 0 1 5 32 0 5 0
Road_Part_Adj_ID 0 1 5 28 0 5 0
Road_Cls_ID 0 1 5 19 0 5 0
Pop_Group_ID 0 1 5 21 0 5 0
Crash_Speed_LimitCat 0 1 5 9 0 5 0
Veh_Body_Styl_ID 0 1 14 14 0 1 0
Prsn_Ethnicity_ID 0 1 5 8 0 5 0
Prsn_Age 0 1 5 11 0 5 0
Prsn_Injry_Sev_ID 0 1 1 2 0 3 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
SurfDry 0 1 0.91 0.28 0 1.0 1 1.0 1 ▁▁▁▁▇
GenMale 0 1 0.88 0.32 0 1.0 1 1.0 1 ▁▁▁▁▇
TrafVol 0 1 14203.65 8358.99 218 6706.5 14019 21416.5 28988 ▇▇▇▇▆

Descriptive Statistics

df1$Prsn_Injry_Sev_ID <- factor(df1$Prsn_Injry_Sev_ID, levels = c("KA", "BC", "O"))

library(compareGroups)
des1 <- compareGroups(`Prsn_Injry_Sev_ID` ~ ., data = df1, max.ylev = 50, max.xlev = 50, ref = 1)
des2 <- createTable(des1, show.ratio = TRUE)
des2
## 
## --------Summary descriptives table by 'Prsn_Injry_Sev_ID'---------
## 
## _____________________________________________________________________________________ 
##                                           KA           BC           O       p.overall 
##                                          N=50        N=120        N=1125              
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯ 
## Wthr_Cond_ID:                                                                   .     
##     Clear                             42 (84.0%)   98 (81.7%)  944 (83.9%)            
##     Cloudy                            6 (12.0%)    19 (15.8%)  121 (10.8%)            
##     Fog                               1 (2.00%)    1 (0.83%)    8 (0.71%)             
##     Other                             0 (0.00%)    0 (0.00%)    10 (0.89%)            
##     Rain                              1 (2.00%)    2 (1.67%)    42 (3.73%)            
## Light_Cond_ID:                                                                  .     
##     Dark, lighted                     0 (0.00%)    5 (4.17%)    37 (3.29%)            
##     Dark, not lighted                 12 (24.0%)   16 (13.3%)  126 (11.2%)            
##     Daylight                          36 (72.0%)   90 (75.0%)  914 (81.2%)            
##     Dusk                              2 (4.00%)    6 (5.00%)    31 (2.76%)            
##     Other                             0 (0.00%)    3 (2.50%)    17 (1.51%)            
## Road_Type_ID:                                                                   .     
##     2 lane, 2 way                     26 (52.0%)   50 (41.7%)  475 (42.2%)            
##     4 or more lanes, divided          9 (18.0%)    27 (22.5%)  149 (13.2%)            
##     4 or more lanes, undivided        6 (12.0%)    11 (9.17%)   79 (7.02%)            
##     Other                             0 (0.00%)    0 (0.00%)    2 (0.18%)             
##     Unknown                           9 (18.0%)    32 (26.7%)  420 (37.3%)            
## Road_Algn_ID:                                                                   .     
##     Curve, level                      2 (4.00%)    6 (5.00%)    54 (4.80%)            
##     Other                             2 (4.00%)    2 (1.67%)    40 (3.56%)            
##     Straight, grade                   5 (10.0%)    24 (20.0%)  106 (9.42%)            
##     Straight, hillcrest               4 (8.00%)    4 (3.33%)    39 (3.47%)            
##     Straight, level                   37 (74.0%)   84 (70.0%)  886 (78.8%)            
## SurfDry                              0.94 (0.24)  0.93 (0.25)  0.91 (0.28)    0.571   
## Traffic_Cntl_ID:                                                                .     
##     Center stripe/divider             15 (30.0%)   32 (26.7%)  259 (23.0%)            
##     Marked lanes                      19 (38.0%)   39 (32.5%)  331 (29.4%)            
##     No passing zone                   6 (12.0%)    12 (10.0%)   76 (6.76%)            
##     None                              5 (10.0%)    21 (17.5%)  270 (24.0%)            
##     Other                             5 (10.0%)    16 (13.3%)  189 (16.8%)            
## Harm_Evnt_ID:                                                                   .     
##     Fixed object                      4 (8.00%)    10 (8.33%)  130 (11.6%)            
##     Motor vehicle in transport        40 (80.0%)   99 (82.5%)  867 (77.1%)            
##     Other                             4 (8.00%)    4 (3.33%)    24 (2.13%)            
##     Overturned                        1 (2.00%)    6 (5.00%)    19 (1.69%)            
##     Parked car                        1 (2.00%)    1 (0.83%)    85 (7.56%)            
## Intrsct_Relat_ID:                                                               .     
##     Driveway access                   3 (6.00%)    8 (6.67%)   134 (11.9%)            
##     Intersection                      5 (10.0%)    15 (12.5%)  150 (13.3%)            
##     Intersection related              1 (2.00%)    7 (5.83%)   104 (9.24%)            
##     Non intersection                  41 (82.0%)   90 (75.0%)  737 (65.5%)            
## FHE_Collsn_ID:                                                               <0.001   
##     Omv vehicle going straight        9 (18.0%)    18 (15.0%)  232 (20.6%)            
##     Other                             6 (12.0%)    24 (20.0%)  367 (32.6%)            
##     Sd both going straight-rear end   27 (54.0%)   60 (50.0%)  225 (20.0%)            
##     Sd both going straight-sideswipe  3 (6.00%)    9 (7.50%)   147 (13.1%)            
##     Sd one straight-one left turn     5 (10.0%)    9 (7.50%)   154 (13.7%)            
## Road_Part_Adj_ID:                                                             0.151   
##     Exit/off ramp                     1 (2.00%)    1 (0.83%)    2 (0.18%)             
##     Main/proper lane                  49 (98.0%)  110 (91.7%)  1066 (94.8%)           
##     Other                             0 (0.00%)    1 (0.83%)    3 (0.27%)             
##     Other (explain in narrative)      0 (0.00%)    4 (3.33%)    26 (2.31%)            
##     Unknown                           0 (0.00%)    4 (3.33%)    28 (2.49%)            
## Road_Cls_ID:                                                                    .     
##     City street                       4 (8.00%)    10 (8.33%)  207 (18.4%)            
##     County road                       4 (8.00%)    15 (12.5%)  172 (15.3%)            
##     Farm to market                    14 (28.0%)   43 (35.8%)  317 (28.2%)            
##     Other                             1 (2.00%)    13 (10.8%)   75 (6.67%)            
##     Us & state highways               27 (54.0%)   39 (32.5%)  354 (31.5%)            
## Pop_Group_ID:                                                                   .     
##     10,000 - 24,999 pop               3 (6.00%)    4 (3.33%)    56 (4.98%)            
##     250,000 pop. And over             2 (4.00%)    9 (7.50%)   106 (9.42%)            
##     Other                             4 (8.00%)    16 (13.3%)  173 (15.4%)            
##     Rural                             40 (80.0%)   87 (72.5%)  739 (65.7%)            
##     Town under 2,499 pop.             1 (2.00%)    4 (3.33%)    51 (4.53%)            
## Crash_Speed_LimitCat:                                                           .     
##     > 70 mph                          10 (20.0%)   19 (15.8%)  174 (15.5%)            
##     30-40 mph                         6 (12.0%)    20 (16.7%)  310 (27.6%)            
##     45-60 mph                         14 (28.0%)   45 (37.5%)  412 (36.6%)            
##     65-70 mph                         20 (40.0%)   28 (23.3%)  175 (15.6%)            
##     Other                             0 (0.00%)    8 (6.67%)    54 (4.80%)            
## Veh_Body_Styl_ID: Farm equipment      50 (100%)    120 (100%)  1125 (100%)      .     
## Prsn_Ethnicity_ID:                                                              .     
##     Black                             2 (4.00%)    9 (7.50%)    52 (4.62%)            
##     Hispanic                          12 (24.0%)   36 (30.0%)  374 (33.2%)            
##     Other                             0 (0.00%)    1 (0.83%)    38 (3.38%)            
##     Unknown                           0 (0.00%)    0 (0.00%)    83 (7.38%)            
##     White                             36 (72.0%)   74 (61.7%)  578 (51.4%)            
## GenMale                              0.98 (0.14)  0.94 (0.24)  0.87 (0.33)    0.008   
## TrafVol                              14259 (8714) 14122 (8472) 14210 (8338)   0.993   
## Prsn_Age:                                                                       .     
##     15-24 years                       5 (10.0%)    18 (15.0%)  165 (14.7%)            
##     25-54 years                       19 (38.0%)   56 (46.7%)  497 (44.2%)            
##     55-64 years                       8 (16.0%)    23 (19.2%)  201 (17.9%)            
##     65-74 years                       10 (20.0%)   14 (11.7%)   94 (8.36%)            
##     Other                             8 (16.0%)    9 (7.50%)   168 (14.9%)            
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯

Correlation Analysis

Correlation Analysis of Categorical Variables

library(vcd)
library(dplyr)
library(ggcorrplot)

# Function to calculate Cramér's V for categorical variables
cramers_v_matrix <- function(df) {
  cat_vars <- df %>% select(where(is.character))
  cat_names <- colnames(cat_vars)
  
  #Creating empty matrix
  cramers_v <- matrix(NA, length(cat_names), length(cat_names), 
                      dimnames = list(cat_names, cat_names))
  
  # Filling matrix with Cramér's V values
  for (i in 1:length(cat_names)) {
    for (j in 1:length(cat_names)) {
      if (i == j) {
        cramers_v[i, j] <- 1 
      } else {
        # Contingency table for the two variables
        tbl <- table(cat_vars[[i]], cat_vars[[j]])
        # Compute Chi-Square test
        chisq_test <- chisq.test(tbl)
        # Compute Cramér's V
        cramers_v[i, j] <- sqrt(chisq_test$statistic / (sum(tbl) * (min(nrow(tbl), ncol(tbl)) - 1)))
      }
    }
  }
  
  return(cramers_v)
}

cramers_v <- cramers_v_matrix(df)

# Plotting the Cramér's V correlation matrix
ggcorrplot(cramers_v, 
           lab = TRUE,               
           lab_size = 8,             
           title = "Cramer's V Correlation Matrix", 
           colors = c("#6D9EC1", "white", "#E46726"),  
           outline.color = "white",  
           hc.order = FALSE) +      
  theme_minimal() +                   
  labs(x = "Categorical Variable 1",  
       y = "Categorical Variable 2") + 
  theme(
    plot.title = element_text(size = 40, face = "bold", hjust = 0.5), 
    axis.text.x = element_text(angle = 45, hjust = 1, size = 22),      
    axis.text.y = element_text(size = 22),                
    axis.title.x = element_text(size = 28),               
    axis.title.y = element_text(size = 28),               
    legend.position = "right",                            
    legend.text = element_text(size = 20),                
    legend.title = element_text(size = 20, face = "bold") 
  )

Interpretation of Results: The correlation analysis of categorical variables is performed using Cramér’s V, a measure of association for nominal (categorical) data. It is computed based on the Chi-Square statistic, testing whether the variables are independent of each other. A matrix of Cramér’s V values is generated, with values closer to 1 indicating a stronger association between the variables.The resulting heatmap visualizes the strength of the relationships between categorical variables, allowing easy identification of the strongest correlations. Since all variables are categorical, this method is appropriate for detecting non-parametric associations without relying on distribution assumptions. The Veh_Body_Styl_ID variable has only one category for all observations and this is the reason of getting “Inf” value in the Cramér’s V matrix. This indicates that the results are in a perfect association (or undefined association) because the Chi-square statistic would be either undefined or extremely large relative to the sample size, leading to an infinite Cramér’s V value. This is because there is no variability in this variable, and it does not provide any meaningful information to measure an association with other variables.

Correlation Analysis of Other Variables

Correlation Analysis of Traffic Volume with Categorical Variables

library(ggplot2)
library(dplyr)

# List of categorical variables to analyze
categorical_vars <- c("Wthr_Cond_ID", "Light_Cond_ID", "Road_Type_ID", "Road_Algn_ID", 
                      "SurfDry", "Traffic_Cntl_ID", "Harm_Evnt_ID", "Intrsct_Relat_ID", 
                      "FHE_Collsn_ID", "Road_Part_Adj_ID", "Road_Cls_ID", 
                      "Pop_Group_ID", "Crash_Speed_LimitCat", "Prsn_Ethnicity_ID", 
                      "GenMale", "Prsn_Age", "Prsn_Injry_Sev_ID")

correlation_results <- data.frame(Variable = character(), PValue = numeric(), stringsAsFactors = FALSE)

# Performing ANOVA or Kruskal-Wallis for each categorical variable
for (var in categorical_vars) {
  if (length(unique(df[[var]])) > 1) {
    # Performing Kruskal-Wallis test (non-parametric)
    test_result <- kruskal.test(TrafVol ~ get(var), data = df)
    correlation_results <- rbind(correlation_results, 
                                 data.frame(Variable = var, PValue = round(test_result$p.value, 3)))
  }
}

print(correlation_results)
##                Variable PValue
## 1          Wthr_Cond_ID  0.353
## 2         Light_Cond_ID  0.277
## 3          Road_Type_ID  0.303
## 4          Road_Algn_ID  0.681
## 5               SurfDry  0.813
## 6       Traffic_Cntl_ID  0.950
## 7          Harm_Evnt_ID  0.208
## 8      Intrsct_Relat_ID  0.550
## 9         FHE_Collsn_ID  0.710
## 10     Road_Part_Adj_ID  0.498
## 11          Road_Cls_ID  0.236
## 12         Pop_Group_ID  0.069
## 13 Crash_Speed_LimitCat  0.796
## 14    Prsn_Ethnicity_ID  0.272
## 15              GenMale  0.057
## 16             Prsn_Age  0.592
## 17    Prsn_Injry_Sev_ID  0.992

Interpretation of Results: For each categorical variable (e.g., Wthr_Cond_ID, Light_Cond_ID, Road_Type_ID, etc.), the Kruskal-Wallis test was performed to determine whether the medians of traffic volume differ significantly across the categories. The p-values represent the likelihood that the differences in medians could have occurred by chance. All p-values are greater than 0.05, indicating no statistically significant associations between traffic volume and the analyzed categorical variables.

Single Variable Analysis

plot_colors <- c("skyblue", "lightcoral", "#084081", "#a8ddb5", "#7bccc4", "lightgray", "lightblue", "#8c6bb1", "#cc4c02", "#810f7c", "#993404")

custom_labels <- c(
  "Wthr_Cond_ID" = "Weather Condition",
  "Light_Cond_ID" = "Light Condition",
  "Road_Type_ID" = "Road Type",
  "Road_Algn_ID" = "Road Alignment",
  "SurfDry" = "Surface Condition: Dry",
  "Traffic_Cntl_ID" = "Traffic Control",
  "Harm_Evnt_ID" = "Harm Event",
  "Intrsct_Relat_ID" = "Intersection Relation",
  "FHE_Collsn_ID" = "First Harmful Event",
  "Road_Part_Adj_ID" = "Road Part Adjacent",
  "Road_Cls_ID" = "Road Classification",
  "Pop_Group_ID" = "Population Group",
  "Crash_Speed_LimitCat" = "Speed Limit",
  "Prsn_Ethnicity_ID" = "Person Ethnicity",
  "GenMale" = "Gender: Male",
  "Prsn_Age" = "Person Age",
  "Prsn_Injry_Sev_ID" = "Injury Severity"
)

df$Prsn_Injry_Sev_ID <- factor(df$Prsn_Injry_Sev_ID, levels = c("KA", "BC", "O"))

# List of all the variables to plot
variables_to_plot <- c(
  "Wthr_Cond_ID", "Light_Cond_ID", "Road_Type_ID", "Road_Algn_ID", "SurfDry", "Traffic_Cntl_ID", "Harm_Evnt_ID", "Intrsct_Relat_ID", 
  "FHE_Collsn_ID", "Road_Part_Adj_ID", "Road_Cls_ID", "Pop_Group_ID", 
  "Crash_Speed_LimitCat", "Prsn_Ethnicity_ID", "GenMale", "Prsn_Age", "Prsn_Injry_Sev_ID"
)

# Loop through each variable and create a bar plot
for (i in seq_along(variables_to_plot)) {
  var <- variables_to_plot[i]  # Get the variable name
  
  label <- custom_labels[var]
  p <- ggplot(df, aes_string(x = var)) + 
    geom_bar(fill = plot_colors[i %% length(plot_colors) + 1]) +  # Assign color for the plot
    theme_minimal() +
    labs(
      title = paste("Distribution of", label), 
      x = label, 
      y = "Count"
    ) +
    theme(
      plot.background = element_rect(fill = "#f0f0f0", color = NA), 
      panel.background = element_rect(fill = "#f0f0f0", color = NA),
      plot.title = element_text(family = "Roboto", size = 16, face = "bold", hjust = 0.5),  
      axis.title = element_text(size = 12),
      axis.text.x = element_text(angle = 45, hjust = 1, size = 9),  
      plot.margin = margin(10, 10, 10, 10)  
    )
  
  # Print each plot
  print(p)
}

Distribution of Traffic Volume

library(hrbrthemes)

p2 <- ggplot(data = df, aes(x = TrafVol)) + 
  geom_density(adjust = 1.5, alpha = 0.4, fill = "lightblue") + 
  theme_ipsum() +  # Use the hrbrthemes for a clean theme
  labs(
    title = "Density Plot of Traffic Volume",
    x = "Traffic Volume",
    y = "Density"
  )
p2

Bivariate Exploratory Data Analysis

custom_labels <- c(
  "Wthr_Cond_ID" = "Weather Condition",
  "Light_Cond_ID" = "Light Condition",
  "Road_Type_ID" = "Road Type",
  "Road_Algn_ID" = "Road Alignment",
  "SurfDry" = "Surface Condition",
  "Traffic_Cntl_ID" = "Traffic Control",
  "Harm_Evnt_ID" = "Harm Event",
  "Intrsct_Relat_ID" = "Intersection Relation",
  "FHE_Collsn_ID" = "First Harmful Event",
  "Road_Part_Adj_ID" = "Road Part Adjacent",
  "Road_Cls_ID" = "Road Classification",
  "Pop_Group_ID" = "Population Group",
  "Crash_Speed_LimitCat" = "Speed Limit",
  "Prsn_Ethnicity_ID" = "Person Ethnicity",
  "GenMale" = "Gender",
  "Prsn_Age" = "Person Age",
  "Prsn_Injry_Sev_ID" = "Injury Severity"
)

variables_to_plot <- c(
  "Wthr_Cond_ID", "Light_Cond_ID", "Road_Type_ID", "Road_Algn_ID", 
  "SurfDry", "Traffic_Cntl_ID", "Harm_Evnt_ID", "Intrsct_Relat_ID", 
  "FHE_Collsn_ID", "Road_Part_Adj_ID", "Road_Cls_ID", "Pop_Group_ID", 
  "Crash_Speed_LimitCat", "Prsn_Ethnicity_ID", "GenMale", "Prsn_Age"
)

# Loop through each variable and create a stacked bar plot
for (var in variables_to_plot) {
  # Plotting for each variable with Injury Severity
  df %>%
    count(.data[[var]], Prsn_Injry_Sev_ID) %>%  
    group_by(.data[[var]]) %>%                   
    mutate(pct = prop.table(n) * 100) %>%       
    ggplot(aes_string(x = var, y = "pct", fill = "Prsn_Injry_Sev_ID")) + 
    geom_bar(stat = "identity", color = "black") +   
    ylab("Percentage") +            
    geom_text(aes(label = paste0(sprintf("%1.1f", pct), "%")),  
              position = position_stack(vjust = 0.5)) +  
    ggtitle(paste("Injury Severity by", custom_labels[var])) +   
    xlab(custom_labels[var]) +                     
    theme(plot.title = element_text(face = "bold", hjust = 0.5)) +                                    
    scale_fill_manual(
      values = c("#DADAEB", "#9E9AC8", "#6A51A3"),
      name = "Person Injury Severity"   
    ) +  
    theme(axis.text.x = element_text(angle = 45, hjust = 1))  
  print(last_plot())
}

library(ggstatsplot)

df$Prsn_Injry_Sev_ID <- factor(df$Prsn_Injry_Sev_ID, levels = c("KA", "BC", "O"))

plt <- ggbetweenstats(
  data = df,
  x = Prsn_Injry_Sev_ID,  
  y = TrafVol,            
  type = "nonparametric",
  pairwise.comparisons = FALSE, 
  pairwise.display = "none",    
  plot.type = "boxviolin",     
  ggtheme = theme_minimal(),   
  messages = FALSE              
)

plt <- plt + 
  labs(
    x = "Injury Severity",
    y = "Traffic Volume",
    title = "Distribution of Traffic Volume Across Different Injury Severities",
  ) + 
  theme(
    text = element_text(family = "Roboto", size = 11, color = "black"),
    
    plot.title = element_text(
      family = "Lobster Two", 
      size = 18,
      face = "bold",
      color = "#2a475e",
      hjust = 0.5 
    ),

    plot.subtitle = element_text(
      family = "Roboto", 
      size = 12, 
      face = "bold",
      color = "#1b2838",
      hjust = 0.5 
    ),
    
    plot.title.position = "plot",
    
    # Customize the axis text
    axis.text = element_text(size = 10, color = "black"),
    axis.title = element_text(size = 12)
  )

plt <- plt  +
  theme(
    axis.ticks = element_blank(),  # Remove axis ticks
    axis.line = element_line(colour = "grey50"),  
    panel.grid = element_line(color = "#b4aea9"), 
    panel.grid.minor = element_blank(), 
    panel.grid.major.x = element_blank(),  
    panel.grid.major.y = element_line(linetype = "dashed"),  
    panel.background = element_rect(fill = "#fbf9f4", color = "#fbf9f4"), 
    plot.background = element_rect(fill = "#fbf9f4", color = "#fbf9f4")   
  )

ggsave(
  filename = "traffic_volume_violin_boxplot.png",
  plot = plt,
  width = 8,
  height = 8,
  device = "png"
)

print(plt)

## Heat Maps

custom_labels <- c(
  "Wthr_Cond_ID" = "Weather Condition",
  "Light_Cond_ID" = "Light Condition",
  "Crash_Speed_LimitCat" = "Speed Limit",
  "Prsn_Age" = "Person Age",
  "Prsn_Injry_Sev_ID" = "Injury Severity"
)

# List of variables for which we are creating heatmaps
variables_to_plot <- c("Wthr_Cond_ID", "Light_Cond_ID", "Crash_Speed_LimitCat", "Prsn_Age")

for (var in variables_to_plot) {
  
  table_data <- table(df$Prsn_Injry_Sev_ID, df[[var]])
  
  df_table <- as.data.frame(table_data)

  p <- ggplot(df_table, aes(Var1, Var2, fill = Freq)) +
    geom_tile() +
    labs(
      title = paste(custom_labels[var], "by Injury Severity"),
      x = custom_labels["Prsn_Injry_Sev_ID"], 
      y = custom_labels[var], 
      fill = "Count"
    ) +
    theme_minimal() +
    scale_fill_gradient(low = "lightblue", high = "darkblue") +
    theme(
      plot.title = element_text(face = "bold", hjust = 0.5, size = 14)
    )
  print(p)
}