Exploratory Data Analysis (HW1)

Introduction

This report analyzes the correlations between variables in the dataset and also goes through the exploratory data analysis.

Name: Md Monzurul Islam

netID: ene44

Load Libraries

knitr::opts_chunk$set(
    echo = TRUE,
    message = FALSE,
    warning = FALSE
)
# Install DT package if not already installed
if (!require(DT)) install.packages("DT", dependencies = TRUE)
if (!require(htmltools)) install.packages("htmltools", dependencies = TRUE)
# Load necessary libraries
library(readxl)
library(ggplot2)
library(corrplot)
library(dplyr)
library(DT)
library(htmltools)
library(GGally)

Load Dataset

# set working directory
setwd("D:\\TXST\\OneDrive - Texas State University\\Fall 24\\CE 5373 AI in CE\\HW 1")
# load the data
data <- read_excel("./HW1_Data.xlsx", sheet = "UnitL4")

Basic Info

head(data)

## # A tibble: 6 × 19
##   Wthr_Cond_ID Light_Cond_ID   Road_Type_ID Road_Algn_ID SurfDry Traffic_Cntl_ID
##   <chr>        <chr>           <chr>        <chr>          <dbl> <chr>          
## 1 Clear        Dark, not ligh… 2 lane, 2 w… Straight, l…       1 Marked lanes   
## 2 Clear        Dark, not ligh… 2 lane, 2 w… Straight, l…       1 Center stripe/…
## 3 Clear        Daylight        2 lane, 2 w… Straight, l…       1 Marked lanes   
## 4 Clear        Daylight        2 lane, 2 w… Straight, l…       1 Center stripe/…
## 5 Clear        Dark, not ligh… 2 lane, 2 w… Straight, g…       1 None           
## 6 Clear        Daylight        Unknown      Straight, l…       1 None           
## # ℹ 13 more variables: Harm_Evnt_ID <chr>, Intrsct_Relat_ID <chr>,
## #   FHE_Collsn_ID <chr>, Road_Part_Adj_ID <chr>, Road_Cls_ID <chr>,
## #   Pop_Group_ID <chr>, Crash_Speed_LimitCat <chr>, Veh_Body_Styl_ID <chr>,
## #   Prsn_Ethnicity_ID <chr>, GenMale <dbl>, TrafVol <dbl>, Prsn_Age <chr>,
## #   Prsn_Injry_Sev_ID <chr>

Get Total Rows and Columns

# Get the number of rows and columns
total_rows <- nrow(data)
total_columns <- ncol(data)

# Display the results
cat("Total Rows: ", total_rows, "\n")

## Total Rows:  1295

cat("Total Columns: ", total_columns, "\n")

## Total Columns:  19

Show the structure of the dataset

str(data)

## tibble [1,295 × 19] (S3: tbl_df/tbl/data.frame)
##  $ Wthr_Cond_ID        : chr [1:1295] "Clear" "Clear" "Clear" "Clear" ...
##  $ Light_Cond_ID       : chr [1:1295] "Dark, not lighted" "Dark, not lighted" "Daylight" "Daylight" ...
##  $ Road_Type_ID        : chr [1:1295] "2 lane, 2 way" "2 lane, 2 way" "2 lane, 2 way" "2 lane, 2 way" ...
##  $ Road_Algn_ID        : chr [1:1295] "Straight, level" "Straight, level" "Straight, level" "Straight, level" ...
##  $ SurfDry             : num [1:1295] 1 1 1 1 1 1 1 1 1 1 ...
##  $ Traffic_Cntl_ID     : chr [1:1295] "Marked lanes" "Center stripe/divider" "Marked lanes" "Center stripe/divider" ...
##  $ Harm_Evnt_ID        : chr [1:1295] "Motor vehicle in transport" "Motor vehicle in transport" "Motor vehicle in transport" "Fixed object" ...
##  $ Intrsct_Relat_ID    : chr [1:1295] "Non intersection" "Non intersection" "Intersection" "Non intersection" ...
##  $ FHE_Collsn_ID       : chr [1:1295] "Sd both going straight-rear end" "Sd both going straight-rear end" "Other" "Omv vehicle going straight" ...
##  $ Road_Part_Adj_ID    : chr [1:1295] "Main/proper lane" "Main/proper lane" "Main/proper lane" "Main/proper lane" ...
##  $ Road_Cls_ID         : chr [1:1295] "Farm to market" "Us & state highways" "Farm to market" "Us & state highways" ...
##  $ Pop_Group_ID        : chr [1:1295] "10,000 - 24,999 pop" "Rural" "Other" "Rural" ...
##  $ Crash_Speed_LimitCat: chr [1:1295] "30-40 mph" "65-70 mph" "45-60 mph" "65-70 mph" ...
##  $ Veh_Body_Styl_ID    : chr [1:1295] "Farm equipment" "Farm equipment" "Farm equipment" "Farm equipment" ...
##  $ Prsn_Ethnicity_ID   : chr [1:1295] "White" "White" "White" "White" ...
##  $ GenMale             : num [1:1295] 1 1 1 1 1 1 1 1 1 1 ...
##  $ TrafVol             : num [1:1295] 7654 13770 11470 16972 413 ...
##  $ Prsn_Age            : chr [1:1295] "25-54 years" "25-54 years" "Other" "25-54 years" ...
##  $ Prsn_Injry_Sev_ID   : chr [1:1295] "O" "O" "O" "O" ...

Get the summary of the dataset

summary(data)

##  Wthr_Cond_ID       Light_Cond_ID      Road_Type_ID       Road_Algn_ID      
##  Length:1295        Length:1295        Length:1295        Length:1295       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##     SurfDry       Traffic_Cntl_ID    Harm_Evnt_ID       Intrsct_Relat_ID  
##  Min.   :0.0000   Length:1295        Length:1295        Length:1295       
##  1st Qu.:1.0000   Class :character   Class :character   Class :character  
##  Median :1.0000   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :0.9143                                                           
##  3rd Qu.:1.0000                                                           
##  Max.   :1.0000                                                           
##  FHE_Collsn_ID      Road_Part_Adj_ID   Road_Cls_ID        Pop_Group_ID      
##  Length:1295        Length:1295        Length:1295        Length:1295       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  Crash_Speed_LimitCat Veh_Body_Styl_ID   Prsn_Ethnicity_ID     GenMale      
##  Length:1295          Length:1295        Length:1295        Min.   :0.0000  
##  Class :character     Class :character   Class :character   1st Qu.:1.0000  
##  Mode  :character     Mode  :character   Mode  :character   Median :1.0000  
##                                                             Mean   :0.8842  
##                                                             3rd Qu.:1.0000  
##                                                             Max.   :1.0000  
##     TrafVol        Prsn_Age         Prsn_Injry_Sev_ID 
##  Min.   :  215   Length:1295        Length:1295       
##  1st Qu.: 6765   Class :character   Class :character  
##  Median :14170   Mode  :character   Mode  :character  
##  Mean   :14417                                        
##  3rd Qu.:22010                                        
##  Max.   :28970

Check for missing values

sum(is.na(data))

## [1] 0

This data set has no missing values.

Correlation matrix:

Calculate correlations between numeric variables.

numeric_cols <- data[, sapply(data, is.numeric)]
cor_matrix <- cor(numeric_cols, use = "complete.obs")
print(cor_matrix)

##             SurfDry     GenMale      TrafVol
## SurfDry 1.000000000  0.04432906  0.009977795
## GenMale 0.044329059  1.00000000 -0.041513425
## TrafVol 0.009977795 -0.04151342  1.000000000

Plot the correlation matrix

library(corrplot)
corrplot(cor_matrix, method = "circle", type = "lower")

Description on Prsn_Injry_Sev_ID

The Prsn_Injry_Sev_ID variable represents the injury severity of individuals involved in traffic accidents, classified according to the KABCO scale. This scale categorizes injury severity from fatal outcomes to cases where no injury occurs. The following classifications are used within this dataset:

K (Killed): This category represents fatal injuries resulting from the crash. Individuals in this category are those who succumbed to injuries at the scene or shortly after the crash.
A (Incapacitating Injury): These injuries are serious and prevent the injured person from continuing their normal activities without medical assistance. Examples include fractures, severe lacerations, or injuries that require hospitalization.
B (Non-incapacitating Injury): This category includes visible injuries such as bruises or minor lacerations that, while painful, do not prevent the individual from continuing daily activities.
C (Possible Injury): Injuries in this category are those that are not visible but are claimed by the person involved in the crash, such as complaints of pain or discomfort. These injuries may or may not require medical attention.
O (No Injury/Property Damage Only): This category is used when no injuries are sustained in the crash, and the only result is damage to the property, such as vehicles.

The Prsn_Injry_Sev_ID variable provides critical insight into the severity of injuries in traffic crashes. By analyzing the distribution of these categories, researchers can identify potential correlations between injury severity and factors such as road conditions, vehicle types, and demographic attributes. This understanding can aid in identifying risk factors associated with more severe injuries and guide interventions to enhance road safety.

Univariate Analysis

Number of Accidents by Weather Condition

# Create a bar plot showing the number of accidents by weather condition
ggplot(data, aes(x = as.factor(Wthr_Cond_ID))) + 
  geom_bar(fill = "skyblue") +
  labs(title = "Number of Accidents by Weather Condition", 
       x = "Weather Condition", 
       y = "Number of Accidents") +
  theme_minimal()

The majority of crashes occurred in clear weather.

Number of Accidents by Light Condition

# Create a bar plot showing the number of accidents by light condition
ggplot(data, aes(x = as.factor(Light_Cond_ID))) + 
  geom_bar(fill = "orange") +
  labs(title = "Number of Accidents by Light Condition", 
       x = "Light Condition", 
       y = "Number of Accidents") +
  theme_minimal()

The majority of crashes occurred in daylight condition.

Number of Accidents by Speed Limit Category

# Create a bar plot showing the number of accidents by speed limit category
ggplot(data, aes(x = as.factor(Crash_Speed_LimitCat))) + 
  geom_bar(fill = "steelblue") +
  labs(title = "Number of Accidents by Speed Limit Category", 
       x = "Speed Limit Category", 
       y = "Number of Accidents") +
  theme_minimal()

Crash distribution follows normal distribution

Number of Accidents by Age Group (Existing Categories)

# Create a bar plot using the existing age group values
ggplot(data, aes(x = as.factor(Prsn_Age))) + 
  geom_bar(fill = "skyblue") +
  labs(title = "Number of Accidents by Age Group", 
       x = "Age Group", 
       y = "Number of Accidents") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Middle age group (22-54 years) is more prone to crashes

Number of Accidents by Injury Severity

# Create a bar plot for the number of accidents by injury severity (Prsn_Injry_Sev_ID)
ggplot(data, aes(x = as.factor(Prsn_Injry_Sev_ID))) + 
  geom_bar(fill = "coral") +
  labs(title = "Number of Accidents by Injury Severity", 
       x = "Injury Severity", 
       y = "Number of Accidents") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Most of the crashes resulted in no injuries, only property damage

Number of Accidents by Gender (GenMale)

# Bar plot for number of accidents by gender (GenMale)
ggplot(data, aes(x = as.factor(GenMale))) + 
  geom_bar(fill = "lightblue") +
  labs(title = "Number of Accidents by Gender", 
       x = "Gender (1 = Male, 0 = Female)", 
       y = "Number of Accidents") +
  theme_minimal()

The male population is more prone to crashes

Number of Accidents by Ethnicity (Prsn_Ethnicity_ID)

# Bar plot for number of accidents by ethnicity (Prsn_Ethnicity_ID)
ggplot(data, aes(x = as.factor(Prsn_Ethnicity_ID))) + 
  geom_bar(fill = "lightgreen") +
  labs(title = "Number of Accidents by Ethnicity", 
       x = "Ethnicity", 
       y = "Number of Accidents") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

White and Hispanic people are more vulnerable to crashes.

Number of Accidents by Intersection Relation (Intrsct_Relat_ID)

# Bar plot for number of accidents by intersection relation (Intrsct_Relat_ID)
ggplot(data, aes(x = as.factor(Intrsct_Relat_ID))) + 
  geom_bar(fill = "orange") +
  labs(title = "Number of Accidents by Intersection Relation", 
       x = "Intersection Relation", 
       y = "Number of Accidents") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Most of the crashes happen in non-intersection

Number of Accidents by Surface Dryness (SurfDry)

# Bar plot for number of accidents by surface dryness (SurfDry)
ggplot(data, aes(x = as.factor(SurfDry))) + 
  geom_bar(fill = "purple") +
  labs(title = "Number of Accidents by Surface Dryness", 
       x = "Surface Dryness (1 = Dry, 0 = Not Dry)", 
       y = "Number of Accidents") +
  theme_minimal()

Most of the crashes happen on dry sufrface

Number of Accidents by Traffic Control (Traffic_Cntl_ID)

# Bar plot for number of accidents by traffic control (Traffic_Cntl_ID)
ggplot(data, aes(x = as.factor(Traffic_Cntl_ID))) + 
  geom_bar(fill = "tomato") +
  labs(title = "Number of Accidents by Traffic Control", 
       x = "Traffic Control", 
       y = "Number of Accidents") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Center stripe/divider and marked lanes are the hot-spots for crashes to occur

Univariate Analysis Summary

The table below summarizes key factors from the univariate analysis of the dataset.

# Create a summary table with factors and observations
univariate_summary <- data.frame(
  Factor = c("Weather Condition (Wthr_Cond_ID)", 
             "Light Condition (Light_Cond_ID)", 
             "Speed Limit Category (Crash_Speed_LimitCat)", 
             "Age Group (Prsn_Age)", 
             "Injury Severity (Prsn_Injry_Sev_ID)", 
             "Gender (GenMale)", 
             "Ethnicity (Prsn_Ethnicity_ID)", 
             "Intersection Relation (Intrsct_Relat_ID)", 
             "Surface Dryness (SurfDry)", 
             "Traffic Control (Traffic_Cntl_ID)"),
  Observation = c("Majority of crashes occurred in clear weather",
                  "Most crashes occurred in daylight condition",
                  "Crashes follow a normal distribution by speed limit category",
                  "The middle-age group (22-54 years) is more prone to crashes",
                  "Most crashes resulted in no injuries, only property damage",
                  "Males are more prone to crashes",
                  "White and Hispanic individuals are more represented in crashes",
                  "Majority of crashes occurred at non-intersections",
                  "Most crashes happened on dry surfaces",
                  "Crashes are more frequent where traffic control is present")
)

# Display the summary table
knitr::kable(univariate_summary, col.names = c("Factor", "Observation"), caption = "Summary of Univariate Analysis")

Summary of Univariate Analysis
Factor	Observation
Weather Condition (Wthr_Cond_ID)	Majority of crashes occurred in clear weather
Light Condition (Light_Cond_ID)	Most crashes occurred in daylight condition
Speed Limit Category (Crash_Speed_LimitCat)	Crashes follow a normal distribution by speed limit category
Age Group (Prsn_Age)	The middle-age group (22-54 years) is more prone to crashes
Injury Severity (Prsn_Injry_Sev_ID)	Most crashes resulted in no injuries, only property damage
Gender (GenMale)	Males are more prone to crashes
Ethnicity (Prsn_Ethnicity_ID)	White and Hispanic individuals are more represented in crashes
Intersection Relation (Intrsct_Relat_ID)	Majority of crashes occurred at non-intersections
Surface Dryness (SurfDry)	Most crashes happened on dry surfaces
Traffic Control (Traffic_Cntl_ID)	Crashes are more frequent where traffic control is present

Bivariate Analysis

Bar plots for categorical columns by injury severity

# Load necessary libraries
library(ggplot2)
library(dplyr)

# List of categorical columns to be plotted against Prsn_Injry_Sev_ID
categorical_columns <- c("Wthr_Cond_ID", "Light_Cond_ID", "Road_Type_ID", "Road_Algn_ID", 
                         "Traffic_Cntl_ID", "Harm_Evnt_ID", "Intrsct_Relat_ID", 
                         "FHE_Collsn_ID", "Road_Part_Adj_ID", "Road_Cls_ID", 
                         "Pop_Group_ID", "Crash_Speed_LimitCat", "Veh_Body_Styl_ID", 
                         "Prsn_Ethnicity_ID", "Prsn_Age")

# Loop through each categorical column and create a bar plot with respect to Prsn_Injry_Sev_ID
for (col in categorical_columns) {
  plot <- ggplot(data, aes_string(x = col, fill = "Prsn_Injry_Sev_ID")) +
    geom_bar(position = "dodge") +
    labs(title = paste("Distribution of", col, "by Injury Severity"), 
         x = col, 
         y = "Count", 
         fill = "Injury Severity (KABCO)") +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
  
  print(plot)  # Use print() to display the plot inside the loop
}

Injury Severity by Gender

ggplot(data, aes(x = as.factor(GenMale), fill = as.factor(Prsn_Injry_Sev_ID))) + 
  geom_bar(position = "dodge") +
  labs(title = "Injury Severity by Gender", x = "Gender (1 = Male, 0 = Female)", y = "Count", fill = "Injury Severity") +
  theme_minimal()

Bar plot summary

The table below summarizes key factors from the bar plots w.r.t Prsn_Injry_Sev_ID of the dataset.

# Load necessary libraries
library(DT)
library(htmltools)

# Create a summary table with factors and observations
barplot_summary <- data.frame(
  Factor = c("Wthr_Cond_ID", "Light_Cond_ID", "Road_Type_ID", "Road_Algn_ID", 
             "Traffic_Cntl_ID", "Harm_Evnt_ID", "Intrsct_Relat_ID", 
             "FHE_Collsn_ID", "Road_Part_Adj_ID", "Road_Cls_ID", 
             "Pop_Group_ID", "Crash_Speed_LimitCat",
             "Prsn_Ethnicity_ID", "Prsn_Age"),
  Observation = c("Most severe (KA) injuries occur in clear weather", 
                  "Most severe (KA) injuries happen during daylight", 
                  "Most severe (KA) injuries take place on 2-lane, 2-way roads", 
                  "Most severe (KA) injuries occur on straight roads", 
                  "Most severe (KA) injuries occur around dividers or marked lanes", 
                  "Motor vehicles are most prone to severe (KA) injuries", 
                  "Most severe (KA) injuries happen on non-intersection roads", 
                  "The major cause of KA injuries is rear-end collisions", 
                  "Most severe (KA) injuries occur on proper roads", 
                  "Most severe (KA) injuries happen on state highways", 
                  "Rural populations are the main victims of KA injuries", 
                  "Most severe (KA) injuries occur due to high speeds (65-70 mph)", 
                  "White people are the major victims of crashes", 
                  "Middle-aged individuals are the main victims of KA crashes")
)

# Title for the interactive table
htmltools::tagList(
  tags$h4("Summary Table: Injury Severity by categories"),
  
  # Create an interactive DataTable
  datatable(barplot_summary, 
            options = list(pageLength = 10, autoWidth = TRUE),
            colnames = c("Factor", "Observation"))
)

Summary Table: Injury Severity by categories

Density plot of numeric variables

# Load necessary library
library(ggplot2)

# List of numeric columns
numeric_columns <- c("TrafVol", "Prsn_Age")  # Add all numeric columns you want to plot

# Loop through each numeric column and create density plots
for (col in numeric_columns) {
  plot <- ggplot(data, aes_string(x = col)) + 
    geom_density(fill = "blue", alpha = 0.5) +
    labs(title = paste("Density Plot of", col), x = col, y = "Density") +
    theme_minimal()
  
  # Print each plot
  print(plot)
}

Contingency Coefficient based on Prsn_Injry_Sev_ID

Contingency Coefficient bar plot

# Load necessary libraries
library(ggplot2)
library(MASS)  # For calculating contingency tables

# Function to calculate contingency coefficient
contingency_coefficient <- function(table) {
  chi2 <- chisq.test(table)$statistic
  n <- sum(table)  # Total observations
  return(sqrt(chi2 / (chi2 + n)))
}

# List of categorical columns
categorical_columns <- c("Wthr_Cond_ID", "Light_Cond_ID", "Road_Type_ID", "Road_Algn_ID", 
                         "Traffic_Cntl_ID", "Harm_Evnt_ID", "Intrsct_Relat_ID", 
                         "FHE_Collsn_ID", "Road_Part_Adj_ID", "Road_Cls_ID", 
                         "Pop_Group_ID", "Crash_Speed_LimitCat", "Prsn_Ethnicity_ID", "Prsn_Age")

# Calculate the contingency coefficient for each categorical column
contingency_results <- data.frame(Categorical_Variable = character(), Contingency_Coefficient = numeric())

for (col in categorical_columns) {
  # Create a contingency table for each variable with respect to Prsn_Injry_Sev_ID
  contingency_table <- table(data$Prsn_Injry_Sev_ID, data[[col]])
  
  # Calculate the contingency coefficient
  coeff <- contingency_coefficient(contingency_table)
  
  # Append the result to the results dataframe
  contingency_results <- rbind(contingency_results, 
                               data.frame(Categorical_Variable = col, Contingency_Coefficient = coeff))
}

# Plot the results using ggplot2
ggplot(contingency_results, aes(x = reorder(Categorical_Variable, Contingency_Coefficient), y = Contingency_Coefficient)) + 
  geom_bar(stat = "identity", fill = "skyblue") +
  coord_flip() +  # Flip coordinates for better readability
  labs(title = "Contingency Coefficients for Categorical Variables", x = "Categorical Variable", y = "Contingency Coefficient") +
  theme_minimal()

Explanation on Contingency Coefficient

The contingency coefficients provide a clearer picture of which factors have a more significant relationship with injury severity and which do not.

Wthr_Cond_ID (Weather Condition) = 0.07: This low value suggests that weather conditions (clear, rainy, foggy, etc.) have little influence on the severity of injuries in the dataset. It indicates that the weather at the time of the crash doesn’t strongly affect how severe the injuries are.
Light_Cond_ID (Light Condition) = 0.10: This indicates a low association between light conditions (daylight, dark, twilight) and the severity of injuries. In other words, the lighting conditions at the time of the crash don’t have a significant impact on the severity of the injuries.
Road_Type_ID (Road Type) = 0.12: This value suggests a weak association between the type of road (e.g., highway, local road, farm road) and injury severity. While there is some influence, it’s not a strong determining factor for the severity of injuries.
Road_Algn_ID (Road Alignment) = 0.11: This suggests a weak association between road alignment (straight, curved, uphill, etc.) and injury severity. The road alignment has a minor influence on how severe the injuries might be in a crash.
Traffic_Cntl_ID (Traffic Control) = 0.10: This value indicates a low association between the presence or type of traffic control (e.g., stop signs, traffic lights, none) and injury severity. Traffic control measures don’t appear to have a strong influence on the outcome in terms of injury severity.
Harm_Evnt_ID (Harmful Event) = 0.09: This value shows a weak association between the type of harmful event (e.g., collisions, fixed objects, overturns) and injury severity. It suggests that the specific event causing the crash has a minor impact on the severity of injuries.
Intrsct_Relat_ID (Intersection Relation) = 0.08: This indicates a low association between whether the crash occurred at or near an intersection and the severity of injuries. Intersection-related crashes don’t seem to significantly affect injury outcomes.
FHE_Collsn_ID (First Harmful Event Collision) = 0.11: This suggests a weak association between the type of first harmful event (e.g., rear-end collisions, side impacts) and injury severity. The first harmful event in the crash doesn’t strongly determine how severe the injuries are.
Road_Part_Adj_ID (Road Part Adjacent) = 0.09: This low value indicates a weak association between the part of the road adjacent to the crash (e.g., shoulder, main lane) and injury severity. The location on the road does not appear to heavily influence injury outcomes.
Road_Cls_ID (Road Classification) = 0.10: This indicates a weak association between the classification of the road (e.g., highway, local, farm road) and injury severity. The type of road on which the crash occurs has little impact on the injury severity.
Pop_Group_ID (Population Group) = 0.08: This value shows a low association between the population group (e.g., rural, urban) where the crash occurred and the severity of injuries. Whether the crash happened in a rural or urban area doesn’t seem to strongly affect injury outcomes.
Crash_Speed_LimitCat (Speed Limit Category) = 0.50: This higher value suggests a moderate to strong association between the speed limit where the crash occurred and injury severity. Higher speed limits are likely related to more severe injuries, which makes sense as crashes at higher speeds generally result in more severe outcomes.
Prsn_Ethnicity_ID (Person Ethnicity) = 0.07: This low value indicates that there is little association between the ethnicity of individuals involved in the crash and injury severity. Ethnicity does not seem to influence the severity of injuries.
Prsn_Age (Person Age) = 0.09: This weak association suggests that the age of individuals involved in the crash has little influence on the severity of injuries. The age group of those involved does not significantly affect how severe the injuries are.
Prsn_Injry_Sev_ID (Person Injury Severity ID): As this is the reference variable used to measure injury severity, it’s not applicable for calculating the contingency coefficient with itself.

Summary:

Low values (closer to 0), such as for weather conditions, light conditions, and road alignment, indicate that these factors have little to no influence on the severity of injuries.
Moderate values, such as for the speed limit category, suggest that certain factors, like speed limits, have a stronger association with injury severity, meaning crashes in high-speed zones are more likely to result in severe injuries.

p-value analysis

Plot p-value analysis

# Function to calculate p-value using Chi-square test
calculate_p_value <- function(x, y) {
  # Create a contingency table
  tbl <- table(x, y)
  
  # Perform chi-square test
  chi2_test <- chisq.test(tbl)
  
  # Return the p-value
  return(chi2_test$p.value)
}

# List of categorical columns
categorical_columns <- c("Wthr_Cond_ID", "Light_Cond_ID", "Road_Type_ID", "Road_Algn_ID", 
                         "Traffic_Cntl_ID", "Harm_Evnt_ID", "Intrsct_Relat_ID", 
                         "FHE_Collsn_ID", "Road_Part_Adj_ID", "Road_Cls_ID", 
                         "Pop_Group_ID", "Crash_Speed_LimitCat", "Prsn_Ethnicity_ID", "Prsn_Age")

# Data frame to store p-values
p_values_results <- data.frame(Categorical_Variable = character(), P_Value = numeric())

# Loop through each categorical variable and calculate the p-value
for (col in categorical_columns) {
  # Calculate the p-value for each variable with respect to Prsn_Injry_Sev_ID
  p_value <- calculate_p_value(data$Prsn_Injry_Sev_ID, data[[col]])
  
  # Append the result to the results dataframe
  p_values_results <- rbind(p_values_results, 
                            data.frame(Categorical_Variable = col, P_Value = p_value))
}

# Show the results
print(p_values_results)

##    Categorical_Variable      P_Value
## 1          Wthr_Cond_ID 5.610124e-01
## 2         Light_Cond_ID 1.046013e-01
## 3          Road_Type_ID 1.748738e-02
## 4          Road_Algn_ID 3.199259e-02
## 5       Traffic_Cntl_ID 8.350679e-02
## 6          Harm_Evnt_ID 2.364021e-03
## 7      Intrsct_Relat_ID 6.959669e-02
## 8         FHE_Collsn_ID 2.889230e-14
## 9      Road_Part_Adj_ID 1.973712e-01
## 10          Road_Cls_ID 1.346929e-03
## 11         Pop_Group_ID 4.619300e-01
## 12 Crash_Speed_LimitCat 5.507364e-05
## 13    Prsn_Ethnicity_ID 1.258593e-03
## 14             Prsn_Age 8.386196e-02

# Optionally, filter the results for variables with p-values less than 0.05 (significant associations)
significant_results <- p_values_results %>% filter(P_Value < 0.05)
print("Significant Results (p < 0.05):")

## [1] "Significant Results (p < 0.05):"

print(significant_results)

##   Categorical_Variable      P_Value
## 1         Road_Type_ID 1.748738e-02
## 2         Road_Algn_ID 3.199259e-02
## 3         Harm_Evnt_ID 2.364021e-03
## 4        FHE_Collsn_ID 2.889230e-14
## 5          Road_Cls_ID 1.346929e-03
## 6 Crash_Speed_LimitCat 5.507364e-05
## 7    Prsn_Ethnicity_ID 1.258593e-03

# Visualize the p-values using ggplot2
ggplot(p_values_results, aes(x = reorder(Categorical_Variable, P_Value), y = P_Value)) + 
  geom_bar(stat = "identity", fill = "skyblue") +
  coord_flip() +  # Flip coordinates for better readability
  labs(title = "P-Value Analysis for Categorical Variables", x = "Categorical Variable", y = "P-Value") +
  theme_minimal() +
  geom_hline(yintercept = 0.05, color = "red", linetype = "dashed")  # Add a threshold line at p = 0.05

Identify Significant Variables

significant_results <- p_values_results %>% filter(P_Value < 0.05)
print("Significant Results (p < 0.05):")

## [1] "Significant Results (p < 0.05):"

print(significant_results)

##   Categorical_Variable      P_Value
## 1         Road_Type_ID 1.748738e-02
## 2         Road_Algn_ID 3.199259e-02
## 3         Harm_Evnt_ID 2.364021e-03
## 4        FHE_Collsn_ID 2.889230e-14
## 5          Road_Cls_ID 1.346929e-03
## 6 Crash_Speed_LimitCat 5.507364e-05
## 7    Prsn_Ethnicity_ID 1.258593e-03

Other plots

Traffic Volume Distribution by Injury Severity

# Enhanced Violin Plot
ggplot(data, aes(x = as.factor(Prsn_Injry_Sev_ID), y = TrafVol)) + 
  geom_violin(fill = "lightblue", color = "black", alpha = 0.7, trim = FALSE) +  # Violin plot with some transparency and no trimming
  geom_boxplot(width = 0.1, color = "black", outlier.shape = NA, alpha = 0.5) +  # Boxplot inside the violin for better clarity
  geom_jitter(width = 0.1, size = 1, alpha = 0.3, color = "#34495e") +  # Add jittered points to show individual data points
  labs(title = "Traffic Volume Distribution by Injury Severity", 
       x = "Injury Severity (KABCO scale)", 
       y = "Traffic Volume") +
  theme_minimal() +  # Use minimal theme for cleaner look
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  # Rotate x-axis labels for better readability
    plot.title = element_text(hjust = 0.5)  # Center the plot title
  ) +
  scale_y_continuous(labels = scales::comma)  # Use comma format for y-axis to make large numbers easier to read

Boxplot for Traffic Volume (TrafVol) by Injury Severity (Prsn_Injry_Sev_ID)

# Boxplot for Traffic Volume (TrafVol) by Injury Severity (Prsn_Injry_Sev_ID)
ggplot(data, aes(x = as.factor(Prsn_Injry_Sev_ID), y = TrafVol)) + 
  geom_boxplot(fill = "lightblue") +
  labs(title = "Boxplot of Traffic Volume by Injury Severity", 
       x = "Injury Severity (KABCO scale)", 
       y = "Traffic Volume") +
  theme_minimal()

Correlation plot of numerical values

numeric_cols <- data[, sapply(data, is.numeric)]
cor_matrix <- cor(numeric_cols, use = "complete.obs")
print(cor_matrix)

##             SurfDry     GenMale      TrafVol
## SurfDry 1.000000000  0.04432906  0.009977795
## GenMale 0.044329059  1.00000000 -0.041513425
## TrafVol 0.009977795 -0.04151342  1.000000000

library(corrplot)
corrplot(cor_matrix, method = "circle", type = "lower")

Heatmap for Injury Severity vs Speed Limit Category

# Create a contingency table (Injury Severity Vs Speed Limit)
contingency_table_speed <- table(data$Crash_Speed_LimitCat, data$Prsn_Injry_Sev_ID)

# Convert to dataframe for ggplot
contingency_df_speed <- as.data.frame(contingency_table_speed)

# Heatmap for Injury Severity vs Speed Limit Category
ggplot(contingency_df_speed, aes(Var1, Var2, fill = Freq)) +
  geom_tile(color = "white") +
  scale_fill_gradient(low = "lightblue", high = "red") +
  labs(title = "Heatmap of Injury Severity Vs Speed Limit Category", 
       x = "Speed Limit Category", 
       y = "Injury Severity (KABCO scale)", 
       fill = "Frequency") +
  theme_minimal(base_size = 15) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Outlier detection using boxplot

# Function to detect outliers using IQR
detect_outliers_iqr <- function(x) {
  Q1 <- quantile(x, 0.25, na.rm = TRUE)
  Q3 <- quantile(x, 0.75, na.rm = TRUE)
  IQR <- Q3 - Q1
  lower_bound <- Q1 - 1.5 * IQR
  upper_bound <- Q3 + 1.5 * IQR
  outliers <- which(x < lower_bound | x > upper_bound)
  return(outliers)
}

# Apply the function to TrafVol (Traffic Volume)
outliers_trafvol <- detect_outliers_iqr(data$TrafVol)

# Create a new column to identify outliers
data$Outlier <- "No"  # Default value
data$Outlier[outliers_trafvol] <- "Yes"  # Mark outliers as "Yes"

# Visualize Outliers with Boxplot and highlight the outliers
ggplot(data, aes(x = "", y = TrafVol)) +
  geom_boxplot(fill = "lightblue") +
  geom_jitter(aes(color = Outlier), width = 0.1, size = 2) +  # Add jittered points and color by outlier status
  scale_color_manual(values = c("No" = "black", "Yes" = "red")) +  # Red for outliers, black for normal points
  labs(title = "Outlier Detection for Traffic Volume", 
       x = "", 
       y = "Traffic Volume") +
  theme_minimal()