Exploratory Data Analysis (HW1)
Introduction
This report analyzes the correlations between variables in the dataset and also goes through the exploratory data analysis.
Name: Md Monzurul Islam
netID: ene44
Load Libraries
knitr::opts_chunk$set(
echo = TRUE,
message = FALSE,
warning = FALSE
)
# Install DT package if not already installed
if (!require(DT)) install.packages("DT", dependencies = TRUE)
if (!require(htmltools)) install.packages("htmltools", dependencies = TRUE)
# Load necessary libraries
library(readxl)
library(ggplot2)
library(corrplot)
library(dplyr)
library(DT)
library(htmltools)
library(GGally)Load Dataset
Basic Info
## # A tibble: 6 × 19
## Wthr_Cond_ID Light_Cond_ID Road_Type_ID Road_Algn_ID SurfDry Traffic_Cntl_ID
## <chr> <chr> <chr> <chr> <dbl> <chr>
## 1 Clear Dark, not ligh… 2 lane, 2 w… Straight, l… 1 Marked lanes
## 2 Clear Dark, not ligh… 2 lane, 2 w… Straight, l… 1 Center stripe/…
## 3 Clear Daylight 2 lane, 2 w… Straight, l… 1 Marked lanes
## 4 Clear Daylight 2 lane, 2 w… Straight, l… 1 Center stripe/…
## 5 Clear Dark, not ligh… 2 lane, 2 w… Straight, g… 1 None
## 6 Clear Daylight Unknown Straight, l… 1 None
## # ℹ 13 more variables: Harm_Evnt_ID <chr>, Intrsct_Relat_ID <chr>,
## # FHE_Collsn_ID <chr>, Road_Part_Adj_ID <chr>, Road_Cls_ID <chr>,
## # Pop_Group_ID <chr>, Crash_Speed_LimitCat <chr>, Veh_Body_Styl_ID <chr>,
## # Prsn_Ethnicity_ID <chr>, GenMale <dbl>, TrafVol <dbl>, Prsn_Age <chr>,
## # Prsn_Injry_Sev_ID <chr>
Get Total Rows and Columns
# Get the number of rows and columns
total_rows <- nrow(data)
total_columns <- ncol(data)
# Display the results
cat("Total Rows: ", total_rows, "\n")## Total Rows: 1295
## Total Columns: 19
Show the structure of the dataset
## tibble [1,295 × 19] (S3: tbl_df/tbl/data.frame)
## $ Wthr_Cond_ID : chr [1:1295] "Clear" "Clear" "Clear" "Clear" ...
## $ Light_Cond_ID : chr [1:1295] "Dark, not lighted" "Dark, not lighted" "Daylight" "Daylight" ...
## $ Road_Type_ID : chr [1:1295] "2 lane, 2 way" "2 lane, 2 way" "2 lane, 2 way" "2 lane, 2 way" ...
## $ Road_Algn_ID : chr [1:1295] "Straight, level" "Straight, level" "Straight, level" "Straight, level" ...
## $ SurfDry : num [1:1295] 1 1 1 1 1 1 1 1 1 1 ...
## $ Traffic_Cntl_ID : chr [1:1295] "Marked lanes" "Center stripe/divider" "Marked lanes" "Center stripe/divider" ...
## $ Harm_Evnt_ID : chr [1:1295] "Motor vehicle in transport" "Motor vehicle in transport" "Motor vehicle in transport" "Fixed object" ...
## $ Intrsct_Relat_ID : chr [1:1295] "Non intersection" "Non intersection" "Intersection" "Non intersection" ...
## $ FHE_Collsn_ID : chr [1:1295] "Sd both going straight-rear end" "Sd both going straight-rear end" "Other" "Omv vehicle going straight" ...
## $ Road_Part_Adj_ID : chr [1:1295] "Main/proper lane" "Main/proper lane" "Main/proper lane" "Main/proper lane" ...
## $ Road_Cls_ID : chr [1:1295] "Farm to market" "Us & state highways" "Farm to market" "Us & state highways" ...
## $ Pop_Group_ID : chr [1:1295] "10,000 - 24,999 pop" "Rural" "Other" "Rural" ...
## $ Crash_Speed_LimitCat: chr [1:1295] "30-40 mph" "65-70 mph" "45-60 mph" "65-70 mph" ...
## $ Veh_Body_Styl_ID : chr [1:1295] "Farm equipment" "Farm equipment" "Farm equipment" "Farm equipment" ...
## $ Prsn_Ethnicity_ID : chr [1:1295] "White" "White" "White" "White" ...
## $ GenMale : num [1:1295] 1 1 1 1 1 1 1 1 1 1 ...
## $ TrafVol : num [1:1295] 7654 13770 11470 16972 413 ...
## $ Prsn_Age : chr [1:1295] "25-54 years" "25-54 years" "Other" "25-54 years" ...
## $ Prsn_Injry_Sev_ID : chr [1:1295] "O" "O" "O" "O" ...
Get the summary of the dataset
## Wthr_Cond_ID Light_Cond_ID Road_Type_ID Road_Algn_ID
## Length:1295 Length:1295 Length:1295 Length:1295
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## SurfDry Traffic_Cntl_ID Harm_Evnt_ID Intrsct_Relat_ID
## Min. :0.0000 Length:1295 Length:1295 Length:1295
## 1st Qu.:1.0000 Class :character Class :character Class :character
## Median :1.0000 Mode :character Mode :character Mode :character
## Mean :0.9143
## 3rd Qu.:1.0000
## Max. :1.0000
## FHE_Collsn_ID Road_Part_Adj_ID Road_Cls_ID Pop_Group_ID
## Length:1295 Length:1295 Length:1295 Length:1295
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Crash_Speed_LimitCat Veh_Body_Styl_ID Prsn_Ethnicity_ID GenMale
## Length:1295 Length:1295 Length:1295 Min. :0.0000
## Class :character Class :character Class :character 1st Qu.:1.0000
## Mode :character Mode :character Mode :character Median :1.0000
## Mean :0.8842
## 3rd Qu.:1.0000
## Max. :1.0000
## TrafVol Prsn_Age Prsn_Injry_Sev_ID
## Min. : 215 Length:1295 Length:1295
## 1st Qu.: 6765 Class :character Class :character
## Median :14170 Mode :character Mode :character
## Mean :14417
## 3rd Qu.:22010
## Max. :28970
Correlation matrix:
Calculate correlations between numeric variables.
numeric_cols <- data[, sapply(data, is.numeric)]
cor_matrix <- cor(numeric_cols, use = "complete.obs")
print(cor_matrix)## SurfDry GenMale TrafVol
## SurfDry 1.000000000 0.04432906 0.009977795
## GenMale 0.044329059 1.00000000 -0.041513425
## TrafVol 0.009977795 -0.04151342 1.000000000
Description on Prsn_Injry_Sev_ID
The Prsn_Injry_Sev_ID variable represents the injury
severity of individuals involved in traffic accidents, classified
according to the KABCO scale. This scale categorizes injury
severity from fatal outcomes to cases where no injury occurs. The
following classifications are used within this dataset:
K (Killed): This category represents fatal injuries resulting from the crash. Individuals in this category are those who succumbed to injuries at the scene or shortly after the crash.
A (Incapacitating Injury): These injuries are serious and prevent the injured person from continuing their normal activities without medical assistance. Examples include fractures, severe lacerations, or injuries that require hospitalization.
B (Non-incapacitating Injury): This category includes visible injuries such as bruises or minor lacerations that, while painful, do not prevent the individual from continuing daily activities.
C (Possible Injury): Injuries in this category are those that are not visible but are claimed by the person involved in the crash, such as complaints of pain or discomfort. These injuries may or may not require medical attention.
O (No Injury/Property Damage Only): This category is used when no injuries are sustained in the crash, and the only result is damage to the property, such as vehicles.
The Prsn_Injry_Sev_ID variable provides critical insight
into the severity of injuries in traffic crashes. By analyzing the
distribution of these categories, researchers can identify potential
correlations between injury severity and factors such as road
conditions, vehicle types, and demographic attributes. This
understanding can aid in identifying risk factors associated with more
severe injuries and guide interventions to enhance road safety.
Univariate Analysis
Number of Accidents by Weather Condition
# Create a bar plot showing the number of accidents by weather condition
ggplot(data, aes(x = as.factor(Wthr_Cond_ID))) +
geom_bar(fill = "skyblue") +
labs(title = "Number of Accidents by Weather Condition",
x = "Weather Condition",
y = "Number of Accidents") +
theme_minimal()
The majority of crashes occurred in clear weather.
Number of Accidents by Light Condition
# Create a bar plot showing the number of accidents by light condition
ggplot(data, aes(x = as.factor(Light_Cond_ID))) +
geom_bar(fill = "orange") +
labs(title = "Number of Accidents by Light Condition",
x = "Light Condition",
y = "Number of Accidents") +
theme_minimal()
The majority of crashes occurred in daylight
condition.
Number of Accidents by Speed Limit Category
# Create a bar plot showing the number of accidents by speed limit category
ggplot(data, aes(x = as.factor(Crash_Speed_LimitCat))) +
geom_bar(fill = "steelblue") +
labs(title = "Number of Accidents by Speed Limit Category",
x = "Speed Limit Category",
y = "Number of Accidents") +
theme_minimal()
Crash distribution follows normal distribution
Number of Accidents by Age Group (Existing Categories)
# Create a bar plot using the existing age group values
ggplot(data, aes(x = as.factor(Prsn_Age))) +
geom_bar(fill = "skyblue") +
labs(title = "Number of Accidents by Age Group",
x = "Age Group",
y = "Number of Accidents") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Middle age group (22-54 years) is more prone to
crashes
Number of Accidents by Injury Severity
# Create a bar plot for the number of accidents by injury severity (Prsn_Injry_Sev_ID)
ggplot(data, aes(x = as.factor(Prsn_Injry_Sev_ID))) +
geom_bar(fill = "coral") +
labs(title = "Number of Accidents by Injury Severity",
x = "Injury Severity",
y = "Number of Accidents") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))Most of the crashes resulted in no injuries, only property damage
Number of Accidents by Gender (GenMale)
# Bar plot for number of accidents by gender (GenMale)
ggplot(data, aes(x = as.factor(GenMale))) +
geom_bar(fill = "lightblue") +
labs(title = "Number of Accidents by Gender",
x = "Gender (1 = Male, 0 = Female)",
y = "Number of Accidents") +
theme_minimal()
The male population is more prone to crashes
Number of Accidents by Ethnicity (Prsn_Ethnicity_ID)
# Bar plot for number of accidents by ethnicity (Prsn_Ethnicity_ID)
ggplot(data, aes(x = as.factor(Prsn_Ethnicity_ID))) +
geom_bar(fill = "lightgreen") +
labs(title = "Number of Accidents by Ethnicity",
x = "Ethnicity",
y = "Number of Accidents") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
White and Hispanic people are more vulnerable to
crashes.
Number of Accidents by Intersection Relation (Intrsct_Relat_ID)
# Bar plot for number of accidents by intersection relation (Intrsct_Relat_ID)
ggplot(data, aes(x = as.factor(Intrsct_Relat_ID))) +
geom_bar(fill = "orange") +
labs(title = "Number of Accidents by Intersection Relation",
x = "Intersection Relation",
y = "Number of Accidents") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Most of the crashes happen in non-intersection
Number of Accidents by Surface Dryness (SurfDry)
# Bar plot for number of accidents by surface dryness (SurfDry)
ggplot(data, aes(x = as.factor(SurfDry))) +
geom_bar(fill = "purple") +
labs(title = "Number of Accidents by Surface Dryness",
x = "Surface Dryness (1 = Dry, 0 = Not Dry)",
y = "Number of Accidents") +
theme_minimal()
Most of the crashes happen on dry sufrface
Number of Accidents by Traffic Control (Traffic_Cntl_ID)
# Bar plot for number of accidents by traffic control (Traffic_Cntl_ID)
ggplot(data, aes(x = as.factor(Traffic_Cntl_ID))) +
geom_bar(fill = "tomato") +
labs(title = "Number of Accidents by Traffic Control",
x = "Traffic Control",
y = "Number of Accidents") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Center stripe/divider and marked lanes are the hot-spots for
crashes to occur
Univariate Analysis Summary
The table below summarizes key factors from the univariate analysis of the dataset.
# Create a summary table with factors and observations
univariate_summary <- data.frame(
Factor = c("Weather Condition (Wthr_Cond_ID)",
"Light Condition (Light_Cond_ID)",
"Speed Limit Category (Crash_Speed_LimitCat)",
"Age Group (Prsn_Age)",
"Injury Severity (Prsn_Injry_Sev_ID)",
"Gender (GenMale)",
"Ethnicity (Prsn_Ethnicity_ID)",
"Intersection Relation (Intrsct_Relat_ID)",
"Surface Dryness (SurfDry)",
"Traffic Control (Traffic_Cntl_ID)"),
Observation = c("Majority of crashes occurred in clear weather",
"Most crashes occurred in daylight condition",
"Crashes follow a normal distribution by speed limit category",
"The middle-age group (22-54 years) is more prone to crashes",
"Most crashes resulted in no injuries, only property damage",
"Males are more prone to crashes",
"White and Hispanic individuals are more represented in crashes",
"Majority of crashes occurred at non-intersections",
"Most crashes happened on dry surfaces",
"Crashes are more frequent where traffic control is present")
)
# Display the summary table
knitr::kable(univariate_summary, col.names = c("Factor", "Observation"), caption = "Summary of Univariate Analysis")| Factor | Observation |
|---|---|
| Weather Condition (Wthr_Cond_ID) | Majority of crashes occurred in clear weather |
| Light Condition (Light_Cond_ID) | Most crashes occurred in daylight condition |
| Speed Limit Category (Crash_Speed_LimitCat) | Crashes follow a normal distribution by speed limit category |
| Age Group (Prsn_Age) | The middle-age group (22-54 years) is more prone to crashes |
| Injury Severity (Prsn_Injry_Sev_ID) | Most crashes resulted in no injuries, only property damage |
| Gender (GenMale) | Males are more prone to crashes |
| Ethnicity (Prsn_Ethnicity_ID) | White and Hispanic individuals are more represented in crashes |
| Intersection Relation (Intrsct_Relat_ID) | Majority of crashes occurred at non-intersections |
| Surface Dryness (SurfDry) | Most crashes happened on dry surfaces |
| Traffic Control (Traffic_Cntl_ID) | Crashes are more frequent where traffic control is present |
Bivariate Analysis
Bar plots for categorical columns by injury severity
# Load necessary libraries
library(ggplot2)
library(dplyr)
# List of categorical columns to be plotted against Prsn_Injry_Sev_ID
categorical_columns <- c("Wthr_Cond_ID", "Light_Cond_ID", "Road_Type_ID", "Road_Algn_ID",
"Traffic_Cntl_ID", "Harm_Evnt_ID", "Intrsct_Relat_ID",
"FHE_Collsn_ID", "Road_Part_Adj_ID", "Road_Cls_ID",
"Pop_Group_ID", "Crash_Speed_LimitCat", "Veh_Body_Styl_ID",
"Prsn_Ethnicity_ID", "Prsn_Age")
# Loop through each categorical column and create a bar plot with respect to Prsn_Injry_Sev_ID
for (col in categorical_columns) {
plot <- ggplot(data, aes_string(x = col, fill = "Prsn_Injry_Sev_ID")) +
geom_bar(position = "dodge") +
labs(title = paste("Distribution of", col, "by Injury Severity"),
x = col,
y = "Count",
fill = "Injury Severity (KABCO)") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(plot) # Use print() to display the plot inside the loop
}Injury Severity by Gender
ggplot(data, aes(x = as.factor(GenMale), fill = as.factor(Prsn_Injry_Sev_ID))) +
geom_bar(position = "dodge") +
labs(title = "Injury Severity by Gender", x = "Gender (1 = Male, 0 = Female)", y = "Count", fill = "Injury Severity") +
theme_minimal()Bar plot summary
The table below summarizes key factors from the bar plots w.r.t Prsn_Injry_Sev_ID of the dataset.
# Load necessary libraries
library(DT)
library(htmltools)
# Create a summary table with factors and observations
barplot_summary <- data.frame(
Factor = c("Wthr_Cond_ID", "Light_Cond_ID", "Road_Type_ID", "Road_Algn_ID",
"Traffic_Cntl_ID", "Harm_Evnt_ID", "Intrsct_Relat_ID",
"FHE_Collsn_ID", "Road_Part_Adj_ID", "Road_Cls_ID",
"Pop_Group_ID", "Crash_Speed_LimitCat",
"Prsn_Ethnicity_ID", "Prsn_Age"),
Observation = c("Most severe (KA) injuries occur in clear weather",
"Most severe (KA) injuries happen during daylight",
"Most severe (KA) injuries take place on 2-lane, 2-way roads",
"Most severe (KA) injuries occur on straight roads",
"Most severe (KA) injuries occur around dividers or marked lanes",
"Motor vehicles are most prone to severe (KA) injuries",
"Most severe (KA) injuries happen on non-intersection roads",
"The major cause of KA injuries is rear-end collisions",
"Most severe (KA) injuries occur on proper roads",
"Most severe (KA) injuries happen on state highways",
"Rural populations are the main victims of KA injuries",
"Most severe (KA) injuries occur due to high speeds (65-70 mph)",
"White people are the major victims of crashes",
"Middle-aged individuals are the main victims of KA crashes")
)
# Title for the interactive table
htmltools::tagList(
tags$h4("Summary Table: Injury Severity by categories"),
# Create an interactive DataTable
datatable(barplot_summary,
options = list(pageLength = 10, autoWidth = TRUE),
colnames = c("Factor", "Observation"))
)Summary Table: Injury Severity by categories
Density plot of numeric variables
# Load necessary library
library(ggplot2)
# List of numeric columns
numeric_columns <- c("TrafVol", "Prsn_Age") # Add all numeric columns you want to plot
# Loop through each numeric column and create density plots
for (col in numeric_columns) {
plot <- ggplot(data, aes_string(x = col)) +
geom_density(fill = "blue", alpha = 0.5) +
labs(title = paste("Density Plot of", col), x = col, y = "Density") +
theme_minimal()
# Print each plot
print(plot)
}Contingency Coefficient based on Prsn_Injry_Sev_ID
Contingency Coefficient bar plot
# Load necessary libraries
library(ggplot2)
library(MASS) # For calculating contingency tables
# Function to calculate contingency coefficient
contingency_coefficient <- function(table) {
chi2 <- chisq.test(table)$statistic
n <- sum(table) # Total observations
return(sqrt(chi2 / (chi2 + n)))
}
# List of categorical columns
categorical_columns <- c("Wthr_Cond_ID", "Light_Cond_ID", "Road_Type_ID", "Road_Algn_ID",
"Traffic_Cntl_ID", "Harm_Evnt_ID", "Intrsct_Relat_ID",
"FHE_Collsn_ID", "Road_Part_Adj_ID", "Road_Cls_ID",
"Pop_Group_ID", "Crash_Speed_LimitCat", "Prsn_Ethnicity_ID", "Prsn_Age")
# Calculate the contingency coefficient for each categorical column
contingency_results <- data.frame(Categorical_Variable = character(), Contingency_Coefficient = numeric())
for (col in categorical_columns) {
# Create a contingency table for each variable with respect to Prsn_Injry_Sev_ID
contingency_table <- table(data$Prsn_Injry_Sev_ID, data[[col]])
# Calculate the contingency coefficient
coeff <- contingency_coefficient(contingency_table)
# Append the result to the results dataframe
contingency_results <- rbind(contingency_results,
data.frame(Categorical_Variable = col, Contingency_Coefficient = coeff))
}
# Plot the results using ggplot2
ggplot(contingency_results, aes(x = reorder(Categorical_Variable, Contingency_Coefficient), y = Contingency_Coefficient)) +
geom_bar(stat = "identity", fill = "skyblue") +
coord_flip() + # Flip coordinates for better readability
labs(title = "Contingency Coefficients for Categorical Variables", x = "Categorical Variable", y = "Contingency Coefficient") +
theme_minimal()Explanation on Contingency Coefficient
The contingency coefficients provide a clearer picture of which factors have a more significant relationship with injury severity and which do not.
Wthr_Cond_ID (Weather Condition) = 0.07: This low value suggests that weather conditions (clear, rainy, foggy, etc.) have little influence on the severity of injuries in the dataset. It indicates that the weather at the time of the crash doesn’t strongly affect how severe the injuries are.
Light_Cond_ID (Light Condition) = 0.10: This indicates a low association between light conditions (daylight, dark, twilight) and the severity of injuries. In other words, the lighting conditions at the time of the crash don’t have a significant impact on the severity of the injuries.
Road_Type_ID (Road Type) = 0.12: This value suggests a weak association between the type of road (e.g., highway, local road, farm road) and injury severity. While there is some influence, it’s not a strong determining factor for the severity of injuries.
Road_Algn_ID (Road Alignment) = 0.11: This suggests a weak association between road alignment (straight, curved, uphill, etc.) and injury severity. The road alignment has a minor influence on how severe the injuries might be in a crash.
Traffic_Cntl_ID (Traffic Control) = 0.10: This value indicates a low association between the presence or type of traffic control (e.g., stop signs, traffic lights, none) and injury severity. Traffic control measures don’t appear to have a strong influence on the outcome in terms of injury severity.
Harm_Evnt_ID (Harmful Event) = 0.09: This value shows a weak association between the type of harmful event (e.g., collisions, fixed objects, overturns) and injury severity. It suggests that the specific event causing the crash has a minor impact on the severity of injuries.
Intrsct_Relat_ID (Intersection Relation) = 0.08: This indicates a low association between whether the crash occurred at or near an intersection and the severity of injuries. Intersection-related crashes don’t seem to significantly affect injury outcomes.
FHE_Collsn_ID (First Harmful Event Collision) = 0.11: This suggests a weak association between the type of first harmful event (e.g., rear-end collisions, side impacts) and injury severity. The first harmful event in the crash doesn’t strongly determine how severe the injuries are.
Road_Part_Adj_ID (Road Part Adjacent) = 0.09: This low value indicates a weak association between the part of the road adjacent to the crash (e.g., shoulder, main lane) and injury severity. The location on the road does not appear to heavily influence injury outcomes.
Road_Cls_ID (Road Classification) = 0.10: This indicates a weak association between the classification of the road (e.g., highway, local, farm road) and injury severity. The type of road on which the crash occurs has little impact on the injury severity.
Pop_Group_ID (Population Group) = 0.08: This value shows a low association between the population group (e.g., rural, urban) where the crash occurred and the severity of injuries. Whether the crash happened in a rural or urban area doesn’t seem to strongly affect injury outcomes.
Crash_Speed_LimitCat (Speed Limit Category) = 0.50: This higher value suggests a moderate to strong association between the speed limit where the crash occurred and injury severity. Higher speed limits are likely related to more severe injuries, which makes sense as crashes at higher speeds generally result in more severe outcomes.
Prsn_Ethnicity_ID (Person Ethnicity) = 0.07: This low value indicates that there is little association between the ethnicity of individuals involved in the crash and injury severity. Ethnicity does not seem to influence the severity of injuries.
Prsn_Age (Person Age) = 0.09: This weak association suggests that the age of individuals involved in the crash has little influence on the severity of injuries. The age group of those involved does not significantly affect how severe the injuries are.
Prsn_Injry_Sev_ID (Person Injury Severity ID): As this is the reference variable used to measure injury severity, it’s not applicable for calculating the contingency coefficient with itself.
Summary:
- Low values (closer to 0), such as for weather conditions, light conditions, and road alignment, indicate that these factors have little to no influence on the severity of injuries.
- Moderate values, such as for the speed limit category, suggest that certain factors, like speed limits, have a stronger association with injury severity, meaning crashes in high-speed zones are more likely to result in severe injuries.
p-value analysis
Plot p-value analysis
# Function to calculate p-value using Chi-square test
calculate_p_value <- function(x, y) {
# Create a contingency table
tbl <- table(x, y)
# Perform chi-square test
chi2_test <- chisq.test(tbl)
# Return the p-value
return(chi2_test$p.value)
}
# List of categorical columns
categorical_columns <- c("Wthr_Cond_ID", "Light_Cond_ID", "Road_Type_ID", "Road_Algn_ID",
"Traffic_Cntl_ID", "Harm_Evnt_ID", "Intrsct_Relat_ID",
"FHE_Collsn_ID", "Road_Part_Adj_ID", "Road_Cls_ID",
"Pop_Group_ID", "Crash_Speed_LimitCat", "Prsn_Ethnicity_ID", "Prsn_Age")
# Data frame to store p-values
p_values_results <- data.frame(Categorical_Variable = character(), P_Value = numeric())
# Loop through each categorical variable and calculate the p-value
for (col in categorical_columns) {
# Calculate the p-value for each variable with respect to Prsn_Injry_Sev_ID
p_value <- calculate_p_value(data$Prsn_Injry_Sev_ID, data[[col]])
# Append the result to the results dataframe
p_values_results <- rbind(p_values_results,
data.frame(Categorical_Variable = col, P_Value = p_value))
}
# Show the results
print(p_values_results)## Categorical_Variable P_Value
## 1 Wthr_Cond_ID 5.610124e-01
## 2 Light_Cond_ID 1.046013e-01
## 3 Road_Type_ID 1.748738e-02
## 4 Road_Algn_ID 3.199259e-02
## 5 Traffic_Cntl_ID 8.350679e-02
## 6 Harm_Evnt_ID 2.364021e-03
## 7 Intrsct_Relat_ID 6.959669e-02
## 8 FHE_Collsn_ID 2.889230e-14
## 9 Road_Part_Adj_ID 1.973712e-01
## 10 Road_Cls_ID 1.346929e-03
## 11 Pop_Group_ID 4.619300e-01
## 12 Crash_Speed_LimitCat 5.507364e-05
## 13 Prsn_Ethnicity_ID 1.258593e-03
## 14 Prsn_Age 8.386196e-02
# Optionally, filter the results for variables with p-values less than 0.05 (significant associations)
significant_results <- p_values_results %>% filter(P_Value < 0.05)
print("Significant Results (p < 0.05):")## [1] "Significant Results (p < 0.05):"
## Categorical_Variable P_Value
## 1 Road_Type_ID 1.748738e-02
## 2 Road_Algn_ID 3.199259e-02
## 3 Harm_Evnt_ID 2.364021e-03
## 4 FHE_Collsn_ID 2.889230e-14
## 5 Road_Cls_ID 1.346929e-03
## 6 Crash_Speed_LimitCat 5.507364e-05
## 7 Prsn_Ethnicity_ID 1.258593e-03
# Visualize the p-values using ggplot2
ggplot(p_values_results, aes(x = reorder(Categorical_Variable, P_Value), y = P_Value)) +
geom_bar(stat = "identity", fill = "skyblue") +
coord_flip() + # Flip coordinates for better readability
labs(title = "P-Value Analysis for Categorical Variables", x = "Categorical Variable", y = "P-Value") +
theme_minimal() +
geom_hline(yintercept = 0.05, color = "red", linetype = "dashed") # Add a threshold line at p = 0.05Identify Significant Variables
significant_results <- p_values_results %>% filter(P_Value < 0.05)
print("Significant Results (p < 0.05):")## [1] "Significant Results (p < 0.05):"
## Categorical_Variable P_Value
## 1 Road_Type_ID 1.748738e-02
## 2 Road_Algn_ID 3.199259e-02
## 3 Harm_Evnt_ID 2.364021e-03
## 4 FHE_Collsn_ID 2.889230e-14
## 5 Road_Cls_ID 1.346929e-03
## 6 Crash_Speed_LimitCat 5.507364e-05
## 7 Prsn_Ethnicity_ID 1.258593e-03
Other plots
Traffic Volume Distribution by Injury Severity
# Enhanced Violin Plot
ggplot(data, aes(x = as.factor(Prsn_Injry_Sev_ID), y = TrafVol)) +
geom_violin(fill = "lightblue", color = "black", alpha = 0.7, trim = FALSE) + # Violin plot with some transparency and no trimming
geom_boxplot(width = 0.1, color = "black", outlier.shape = NA, alpha = 0.5) + # Boxplot inside the violin for better clarity
geom_jitter(width = 0.1, size = 1, alpha = 0.3, color = "#34495e") + # Add jittered points to show individual data points
labs(title = "Traffic Volume Distribution by Injury Severity",
x = "Injury Severity (KABCO scale)",
y = "Traffic Volume") +
theme_minimal() + # Use minimal theme for cleaner look
theme(
axis.text.x = element_text(angle = 45, hjust = 1), # Rotate x-axis labels for better readability
plot.title = element_text(hjust = 0.5) # Center the plot title
) +
scale_y_continuous(labels = scales::comma) # Use comma format for y-axis to make large numbers easier to readBoxplot for Traffic Volume (TrafVol) by Injury Severity (Prsn_Injry_Sev_ID)
# Boxplot for Traffic Volume (TrafVol) by Injury Severity (Prsn_Injry_Sev_ID)
ggplot(data, aes(x = as.factor(Prsn_Injry_Sev_ID), y = TrafVol)) +
geom_boxplot(fill = "lightblue") +
labs(title = "Boxplot of Traffic Volume by Injury Severity",
x = "Injury Severity (KABCO scale)",
y = "Traffic Volume") +
theme_minimal()Correlation plot of numerical values
numeric_cols <- data[, sapply(data, is.numeric)]
cor_matrix <- cor(numeric_cols, use = "complete.obs")
print(cor_matrix)## SurfDry GenMale TrafVol
## SurfDry 1.000000000 0.04432906 0.009977795
## GenMale 0.044329059 1.00000000 -0.041513425
## TrafVol 0.009977795 -0.04151342 1.000000000
Heatmap for Injury Severity vs Speed Limit Category
# Create a contingency table (Injury Severity Vs Speed Limit)
contingency_table_speed <- table(data$Crash_Speed_LimitCat, data$Prsn_Injry_Sev_ID)
# Convert to dataframe for ggplot
contingency_df_speed <- as.data.frame(contingency_table_speed)
# Heatmap for Injury Severity vs Speed Limit Category
ggplot(contingency_df_speed, aes(Var1, Var2, fill = Freq)) +
geom_tile(color = "white") +
scale_fill_gradient(low = "lightblue", high = "red") +
labs(title = "Heatmap of Injury Severity Vs Speed Limit Category",
x = "Speed Limit Category",
y = "Injury Severity (KABCO scale)",
fill = "Frequency") +
theme_minimal(base_size = 15) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))Outlier detection using boxplot
# Function to detect outliers using IQR
detect_outliers_iqr <- function(x) {
Q1 <- quantile(x, 0.25, na.rm = TRUE)
Q3 <- quantile(x, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
outliers <- which(x < lower_bound | x > upper_bound)
return(outliers)
}
# Apply the function to TrafVol (Traffic Volume)
outliers_trafvol <- detect_outliers_iqr(data$TrafVol)
# Create a new column to identify outliers
data$Outlier <- "No" # Default value
data$Outlier[outliers_trafvol] <- "Yes" # Mark outliers as "Yes"
# Visualize Outliers with Boxplot and highlight the outliers
ggplot(data, aes(x = "", y = TrafVol)) +
geom_boxplot(fill = "lightblue") +
geom_jitter(aes(color = Outlier), width = 0.1, size = 2) + # Add jittered points and color by outlier status
scale_color_manual(values = c("No" = "black", "Yes" = "red")) + # Red for outliers, black for normal points
labs(title = "Outlier Detection for Traffic Volume",
x = "",
y = "Traffic Volume") +
theme_minimal()