HW01: Exploratory Data Analysis by Mahmuda Mimi
Read and explore data
Overview of the data
library(readxl)
setwd("C:/Users/mahmu/OneDrive/Desktop/AI")
df <- read_excel("HW1_Data.xlsx")
head(df, n=10)## # A tibble: 10 × 19
## Wthr_Cond_ID Light_Cond_ID Road_Type_ID Road_Algn_ID SurfDry Traffic_Cntl_ID
## <chr> <chr> <chr> <chr> <dbl> <chr>
## 1 Clear Dark, not lig… 2 lane, 2 w… Straight, l… 1 Marked lanes
## 2 Clear Dark, not lig… 2 lane, 2 w… Straight, l… 1 Center stripe/…
## 3 Clear Daylight 2 lane, 2 w… Straight, l… 1 Marked lanes
## 4 Clear Daylight 2 lane, 2 w… Straight, l… 1 Center stripe/…
## 5 Clear Dark, not lig… 2 lane, 2 w… Straight, g… 1 None
## 6 Clear Daylight Unknown Straight, l… 1 None
## 7 Clear Daylight 4 or more l… Curve, level 1 Marked lanes
## 8 Clear Daylight 4 or more l… Curve, level 1 Marked lanes
## 9 Clear Daylight Unknown Straight, l… 1 None
## 10 Clear Daylight 2 lane, 2 w… Straight, l… 1 Center stripe/…
## # ℹ 13 more variables: Harm_Evnt_ID <chr>, Intrsct_Relat_ID <chr>,
## # FHE_Collsn_ID <chr>, Road_Part_Adj_ID <chr>, Road_Cls_ID <chr>,
## # Pop_Group_ID <chr>, Crash_Speed_LimitCat <chr>, Veh_Body_Styl_ID <chr>,
## # Prsn_Ethnicity_ID <chr>, GenMale <dbl>, TrafVol <dbl>, Prsn_Age <chr>,
## # Prsn_Injry_Sev_ID <chr>
## [1] 1295 19
## [1] "Wthr_Cond_ID" "Light_Cond_ID" "Road_Type_ID"
## [4] "Road_Algn_ID" "SurfDry" "Traffic_Cntl_ID"
## [7] "Harm_Evnt_ID" "Intrsct_Relat_ID" "FHE_Collsn_ID"
## [10] "Road_Part_Adj_ID" "Road_Cls_ID" "Pop_Group_ID"
## [13] "Crash_Speed_LimitCat" "Veh_Body_Styl_ID" "Prsn_Ethnicity_ID"
## [16] "GenMale" "TrafVol" "Prsn_Age"
## [19] "Prsn_Injry_Sev_ID"
## tibble [1,295 × 19] (S3: tbl_df/tbl/data.frame)
## $ Wthr_Cond_ID : chr [1:1295] "Clear" "Clear" "Clear" "Clear" ...
## $ Light_Cond_ID : chr [1:1295] "Dark, not lighted" "Dark, not lighted" "Daylight" "Daylight" ...
## $ Road_Type_ID : chr [1:1295] "2 lane, 2 way" "2 lane, 2 way" "2 lane, 2 way" "2 lane, 2 way" ...
## $ Road_Algn_ID : chr [1:1295] "Straight, level" "Straight, level" "Straight, level" "Straight, level" ...
## $ SurfDry : num [1:1295] 1 1 1 1 1 1 1 1 1 1 ...
## $ Traffic_Cntl_ID : chr [1:1295] "Marked lanes" "Center stripe/divider" "Marked lanes" "Center stripe/divider" ...
## $ Harm_Evnt_ID : chr [1:1295] "Motor vehicle in transport" "Motor vehicle in transport" "Motor vehicle in transport" "Fixed object" ...
## $ Intrsct_Relat_ID : chr [1:1295] "Non intersection" "Non intersection" "Intersection" "Non intersection" ...
## $ FHE_Collsn_ID : chr [1:1295] "Sd both going straight-rear end" "Sd both going straight-rear end" "Other" "Omv vehicle going straight" ...
## $ Road_Part_Adj_ID : chr [1:1295] "Main/proper lane" "Main/proper lane" "Main/proper lane" "Main/proper lane" ...
## $ Road_Cls_ID : chr [1:1295] "Farm to market" "Us & state highways" "Farm to market" "Us & state highways" ...
## $ Pop_Group_ID : chr [1:1295] "10,000 - 24,999 pop" "Rural" "Other" "Rural" ...
## $ Crash_Speed_LimitCat: chr [1:1295] "30-40 mph" "65-70 mph" "45-60 mph" "65-70 mph" ...
## $ Veh_Body_Styl_ID : chr [1:1295] "Farm equipment" "Farm equipment" "Farm equipment" "Farm equipment" ...
## $ Prsn_Ethnicity_ID : chr [1:1295] "White" "White" "White" "White" ...
## $ GenMale : num [1:1295] 1 1 1 1 1 1 1 1 1 1 ...
## $ TrafVol : num [1:1295] 11419 5267 5247 13009 13229 ...
## $ Prsn_Age : chr [1:1295] "25-54 years" "25-54 years" "Other" "25-54 years" ...
## $ Prsn_Injry_Sev_ID : chr [1:1295] "O" "O" "O" "O" ...
df1 <- df
library(DT)
datatable(
df1, extensions = 'Buttons', options = list(
dom = 'Bfrtip',
buttons = c('excel')
)
)| Name | df1 |
| Number of rows | 1295 |
| Number of columns | 19 |
| _______________________ | |
| Column type frequency: | |
| character | 16 |
| numeric | 3 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Wthr_Cond_ID | 0 | 1 | 3 | 6 | 0 | 5 | 0 |
| Light_Cond_ID | 0 | 1 | 4 | 17 | 0 | 5 | 0 |
| Road_Type_ID | 0 | 1 | 5 | 26 | 0 | 5 | 0 |
| Road_Algn_ID | 0 | 1 | 5 | 19 | 0 | 5 | 0 |
| Traffic_Cntl_ID | 0 | 1 | 4 | 21 | 0 | 5 | 0 |
| Harm_Evnt_ID | 0 | 1 | 5 | 26 | 0 | 5 | 0 |
| Intrsct_Relat_ID | 0 | 1 | 12 | 20 | 0 | 4 | 0 |
| FHE_Collsn_ID | 0 | 1 | 5 | 32 | 0 | 5 | 0 |
| Road_Part_Adj_ID | 0 | 1 | 5 | 28 | 0 | 5 | 0 |
| Road_Cls_ID | 0 | 1 | 5 | 19 | 0 | 5 | 0 |
| Pop_Group_ID | 0 | 1 | 5 | 21 | 0 | 5 | 0 |
| Crash_Speed_LimitCat | 0 | 1 | 5 | 9 | 0 | 5 | 0 |
| Veh_Body_Styl_ID | 0 | 1 | 14 | 14 | 0 | 1 | 0 |
| Prsn_Ethnicity_ID | 0 | 1 | 5 | 8 | 0 | 5 | 0 |
| Prsn_Age | 0 | 1 | 5 | 11 | 0 | 5 | 0 |
| Prsn_Injry_Sev_ID | 0 | 1 | 1 | 2 | 0 | 3 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| SurfDry | 0 | 1 | 0.91 | 0.28 | 0 | 1.0 | 1 | 1.0 | 1 | ▁▁▁▁▇ |
| GenMale | 0 | 1 | 0.88 | 0.32 | 0 | 1.0 | 1 | 1.0 | 1 | ▁▁▁▁▇ |
| TrafVol | 0 | 1 | 14203.65 | 8358.99 | 218 | 6706.5 | 14019 | 21416.5 | 28988 | ▇▇▇▇▆ |
Descriptive Statistics
df1$Prsn_Injry_Sev_ID <- factor(df1$Prsn_Injry_Sev_ID, levels = c("KA", "BC", "O"))
library(compareGroups)
des1 <- compareGroups(`Prsn_Injry_Sev_ID` ~ ., data = df1, max.ylev = 50, max.xlev = 50, ref = 1)
des2 <- createTable(des1, show.ratio = TRUE)
des2##
## --------Summary descriptives table by 'Prsn_Injry_Sev_ID'---------
##
## _____________________________________________________________________________________
## KA BC O p.overall
## N=50 N=120 N=1125
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
## Wthr_Cond_ID: .
## Clear 42 (84.0%) 98 (81.7%) 944 (83.9%)
## Cloudy 6 (12.0%) 19 (15.8%) 121 (10.8%)
## Fog 1 (2.00%) 1 (0.83%) 8 (0.71%)
## Other 0 (0.00%) 0 (0.00%) 10 (0.89%)
## Rain 1 (2.00%) 2 (1.67%) 42 (3.73%)
## Light_Cond_ID: .
## Dark, lighted 0 (0.00%) 5 (4.17%) 37 (3.29%)
## Dark, not lighted 12 (24.0%) 16 (13.3%) 126 (11.2%)
## Daylight 36 (72.0%) 90 (75.0%) 914 (81.2%)
## Dusk 2 (4.00%) 6 (5.00%) 31 (2.76%)
## Other 0 (0.00%) 3 (2.50%) 17 (1.51%)
## Road_Type_ID: .
## 2 lane, 2 way 26 (52.0%) 50 (41.7%) 475 (42.2%)
## 4 or more lanes, divided 9 (18.0%) 27 (22.5%) 149 (13.2%)
## 4 or more lanes, undivided 6 (12.0%) 11 (9.17%) 79 (7.02%)
## Other 0 (0.00%) 0 (0.00%) 2 (0.18%)
## Unknown 9 (18.0%) 32 (26.7%) 420 (37.3%)
## Road_Algn_ID: .
## Curve, level 2 (4.00%) 6 (5.00%) 54 (4.80%)
## Other 2 (4.00%) 2 (1.67%) 40 (3.56%)
## Straight, grade 5 (10.0%) 24 (20.0%) 106 (9.42%)
## Straight, hillcrest 4 (8.00%) 4 (3.33%) 39 (3.47%)
## Straight, level 37 (74.0%) 84 (70.0%) 886 (78.8%)
## SurfDry 0.94 (0.24) 0.93 (0.25) 0.91 (0.28) 0.571
## Traffic_Cntl_ID: .
## Center stripe/divider 15 (30.0%) 32 (26.7%) 259 (23.0%)
## Marked lanes 19 (38.0%) 39 (32.5%) 331 (29.4%)
## No passing zone 6 (12.0%) 12 (10.0%) 76 (6.76%)
## None 5 (10.0%) 21 (17.5%) 270 (24.0%)
## Other 5 (10.0%) 16 (13.3%) 189 (16.8%)
## Harm_Evnt_ID: .
## Fixed object 4 (8.00%) 10 (8.33%) 130 (11.6%)
## Motor vehicle in transport 40 (80.0%) 99 (82.5%) 867 (77.1%)
## Other 4 (8.00%) 4 (3.33%) 24 (2.13%)
## Overturned 1 (2.00%) 6 (5.00%) 19 (1.69%)
## Parked car 1 (2.00%) 1 (0.83%) 85 (7.56%)
## Intrsct_Relat_ID: .
## Driveway access 3 (6.00%) 8 (6.67%) 134 (11.9%)
## Intersection 5 (10.0%) 15 (12.5%) 150 (13.3%)
## Intersection related 1 (2.00%) 7 (5.83%) 104 (9.24%)
## Non intersection 41 (82.0%) 90 (75.0%) 737 (65.5%)
## FHE_Collsn_ID: <0.001
## Omv vehicle going straight 9 (18.0%) 18 (15.0%) 232 (20.6%)
## Other 6 (12.0%) 24 (20.0%) 367 (32.6%)
## Sd both going straight-rear end 27 (54.0%) 60 (50.0%) 225 (20.0%)
## Sd both going straight-sideswipe 3 (6.00%) 9 (7.50%) 147 (13.1%)
## Sd one straight-one left turn 5 (10.0%) 9 (7.50%) 154 (13.7%)
## Road_Part_Adj_ID: 0.151
## Exit/off ramp 1 (2.00%) 1 (0.83%) 2 (0.18%)
## Main/proper lane 49 (98.0%) 110 (91.7%) 1066 (94.8%)
## Other 0 (0.00%) 1 (0.83%) 3 (0.27%)
## Other (explain in narrative) 0 (0.00%) 4 (3.33%) 26 (2.31%)
## Unknown 0 (0.00%) 4 (3.33%) 28 (2.49%)
## Road_Cls_ID: .
## City street 4 (8.00%) 10 (8.33%) 207 (18.4%)
## County road 4 (8.00%) 15 (12.5%) 172 (15.3%)
## Farm to market 14 (28.0%) 43 (35.8%) 317 (28.2%)
## Other 1 (2.00%) 13 (10.8%) 75 (6.67%)
## Us & state highways 27 (54.0%) 39 (32.5%) 354 (31.5%)
## Pop_Group_ID: .
## 10,000 - 24,999 pop 3 (6.00%) 4 (3.33%) 56 (4.98%)
## 250,000 pop. And over 2 (4.00%) 9 (7.50%) 106 (9.42%)
## Other 4 (8.00%) 16 (13.3%) 173 (15.4%)
## Rural 40 (80.0%) 87 (72.5%) 739 (65.7%)
## Town under 2,499 pop. 1 (2.00%) 4 (3.33%) 51 (4.53%)
## Crash_Speed_LimitCat: .
## > 70 mph 10 (20.0%) 19 (15.8%) 174 (15.5%)
## 30-40 mph 6 (12.0%) 20 (16.7%) 310 (27.6%)
## 45-60 mph 14 (28.0%) 45 (37.5%) 412 (36.6%)
## 65-70 mph 20 (40.0%) 28 (23.3%) 175 (15.6%)
## Other 0 (0.00%) 8 (6.67%) 54 (4.80%)
## Veh_Body_Styl_ID: Farm equipment 50 (100%) 120 (100%) 1125 (100%) .
## Prsn_Ethnicity_ID: .
## Black 2 (4.00%) 9 (7.50%) 52 (4.62%)
## Hispanic 12 (24.0%) 36 (30.0%) 374 (33.2%)
## Other 0 (0.00%) 1 (0.83%) 38 (3.38%)
## Unknown 0 (0.00%) 0 (0.00%) 83 (7.38%)
## White 36 (72.0%) 74 (61.7%) 578 (51.4%)
## GenMale 0.98 (0.14) 0.94 (0.24) 0.87 (0.33) 0.008
## TrafVol 14259 (8714) 14122 (8472) 14210 (8338) 0.993
## Prsn_Age: .
## 15-24 years 5 (10.0%) 18 (15.0%) 165 (14.7%)
## 25-54 years 19 (38.0%) 56 (46.7%) 497 (44.2%)
## 55-64 years 8 (16.0%) 23 (19.2%) 201 (17.9%)
## 65-74 years 10 (20.0%) 14 (11.7%) 94 (8.36%)
## Other 8 (16.0%) 9 (7.50%) 168 (14.9%)
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
Correlation Analysis
Correlation Analysis of Categorical Variables
library(vcd)
library(dplyr)
library(ggcorrplot)
# Function to calculate Cramér's V for categorical variables
cramers_v_matrix <- function(df) {
cat_vars <- df %>% select(where(is.character))
cat_names <- colnames(cat_vars)
#Creating empty matrix
cramers_v <- matrix(NA, length(cat_names), length(cat_names),
dimnames = list(cat_names, cat_names))
# Filling matrix with Cramér's V values
for (i in 1:length(cat_names)) {
for (j in 1:length(cat_names)) {
if (i == j) {
cramers_v[i, j] <- 1
} else {
# Contingency table for the two variables
tbl <- table(cat_vars[[i]], cat_vars[[j]])
# Compute Chi-Square test
chisq_test <- chisq.test(tbl)
# Compute Cramér's V
cramers_v[i, j] <- sqrt(chisq_test$statistic / (sum(tbl) * (min(nrow(tbl), ncol(tbl)) - 1)))
}
}
}
return(cramers_v)
}
cramers_v <- cramers_v_matrix(df)
# Plotting the Cramér's V correlation matrix
ggcorrplot(cramers_v,
lab = TRUE,
lab_size = 8,
title = "Cramer's V Correlation Matrix",
colors = c("#6D9EC1", "white", "#E46726"),
outline.color = "white",
hc.order = FALSE) +
theme_minimal() +
labs(x = "Categorical Variable 1",
y = "Categorical Variable 2") +
theme(
plot.title = element_text(size = 40, face = "bold", hjust = 0.5),
axis.text.x = element_text(angle = 45, hjust = 1, size = 22),
axis.text.y = element_text(size = 22),
axis.title.x = element_text(size = 28),
axis.title.y = element_text(size = 28),
legend.position = "right",
legend.text = element_text(size = 20),
legend.title = element_text(size = 20, face = "bold")
)
Interpretation of Results: The correlation analysis of
categorical variables is performed using Cramér’s V, a measure of
association for nominal (categorical) data. It is computed based on the
Chi-Square statistic, testing whether the variables are independent of
each other. A matrix of Cramér’s V values is generated, with values
closer to 1 indicating a stronger association between the variables.The
resulting heatmap visualizes the strength of the relationships between
categorical variables, allowing easy identification of the strongest
correlations. Since all variables are categorical, this method is
appropriate for detecting non-parametric associations without relying on
distribution assumptions. The Veh_Body_Styl_ID variable has only one
category for all observations and this is the reason of getting “Inf”
value in the Cramér’s V matrix. This indicates that the results are in a
perfect association (or undefined association) because the Chi-square
statistic would be either undefined or extremely large relative to the
sample size, leading to an infinite Cramér’s V value. This is because
there is no variability in this variable, and it does not provide any
meaningful information to measure an association with other
variables.
Correlation Analysis of Other Variables
Correlation Analysis of Traffic Volume with Categorical Variables
library(ggplot2)
library(dplyr)
# List of categorical variables to analyze
categorical_vars <- c("Wthr_Cond_ID", "Light_Cond_ID", "Road_Type_ID", "Road_Algn_ID",
"SurfDry", "Traffic_Cntl_ID", "Harm_Evnt_ID", "Intrsct_Relat_ID",
"FHE_Collsn_ID", "Road_Part_Adj_ID", "Road_Cls_ID",
"Pop_Group_ID", "Crash_Speed_LimitCat", "Prsn_Ethnicity_ID",
"GenMale", "Prsn_Age", "Prsn_Injry_Sev_ID")
correlation_results <- data.frame(Variable = character(), PValue = numeric(), stringsAsFactors = FALSE)
# Performing ANOVA or Kruskal-Wallis for each categorical variable
for (var in categorical_vars) {
if (length(unique(df[[var]])) > 1) {
# Performing Kruskal-Wallis test (non-parametric)
test_result <- kruskal.test(TrafVol ~ get(var), data = df)
correlation_results <- rbind(correlation_results,
data.frame(Variable = var, PValue = round(test_result$p.value, 3)))
}
}
print(correlation_results)## Variable PValue
## 1 Wthr_Cond_ID 0.353
## 2 Light_Cond_ID 0.277
## 3 Road_Type_ID 0.303
## 4 Road_Algn_ID 0.681
## 5 SurfDry 0.813
## 6 Traffic_Cntl_ID 0.950
## 7 Harm_Evnt_ID 0.208
## 8 Intrsct_Relat_ID 0.550
## 9 FHE_Collsn_ID 0.710
## 10 Road_Part_Adj_ID 0.498
## 11 Road_Cls_ID 0.236
## 12 Pop_Group_ID 0.069
## 13 Crash_Speed_LimitCat 0.796
## 14 Prsn_Ethnicity_ID 0.272
## 15 GenMale 0.057
## 16 Prsn_Age 0.592
## 17 Prsn_Injry_Sev_ID 0.992
Interpretation of Results: For each categorical variable (e.g., Wthr_Cond_ID, Light_Cond_ID, Road_Type_ID, etc.), the Kruskal-Wallis test was performed to determine whether the medians of traffic volume differ significantly across the categories. The p-values represent the likelihood that the differences in medians could have occurred by chance. All p-values are greater than 0.05, indicating no statistically significant associations between traffic volume and the analyzed categorical variables.
Single Variable Analysis
plot_colors <- c("skyblue", "lightcoral", "#084081", "#a8ddb5", "#7bccc4", "lightgray", "lightblue", "#8c6bb1", "#cc4c02", "#810f7c", "#993404")
custom_labels <- c(
"Wthr_Cond_ID" = "Weather Condition",
"Light_Cond_ID" = "Light Condition",
"Road_Type_ID" = "Road Type",
"Road_Algn_ID" = "Road Alignment",
"SurfDry" = "Surface Condition: Dry",
"Traffic_Cntl_ID" = "Traffic Control",
"Harm_Evnt_ID" = "Harm Event",
"Intrsct_Relat_ID" = "Intersection Relation",
"FHE_Collsn_ID" = "First Harmful Event",
"Road_Part_Adj_ID" = "Road Part Adjacent",
"Road_Cls_ID" = "Road Classification",
"Pop_Group_ID" = "Population Group",
"Crash_Speed_LimitCat" = "Speed Limit",
"Prsn_Ethnicity_ID" = "Person Ethnicity",
"GenMale" = "Gender: Male",
"Prsn_Age" = "Person Age",
"Prsn_Injry_Sev_ID" = "Injury Severity"
)
df$Prsn_Injry_Sev_ID <- factor(df$Prsn_Injry_Sev_ID, levels = c("KA", "BC", "O"))
# List of all the variables to plot
variables_to_plot <- c(
"Wthr_Cond_ID", "Light_Cond_ID", "Road_Type_ID", "Road_Algn_ID", "SurfDry", "Traffic_Cntl_ID", "Harm_Evnt_ID", "Intrsct_Relat_ID",
"FHE_Collsn_ID", "Road_Part_Adj_ID", "Road_Cls_ID", "Pop_Group_ID",
"Crash_Speed_LimitCat", "Prsn_Ethnicity_ID", "GenMale", "Prsn_Age", "Prsn_Injry_Sev_ID"
)
# Loop through each variable and create a bar plot
for (i in seq_along(variables_to_plot)) {
var <- variables_to_plot[i] # Get the variable name
label <- custom_labels[var]
p <- ggplot(df, aes_string(x = var)) +
geom_bar(fill = plot_colors[i %% length(plot_colors) + 1]) + # Assign color for the plot
theme_minimal() +
labs(
title = paste("Distribution of", label),
x = label,
y = "Count"
) +
theme(
plot.background = element_rect(fill = "#f0f0f0", color = NA),
panel.background = element_rect(fill = "#f0f0f0", color = NA),
plot.title = element_text(family = "Roboto", size = 16, face = "bold", hjust = 0.5),
axis.title = element_text(size = 12),
axis.text.x = element_text(angle = 45, hjust = 1, size = 9),
plot.margin = margin(10, 10, 10, 10)
)
# Print each plot
print(p)
}Distribution of Traffic Volume
library(hrbrthemes)
p2 <- ggplot(data = df, aes(x = TrafVol)) +
geom_density(adjust = 1.5, alpha = 0.4, fill = "lightblue") +
theme_ipsum() + # Use the hrbrthemes for a clean theme
labs(
title = "Density Plot of Traffic Volume",
x = "Traffic Volume",
y = "Density"
)
p2Bivariate Exploratory Data Analysis
custom_labels <- c(
"Wthr_Cond_ID" = "Weather Condition",
"Light_Cond_ID" = "Light Condition",
"Road_Type_ID" = "Road Type",
"Road_Algn_ID" = "Road Alignment",
"SurfDry" = "Surface Condition",
"Traffic_Cntl_ID" = "Traffic Control",
"Harm_Evnt_ID" = "Harm Event",
"Intrsct_Relat_ID" = "Intersection Relation",
"FHE_Collsn_ID" = "First Harmful Event",
"Road_Part_Adj_ID" = "Road Part Adjacent",
"Road_Cls_ID" = "Road Classification",
"Pop_Group_ID" = "Population Group",
"Crash_Speed_LimitCat" = "Speed Limit",
"Prsn_Ethnicity_ID" = "Person Ethnicity",
"GenMale" = "Gender",
"Prsn_Age" = "Person Age",
"Prsn_Injry_Sev_ID" = "Injury Severity"
)
variables_to_plot <- c(
"Wthr_Cond_ID", "Light_Cond_ID", "Road_Type_ID", "Road_Algn_ID",
"SurfDry", "Traffic_Cntl_ID", "Harm_Evnt_ID", "Intrsct_Relat_ID",
"FHE_Collsn_ID", "Road_Part_Adj_ID", "Road_Cls_ID", "Pop_Group_ID",
"Crash_Speed_LimitCat", "Prsn_Ethnicity_ID", "GenMale", "Prsn_Age"
)
# Loop through each variable and create a stacked bar plot
for (var in variables_to_plot) {
# Plotting for each variable with Injury Severity
df %>%
count(.data[[var]], Prsn_Injry_Sev_ID) %>%
group_by(.data[[var]]) %>%
mutate(pct = prop.table(n) * 100) %>%
ggplot(aes_string(x = var, y = "pct", fill = "Prsn_Injry_Sev_ID")) +
geom_bar(stat = "identity", color = "black") +
ylab("Percentage") +
geom_text(aes(label = paste0(sprintf("%1.1f", pct), "%")),
position = position_stack(vjust = 0.5)) +
ggtitle(paste("Injury Severity by", custom_labels[var])) +
xlab(custom_labels[var]) +
theme(plot.title = element_text(face = "bold", hjust = 0.5)) +
scale_fill_manual(
values = c("#DADAEB", "#9E9AC8", "#6A51A3"),
name = "Person Injury Severity"
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(last_plot())
}library(ggstatsplot)
df$Prsn_Injry_Sev_ID <- factor(df$Prsn_Injry_Sev_ID, levels = c("KA", "BC", "O"))
plt <- ggbetweenstats(
data = df,
x = Prsn_Injry_Sev_ID,
y = TrafVol,
type = "nonparametric",
pairwise.comparisons = FALSE,
pairwise.display = "none",
plot.type = "boxviolin",
ggtheme = theme_minimal(),
messages = FALSE
)
plt <- plt +
labs(
x = "Injury Severity",
y = "Traffic Volume",
title = "Distribution of Traffic Volume Across Different Injury Severities",
) +
theme(
text = element_text(family = "Roboto", size = 11, color = "black"),
plot.title = element_text(
family = "Lobster Two",
size = 18,
face = "bold",
color = "#2a475e",
hjust = 0.5
),
plot.subtitle = element_text(
family = "Roboto",
size = 12,
face = "bold",
color = "#1b2838",
hjust = 0.5
),
plot.title.position = "plot",
# Customize the axis text
axis.text = element_text(size = 10, color = "black"),
axis.title = element_text(size = 12)
)
plt <- plt +
theme(
axis.ticks = element_blank(), # Remove axis ticks
axis.line = element_line(colour = "grey50"),
panel.grid = element_line(color = "#b4aea9"),
panel.grid.minor = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.major.y = element_line(linetype = "dashed"),
panel.background = element_rect(fill = "#fbf9f4", color = "#fbf9f4"),
plot.background = element_rect(fill = "#fbf9f4", color = "#fbf9f4")
)
ggsave(
filename = "traffic_volume_violin_boxplot.png",
plot = plt,
width = 8,
height = 8,
device = "png"
)
print(plt)
## Heat Maps
custom_labels <- c(
"Wthr_Cond_ID" = "Weather Condition",
"Light_Cond_ID" = "Light Condition",
"Crash_Speed_LimitCat" = "Speed Limit",
"Prsn_Age" = "Person Age",
"Prsn_Injry_Sev_ID" = "Injury Severity"
)
# List of variables for which we are creating heatmaps
variables_to_plot <- c("Wthr_Cond_ID", "Light_Cond_ID", "Crash_Speed_LimitCat", "Prsn_Age")
for (var in variables_to_plot) {
table_data <- table(df$Prsn_Injry_Sev_ID, df[[var]])
df_table <- as.data.frame(table_data)
p <- ggplot(df_table, aes(Var1, Var2, fill = Freq)) +
geom_tile() +
labs(
title = paste(custom_labels[var], "by Injury Severity"),
x = custom_labels["Prsn_Injry_Sev_ID"],
y = custom_labels[var],
fill = "Count"
) +
theme_minimal() +
scale_fill_gradient(low = "lightblue", high = "darkblue") +
theme(
plot.title = element_text(face = "bold", hjust = 0.5, size = 14)
)
print(p)
}