Final Project

Data Cleaning/Preparation

Importing the Dataset & Libraries

library(tidyverse)
library(ggplot2)
library(ggalt)
library(dplyr)
library(lubridate)
library(writexl)
library(ggmap)
library(sf)
library(RColorBrewer)
data <- read.csv('Traffic_Crash_Reports__CPD__20250223.csv')

Looking at the column data types

#looking at columns in dataset
for (i in 1:ncol(data)){print(paste(i,colnames(data[i]),":",class(data[[i]])))}

## [1] "1 ADDRESS_X : character"
## [1] "2 LATITUDE_X : numeric"
## [1] "3 LONGITUDE_X : numeric"
## [1] "4 AGE : character"
## [1] "5 COMMUNITY_COUNCIL_NEIGHBORHOOD : character"
## [1] "6 CPD_NEIGHBORHOOD : character"
## [1] "7 SNA_NEIGHBORHOOD : character"
## [1] "8 CRASHDATE : character"
## [1] "9 CRASHLOCATION : character"
## [1] "10 CRASHSEVERITY : character"
## [1] "11 CRASHSEVERITYID : integer"
## [1] "12 DATECRASHREPORTED : character"
## [1] "13 DAYOFWEEK : character"
## [1] "14 GENDER : character"
## [1] "15 INJURIES : character"
## [1] "16 INSTANCEID : character"
## [1] "17 LIGHTCONDITIONSPRIMARY : character"
## [1] "18 LOCALREPORTNO : numeric"
## [1] "19 MANNEROFCRASH : character"
## [1] "20 ROADCONDITIONSPRIMARY : character"
## [1] "21 ROADCONTOUR : character"
## [1] "22 ROADSURFACE : character"
## [1] "23 ROADCLASS : integer"
## [1] "24 ROADCLASSDESC : character"
## [1] "25 UNITTYPE : character"
## [1] "26 TYPEOFPERSON : character"
## [1] "27 WEATHER : character"
## [1] "28 ZIP : character"

Identifying NA’s in the data set for cleaning.

#identifying NAs
for (i in 1:ncol(data)){print(paste(i,colnames(data[i]),":",colSums(is.na(data[i]))))}

## [1] "1 ADDRESS_X : 0"
## [1] "2 LATITUDE_X : 63"
## [1] "3 LONGITUDE_X : 65"
## [1] "4 AGE : 0"
## [1] "5 COMMUNITY_COUNCIL_NEIGHBORHOOD : 0"
## [1] "6 CPD_NEIGHBORHOOD : 0"
## [1] "7 SNA_NEIGHBORHOOD : 0"
## [1] "8 CRASHDATE : 0"
## [1] "9 CRASHLOCATION : 0"
## [1] "10 CRASHSEVERITY : 0"
## [1] "11 CRASHSEVERITYID : 19"
## [1] "12 DATECRASHREPORTED : 0"
## [1] "13 DAYOFWEEK : 0"
## [1] "14 GENDER : 0"
## [1] "15 INJURIES : 0"
## [1] "16 INSTANCEID : 0"
## [1] "17 LIGHTCONDITIONSPRIMARY : 0"
## [1] "18 LOCALREPORTNO : 0"
## [1] "19 MANNEROFCRASH : 0"
## [1] "20 ROADCONDITIONSPRIMARY : 0"
## [1] "21 ROADCONTOUR : 0"
## [1] "22 ROADSURFACE : 0"
## [1] "23 ROADCLASS : 151069"
## [1] "24 ROADCLASSDESC : 0"
## [1] "25 UNITTYPE : 0"
## [1] "26 TYPEOFPERSON : 0"
## [1] "27 WEATHER : 0"
## [1] "28 ZIP : 0"

Identifying Blanks in the dataset for cleaning.

#identifying BLANKS
for (i in 1:ncol(data)){print(paste(i,colnames(data[i]),":",colSums(data[i] == "")))}

## [1] "1 ADDRESS_X : 45"
## [1] "2 LATITUDE_X : NA"
## [1] "3 LONGITUDE_X : NA"
## [1] "4 AGE : 48768"
## [1] "5 COMMUNITY_COUNCIL_NEIGHBORHOOD : 0"
## [1] "6 CPD_NEIGHBORHOOD : 0"
## [1] "7 SNA_NEIGHBORHOOD : 0"
## [1] "8 CRASHDATE : 35"
## [1] "9 CRASHLOCATION : 194675"
## [1] "10 CRASHSEVERITY : 19"
## [1] "11 CRASHSEVERITYID : NA"
## [1] "12 DATECRASHREPORTED : 49"
## [1] "13 DAYOFWEEK : 33"
## [1] "14 GENDER : 44522"
## [1] "15 INJURIES : 383"
## [1] "16 INSTANCEID : 0"
## [1] "17 LIGHTCONDITIONSPRIMARY : 45"
## [1] "18 LOCALREPORTNO : 0"
## [1] "19 MANNEROFCRASH : 45"
## [1] "20 ROADCONDITIONSPRIMARY : 45"
## [1] "21 ROADCONTOUR : 45"
## [1] "22 ROADSURFACE : 45"
## [1] "23 ROADCLASS : NA"
## [1] "24 ROADCLASSDESC : 151163"
## [1] "25 UNITTYPE : 383"
## [1] "26 TYPEOFPERSON : 371"
## [1] "27 WEATHER : 45"
## [1] "28 ZIP : 7181"

#cleaning the data & removing unwanted values due to NAs
df <- data %>%
  filter(LATITUDE_X != 'NA', LONGITUDE_X != 'NA') %>%
  select(-c('ROADCLASS','CRASHLOCATION', 'DATECRASHREPORTED', 'LOCALREPORTNO', 'ROADCLASSDESC'))

Finalizing the data frame for visualization use.

#cleaning up data types
df <- df %>%
  mutate(AGE = as.integer(AGE),
         CRASHDATE = mdy_hms(CRASHDATE),
         ZIP = as.character(ZIP),
         DAYOFWEEKNUM = match(DAYOFWEEK, c("SUN","MON", "TUE", "WED", "THU", "FRI", "SAT")),
         )
#cleaning up blanks to be "unknown"
df <- df %>%
  mutate(across(where(is.character), ~ ifelse(is.na(.) | trimws(.) == "", "UNKNOWN", .)))

#cleaning up categorical columns to remove the category keys
df <- df %>%
  mutate(across(where(is.character), ~ sub("^\\S+ - ", "", .)))

#exporting the data to use in tableau
#write_xlsx(df, "crash_data.xlsx")

Geographical Analysis

The first question I’d like to address is where are most of Cincinnati’s car crashes occuring? To do this, I took a look at the overal heatmap of crashes across the dataset. Then I refined the search by zipcode. For the top 5 zipcodes, I will look at the date/time that the crash occurred on.

Density Heatmap

#density visualization
ggmap::register_google(key = "AIzaSyBVC4hbOlw9oAD3xi1rtIs6R4jnoF4YPNA")
Cincimap = get_map(location = 'Cincinnati',
                   zoom = 11, source = 'google', maptype = 'roadmap')

ggmap(Cincimap) +
  geom_bin2d(data = df,
             aes(x = LONGITUDE_X,
                 y = LATITUDE_X),
             alpha = .5, bins = 25
             ) +
  scale_fill_viridis_c(labels = scales :: comma) +
  labs(title = 'Heatmap of Crashes Across Cincinnati',
       x = 'Longitude',
       y = 'Latitude',
       fill = 'Number of Crashes') +
  theme(
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
    axis.title = element_text(size = 12),
    axis.text = element_text(size = 10))

A Closer Look

After looking at the general heatmap of the datatset, I wanted to zoom in on the high crash area which happens to be Downtown Cincinnati. Here we see that many of the crashes occur in central downtown, near the bridges and many offices.

#heatmap with zipcode boundaries
zips <- st_read("CensusData/cb_2020_us_zcta520_500k.shp")

## Reading layer `cb_2020_us_zcta520_500k' from data source 
##   `C:\Users\leahd\OneDrive - University of Cincinnati\2024-2025\BANA  4137 - Data Visualizations\BANA4137\CensusData\cb_2020_us_zcta520_500k.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 33791 features and 7 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -176.6967 ymin: -14.37374 xmax: 145.8304 ymax: 71.34122
## Geodetic CRS:  NAD83

# Keep only ZIPs that start with 452 (Cincinnati region)
zips_cincy <- zips %>% filter(grepl("^452", ZCTA5CE20))

zoomed_map = get_map(location = 'Cincinnati',
                   zoom = 14, source = 'google', maptype = 'terrain')
ggmap(zoomed_map) +
  geom_bin2d(data = df,
             aes(x = LONGITUDE_X,
                 y = LATITUDE_X),
             alpha = .5, bins = 10) +
  scale_fill_gradient(low = "white", high = "red", labels = scales :: comma) +
  geom_sf(data = zips_cincy, fill = NA, color = "black", size = 1, inherit.aes = FALSE) +
  labs(title = "Downtown Heatmap",
       y = 'Latitude',
       x = 'Longitude',
       fill = 'Number of Crashes',
       caption = 'The black outlines are the ZIP code regions.') +
  theme(
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
    axis.title = element_text(size = 12),
    axis.text = element_text(size = 10))

Heatmap by ZIP

//code to embed tableau visualization

var divElement = document.getElementById('viz1744242421921');                   var vizElement = divElement.getElementsByTagName('object')[0];                    vizElement.style.width='100%';vizElement.style.height=(divElement.offsetWidth*0.75)+'px';
var scriptElement = document.createElement('script');                    scriptElement.src = 'https://public.tableau.com/javascripts/api/viz_v1.js';    vizElement.parentNode.insertBefore(scriptElement, vizElement);

Day of Week Analysis

For the top 5 zip codes, which tend to be in the Downtown Cincinnati Area, I wanted to visualize how many crashes occur as we cycle through the day. I found that the majority of crashes occur on Fridays. This makes a lot of sense as Downtown is a busy area, many people go out at night on Friday to celebrate or just go and have drinks. Many college students do the same for the weekends, and many people may be traveling in and out of the city for the weekend which makes Friday a high-traffic day.

top_zips <- df %>%
  group_by(ZIP) %>%
  summarize(NumCrashes = n()) %>%
  arrange(desc(NumCrashes)) %>%
  slice_head(n = 5) %>%
  pull(ZIP)

df %>%
  filter(ZIP %in% top_zips) %>%
  filter(DAYOFWEEK != 'Unknown' & DAYOFWEEK != 'UNKNOWN') %>%
  ggplot() +
  geom_bar(aes(x = reorder(DAYOFWEEK, DAYOFWEEKNUM), fill = reorder(DAYOFWEEK, DAYOFWEEKNUM))) +
  labs(x = 'Day of Week',
       y = 'Number of Crashes',
       title = 'Crashes by Weekday for the Top 5 Zipcodes',
       caption = 'Here we find that most crashes in downtown occur on Fridays. 
       Note: 5 observations were omitted for being unknown.') +
  scale_y_continuous(labels = scales :: comma) +
  theme(
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
    axis.title = element_text(size = 12, face = "bold"),
    axis.text = element_text(size = 10),
    legend.title = element_blank(),
    legend.position = 'none',
    panel.background = element_rect(fill = "white"),
    panel.grid.major.y = element_line(color = 'black')) +
  scale_fill_brewer(palette = "Spectral") +
  geom_text(aes(x = reorder(DAYOFWEEK, DAYOFWEEKNUM), label = scales:: comma(..count..)), stat = "count", vjust = 1.5, color = 'black')

Hour of the Day Analysis

df %>%
  filter(ZIP %in% top_zips) %>%
  mutate(hour = hour(CRASHDATE)) %>%
  ggplot() +
  geom_bar(aes(x = hour, fill = ifelse(hour %in% c(16, 17), "Highlight", "Normal"))) +
  geom_text(data = . %>% filter(hour == 16), stat = "count",
    aes(x = hour, y = ..count.., label = scales :: comma(..count..)),
    vjust = -.3, hjust = .75, color = 'black') +
  geom_text(data = . %>% filter(hour == 17), stat = "count",
    aes(x = hour, y = ..count.., label = scales :: comma(..count..)),
    vjust = -.275, hjust = .2, color = 'black') +
  labs(x = 'Hour of Day',
       y = 'Number of Crashes',
       title = 'Hourly Crashes for the Top 5 Zipcodes',
       caption = 'Most crashes occur at 4:00 PM and 5:00 PM EST.') +
  scale_fill_manual(values = c("Highlight" = "maroon", "Normal" = "lightblue")) +
  scale_y_continuous(labels = scales :: comma) +
  theme(
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
    axis.title = element_text(size = 12, face = "bold"),
    axis.text = element_text(size = 10),
    legend.title = element_blank(),
    legend.position = 'none',
    panel.background = element_rect(fill = "white"),
    panel.grid.major.y = element_line(color = 'black'))

Demographic Analysis

The next question I wanted to analyze was the demographic trends behind the crashes in Cincinnati. Specifically looking at the injuries, age, and sex of the person involved in the crash.

Crashes by Gender

The first obvious feature to look at is gender. We find that there are more crashes in Cincinnati by males than there are females. By speculation, some could say that males are more reckless drivers and that is another reason their car insurance costs more. However, without other data to understand Cincinnati’s population, it is hard to state any reason there might be this trend; additionally, the unknowns play a large factor in creating this to be an unreliable visualization to create assumptions off of.

df %>%
  group_by(GENDER) %>%
  ggplot(aes(x = factor(GENDER), fill = factor(GENDER))) +
  geom_bar() +
  labs(y = 'Number of Crashes',
       x = 'Gender',
       title = 'Crashes by Gender') +
  scale_y_continuous(labels = scales:: comma) +
  theme(panel.background = element_rect(fill = "white"),
    panel.grid.major.y = element_line(color = 'black'),
    legend.title = element_blank(),
    legend.position = 'none',
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
    axis.title = element_text(size = 12, face = "bold"),
    axis.text = element_text(size = 10)) +
scale_fill_manual(values = c("#F48FC3", "#90CAF9", "#81C784")) +
  geom_text(aes(label = scales:: comma(..count..)), stat = "count", vjust = 1.5, color = 'black')

Crash Injuries

After analyzing the crashes by gender, I wanted to look at the kind of injuries reported for each gender. This way, we can see if there is any trend and correlation between gender and the crash severity based on injuries reported.

df %>%
  mutate(INJURIES = case_when(
    INJURIES == "POSSIBLE INJURY" | INJURIES == "POSSIBLE" ~ 'Possible injury',
    INJURIES == "SUSPECTED MINOR INJURY" ~ 'Suspected minor injury',
    INJURIES == "FATAL" ~ 'Fatal',
    INJURIES == "SUSPECTED SERIOUS INJURY" ~ 'Suspected major injury',
    INJURIES == "NO INJURY / NONE REPORTED" | INJURIES == "NO APPARENTY INJURY" ~ 'No injury / None reported',
    INJURIES == "NON-INCAPACITATING" ~ 'Non-incapacitating injury',
    INJURIES == "INCAPACITATING" ~ 'Incapacitating injury',
    is.na(INJURIES) == TRUE ~ 'Unknown',
    INJURIES == 'UNKNOWN' ~ 'Unknown')) %>%
  group_by(GENDER, INJURIES) %>%
  summarize(count = n()) %>%
  ggplot() +
  geom_col(aes(x = count, y = INJURIES, fill = INJURIES)) +
  facet_wrap(~ GENDER, ncol = 1) +
  theme(panel.background = element_rect(fill = "white"),
    panel.grid.major.x = element_line(color = 'black'),
    legend.title = element_blank(),
    legend.position = 'none',
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
    axis.title = element_text(size = 12, face = "bold"),
    axis.text = element_text(size = 10)) +
  scale_x_continuous(labels = scales :: comma) +
  labs(y = '',
       x = 'Number of Injuries',
       title = 'Types of Injuries Reported') +
  scale_fill_manual(values = c('No injury / None reported' = 'pale green'))

Crashes by Age Group

Next I wanted to view the demographics by creating age groups for the data. I decided on 7 standard age groupings and left a unknown column for the crashes that did not get that data recorded.

df <- df %>%
  mutate(age_group = case_when(
    AGE <= 17 ~ "Under 18",
    AGE > 17 & AGE <= 24 ~ "18-24",
    AGE > 24 & AGE <= 34 ~ "25-34",
    AGE > 34 & AGE <= 44 ~ "35-44",
    AGE > 44 & AGE <= 54 ~ "45-54",
    AGE > 54 & AGE <= 64 ~ "55-64",
    AGE >= 65             ~ "65+",
    is.na(AGE) == TRUE ~ 'Unknown')
    ) %>%
  mutate(age_group = factor(age_group, levels = c('Under 18', '18-24', '25-34', '35-44', '45-54', '55-64', '65+', 'Unknown')))

df %>%
  group_by(age_group) %>%
  summarize(count = n()) %>%
  ggplot() +
  geom_col(aes(x = age_group, y = count, fill = age_group)) +
  labs(y = 'Number of Crashes',
       x = 'Age Group',
       title = 'Crashes by Age Group') +
  scale_y_continuous(labels = scales:: comma) +
  theme(panel.background = element_rect(fill = "white"),
    panel.grid.major.y = element_line(color = 'black'),
    legend.title = element_blank(),
    legend.position = 'none',
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
    axis.title = element_text(size = 12, face = "bold"),
    axis.text = element_text(size = 10)) +
  scale_fill_brewer(palette = 'Spectral') +
  geom_text(aes(x = age_group, y = count, label = scales::comma(count)), vjust = 1.0, color = 'black')

Crash Injuries by Age Group

After analyzing the age groups that were in more crashes than others, I wanted to look at the severity of crashes for the 24-35 yr old age group since they were the age group involved in the most crashes. Here we find that, along with the overall data between males and females, this age group has the same trend: no injury/none reported as the most common crash severity. This most-likely means that many of the crashes that occur in Cincinnati happen to be traffic violations and or minor crashes.

df %>%
  mutate(INJURIES = case_when(
    INJURIES == "POSSIBLE INJURY" | INJURIES == "POSSIBLE" ~ 'Possible injury',
    INJURIES == "SUSPECTED MINOR INJURY" ~ 'Suspected minor injury',
    INJURIES == "FATAL" ~ 'Fatal',
    INJURIES == "SUSPECTED SERIOUS INJURY" ~ 'Suspected major injury',
    INJURIES == "NO INJURY / NONE REPORTED" | INJURIES == "NO APPARENTY INJURY" ~ 'No injury / None reported',
    INJURIES == "NON-INCAPACITATING" ~ 'Non-incapacitating injury',
    INJURIES == "INCAPACITATING" ~ 'Incapacitating injury',
    is.na(INJURIES) == TRUE ~ 'Unknown',
    INJURIES == 'UNKNOWN' ~ 'Unknown')) %>%
  filter(age_group == '25-34') %>%
  group_by(age_group, INJURIES) %>%
  summarize(count = n()) %>%
  ggplot() +
  geom_col(aes(x = count, y = INJURIES, fill = INJURIES)) +
  theme(panel.background = element_rect(fill = "white"),
    panel.grid.major.x = element_line(color = 'black'),
    legend.title = element_blank(),
    legend.position = 'none',
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
    axis.title = element_text(size = 12, face = "bold"),
    axis.text = element_text(size = 10)) +
  scale_x_continuous(labels = scales :: comma) +
  labs(y = '',
       x = 'Number of Injuries',
       title = 'Types of Injuries Reported for 25-34 yr olds') +
  scale_fill_manual(values = c('No injury / None reported' = 'pale green'))

Weather Trend Analysis

Lastly, I wanted to analyze the weather conditions that may or may not influence the number of crashes occurring in Cincinnati. I decided to look into the time of year most crashes occur, look at the road and weather conditions for the overall dataset and for the highest crash count month.

Time of Year

Here we find that most crashes in Cincinnati occur during October. This could be due to various reasons such as homecoming, football season, college outings, and work events.

df %>%
  filter(!is.na(CRASHDATE)) %>%
  ggplot() +
  geom_bar(aes(x = month(CRASHDATE, label = TRUE), fill = ifelse(month(CRASHDATE) == 10, "Highlight", "Normal"))) +
  labs(x = "Month", y = "Number of Crashes", title = "Crashes by Time of Year", 
       caption = '7 crashes were omitted for not having a recorded date.') +
   theme(panel.background = element_rect(fill = "white"),
    panel.grid.major.y = element_line(color = 'black'),
    legend.title = element_blank(),
    legend.position = 'none',
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
    axis.title = element_text(size = 12, face = "bold"),
    axis.text = element_text(size = 10)) +
    scale_y_continuous(labels = scales:: comma) +
    scale_fill_manual(values = c("Highlight" = "coral", "Normal" = "steelblue2"))

Another way to visualize the number of crashes occurring throughout the year. Note, this scale starts at 24,000 to better show the fluctuations month by month. Additionally, I have included a fitted linear regression line to show the overall trend of car crashes throughout the year.

df %>%
  filter(!is.na(CRASHDATE)) %>%
  group_by(month = month(CRASHDATE, label = TRUE, abbr = TRUE)) %>% 
  summarise(crash_count = n(), .groups = "drop") %>%
  ggplot(aes(x = month, y = crash_count, group = 1)) +
  geom_line(color = "steelblue3", size = 2) +
  geom_point(aes(shape = (month == "Oct"), color = (month == "Oct")), size = 4) +
  geom_smooth(aes(x = month, y = crash_count), method = 'lm', color = 'red')+
  scale_shape_manual(values = c("TRUE"  = 16, "FALSE" = 26)) +
  scale_color_manual(values = c("TRUE"  = "coral",
               "FALSE" = "steelblue2")) +
  labs(x = "Month", y = "Number of Crashes", title = "Crashes by Time of Year", 
       caption = '7 crashes were omitted for not having a recorded date.') +
   theme(panel.background = element_rect(fill = "white"),
    panel.grid.major.y = element_line(color = 'black'),
    legend.title = element_blank(),
    legend.position = 'none',
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
    axis.title = element_text(size = 12, face = "bold"),
    axis.text = element_text(size = 10)) +
  scale_y_continuous(limits = c(24000,38000))

Weather Conditions

Approximately two thirds of the crashes in Cincinnati were under Dry/Clear conditions. It makes sense that snow/hail is a low leading cause in Cincinnati since we don’t get many snowstorms here and the number of months available for a snowstorm are low. I would say Cincinnati as a general area has pretty mild weather which aligns with the data that many of these car crashes occur in normal day to day clear and cloudy weather.

df %>%
  filter(!is.na(WEATHER)) %>%
  mutate(WEATHER = case_when(
    WEATHER == 'CLEAR' ~ 'CLEAR',
    WEATHER == 'CLOUDY' ~ 'CLOUDY',
    WEATHER == 'FREEZING RAIN OR FREEZING DRIZZLE' | WEATHER == 'SLEET, HAIL' | WEATHER == 'SLEET,HAIL' ~ 'SLEET/HAIL',
    WEATHER == 'SNOW' ~ 'SNOW',
    WEATHER == 'RAIN' ~ 'RAIN',
    WEATHER == 'FOG, SMOG, SMOKE' ~ 'FOG/SMOG/SMOKE',
    WEATHER == 'BLOWING SAND, SOIL, DIRT, SNOW' ~ 'SAND/SOIL/DIRT',
    WEATHER == 'SEVERE CROSSWINDS' ~ 'SEVERE WINDS',
    WEATHER == 'UNKNOWN' | WEATHER == 'OTHER/UNKNOWN' ~ 'UNKNOWN'
  )) %>%
  group_by(WEATHER) %>%
  summarize(count = n()) %>%
  ggplot() +
  geom_col(aes(y = reorder(WEATHER, -count), x = count, fill = WEATHER)) +
  labs(y = "Weather Condition", x = "Number of Crashes", title = "Crashes by Weather") +
   theme(panel.background = element_rect(fill = "white"),
    panel.grid.major.y = element_line(color = 'black'),
    legend.title = element_blank(),
    legend.position = 'none',
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
    axis.title = element_text(size = 12, face = "bold"),
    axis.text = element_text(size = 10)) +
    scale_x_continuous(labels = scales:: comma) +
  scale_fill_brewer(palette = 'Spectral')

Road Conditions

Aligning with the weather conditions, for a majority of crashes the road conditions were also clear/dry. The second leading factor was the presence of water, having either a wet road or water in the road.

df %>%
  filter(!is.na(ROADCONDITIONSPRIMARY)) %>%
  mutate(ROADCONDITION = case_when(
    ROADCONDITIONSPRIMARY == 'DRY' ~ 'DRY',
    ROADCONDITIONSPRIMARY == 'SNOW' ~ 'SNOW',
    ROADCONDITIONSPRIMARY == 'WET' | ROADCONDITIONSPRIMARY == 'WATER (STANDING, MOVING)' ~ 'WET/WATER',
    ROADCONDITIONSPRIMARY == 'SLUSH' | ROADCONDITIONSPRIMARY == 'ICE' ~ 'SLUSH/ICE',
    ROADCONDITIONSPRIMARY == 'SAND, MUD, DIRT, OIL, GRAVEL' ~ 'SAND/MUD/DEBRIS',
    ROADCONDITIONSPRIMARY == 'OTHER' | ROADCONDITIONSPRIMARY == 'UNKNOWN' ~ 'UNKNOWN'
  )) %>%
  group_by(ROADCONDITION) %>%
  summarize(count = n()) %>%
  ggplot() +
  geom_col(aes(y = reorder(ROADCONDITION, -count), x = count, fill = ROADCONDITION)) +
  labs(y = "Road Condition", x = "Number of Crashes", title = "Crashes by Road Condition") +
   theme(panel.background = element_rect(fill = "white"),
    panel.grid.major.x = element_line(color = 'black'),
    legend.title = element_blank(),
    legend.position = 'none',
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
    axis.title = element_text(size = 12, face = "bold"),
    axis.text = element_text(size = 10)) +
    scale_x_continuous(labels = scales:: comma)+
  scale_fill_brewer(palette = 'Spectral')

A deeper look into October

Next I wanted to look into what could possibly be driving the month of October to be the highest crash month for Cincinnati. Most of the crashes had clear weather conditions, the second most cause is rain. Aligning with this weather condition, we see that water/wet roads were also the second leading cause for car crashes in October.

df %>%
  filter(!is.na(WEATHER)) %>%
  filter(month(CRASHDATE) == 10) %>%
  mutate(WEATHER = case_when(
    WEATHER == 'CLEAR' ~ 'CLEAR',
    WEATHER == 'CLOUDY' ~ 'CLOUDY',
    WEATHER == 'FREEZING RAIN OR FREEZING DRIZZLE' | WEATHER == 'SLEET, HAIL' | WEATHER == 'SLEET,HAIL' ~ 'SLEET/HAIL',
    WEATHER == 'SNOW' ~ 'SNOW',
    WEATHER == 'RAIN' ~ 'RAIN',
    WEATHER == 'FOG, SMOG, SMOKE' ~ 'FOG/SMOG/SMOKE',
    WEATHER == 'BLOWING SAND, SOIL, DIRT, SNOW' ~ 'SAND/SOIL/DIRT',
    WEATHER == 'SEVERE CROSSWINDS' ~ 'SEVERE WINDS',
    WEATHER == 'UNKNOWN' | WEATHER == 'OTHER/UNKNOWN' ~ 'UNKNOWN'
  )) %>%
  group_by(WEATHER) %>%
  summarize(count = n()) %>%
  ggplot() +
  geom_col(aes(y = reorder(WEATHER, -count), x = count, fill = WEATHER)) +
  labs(y = "Weather Condition", x = "Number of Crashes", title = "Crashes by Weather for October") +
   theme(panel.background = element_rect(fill = "white"),
    panel.grid.major.x = element_line(color = 'black'),
    legend.title = element_blank(),
    legend.position = 'none',
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
    axis.title = element_text(size = 12, face = "bold"),
    axis.text = element_text(size = 10)) +
    scale_x_continuous(labels = scales:: comma) +
  scale_fill_brewer(palette = 'Pastel1')

df %>%
  filter(!is.na(ROADCONDITIONSPRIMARY)) %>%
  filter(month(CRASHDATE) == 10) %>%
  mutate(ROADCONDITION = case_when(
    ROADCONDITIONSPRIMARY == 'DRY' ~ 'DRY',
    ROADCONDITIONSPRIMARY == 'SNOW' ~ 'SNOW',
    ROADCONDITIONSPRIMARY == 'WET' | ROADCONDITIONSPRIMARY == 'WATER (STANDING, MOVING)' ~ 'WET/WATER',
    ROADCONDITIONSPRIMARY == 'SLUSH' | ROADCONDITIONSPRIMARY == 'ICE' ~ 'SLUSH/ICE',
    ROADCONDITIONSPRIMARY == 'SAND, MUD, DIRT, OIL, GRAVEL' ~ 'SAND/MUD/DEBRIS',
    ROADCONDITIONSPRIMARY == 'OTHER' | ROADCONDITIONSPRIMARY == 'UNKNOWN' ~ 'UNKNOWN'
  )) %>%
  group_by(ROADCONDITION) %>%
  summarize(count = n()) %>%
  ggplot() +
  geom_col(aes(y = reorder(ROADCONDITION, -count), x = count, fill = ROADCONDITION)) +
  labs(y = "Road Condition", x = "Number of Crashes", title = "Crashes by Road Condition for October") +
   theme(panel.background = element_rect(fill = "white"),
    panel.grid.major.x = element_line(color = 'black'),
    legend.title = element_blank(),
    legend.position = 'none',
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
    axis.title = element_text(size = 12, face = "bold"),
    axis.text = element_text(size = 10)) +
    scale_x_continuous(labels = scales:: comma)+
  scale_fill_brewer(palette = 'Pastel1')