library(ggplot2)
library(tidyverse)
library(lubridate)
set.seed(123)
PackagesToCheck <- list( "ggplot2", "tidyverse", "lubridate")
for (i in PackagesToCheck) {
if (i %in% rownames(installed.packages()) == FALSE) {
install.packages(i)
}
}
Accidents_Original <- readRDS("farsp.RDS")
alcohol <- read.csv("alcohol.csv")
alcohol <- alcohol %>%
mutate(description = if_else(description %in% c("Unknown", "Not reported"), NA, description))
body_typ <- read.csv("body_typ.csv")
body_typ <- body_typ %>%
mutate(description = if_else(description %in% c("Unknown body type", "Not Reported"), NA, description))
col_names <- read.csv("column_name_description.csv")
inj_sev <- read.csv("inj_sev.csv")
inj_sev <- inj_sev %>%
mutate(description = if_else(description %in% c("Unknown/Not Reported", "N/A"), NA, description))
man_coll <- read.csv("man_coll.csv")
man_coll <- man_coll %>%
mutate(description = if_else(description %in% c("Not Reported", "Unknown"), NA, description))
sex <- read.csv("sex.csv")
sex <- sex %>%
mutate(description = if_else(description %in% c("not reported", "unknown"), NA, description))
state_code <- read.csv("state_code.csv")
col_names <- setNames(col_names$column_name, col_names$description)
names(col_names) <- gsub(" ", "_", names(col_names))
Master_Dataframe <- Accidents_Original %>%
rename(alcohol = drinking) %>%
left_join(state_code, by = c("state" = "state_code")) %>%
left_join(sex, by = "sex") %>%
left_join(man_coll, by = "man_coll") %>%
left_join(inj_sev, by = "inj_sev") %>%
left_join(body_typ, by = "body_typ") %>%
left_join(alcohol, by = "alcohol") %>%
mutate(across(c(year, month, day, hour, minute, age), ~replace(., . == 99, NA)),
across(c(age), ~replace(., . == 999, NA))) %>%
drop_na(c("year", "month", "day", "hour", "minute")) %>%
mutate(
state = as.character(state_name),
sex = as.character(description.x),
man_coll = as.character(description.y),
alcohol = as.character(description),
inj_sev = as.character(description.x.x),
body_typ = as.character(description.y.y),
Datetime = ymd_hm(paste(year, month, day, hour, minute)),
Month_Name = month(month, label = TRUE)
) %>%
rename(!!!col_names) %>%
mutate(
State = as.factor(State),
County = as.factor(County),
Manner_of_collision = as.factor(Manner_of_collision),
Vehicle_body_type = as.factor(Vehicle_body_type),
Sex = as.factor(Sex),
Injury_severity = factor(Injury_severity, levels = c("No Apparent Injury", "Injured, Severity Unknown", "Possible Injury", "Suspected Minor Injury", "Suspected Serious Injury", "Non-incapacitating injury", "Fatal Injury", "Died Prior to Crash")),
Injury_Severity_Basic = if_else(Injury_severity %in% c("Fatal Injury", "Died Prior to Crash"), "Fatal", "Non-Fatal"),
Datetime = as.POSIXct(Datetime),
Date = as.Date(Datetime),
Accident_weekday = factor(wday(Datetime, label = TRUE), levels = c("Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat")),
Year = year(Datetime)
) %>%
filter(Age < 100) %>%
select(-State, -description.x, -description.y, -description.x.x, -description.y.y,-description) %>%
mutate(region = case_when(
state_name == "District of Columbia" ~ "South",
state.region[match(state_name, state.name)] == "North Central" ~ "Midwest",
TRUE ~ state.region[match(state_name, state.name)]
),
Vehicle_body_type = case_when(
grepl("sedan|coupe|convertible|hatchback|automobile|compact", Vehicle_body_type, ignore.case = TRUE) ~ "Automobiles",
grepl("motorcycle|scooter|moped", Vehicle_body_type, ignore.case = TRUE) ~ "Motorcycles and Scooters",
grepl("van|minivan", Vehicle_body_type, ignore.case = TRUE) ~ "Vans and Minivans",
grepl("pickup|utility|truck", Vehicle_body_type, ignore.case = TRUE) ~ "Trucks",
grepl("bus", Vehicle_body_type, ignore.case = TRUE) ~ "Buses",
grepl("ATV|snowmobile|golf cart|off-highway", Vehicle_body_type, ignore.case = TRUE) ~ "Specialized Vehicles",
TRUE ~ "Unknown/Other Vehicle Types"
),
Injury_severity = factor(Injury_severity, levels = c("No Apparent Injury", "Possible Injury", "Suspected Minor Injury", "Suspected Serious Injury", "Fatal Injury", "Injured, Severity Unknown", "Died Prior to Crash"), ordered = TRUE),
time_period = case_when(
Accident_hour >= 5 & Accident_hour < 12 ~ "morning",
Accident_hour >= 12 & Accident_hour < 17 ~ "midday",
Accident_hour >= 17 & Accident_hour < 20 ~ "evening",
Accident_hour >= 20 & Accident_hour < 24 ~ "night",
Accident_hour >= 0 & Accident_hour < 5 ~ "late_night",
TRUE ~ "unknown"
))
rm("sex", "alcohol","state_code","man_coll","inj_sev","body_typ")
# https://www.cdc.gov/nchs/hus/sources-definitions/geographic-region.htm
The goal of this analysis is to use temporal analysis techniques to better understand the root causes of traffic accidents, inform policy decisions, and educate drivers to reduce the risk of injury or death. Driving is one of the most dangerous activities that we participate in on a daily basis. Each year, 40,400 people die from car accidents. On a personal level, there are a lot of factors outside of your control when on the road, but one that you often can control is when you drive. Should you drive at night or wait until noon the next day? Should you drive during the holidays (in particular those that involve all-night drinking)? We should understand the root causes of traffic accidents to educate our divers, improve road infrastructure, and inform policy decisions. Using temporal analysis techniques focused on the date and time of day, we can glean insights into accident patterns and better inform people who want to change their habits to reduce their risk of getting injured or killed. While driving will always be inherently dangerous, we can take measures to minimize that risk by using data-driven insights.
Data was collected from the U.S. Department of Transportation’s National Highway Traffic Safety Administration between 1996 and 2016. The dataset has 1,856,573 accidents recorded and gives information on factors such as the date/time, if the participant was intoxicated, the injury severity, and the location. Prior to conducting any analysis, the dataset underwent a series of data cleaning and manipulation steps. Column names were modified to make them more human-readable, and data values were converted from numeric representations to text. These improved readability and consistency. Next, the dataset was cleaned and transformed using dplyr. Additional modifications included adding columns such as those which designate what region the accident took place or if an accident happened during the day or at night.
The data manipulation and cleaning steps were performed using R version 4.2.2 (2022-10-31) on a Mac Book Pro running macOS Monterrey (12.4). All data manipulation and cleaning steps are documented in a script file for reproducibility purposes and can be viewed through drop down “code” menus throughout the document.
The majority of accidents occur in July with the fewest in February. Surprisingly, there is not as commensurate a drop in the number of fatalities in February as compared with July as one may expect based on the total number of accidents being higher. Although lower during February, the number of fatalities remains fairly constant across the year at around 67,000 deaths. There is a peak of suspected major and minor injuries in July, which is not reflected among the confirmed injuries. Fatal injuries, for instance, are elevated all summer. The number of accidents that do not result in any injury remains fairly steady across all months around 32,000 incidents.
ggplot(Master_Dataframe %>% filter(!is.na(Injury_severity)), aes(x = Month_Name, fill = Injury_severity)) +
geom_bar() +
labs(x = "Month", y = "Number of accidents", title = "Number of Accidents by Month and Injury Severity", fill = "Injury Severity") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
scale_fill_manual(values = c("lightblue", "orange", "mediumpurple1", "red1", "darkred", "tan1", "indianred3"))+
theme_minimal()
We can see that the south accounts for the majority of accidents and the northeast region accounts for the least. When adjusting the scales to not be uniform we can see a normal distribution of accidents in the Midwest, northeast, and west regions centered around August. The south remains at a similar level across months.
APM <- ggplot(Master_Dataframe %>% filter(!is.na(Datetime)), aes(x = month(Datetime, label = TRUE))) +
geom_bar() +
ggtitle("Number of Accidents per Month") +
facet_wrap( ~ region) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
labs(x = "Months", y = "Count")
APMRel <- ggplot(Master_Dataframe %>% filter(!is.na(Datetime)), aes(x = month(Datetime, label = TRUE))) +
geom_bar() +
ggtitle("Number of Accidents per Month") +
facet_wrap( ~ region, scales = "free_y") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
labs(x = "Months", y = "Count")
gridExtra::grid.arrange(APM, APMRel, ncol = 2)
rm("APM","APMRel")
Note: February 29th removed to account for leap years
When graphing the number of accidents every day over the year, we can see that New Years and July 4th are dangerous times to be on the road. In particular, after 4pm on July 4th and just after midnight on January 1st are especially dangerous. In general, mornings are reasonably safe and accidents pick up around 9:00 am. Accidents then become much more frequent after 4:00 pm as people leave from work and kids get out of school.
accidents_per_day <- Master_Dataframe %>%
mutate(DoY = yday(Datetime), Month = month(Datetime), Day = day(Datetime), Hour = hour(Datetime)) %>%
group_by(Month, Day, Hour) %>%
summarize(count = n()) %>%
ungroup()
ggplot(subset(accidents_per_day, !(Month == 2 & Day == 29)), aes(x = Day, y = Hour, fill = count - mean(count))) +
geom_tile(width = 0.5) +
scale_y_reverse() +
scale_fill_gradient2(low = "dodgerblue4", mid = "white", high = "indianred3", midpoint = 0) +
facet_wrap(~ month(Month, label = TRUE)) +
labs(x = "Day", y = "Time", fill = "Accidents", title = "Analyzing the Relationship Between Time and Accidents Across Different Months")+
theme_minimal()
We can see that very few accidents occur during the week before 5:00 am. There is a slight uptick in the number of accidents around 8:00 am as people commute to work, but this subsides until 3:00 pm when accidents pick up and continue to be high through 6:00 pm. Of the weekdays, Friday evenings and nights are the most dangerous, likely caused by people eager to get home for the weekend and those who decide to party later into the night. The elevated number of accidents on Friday continue into Saturday until 4:00am.
On the weekends it is generally pretty safe to drive between 4:00am and 11:00 am. After noon accidents tend to pick up. Similar to Friday nights, Saturday nights are periods when many accidents take place and this continues through Sunday at 4:00 am. Sundays have a large number of accidents around 6:00 pm, but there are far fewer accidents late at night than on Fridays or Saturdays, presumably because people are getting ready for the work week.
accidents_by_hour_weekday <- Master_Dataframe %>%
group_by(hour = hour(Datetime), weekday = wday(Datetime, label = TRUE)) %>%
summarize(n = n())
ggplot(accidents_by_hour_weekday, aes(x = hour, y = weekday, fill = n)) +
geom_tile() +
scale_fill_gradient(low = "#f7f7f7", high = "indianred3") +
labs(x = "Hour of day", y = "Day of week", fill = "Number of accidents", title = "Heatmap of Accidents by Hour and Day of the Week")+
theme_minimal()
Across the whole year, the majority of accidents occur at night (as defined by 8:00 pm until midnight) and the fewest occur during the evening (as defined by 5:00 pm to 8:00 pm). It is important to note that the evening window is also the smallest in terms of number of hours it encompasses which distorts accident count as periods are not uniformly long and are arbitrary. Starting in the evening, there are fewer people over the age of 60 getting in accidents which is likely a result of that group not being on the road as much as young and middle aged people.
We can see that, in general, men are involved in more accidents across the board than women. This is especially apparent between the ages of 16 (when people generally get their drivers licences) and 75. Before the age of 16 both genders are involved in similar numbers of accidents as the individuals identified as not driving and are simply passengers.
Although the percent of accidents where men are involved is elevated relative to female for the vast majority of ages, we can see that during the night/twilight hours the proportion of men to women increases among older people. The period of time when male and female are most balanced is during the mid-day period.
SexAgeGraph <- Master_Dataframe %>%
filter(!is.na(Sex) & !is.na(Age) & time_period == "morning") %>%
group_by(Age, Sex) %>%
summarise(Count = n()) %>%
mutate(Percent = Count/sum(Count)*100)
AgeSexOne <- ggplot(Master_Dataframe %>% filter(!is.na(Age) & !is.na(Sex) & time_period == "morning"), aes(x = Age, fill = Sex)) +
geom_bar(stat = "count", position = "stack") +
labs(y = "Count", title = "Distribution of Age of Persons Involved in Accidents") + ylim(0,15000)
AgeSexTwo <- ggplot(data = SexAgeGraph, aes(x = Age, y = Percent, fill = Sex)) +
geom_col(position = "stack") +
ggtitle("Percent of Each Sex Who Are In Accidents By Age")+
geom_abline(intercept = 50, slope = 0, alpha = 0.75, linetype = "dashed")
gridExtra::grid.arrange(AgeSexOne, AgeSexTwo)
SexAgeGraph <- Master_Dataframe %>%
filter(!is.na(Sex) & !is.na(Age) & time_period == "midday") %>%
group_by(Age, Sex) %>%
summarise(Count = n()) %>%
mutate(Percent = Count/sum(Count)*100)
AgeSexOne <- ggplot(Master_Dataframe %>% filter(!is.na(Age) & !is.na(Sex) & time_period == "midday"), aes(x = Age, fill = Sex)) +
geom_bar(stat = "count", position = "stack") +
labs(y = "Count", title = "Distribution of Age of Persons Involved in Accidents")+ ylim(0,15000)
AgeSexTwo <- ggplot(data = SexAgeGraph, aes(x = Age, y = Percent, fill = Sex)) +
geom_col(position = "stack") +
ggtitle("Percent of Each Sex Who Are In Accidents By Age")+
geom_abline(intercept = 50, slope = 0, alpha = 0.75, linetype = "dashed")
gridExtra::grid.arrange(AgeSexOne, AgeSexTwo)
SexAgeGraph <- Master_Dataframe %>%
filter(!is.na(Sex) & !is.na(Age) & time_period == "evening") %>%
group_by(Age, Sex) %>%
summarise(Count = n()) %>%
mutate(Percent = Count/sum(Count)*100)
AgeSexOne <- ggplot(Master_Dataframe %>% filter(!is.na(Age) & !is.na(Sex) & time_period == "evening"), aes(x = Age, fill = Sex)) +
geom_bar(stat = "count", position = "stack") +
labs(y = "Count", title = "Distribution of Age of Persons Involved in Accidents")+ ylim(0,15000)
AgeSexTwo <- ggplot(data = SexAgeGraph, aes(x = Age, y = Percent, fill = Sex)) +
geom_col(position = "stack") +
ggtitle("Percent of Each Sex Who Are In Accidents By Age")+
geom_abline(intercept = 50, slope = 0, alpha = 0.75, linetype = "dashed")
gridExtra::grid.arrange(AgeSexOne, AgeSexTwo)
SexAgeGraph <- Master_Dataframe %>%
filter(!is.na(Sex) & !is.na(Age) & time_period == "night") %>%
group_by(Age, Sex) %>%
summarise(Count = n()) %>%
mutate(Percent = Count/sum(Count)*100)
AgeSexOne <- ggplot(Master_Dataframe %>% filter(!is.na(Age) & !is.na(Sex) & time_period == "night"), aes(x = Age, fill = Sex)) +
geom_bar(stat = "count", position = "stack") +
labs(y = "Count", title = "Distribution of Age of Persons Involved in Accidents")+ ylim(0,15000)
AgeSexTwo <- ggplot(data = SexAgeGraph, aes(x = Age, y = Percent, fill = Sex)) +
geom_col(position = "stack") +
ggtitle("Percent of Each Sex Who Are In Accidents By Age")+
geom_abline(intercept = 50, slope = 0, alpha = 0.75, linetype = "dashed")
gridExtra::grid.arrange(AgeSexOne, AgeSexTwo)
SexAgeGraph <- Master_Dataframe %>%
filter(!is.na(Sex) & !is.na(Age) & time_period == "late_night") %>%
group_by(Age, Sex) %>%
summarise(Count = n()) %>%
mutate(Percent = Count/sum(Count)*100)
AgeSexOne <- ggplot(Master_Dataframe %>% filter(!is.na(Age) & !is.na(Sex) & time_period == "late_night"), aes(x = Age, fill = Sex)) +
geom_bar(stat = "count", position = "stack") +
labs(y = "Count", title = "Distribution of Age of Persons Involved in Accidents")+ ylim(0,15000)
AgeSexTwo <- ggplot(data = SexAgeGraph, aes(x = Age, y = Percent, fill = Sex)) +
geom_col(position = "stack") +
ggtitle("Percent of Each Sex Who Are In Accidents By Age")+
geom_abline(intercept = 50, slope = 0, alpha = 0.75, linetype = "dashed")
gridExtra::grid.arrange(AgeSexOne, AgeSexTwo)
We can see that the majority of accidents involving alcohol occur late at night (midnight to 5:00 am) which is followed by the mid-day (noon to 5 pm) and nighttime (8:00 pm to midnight) periods.
In the morning there are very few accidents involving alcohol. There is a noticeable spike in accidents among 24 year old men getting into accidents in the morning, but more than 3/4 of the other accidents are from sober drivers.
During the middle of the day there is a less noticeable peak around 25 and instead men between 25 and 60 are getting into accidents from alcohol. It is important to note again that this is of a very small number of accidents involving alcohol at this time.
Progressing from the evening period through the late night period we can see a significant uptick in accidents, in particular those involving alcohol. Men between the ages of 20 and 60 remain the most at risk of getting into an accident while intoxicated during these periods. Late at night, when the majority of accidents take place, we can see that more than half of all accidents are caused by drunk men.
Across the board, women are generally involved in very few accidents involving alcohol, but at night and very late into the night there is a decent sized segment of the female population getting into accidents due to alcohol. Most of these are from female drivers between the ages of 20 and 60, with a spike around 26.
Master_DataframeMini <- Master_Dataframe %>%
filter(time_period == "morning")
LongAlcSexAge <- Master_DataframeMini %>%
filter(!is.na(Alcohol_involvement)) %>%
filter(!is.na(Sex)) %>%
filter(!is.na(Age)) %>%
mutate(grp = paste0(Sex,Alcohol_involvement)) %>%
mutate(grp = factor(paste0(Sex, Alcohol_involvement),
levels = c("femaleNo (Alcohol Not Involved)",
"femaleYes (Alcohol Involved)",
"maleNo (Alcohol Not Involved)",
"maleYes (Alcohol Involved)"
))) %>%
group_by(Age, grp) %>%
summarise(Accident_count = n()) %>%
mutate(perc = Accident_count / sum(Accident_count) * 100) %>%
mutate(grp = fct_recode(grp,
"Female, No Alcohol" = "femaleNo (Alcohol Not Involved)",
"Female, Alcohol Involved" = "femaleYes (Alcohol Involved)",
"Male, No Alcohol" = "maleNo (Alcohol Not Involved)",
"Male, Alcohol Involved" = "maleYes (Alcohol Involved)"
))
AgeSexAlcOne <- ggplot(Master_DataframeMini %>% filter(!is.na(Sex)) %>% filter(!is.na(Alcohol_involvement)) %>% filter(!is.na(Age)), aes(x = Age, fill = paste0(Sex,Alcohol_involvement))) +
geom_bar(stat = "count", position = "stack") +
labs(fill = "Group")+
scale_fill_manual(labels = c("femaleNo (Alcohol Not Involved)" = "Female, No Alcohol",
"femaleYes (Alcohol Involved)" = "Female, Alcohol Involved",
"maleNo (Alcohol Not Involved)" = "Male, No Alcohol",
"maleYes (Alcohol Involved)" = "Male, Alcohol Involved"),
values = c("coral4", "coral1", "dodgerblue1", "dodgerblue4")) + ylim(0,8000)
AgeSexAlcTwo <- ggplot(LongAlcSexAge, aes(x = Age, y = perc, fill = grp)) +
geom_col(position = "stack") +
scale_fill_manual(breaks = c("Female, No Alcohol", "Female, Alcohol Involved", "Male, No Alcohol", "Male, Alcohol Involved"),
labels = c("Female, No Alcohol", "Female, Alcohol Involved", "Male, No Alcohol", "Male, Alcohol Involved"),
values = c("coral4", "coral1", "dodgerblue1", "dodgerblue4")) +
labs(title = "Percent of Accidents by Age, Sex, and Intoxication Level",
x = "Age",
y = "Percent of Accidents",
fill = "Group") +
theme_minimal()
rm("Master_DataframeMini")
gridExtra::grid.arrange(AgeSexAlcOne, AgeSexAlcTwo)
Master_DataframeMini <- Master_Dataframe %>%
filter(time_period == "midday")
LongAlcSexAge <- Master_DataframeMini %>%
filter(!is.na(Alcohol_involvement)) %>%
filter(!is.na(Sex)) %>%
filter(!is.na(Age)) %>%
mutate(grp = paste0(Sex,Alcohol_involvement)) %>%
mutate(grp = factor(paste0(Sex, Alcohol_involvement),
levels = c("femaleNo (Alcohol Not Involved)",
"femaleYes (Alcohol Involved)",
"maleNo (Alcohol Not Involved)",
"maleYes (Alcohol Involved)"
))) %>%
group_by(Age, grp) %>%
summarise(Accident_count = n()) %>%
mutate(perc = Accident_count / sum(Accident_count) * 100) %>%
mutate(grp = fct_recode(grp,
"Female, No Alcohol" = "femaleNo (Alcohol Not Involved)",
"Female, Alcohol Involved" = "femaleYes (Alcohol Involved)",
"Male, No Alcohol" = "maleNo (Alcohol Not Involved)",
"Male, Alcohol Involved" = "maleYes (Alcohol Involved)"
))
AgeSexAlcOne <- ggplot(Master_DataframeMini %>% filter(!is.na(Sex)) %>% filter(!is.na(Alcohol_involvement)) %>% filter(!is.na(Age)), aes(x = Age, fill = paste0(Sex,Alcohol_involvement))) +
geom_bar(stat = "count", position = "stack") +
labs(fill = "Group")+
scale_fill_manual(labels = c("femaleNo (Alcohol Not Involved)" = "Female, No Alcohol",
"femaleYes (Alcohol Involved)" = "Female, Alcohol Involved",
"maleNo (Alcohol Not Involved)" = "Male, No Alcohol",
"maleYes (Alcohol Involved)" = "Male, Alcohol Involved"),
values = c("coral4", "coral1", "dodgerblue1", "dodgerblue4"))+ ylim(0,8000)
AgeSexAlcTwo <- ggplot(LongAlcSexAge, aes(x = Age, y = perc, fill = grp)) +
geom_col(position = "stack") +
scale_fill_manual(breaks = c("Female, No Alcohol", "Female, Alcohol Involved", "Male, No Alcohol", "Male, Alcohol Involved"),
labels = c("Female, No Alcohol", "Female, Alcohol Involved", "Male, No Alcohol", "Male, Alcohol Involved"),
values = c("coral4", "coral1", "dodgerblue1", "dodgerblue4")) +
labs(title = "Percent of Accidents by Age, Sex, and Intoxication Level",
x = "Age",
y = "Percent of Accidents",
fill = "Group") +
theme_minimal()
rm("Master_DataframeMini")
gridExtra::grid.arrange(AgeSexAlcOne, AgeSexAlcTwo)
Master_DataframeMini <- Master_Dataframe %>%
filter(time_period == "evening")
LongAlcSexAge <- Master_DataframeMini %>%
filter(!is.na(Alcohol_involvement)) %>%
filter(!is.na(Sex)) %>%
filter(!is.na(Age)) %>%
mutate(grp = paste0(Sex,Alcohol_involvement)) %>%
mutate(grp = factor(paste0(Sex, Alcohol_involvement),
levels = c("femaleNo (Alcohol Not Involved)",
"femaleYes (Alcohol Involved)",
"maleNo (Alcohol Not Involved)",
"maleYes (Alcohol Involved)"
))) %>%
group_by(Age, grp) %>%
summarise(Accident_count = n()) %>%
mutate(perc = Accident_count / sum(Accident_count) * 100) %>%
mutate(grp = fct_recode(grp,
"Female, No Alcohol" = "femaleNo (Alcohol Not Involved)",
"Female, Alcohol Involved" = "femaleYes (Alcohol Involved)",
"Male, No Alcohol" = "maleNo (Alcohol Not Involved)",
"Male, Alcohol Involved" = "maleYes (Alcohol Involved)"
))
AgeSexAlcOne <- ggplot(Master_DataframeMini %>% filter(!is.na(Sex)) %>% filter(!is.na(Alcohol_involvement)) %>% filter(!is.na(Age)), aes(x = Age, fill = paste0(Sex,Alcohol_involvement))) +
geom_bar(stat = "count", position = "stack") +
labs(fill = "Group")+
scale_fill_manual(labels = c("femaleNo (Alcohol Not Involved)" = "Female, No Alcohol",
"femaleYes (Alcohol Involved)" = "Female, Alcohol Involved",
"maleNo (Alcohol Not Involved)" = "Male, No Alcohol",
"maleYes (Alcohol Involved)" = "Male, Alcohol Involved"),
values = c("coral4", "coral1", "dodgerblue1", "dodgerblue4"))+ ylim(0,8000)
AgeSexAlcTwo <- ggplot(LongAlcSexAge, aes(x = Age, y = perc, fill = grp)) +
geom_col(position = "stack") +
scale_fill_manual(breaks = c("Female, No Alcohol", "Female, Alcohol Involved", "Male, No Alcohol", "Male, Alcohol Involved"),
labels = c("Female, No Alcohol", "Female, Alcohol Involved", "Male, No Alcohol", "Male, Alcohol Involved"),
values = c("coral4", "coral1", "dodgerblue1", "dodgerblue4")) +
labs(title = "Percent of Accidents by Age, Sex, and Intoxication Level",
x = "Age",
y = "Percent of Accidents",
fill = "Group") +
theme_minimal()
rm("Master_DataframeMini")
gridExtra::grid.arrange(AgeSexAlcOne, AgeSexAlcTwo)
Master_DataframeMini <- Master_Dataframe %>%
filter(time_period == "night")
LongAlcSexAge <- Master_DataframeMini %>%
filter(!is.na(Alcohol_involvement)) %>%
filter(!is.na(Sex)) %>%
filter(!is.na(Age)) %>%
mutate(grp = paste0(Sex,Alcohol_involvement)) %>%
mutate(grp = factor(paste0(Sex, Alcohol_involvement),
levels = c("femaleNo (Alcohol Not Involved)",
"femaleYes (Alcohol Involved)",
"maleNo (Alcohol Not Involved)",
"maleYes (Alcohol Involved)"
))) %>%
group_by(Age, grp) %>%
summarise(Accident_count = n()) %>%
mutate(perc = Accident_count / sum(Accident_count) * 100) %>%
mutate(grp = fct_recode(grp,
"Female, No Alcohol" = "femaleNo (Alcohol Not Involved)",
"Female, Alcohol Involved" = "femaleYes (Alcohol Involved)",
"Male, No Alcohol" = "maleNo (Alcohol Not Involved)",
"Male, Alcohol Involved" = "maleYes (Alcohol Involved)"
))
AgeSexAlcOne <- ggplot(Master_DataframeMini %>% filter(!is.na(Sex)) %>% filter(!is.na(Alcohol_involvement)) %>% filter(!is.na(Age)), aes(x = Age, fill = paste0(Sex,Alcohol_involvement))) +
geom_bar(stat = "count", position = "stack") +
labs(fill = "Group")+
scale_fill_manual(labels = c("femaleNo (Alcohol Not Involved)" = "Female, No Alcohol",
"femaleYes (Alcohol Involved)" = "Female, Alcohol Involved",
"maleNo (Alcohol Not Involved)" = "Male, No Alcohol",
"maleYes (Alcohol Involved)" = "Male, Alcohol Involved"),
values = c("coral4", "coral1", "dodgerblue1", "dodgerblue4"))+ ylim(0,8000)
AgeSexAlcTwo <- ggplot(LongAlcSexAge, aes(x = Age, y = perc, fill = grp)) +
geom_col(position = "stack") +
scale_fill_manual(breaks = c("Female, No Alcohol", "Female, Alcohol Involved", "Male, No Alcohol", "Male, Alcohol Involved"),
labels = c("Female, No Alcohol", "Female, Alcohol Involved", "Male, No Alcohol", "Male, Alcohol Involved"),
values = c("coral4", "coral1", "dodgerblue1", "dodgerblue4")) +
labs(title = "Percent of Accidents by Age, Sex, and Intoxication Level",
x = "Age",
y = "Percent of Accidents",
fill = "Group") +
theme_minimal()
rm("Master_DataframeMini")
gridExtra::grid.arrange(AgeSexAlcOne, AgeSexAlcTwo)
Master_DataframeMini <- Master_Dataframe %>%
filter(time_period == "late_night")
LongAlcSexAge <- Master_DataframeMini %>%
filter(!is.na(Alcohol_involvement)) %>%
filter(!is.na(Sex)) %>%
filter(!is.na(Age)) %>%
mutate(grp = paste0(Sex,Alcohol_involvement)) %>%
mutate(grp = factor(paste0(Sex, Alcohol_involvement),
levels = c("femaleNo (Alcohol Not Involved)",
"femaleYes (Alcohol Involved)",
"maleNo (Alcohol Not Involved)",
"maleYes (Alcohol Involved)"
))) %>%
group_by(Age, grp) %>%
summarise(Accident_count = n()) %>%
mutate(perc = Accident_count / sum(Accident_count) * 100) %>%
mutate(grp = fct_recode(grp,
"Female, No Alcohol" = "femaleNo (Alcohol Not Involved)",
"Female, Alcohol Involved" = "femaleYes (Alcohol Involved)",
"Male, No Alcohol" = "maleNo (Alcohol Not Involved)",
"Male, Alcohol Involved" = "maleYes (Alcohol Involved)"
))
AgeSexAlcOne <- ggplot(Master_DataframeMini %>% filter(!is.na(Sex)) %>% filter(!is.na(Alcohol_involvement)) %>% filter(!is.na(Age)), aes(x = Age, fill = paste0(Sex,Alcohol_involvement))) +
geom_bar(stat = "count", position = "stack") +
labs(fill = "Group")+
scale_fill_manual(labels = c("femaleNo (Alcohol Not Involved)" = "Female, No Alcohol",
"femaleYes (Alcohol Involved)" = "Female, Alcohol Involved",
"maleNo (Alcohol Not Involved)" = "Male, No Alcohol",
"maleYes (Alcohol Involved)" = "Male, Alcohol Involved"),
values = c("coral4", "coral1", "dodgerblue1", "dodgerblue4"))+ ylim(0,8000)
AgeSexAlcTwo <- ggplot(LongAlcSexAge, aes(x = Age, y = perc, fill = grp)) +
geom_col(position = "stack") +
scale_fill_manual(breaks = c("Female, No Alcohol", "Female, Alcohol Involved", "Male, No Alcohol", "Male, Alcohol Involved"),
labels = c("Female, No Alcohol", "Female, Alcohol Involved", "Male, No Alcohol", "Male, Alcohol Involved"),
values = c("coral4", "coral1", "dodgerblue1", "dodgerblue4")) +
labs(title = "Percent of Accidents by Age, Sex, and Intoxication Level",
x = "Age",
y = "Percent of Accidents",
fill = "Group") +
theme_minimal()
rm("Master_DataframeMini")
gridExtra::grid.arrange(AgeSexAlcOne, AgeSexAlcTwo)
We can see that both non-fatal and fatal accidents tend to involve younger individuals over the course of the day. As such, the spread decreases throughout the day and especially small late at night. The spread of age is largest during the midday period. At night and during the late night periods, the median is near the 25th percentile for age which indicates that the majority of the data points are clustered towards the lower end of the distribution, with relatively few extreme values towards the upper end of the age distribution.
Across all times, the median age is higher for fatal accidents than non-fatal accidents. This makes sense since older individuals may have more health complications and more frail bodies overall.
temp_df <- Master_Dataframe %>%
filter(!is.na(Sex) & time_period != "unknown") %>%
mutate(time_period = fct_relevel(time_period, "morning", "midday", "evening", "night", "late_night")) %>%
mutate(time_period = fct_recode(time_period,
"Morning" = "morning",
"Midday" = "midday",
"Evening" = "evening",
"Night" = "night",
"Late Night" = "late_night"))
ggplot(temp_df, aes(y = Age, x = Injury_Severity_Basic)) +
geom_boxplot() +
facet_wrap(~ time_period) +
labs(y = "Age", x = "Was The Accident Fatal", title = "Relationship between Age, Injury Severity, and Hour of Day") +
guides(color = guide_legend(override.aes = list(alpha = 1)))
Over the course of the day more of the accidents involve younger individuals. For instance, the age of accidents which do not result in an injury decreases by about a decade from the morning to late in the night.
Generally, the spread of age is largest during the mid-day period as people of all ages are awake. The spread narrows significantly as we enter the late night where most accidents are involve younger individuals (usually around the age of 25). One notable exception to this are individuals who died prior to the crash which remains to be an older demographic with a larger spread throughout the day.
ggplot(Master_Dataframe %>% filter(time_period == "morning"), aes(x = Injury_severity, y = Age)) +
geom_boxplot() +
labs(title = "Boxplot of Age by Injury Severity Morning", x = "Injury Severity", y = "Age") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
Boxplot of Age by Injury Severity Morning
ggplot(Master_Dataframe%>% filter(time_period == "midday"), aes(x = Injury_severity, y = Age)) +
geom_boxplot() +
labs(title = "Boxplot of Age by Injury Severity Midday", x = "Injury Severity", y = "Age") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
Boxplot of Age by Injury Severity midday
ggplot(Master_Dataframe%>% filter(time_period == "evening"), aes(x = Injury_severity, y = Age)) +
geom_boxplot() +
labs(title = "Boxplot of Age by Injury Severity Evening", x = "Injury Severity", y = "Age") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
Boxplot of Age by Injury Severity evening
ggplot(Master_Dataframe%>% filter(time_period == "night"), aes(x = Injury_severity, y = Age)) +
geom_boxplot() +
labs(title = "Boxplot of Age by Injury Severity Night", x = "Injury Severity", y = "Age") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
Boxplot of Age by Injury Severity night
ggplot(Master_Dataframe%>% filter(time_period == "late_night"), aes(x = Injury_severity, y = Age)) +
geom_boxplot() +
labs(title = "Boxplot of Age by Injury Severity Late Night", x = "Injury Severity", y = "Age") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
Boxplot of Age by Injury Severity Late Night
During the morning, midday, and the evening the distribution of men and women is almost identical, with medians that are almost equal around 35 years old and a large number of accidents involving those around the age of 25 years old. Starting in the evening and continuing through the late night period we can see the median age lower to 25 years old, a decade drop from earlier in the day. We can also see a much more pronounced peak of accidents among young individuals around the age of 17 (presumably those who are new to driving, in particular driving at night). Accordingly, the spread of ages decreases throughout the day. Throughout all time periods, men and women are mostly on par with each other in terms of the distribution of accidents among their groups.
temp_df <- Master_Dataframe %>%
filter(!is.na(Sex) & time_period != "unknown") %>%
mutate(time_period = fct_relevel(time_period, "morning", "midday", "evening", "night", "late_night")) %>%
mutate(time_period = fct_recode(time_period,
"Morning" = "morning",
"Midday" = "midday",
"Evening" = "evening",
"Night" = "night",
"Late Night" = "late_night"))
ggplot(temp_df %>% filter(!is.na(Sex) & !is.na(time_period)), aes(x = Sex, y = Age)) +
geom_boxplot(aes(fill = "indianred3")) +
geom_violin(aes(fill = "dodgerblue"), alpha = 0.5) +
labs(x = "Sex", y = "Age", title = "Distribution of Age by Sex Involved in Accidents by time") +
theme_minimal() +
guides(fill = "none") +
facet_wrap( ~ time_period)
We can see that across all time periods automobiles are involved in the most accidents. This is unsurprising since the majority of vehicles on the road are automobiles. Most of the serious/fatal injuries of those in automobiles take place during the midday period when nearly 140,000 accidents occur in which there is a serious or fatal injury (or suspected). During the evening there are generally far fewer serious or fatal accidents which is commensurate with the generally fewer number of accidents at that time.
Trucks are involved in a significant number of accidents during the morning and at midday, but many of them do not result in any injury. Vans and minivans also get into accidents in the morning and midday, but many more of those result in more injuries or deaths than with trucks (as a proportion of accidents vans are involved in).
temp_df <- Master_Dataframe %>%
filter(time_period != "unknown") %>%
mutate(time_period = fct_relevel(time_period, "morning", "midday", "evening", "night", "late_night")) %>%
mutate(time_period = fct_recode(time_period,
"Morning" = "morning",
"Midday" = "midday",
"Evening" = "evening",
"Night" = "night",
"Late Night" = "late_night"))
ggplot(temp_df , aes(y = Vehicle_body_type, fill = Injury_severity)) +
geom_bar(position = "stack") +
labs(y = "Vehicle body type", x = "Number of accidents", title = "Number of Accidents by Vehicle Body Type and Injury Severity")+
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
scale_fill_manual(values = c("lightblue", "orange", "mediumpurple1", "red1", "darkred", "tan1", "indianred3"))+
theme_minimal()+
theme(legend.position = "bottom", # Move legend to bottom
legend.title = element_blank()) +
facet_wrap( ~ time_period, nrow = 1)+
theme(axis.text.x = element_text(angle = 90, hjust = 1))
Below we will begin an exploration of the trends in the number of accidents.
accidents_per_day <- Master_Dataframe %>%
group_by(Date) %>%
summarise(per_day = n())
# Plot 1: Accidents per Day with Linear Regression
ggplot(accidents_per_day, aes(x = Date, y = per_day)) + geom_line() +
stat_smooth(method = "lm", se = FALSE, col = "red") +
ggtitle("Accidents per Day with Linear Regression")
# Generate a model object from the regression fit
M1 <- lm(per_day ~ as.numeric(Date), accidents_per_day)
accidents_per_day$res1 <- M1$residuals
# Plot 2: Residuals from Linear Regression
ggplot(accidents_per_day, aes(x = Date, y = res1)) + geom_point(alpha = 0.1) +
ggtitle("Residuals from Linear Regression")+
stat_smooth(method = "loess", se = FALSE)
There is a large curve that has critical points around 2005 and 2012 with a linear regression fit suggesting that the linear model does not accurately reflect the dataset. Despite our attempt to control for the trend in the data, our simple linear model does not capture the trend well.
# Plot 3: Accidents per Day with Quadratic Regression
ggplot(accidents_per_day, aes(x = Date, y = per_day)) + geom_line() +
stat_smooth(method = "lm", formula = y ~ x + I(x^2),
se = FALSE, col = "red") +
ggtitle("Accidents per Day with Quadratic Regression")
M2 <- lm(per_day ~ as.numeric(Date) + I(as.numeric(Date)^2), data = accidents_per_day)
accidents_per_day$res2 <- M2$residuals
# Plot 4: Residuals from Quadratic Regression
ggplot(accidents_per_day , aes(x = Date, y = res2)) + geom_point(alpha = 0.1) +
stat_smooth(method = "loess", se = FALSE) +
ggtitle("Residuals from Quadratic Regression")
Although an improvement over the first order polynomial model, there
remains a slight curve that has critical points around 2005 and 2011
with a quadratic regression fit. This indicates that the trend is not
linear and instead follows a polynomial trajectory.
# Plot 5: Accidents per Day with Cubic Regression
ggplot(accidents_per_day, aes(x = Date, y = per_day)) + geom_line() +
stat_smooth(method = "lm", formula = y ~ x + I(x^2) + I(x^3),
se = FALSE, col = "red") +
ggtitle("Accidents per Day with Cubic Regression")
M2 <- lm(per_day ~ as.numeric(Date) + I(as.numeric(Date)^2) + I(as.numeric(Date)^3), data = accidents_per_day)
accidents_per_day$res2 <- M2$residuals
# Plot 6: Residuals from Cubic Regression
ggplot(accidents_per_day , aes(x = Date, y = res2)) + geom_point(alpha = 0.1) +
stat_smooth(method = "loess", se = FALSE) +
ggtitle("Residuals from Cubic Regression")
The residuals still retains a “w” shape with a cubic regression fit.
This can hamper our analysis of the oscillatory patterns in the data. We
can try a non-parametric fit and see if it can capture the 2003 to 2010
hump. That being said, it is reasonably close to a straight line and
could be used as a reasonable model if future investigation wishes to
use a parametric fit.
# Plot 7: Accidents per Day with LOESS Smoothing
ggplot(accidents_per_day, aes(x = Date, y = per_day)) + geom_line() +
stat_smooth(method = "loess", span = 1/4, se = FALSE, col = "red") +
ggtitle("Accidents per Day with LOESS Smoothing")
lo <- loess(per_day ~ as.numeric(Date), accidents_per_day, span=1/4)
accidents_per_day$res3 <- lo$residuals
# Plot 9: Residuals from LOESS Smoothing
ggplot(accidents_per_day, aes(x = Date, y = res3)) + geom_point(alpha = 0.1) +
stat_smooth(method = "loess", se = FALSE, span = 0.2) +
ggtitle("Residuals from LOESS Smoothing")
The non-parametric fit is a significant improvement compared to the
linear or polynomial fits (as the loess is almost a straight line),
suggesting the number of accidents has some non-linear and/or
non-monotonic components.
accidents_per_day$MonthTwo <- month(accidents_per_day$Date, label=TRUE)
ggplot(accidents_per_day, aes(x = MonthTwo, y = res3)) +
geom_boxplot() +
labs(y = "Loess Residual", x = "Month")
We can see that the plot follows a yearly oscillation cycle which peaks
during the summer time and is the lowest during the winter. Since
driving conditions are highly regional, it is difficult to glean to what
extent the model is accurately accounting for changes in driving
conditions or habits. Therefore, it is important to understand the
regional factors before moving forward.
Master_DataframePerDay <- Master_Dataframe %>%
group_by(region, Month_Name, Date) %>%
summarize(per_day = n(), .groups = 'drop')
lo2 <- loess(per_day ~ as.numeric(Date), Master_DataframePerDay, span = 1/4)
Master_DataframePerDay$res3 <- lo2$residuals
ggplot(Master_DataframePerDay, aes(x = factor(Month_Name), y = res3)) +
geom_boxplot() +
facet_wrap(~ region) +
labs(title = "Box Plots of Residuals by Month and Region",
x = "Month",
y = "Residuals") +
theme_minimal()
There exist large discrepancies between how well the model fits each region. The western states fit the national model best, but other regions such as the south are not modeled well. We can also make out a noticeable oscillation in the Midwest which peaks near zero in August (meaning that the model fits the data well during that period and falls off on either end).
In the south the observed values are far greater than the predicted values based on the model. This is balanced out by the northeast, west, and to a lesser extent Midwest regions which are underestimated by the model.
With these discrepancies in mind, we will continue our exploration without the seasonal component by subtracting the median residual to see if another pattern emerges.
d3 <- accidents_per_day %>%
group_by(month(Date)) %>%
mutate(Resid.lev = res3 - median(res3))
ggplot(d3, aes(x = MonthTwo, y = Resid.lev)) + geom_boxplot() + labs(x = "Month")
The spreads among months are generally consistent across months with slightly larger spreads in May, July, and September. We can also identify that the data in September is skewed towards lower values than predicted.
d3$YearTwo <- year(accidents_per_day$Date)
ggplot(d3, aes(x = YearTwo, y = Resid.lev)) + geom_point(col = "grey", cex = .6) +
stat_smooth(se = FALSE, method = "lm") + facet_wrap(~ MonthTwo, nrow = 1) +
theme(axis.text.x=element_text(angle=45, hjust = 1.5, vjust = 1.6, size = 7)) + labs(x = "Year")
We can see that the residuals generally fit the data well from September until May, but during the summer there has been a decline in data values as compared to predicted values over time. This could suggest a gradual offset in the periodicity of the two decades of data. We will fit a loess model to see if the trends for each month are monotonic by adopting a robust loess which controls for possible outliers in the data.
ggplot(d3, aes(x = YearTwo, y = Resid.lev)) + geom_point(col = "grey", cex = .6) +
stat_smooth(method = "loess", se = FALSE,
method.args = list(family = "symmetric")) +
facet_wrap(~ MonthTwo, nrow = 1) +
theme(axis.text.x=element_text(angle=45, hjust = 1.5, vjust = 1.6, size = 7))+ labs(x = "Year")
The trends do not appear to be monotonic (except for May, October, and
to a lesser extent September) which suggests that the cyclical nature of
accidents occurring is not systemic.
Across all regions we can see that the number of accidents per day (mean in window) has been decreasing over the past two decades and has recently seen an uptick. The spread of mean values per window had a lot of variation in the late 20th century and became much more consistent around 2004 which has remained reasonably consistent with some windows being less uniform than in the 2004-2008 period. The recent uptick is notable, but remains significantly lower than two decades ago.
SingleWindow <- function(statistic, data, colname = NULL) {
if (!is.data.frame(data)) {
stop("The input data must be a dataframe.")
}
switch(statistic,
"correlation" = list(Value = cor(data[,2], data[,3]), Midpoint = data[ceiling(nrow(data)/2), 1]),
"p value" = list(Value = cor.test(data[,2], data[,3])$p.value, Midpoint = data[ceiling(nrow(data)/2), 1]),
"OLS" = list(Value = summary(lm(data[,3] ~ data[,2]))$coefficients, Midpoint = data[ceiling(nrow(data)/2), 1]),
"mean" = list(Value = mean(data[[colname]]), Midpoint = data[ceiling(nrow(data)/2), 1]), # Second column only
stop("The input statistic must be one of 'correlation', 'p value', 'mean', or 'OLS'.")
)
}
MovingWindow <- function(data, posix_col, col1, col2, window_days, step_size, statistic) {
# Ensure the input POSIX column is in POSIXct format
data[[posix_col]] <- as.POSIXct(data[[posix_col]])
# Calculate the minimum and maximum POSIX values in the dataset
min_date <- min(data[[posix_col]])
max_date <- max(data[[posix_col]])
# Convert window_days and step_size to seconds
window_seconds <- window_days * 24 * 60 * 60
step_seconds <- step_size * 24 * 60 * 60
# Set the starting date for the moving window
current_date <- min_date
results = list()
# Iterate through the dataset using the moving window
while (current_date + window_seconds <= max_date) {
# Filter the data within the moving window
window_data <- data[data[[posix_col]] >= current_date & data[[posix_col]] <= current_date + window_seconds, c(posix_col, col1, col2)]
# Select the posix_col, col1, and col2 using dplyr and set it equal to win_val
win_val <- window_data %>%
select(all_of(c(posix_col, col1, col2)))
# Print the win_val dataframe for the current window
# print(colnames(win_val))
if (statistic != "mean") {
result <- SingleWindow(statistic, win_val)
}
if (statistic == "mean") {
result <- SingleWindow(statistic, win_val, colname = col1)
}
results <- rbind(results, data.frame(DateTime = current_date,
Value = result$Value[1],
Midpoint = as.POSIXct(result$Midpoint$Datetime)))
# Move the window by step_size
current_date <- current_date + step_seconds
}
# print(results)
ggplot(data = results, aes(x=Midpoint,Value))+
geom_point()
return(results)
}
# MovingWindow(Master_Dataframe, "Datetime", "Number_of_persons_involved", "Number_of_vehicles_involved", 30, 15, "correlation")
DayMeanFind <- Master_Dataframe %>%
group_by(Datetime) %>%
summarize(rows_on_date = n()) %>%
filter(!is.na(rows_on_date))%>%
mutate(rows_on_date = as.numeric(rows_on_date))
MW_Num_Per_Day <- MovingWindow(DayMeanFind, "Datetime", "rows_on_date", "rows_on_date", 60, 15, "mean")
ggplot(data = MW_Num_Per_Day, aes(x=Midpoint,Value))+
geom_point() +
labs(
title = "Mean Accidents Per Day in 60 Day Window With 15 Day Overlap",
x = "Date",
y = "Mean Number of Accidents Per Day"
) +
geom_smooth(method = "loess")
rm("MW_Num_Per_Day","DayMeanFind")
We can see that the northeastern states have consistently been safer than the other regions and that the western states have been the greatest risk over the past twenty years. The south saw a noticeable decrease in accidents between 1996 and 2011 which the other regions echoed but to a lesser extent. The Midwest has not seen as significant an uptick recently as compared to the other regions and has nearly flat lined over the past six years.
# filter by region and calculate daily mean
south <- Master_Dataframe %>%
filter(region == "South") %>%
group_by(Datetime) %>%
summarize(rows_on_date = n()) %>%
filter(!is.na(rows_on_date)) %>%
mutate(rows_on_date = as.numeric(rows_on_date))
southMW <- MovingWindow(south, "Datetime", "rows_on_date", "rows_on_date", 60, 15, "mean")
west <- Master_Dataframe %>%
filter(region == "West") %>%
group_by(Datetime) %>%
summarize(rows_on_date = n()) %>%
filter(!is.na(rows_on_date)) %>%
mutate(rows_on_date = as.numeric(rows_on_date))
westMW <- MovingWindow(west, "Datetime", "rows_on_date", "rows_on_date", 60, 15, "mean")
northeast <- Master_Dataframe %>%
filter(region == "Northeast") %>%
group_by(Datetime) %>%
summarize(rows_on_date = n()) %>%
filter(!is.na(rows_on_date)) %>%
mutate(rows_on_date = as.numeric(rows_on_date))
northeastMW <- MovingWindow(northeast, "Datetime", "rows_on_date", "rows_on_date", 60, 15, "mean")
midwest <- Master_Dataframe %>%
filter(region == "Midwest") %>%
group_by(Datetime) %>%
summarize(rows_on_date = n()) %>%
filter(!is.na(rows_on_date)) %>%
mutate(rows_on_date = as.numeric(rows_on_date))
midwestMW <- MovingWindow(midwest, "Datetime", "rows_on_date", "rows_on_date", 60, 15, "mean")
AllRegionsLong <- bind_rows(
southMW %>% mutate(region = "South") %>% slice_sample(n = 25000, replace = FALSE),
westMW %>% mutate(region = "West") %>% slice_sample(n = 25000, replace = FALSE),
northeastMW %>% mutate(region = "Northeast") %>% slice_sample(n = 25000, replace = FALSE),
midwestMW %>% mutate(region = "Midwest") %>% slice_sample(n = 25000, replace = FALSE)
)
ggplot(data = AllRegionsLong, aes(x=Midpoint,y=Value, col = region))+
# geom_point(alpha = 1/4) +
labs(
title = "Regional Mean Accidents (loess) Per Day in 60 Day Window (15 Day Overlap)",
x = "Date",
y = "Mean Number of Accidents Per Day",
caption = "To reduce render time, sample sizes have been capped at 25,000 per region"
) +
scale_x_datetime(date_breaks = "2 years", date_labels = "%Y") +
geom_smooth(method = "loess", aes(group = region), formula = "y ~ x", se = FALSE, span = 0.6) +
theme(legend.position = "top",
panel.grid.major = element_line(color = "gray", linetype = "dashed"),
panel.grid.minor = element_blank()) +
guides(col = guide_legend(title.position = "top", nrow = 1))+
scale_color_discrete(name = "Regions:")
rm("south", "west", "midwest","northeast","southMW", "westMW", "northeastMW", "midwestMW","AllRegionsLong")
There are clear patterns and trends in car accidents that can be identified based on the time of year and time of the day which can be made more specific using factors such as age or gender. The majority of accidents occur in July, with the fewest in February, following a cyclic pattern. Additionally, New Years and July 4th are very dangerous times to be on the road. Most car accidents occur midday, with vehicles used for commercial purposes such as trucks involved in more accidents during the morning and midday, which are more likely to result in a death. Across the entire day, accidents tend to involve younger individuals, whereas fatal accidents tend to involve older individuals than non-fatal accidents.
The majority of accidents occur in the South and the fewest in the Northeast, with the Northeast, Midwest, and Western regions following a cyclic pattern of accidents by month, while the South remains high year-round. Across the country, accidents start to occur around 9:00 am (8:00 am on weekdays), kicking into high gear around 4:00 pm, and driving becomes more dangerous during the twilight hours. A significant number of young people get into accidents late at night, with the majority being men. During the twilight hours, there are also the most accidents involving alcohol. Fridays and Saturdays are particularly dangerous, especially during the evening and early morning hours, while mornings through midday on weekends are generally safer.
No matter the time of day, men are generally more at risk of being in a car accident than women, with the jump in risk starting as soon as they get their license. Across all hours, women are less likely to get into an accident due to intoxication, but the largest number of women in accidents due to alcohol is late at night. Between 5 am and 8 pm, the distribution of ages of men and women is very similar, with medians around 35 years old and a large number of accidents involving those around 22 years old. However, starting at 8 pm, the median drops a decade towards 25 years old with a much smaller distribution of ages, mostly around the age of 20. These factors should be taken into account when driving to minimize the risk of accidents.
[1] R Core Team (2022). R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. URL https://www.R-project.org/.
[2] Wickham H, François R, Henry L, Müller K, Vaughan D (2023). dplyr: A Grammar of Data Manipulation. R package version 1.1.0, https://CRAN.R-project.org/package=dplyr.
[3] H. Wickham. ggplot2: Elegant Graphics for Data Analysis. Springer-Verlag New York, 2016.
[4] Auguie B (2017). gridExtra: Miscellaneous Functions for “Grid” Graphics. R package version 2.3, https://CRAN.R-project.org/package=gridExtra.
[5] Wickham H, Averick M, Bryan J, Chang W, McGowan LD, François R, Grolemund G, Hayes A, Henry L, Hester J, Kuhn M, Pedersen TL, Miller E, Bache SM, Müller K, Ooms J, Robinson D, Seidel DP, Spinu V, Takahashi K, Vaughan D, Wilke C, Woo K, Yutani H (2019). “Welcome to the tidyverse.” Journal of Open Source Software, 4(43), 1686. doi:10.21105/joss.01686 https://doi.org/10.21105/joss.01686.
[6] Garrett Grolemund, Hadley Wickham (2011). Dates and Times Made Easy with lubridate. Journal of Statistical Software, 40(3), 1-25. URL https://www.jstatsoft.org/v40/i03/.