library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidyr)
library(dplyr)
library(ggplot2)
library(maps)
##
## Attaching package: 'maps'
##
## The following object is masked from 'package:purrr':
##
## map
library(usmap)
We will examine the relationship between the number of drivers and the incidence of fatal accidents in the USA. Our focus will be on fatality data from the National Highway Traffic Safety Administration and driving population statistics from the Federal Highway Administration for the years 2018, 2020, and 2022. Our analysis will investigate whether the number of fatalities increases as the driving population grows. We believe that an increase in the number of drivers is likely to lead to an increase in the number of fatalities as well.
Data of fatal accidents per state
Population of drivers per state:
Fatality_data <- read.csv("https://raw.githubusercontent.com/ZanetaP02/DAta607-Final-Project/refs/heads/main/Fatalities%20and%20Fatality%20Rates%20by%20STATE%2C%201994%20-%202022%20-%20State%20USA.csv")
Fatality_data
Fatalitydata <- Fatality_data[, c("State", "X2018", "X2020", "X2022")]
names(Fatalitydata) <- c("State", "Y18", "Y20", "Y22")
Fatalitydata <- Fatalitydata[-c(52), ]
head(Fatalitydata)
data_2022 <- read.csv("https://raw.githubusercontent.com/ZanetaP02/DAta607-Final-Project/refs/heads/main/Licensed%20Drivers%20by%20Sex%20and%20Ratio%20to%20Population%20-%202022.csv")
data_2022
names(data_2022) <- c("State", "Male_Drivers_Y22", "Male_Driver_%_Y22", "Female_Driver_Y22", "Female_Driver_%_Y22", "Total_Drivers_Y22", "Commercial_Motor_Vehicles_Registered_Y22", "Population_Resident_Y22", "Population_Male_Y22", "Population_Female_Y22", "Population_Total_Y22", "Drivers_Per_1K_Total_Resident_Polulation_Y22", "Drivers_Per_1K_Age_Population_Y22")
data_2022
data22 <- data_2022[-c(1,2,3,4,5,57,58,59,60,61,62,63,64), ]
data22
Remove fotenote from State
data22$State[data22$State=="Alaska(2)"] <-"Alaska"
data22$State[data22$State=="Arkansas(4)"] <-"Arkansas"
data22$State[data22$State=="Connecticut(7)"] <-"Connecticut"
data22$State[data22$State=="Dist. of Col.(4)"] <-"District of Columbia"
data22$State[data22$State=="Colorado(6)"] <-"Colorado"
data22$State[data22$State=="Hawaii(7)"] <-"Hawaii"
data22$State[data22$State=="Iowa(5)"] <-"Iowa"
data22$State[data22$State=="Maryland(6)"] <-"Maryland"
data22$State[data22$State=="Michigan(6)"] <-"Michigan"
data22$State[data22$State=="Minnesota(6)"] <-"Minnesota"
data22$State[data22$State=="Nevada(6)"] <-"Nevada"
data22$State[data22$State=="New Hampshire(4)(6)"] <-"New Hampshire"
data22$State[data22$State=="New Jersey(6)"] <-"New Jersey"
data22$State[data22$State=="New Mexico(6)"] <-"New Mexico"
data22$State[data22$State=="Oregon(6)"] <-"Oregon"
data22$State[data22$State=="Vermont(6)"] <-"Vermont"
data22$State[data22$State=="Washington(6)"] <-"Washington"
data22$State[data22$State=="Wyoming(5)"] <-"Wyoming"
head(data22)
data_2020 <- read.csv("https://raw.githubusercontent.com/ZanetaP02/DAta607-Final-Project/refs/heads/main/Licensed%20Drivers%20by%20Sex%20and%20Ratio%20to%20Population%20-%202020.csv")
head(data_2020)
names(data_2020) <- c("State", "Male_Drivers_Y20", "Male_Driver_%_Y20", "Female_Driver_Y20", "Female_Driver_%_Y20", "Total_Drivers_Y20", "Commercial_Motor_Vehicles_Registered_Y20", "Population_Resident_Y20", "Population_Male_Y20", "Population_Female_Y20", "Population_Total_Y20", "Drivers_Per_1K_Total_Resident_Polulation_Y20", "Drivers_Per_1K_Age_Population_Y20")
data20 <- data_2020[-c(1,2,3,4,5,6,58,59,60,61,62,63,64), ]
head(data20)
data20$State[data20$State=="Arkansas(5)"] <-"Arkansas"
data20$State[data20$State=="Dist. of Col."] <-"District of Columbia"
data20$State[data20$State=="New York(5)"] <-"New York"
head(data20)
data_2018 <- read.csv("https://raw.githubusercontent.com/ZanetaP02/DAta607-Final-Project/refs/heads/main/Licensed%20Drivers%20by%20Sex%20and%20Ratio%20to%20Population%20-%202018.csv")
head(data_2018)
names(data_2018) <- c("State", "Male_Drivers_Y18", "Male_Driver_%_Y18", "Female_Driver_Y18", "Female_Driver_%_Y18", "Total_Drivers_Y18", "Commercial_Motor_Vehicles_Registered_Y18", "Population_Resident_Y18", "Population_Male_Y18", "Population_Female_Y18", "Population_Total_Y18", "Drivers_Per_1K_Total_Resident_Polulation_Y18", "Drivers_Per_1K_Age_Population_Y18")
data18 <- data_2018[-c(1,2,3,4,5,6,58,59,60), ]
data18
data18$State[data18$State=="Alaska2/"] <-"Alaska"
data18$State[data18$State=="Dist. of Col."] <-"District of Columbia"
data18$State[data18$State=="Illinois2/"] <-"Illinois"
data18
df22 <- data22[, c("State", "Total_Drivers_Y22")]
df20 <- data20[, c("State", "Total_Drivers_Y20")]
df18 <- data18[, c("State", "Total_Drivers_Y18")]
df_pop <- Reduce(merge,list(df18,df20,df22))
fd_pop <- Reduce(merge,list(df18,df20,df22))
head(df_pop)
dp <- df_pop %>%
pivot_longer(cols = c('Total_Drivers_Y18', 'Total_Drivers_Y20', 'Total_Drivers_Y22'), names_to = "Total_Drivers_Per_Years", values_to = "Drivers_Population")
head(dp)
summary(dp)
## State Total_Drivers_Per_Years Drivers_Population
## Length:153 Length:153 Length:153
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
fd <- Fatalitydata %>%
pivot_longer(cols = c('Y18', 'Y20', 'Y22'), names_to = "Years", values_to = "Fatalities_Rate")
ggplot((data = fd), aes(x = State, y = Fatalities_Rate, fill = Years)) +
geom_col(position = position_dodge()) +
ggtitle("Fatalities Rate for Years 2018, 2020, & 2022") +
theme(axis.text.x = element_text(angle = 66, hjust = 1))
states <- map_data("state")
counties <- map_data("county")
usa <- subset(states)
head(usa)
fd <- Fatalitydata %>% as.data.frame() %>% rownames_to_column("state")
fd$State <- tolower(fd$State)
head(fd)
fd_long <- fd %>%
pivot_longer(cols = starts_with("Y"), names_to = "Year", values_to = "Fatalities_Rate")
head(fd_long)
ggplot(fd_long %>% filter(Year == "Y18"), aes(map_id = State)) +
geom_map(aes(fill = Fatalities_Rate), map = states) +
expand_limits(x = states$long, y = states$lat) +
scale_fill_gradient(low = "white", high = "blue") +
labs(title = "Fatal Accidents Per State - 2018", x = "Longitude", y = "Latitude") +
coord_fixed(1.3) +
theme_minimal()
ggplot(fd_long %>% filter(Year == "Y20"), aes(map_id = State)) +
geom_map(aes(fill = Fatalities_Rate), map = states) +
expand_limits(x = states$long, y = states$lat) +
scale_fill_gradient(low = "white", high = "green") +
labs(title = "Fatal Accidents Per State - 2020", x = "Longitude", y = "Latitude") +
coord_fixed(1.3) +
theme_minimal()
ggplot(fd_long %>% filter(Year == "Y22"), aes(map_id = State)) +
geom_map(aes(fill = Fatalities_Rate), map = states) +
expand_limits(x = states$long, y = states$lat) +
scale_fill_gradient(low = "white", high = "red") +
labs(title = "Fatal Accidents Per State - 2022", x = "Longitude", y = "Latitude") +
coord_fixed(1.3) +
theme_minimal()
fd %>% top_n(3)
## Selecting by Y22
fd %>% top_n(-3)
## Selecting by Y22
fd_pop$Total_Drivers_Y22 <- as.numeric(gsub(",", "", df_pop$Total_Drivers_Y22))
fd_pop$Total_Drivers_Y20 <- as.numeric(gsub(",", "", df_pop$Total_Drivers_Y20))
fd_pop$Total_Drivers_Y18 <- as.numeric(gsub(",", "", df_pop$Total_Drivers_Y18))
fd_pop
fd_pop %>% top_n(3)
## Selecting by Total_Drivers_Y22
fd_pop %>% top_n(-3)
## Selecting by Total_Drivers_Y22
This graph provides a clear visual representation of fatalities rates across states and years, allowing for comparisons and analysis of trends.
The graph effectively visualizes traffic fatalities across US states for 2018, 2020, and 2022 and the tables of High and Low Fatalities States highlighting significant variations between populous states like California, Texas, and Florida compared to smaller states.
Key Observations:
High Fatalities States: California, Florida, and Texas consistently have the highest fatalities rates.
Low Fatalities States: Vermont, District of Columbia, and Rhode Island have the lowest rates.
Yearly Changes: While some states show slight increases or decreases, the overall trend varies by state.
dp <- df_pop %>%
pivot_longer(cols = c('Total_Drivers_Y18', 'Total_Drivers_Y20', 'Total_Drivers_Y22'), names_to = "Total_Drivers_Per_Years", values_to = "Drivers_Population")
head(dp)
ggplot(data = dp, aes(x = State, y = Drivers_Population, fill = Total_Drivers_Per_Years)) +
geom_col(position = position_dodge()) +
ggtitle("Graph of Drivers Population") +
theme(axis.text.x = element_text(angle = 66, hjust = 1)) +
labs(y = "Drivers Population")
df_p <- df_pop %>%
pivot_longer(cols = c('Total_Drivers_Y18', 'Total_Drivers_Y20', 'Total_Drivers_Y22'), names_to = "Total_Drivers_Per_Years", values_to = "Drivers_Population") %>%
mutate(Total_Drivers_Per_Years = recode(Total_Drivers_Per_Years,
"Total_Drivers_Y18" = "2018",
"Total_Drivers_Y20" = "2020",
"Total_Drivers_Y22" = "2022"),
Drivers_Population = as.numeric(gsub("[^0-9]", "", Drivers_Population)))
ggplot(data = df_p, aes(x = State, y = Drivers_Population, fill = Total_Drivers_Per_Years)) +
geom_col(position = position_dodge()) +
ggtitle("Graph of Drivers Population") +
theme(axis.text.x = element_text(angle = 66, hjust = 1)) +
labs(y = "Drivers Population")
df_pop %>% top_n(3)
## Selecting by Total_Drivers_Y22
df_pop %>% top_n(-3)
## Selecting by Total_Drivers_Y22
fd_pop$Total_Drivers_Y22 <- as.numeric(gsub(",", "", df_pop$Total_Drivers_Y22))
fd_pop$Total_Drivers_Y20 <- as.numeric(gsub(",", "", df_pop$Total_Drivers_Y20))
fd_pop$Total_Drivers_Y18 <- as.numeric(gsub(",", "", df_pop$Total_Drivers_Y18))
fd_pop
fd_pop %>% top_n(3)
## Selecting by Total_Drivers_Y22
fd_pop %>% top_n(-3)
## Selecting by Total_Drivers_Y22
The bar graph “Drivers Population for the Years 2018, 2020, & 2022” shows driver population across different U.S. states and the tables of the highs and lows State Driver Population. Here are the key observations:
Key Observations:
The largest populations are around 27 million drivers
The smallest populations appear to be less than 1 million drivers
Most states fall in the range of 2-8 million drivers
High Driver Population States: California, Florida, and Texas consistently have the highest fatalities rates.
Low Driver Population States: Vermont, District of Columbia, and Wyoming have the lowest rates.
Analysis of the collected data shows a relationship between states with high driver populations and high fatal accident rates, similar to the trend observed in states with low driver populations.
To better examinethe relationship between the number of drivers and the incidence of fatal accidents in the USA, we can set up the null and alternative hypotheses:
Null Hypothesis (H0): There is no relationship between the number of drivers and the incidence of fatal accidents. Alternative Hypothesis (H1): There is a positive relationship between the number of drivers and the incidence of fatal accidents.
Let’s have a look at the code that should help us to test the hypothesis and visualize the relationship between the number of drivers and the incidence of fatal accidents.
# Pivot the Fatalitydata dataframe (there was a change into fd dataframe previous in code)
head(fd)
fd <- Fatalitydata %>%
pivot_longer(cols = c('Y18', 'Y20', 'Y22'), names_to = "Years", values_to = "Fatalities_Rate") %>%
mutate(Years = recode(Years, "Y18" = "2018", "Y20" = "2020", "Y22" = "2022"))
head(fd)
# Pivot the df_pop dataframe
head(dp)
dp <- df_pop %>%
pivot_longer(cols = c('Total_Drivers_Y18', 'Total_Drivers_Y20', 'Total_Drivers_Y22'), names_to = "Total_Drivers_Per_Years", values_to = "Drivers_Population") %>%
mutate(Total_Drivers_Per_Years = recode(Total_Drivers_Per_Years,
"Total_Drivers_Y18" = "2018",
"Total_Drivers_Y20" = "2020",
"Total_Drivers_Y22" = "2022"),
Drivers_Population = as.numeric(gsub("[^0-9]", "", Drivers_Population)))
head(dp)
fd$State <- tolower(fd$State)
dp$State <- tolower(dp$State)
# Merge the two dataframes on the appropriate columns
merged_data <- merge(fd, dp, by.x = c("State", "Years"), by.y = c("State", "Total_Drivers_Per_Years"))
head(merged_data)
model <- lm(Fatalities_Rate ~ Drivers_Population, data = merged_data)
summary(model)
##
## Call:
## lm(formula = Fatalities_Rate ~ Drivers_Population, data = merged_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1073.53 -128.75 -27.83 169.64 1293.24
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.036e+01 3.424e+01 0.887 0.377
## Drivers_Population 1.646e-04 5.131e-06 32.077 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 311.8 on 151 degrees of freedom
## Multiple R-squared: 0.872, Adjusted R-squared: 0.8712
## F-statistic: 1029 on 1 and 151 DF, p-value: < 2.2e-16
ggplot(merged_data, aes(x = Drivers_Population, y = Fatalities_Rate)) +
geom_point() +
geom_smooth(method = "lm", col = "blue") +
labs(title = "Relationship between Drivers Population and Fatalities Rate",
x = "Drivers Population",
y = "Fatalities Rate")
## `geom_smooth()` using formula = 'y ~ x'
There is a clear statistical relationship between the size of the driving population and the number of fatalities - The relationship appears to be roughly proportional, suggesting that larger driving populations are associated with higher fatality rates - The visualization suggests that fatality rates are strongly correlated with the size of the driving population, which could be valuable for traffic safety planning and policy-making.
Based on the results of the linear regression analysis, we can draw the following conclusions:
Positive Relationship: The coefficient for Drivers_Population is positive and highly significant (p-value < 2e-16), indicating a strong positive relationship between the number of drivers and the incidence of fatal accidents. This means that as the number of drivers increases, the number of fatal accidents also increases.
Model Fit: The R-squared value is 0.8703, which means that approximately 87% of the variability in the fatality rate can be explained by the number of drivers. This suggests that the model fits the data well.
Statistical Significance: The very low p-value (< 2.2e-16) for the Drivers_Population variable indicates that the relationship between the number of drivers and the fatality rate is statistically significant.
the analysis supports the hypothesis that an increase in the number of drivers is likely to lead to an increase in the number of fatalities. This finding is consistent with the expectation that more drivers on the road can lead to a higher incidence of fatal accidents.
The analysis shows a significant positive relationship between the number of drivers and the incidence of fatal accidents in the USA. As the driving population increases, the number of fatalities also increases. This supports the hypothesis that more drivers lead to more fatal accidents.