Final Project: Exploring the Relationship Between Population and Accident Data in the US

Hyreen Alice, Pujan Rijal, Banabas Kariuki

2022-12-12

# Loading the required package

library(skimr)
library(ggplot2)
library(knitr)
library(tidyverse)
library(data.table)
library(knitr)
library(arrow)
library(bench)
library(haven)
library(googlesheets4)
library(plyr)
library(scales)
library(ggdist)
library(patchwork)
library(ggthemes)
library(stringr)
set.seed(1994)
#Importing the US Accidents dataset
US_Accidents <- data.table::fread("Accidents.csv") %>% as_tibble()
# Importing the Population dataset
Population_data <- read.csv("Population.csv")
# Using R's `lubridate` function to manipulate 2021 Accidents Data
US_Accidents_2021 <- US_Accidents %>% mutate(Start_time=lubridate::mdy_hm(Start_Time)) %>% filter(year( Start_time)==2021) 

#Extracting the number of accidents per month using the newly created variable for StarTtime
monthly_accidents_2021<-US_Accidents_2021 %>% mutate(accident_month=lubridate::month(Start_time, label=T, abbr=T))%>% dplyr::count(accident_month) %>% dplyr::rename(accidents=n) %>% flextable::flextable()
#Counting the number of accidents per weather condition and arranging them in a descending order
pie_data<-dplyr::count(US_Accidents_2021, Weather_Condition) %>% dplyr::rename(accidents=n) %>% arrange(desc(accidents))
flextable::flextable(pie_data, cwidth = 2)
#An interactive visualization showing accidents during different weather conditions

library(plotly)
plot_ly(pie_data,values=~accidents,labels=~factor(Weather_Condition),marker=list(colors=c("blue","green")),type="pie")

From the visualization, we can infer that most accidents happened when the weather condition was fair while the least number of accidents occurred during the instances of hail, heavy rain shower/windy, dust-storm, and so on.

#grouping accidents per month in year 2021 for all US states
monthly_US_Accidents_2021<-US_Accidents_2021%>% mutate(Month=lubridate::month(Start_time, label=TRUE, abbr=TRUE)) %>% dplyr::count(Month, State) %>% dplyr::rename(no_of_accidents=n)
flextable::flextable(monthly_US_Accidents_2021, cwidth = 2)
#Visualizing the total accidents per month using geom_col

Monthly_accidents<-monthly_US_Accidents_2021 %>% ggplot(aes(x=Month,
                                       y=no_of_accidents, fill=State))+geom_col() +
  scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) +
  labs(title = "Monthly 2021 Accidents in the US",
         y = "Number of accidents",
         x = "Month",
         caption = "Kaggle Data: US Accidents 2016 - 2021") +
  scale_fill_viridis_d()+
    theme_minimal() + 
  theme_grey(base_size = 11,
  base_family = "")+
    theme(legend.position = "none",
          text = element_text(face = "bold"))
ggplotly(Monthly_accidents)

The chart above indicates that the highest number of cumulative accidents in the US occurred in the month of December while the lowest number of accidents occurred in the month of February.

#Pivoting longer() to create the variable 'accidentLocation' and using filter() to get accidents for the three States

US_Accidents_2021_3States<-US_Accidents_2021 %>% 
  pivot_longer(cols=c(Amenity:Turning_Loop),
               names_to ="accidentLocation",
               values_to = "TRUE_FALSE")%>% filter(State==c("MI", "NY","OH"))
#Exploratory data analysis
glimpse(US_Accidents_2021_3States)
## Rows: 148,534
## Columns: 37
## $ ID                    <chr> "A-224995", "A-224995", "A-224995", "A-224995", …
## $ Severity              <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ Start_Time            <chr> "7/31/2021 23:35", "7/31/2021 23:35", "7/31/2021…
## $ End_Time              <chr> "8/1/2021 2:45", "8/1/2021 2:45", "8/1/2021 2:45…
## $ Start_Lat             <dbl> 40.81538, 40.81538, 40.81538, 40.81538, 40.81538…
## $ Start_Lng             <dbl> -73.83590, -73.83590, -73.83590, -73.83590, -73.…
## $ End_Lat               <dbl> 40.81768, 40.81768, 40.81768, 40.81768, 40.81768…
## $ End_Lng               <dbl> -73.83604, -73.83604, -73.83604, -73.83604, -73.…
## $ `Distance(mi)`        <dbl> 0.159, 0.159, 0.159, 0.159, 0.159, 0.083, 0.083,…
## $ Description           <chr> "Crash on I-678 ramp northbound Hutchinson River…
## $ Number                <int> 620, 620, 620, 620, 620, 1066, 1066, 1066, 1066,…
## $ Street                <chr> "Hutchinson River Pkwy", "Hutchinson River Pkwy"…
## $ Side                  <chr> "L", "L", "L", "L", "L", "R", "R", "R", "R", "R"…
## $ City                  <chr> "Bronx", "Bronx", "Bronx", "Bronx", "Bronx", "Ca…
## $ County                <chr> "Bronx", "Bronx", "Bronx", "Bronx", "Bronx", "Pu…
## $ State                 <chr> "NY", "NY", "NY", "NY", "NY", "NY", "NY", "NY", …
## $ Zipcode               <chr> "10465", "10465", "10465", "10465", "10465", "10…
## $ Country               <chr> "US", "US", "US", "US", "US", "US", "US", "US", …
## $ Timezone              <chr> "US/Eastern", "US/Eastern", "US/Eastern", "US/Ea…
## $ Airport_Code          <chr> "KLGA", "KLGA", "KLGA", "KLGA", "KLGA", "KDXR", …
## $ Weather_Timestamp     <chr> "7/31/2021 23:51", "7/31/2021 23:51", "7/31/2021…
## $ `Temperature(F)`      <dbl> 72, 72, 72, 72, 72, 67, 67, 67, 67, 35, 35, 35, …
## $ `Wind_Chill(F)`       <dbl> 72, 72, 72, 72, 72, 67, 67, 67, 67, 26, 26, 26, …
## $ `Humidity(%)`         <int> 64, 64, 64, 64, 64, 49, 49, 49, 49, 54, 54, 54, …
## $ `Pressure(in)`        <dbl> 29.92, 29.92, 29.92, 29.92, 29.92, 29.70, 29.70,…
## $ `Visibility(mi)`      <dbl> 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, …
## $ Wind_Direction        <chr> "S", "S", "S", "S", "S", "NNW", "NNW", "NNW", "N…
## $ `Wind_Speed(mph)`     <dbl> 6, 6, 6, 6, 6, 8, 8, 8, 8, 13, 13, 13, 13, 15, 1…
## $ `Precipitation(in)`   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Weather_Condition     <chr> "Fair", "Fair", "Fair", "Fair", "Fair", "Fair", …
## $ Sunrise_Sunset        <chr> "Night", "Night", "Night", "Night", "Night", "Da…
## $ Civil_Twilight        <chr> "Night", "Night", "Night", "Night", "Night", "Da…
## $ Nautical_Twilight     <chr> "Night", "Night", "Night", "Night", "Night", "Da…
## $ Astronomical_Twilight <chr> "Night", "Night", "Night", "Night", "Night", "Da…
## $ Start_time            <dttm> 2021-07-31 23:35:00, 2021-07-31 23:35:00, 2021-…
## $ accidentLocation      <chr> "Amenity", "Give_Way", "Railway", "Stop", "Turni…
## $ TRUE_FALSE            <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,…
skim(US_Accidents_2021_3States)
Data summary
Name US_Accidents_2021_3States
Number of rows 148534
Number of columns 37
_______________________
Column type frequency:
character 21
logical 1
numeric 14
POSIXct 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
ID 0 1 8 9 0 34275 0
Start_Time 0 1 13 16 0 26512 0
End_Time 0 1 13 16 0 30182 0
Description 0 1 16 375 0 25700 0
Street 0 1 4 50 0 4683 0
Side 0 1 1 1 0 2 0
City 0 1 3 22 0 1002 0
County 0 1 3 14 0 128 0
State 0 1 2 2 0 3 0
Zipcode 0 1 0 10 9 8201 0
Country 0 1 2 2 0 1 0
Timezone 0 1 0 10 9 3 0
Airport_Code 0 1 0 4 282 138 0
Weather_Timestamp 0 1 0 16 685 16607 0
Wind_Direction 0 1 0 4 2242 19 0
Weather_Condition 0 1 0 28 872 54 0
Sunrise_Sunset 0 1 0 5 248 3 0
Civil_Twilight 0 1 0 5 248 3 0
Nautical_Twilight 0 1 0 5 248 3 0
Astronomical_Twilight 0 1 0 5 248 3 0
accidentLocation 0 1 4 15 0 13 0

Variable type: logical

skim_variable n_missing complete_rate mean count
TRUE_FALSE 0 1 0.02 FAL: 145012, TRU: 3522

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Severity 0 1.00 2.05 0.32 2.00 2.00 2.00 2.00 4.00 ▇▁▁▁▁
Start_Lat 0 1.00 41.89 1.11 38.84 40.81 42.27 42.96 47.13 ▁▇▇▁▁
Start_Lng 0 1.00 -77.35 4.49 -90.17 -83.08 -74.07 -73.84 -71.95 ▁▃▁▂▇
End_Lat 0 1.00 41.89 1.11 38.83 40.81 42.27 42.96 47.13 ▁▇▇▁▁
End_Lng 0 1.00 -77.35 4.49 -90.17 -83.09 -74.08 -73.84 -71.95 ▁▃▁▂▇
Distance(mi) 0 1.00 0.85 1.25 0.00 0.11 0.42 1.06 39.63 ▇▁▁▁▁
Number 102743 0.31 2585.06 3565.84 1.00 329.00 1282.00 3346.00 44801.00 ▇▁▁▁▁
Temperature(F) 847 0.99 57.99 17.19 8.00 44.00 59.00 72.00 98.00 ▁▆▆▇▂
Wind_Chill(F) 2320 0.98 56.10 19.54 -6.00 39.00 59.00 72.00 98.00 ▁▃▅▇▃
Humidity(%) 895 0.99 66.04 19.78 8.00 51.00 67.00 83.00 100.00 ▁▅▇▇▇
Pressure(in) 822 0.99 29.51 0.43 19.75 29.20 29.51 29.85 30.65 ▁▁▁▁▇
Visibility(mi) 1164 0.99 9.14 2.51 0.00 10.00 10.00 10.00 20.00 ▁▁▇▁▁
Wind_Speed(mph) 2242 0.98 8.37 5.40 0.00 5.00 8.00 12.00 38.00 ▇▇▂▁▁
Precipitation(in) 879 0.99 0.01 0.04 0.00 0.00 0.00 0.00 1.32 ▇▁▁▁▁

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
Start_time 0 1 2021-02-27 22:41:00 2021-12-31 22:07:00 2021-09-10 13:21:30 26512

The dataset has a completion rate of 0.98 - 1.0.

# Calculating the number of accidents during days and nights in the 3 states using the variable 'Sunrise_Sunset'

dayAndNight<-US_Accidents_2021_3States %>%
   dplyr::count(Sunrise_Sunset,State) %>% filter(Sunrise_Sunset %in% c("Day","Night"))
flextable::flextable(dayAndNight, cwidth = 2)

NY had the most number of accidents during the day and night (68486, 38850) while Ohio had the least(5013, 2114).

# Summary statistics of the accidents that occurred at the Right and the Left side of the road

leftAndRightSideAccidents<-US_Accidents_2021_3States %>%
   dplyr::count(Side,State)  %>% filter(State %in% c("OH","MI","NY"))
flextable::flextable(leftAndRightSideAccidents, cwidth = 2)

From the tibble above, we can see that most of the accidents happened on the right side of the road with NY having the highest number of accidents among the 3 states.

# Summary statistics of accidents caused by the selective weather types 
accidents_weather_condition<-US_Accidents_2021_3States %>%
   group_by(Weather_Condition, State, Severity) %>%
   dplyr::summarise(Count = n()) %>%
   dplyr::rename(accident = Count) %>% filter(Weather_Condition %in% c("Clear", "Mostly Cloudy", "Overcast", "Partly Cloudy", "Scattterd Clouds", "Fair", "Light Rain", "Light Snow", "Cloudy", "Rain", "Fog")) %>% arrange(desc(accident))
flextable::flextable(accidents_weather_condition, cwidth = 2)

Most of the accidents happened in NY when the weather was Fair and the least number of accidents happened in Ohio when the weather was mostly cloudy.

#Counting monthly accidents in the three states
monthly_US_Accidents_2021 <- US_Accidents_2021_3States %>% mutate(Month = lubridate::month(Start_time, label = T, abbr = T)) %>% dplyr::count(Month,State,Start_time) %>% dplyr::rename(no_of_Accidents = n)


#visualizing accidents per month in the three states
monthly_accidents<-monthly_US_Accidents_2021 %>% ggplot(aes(x=Month,
                                       y=no_of_Accidents, fill=State))+
  geom_col()+
  scale_fill_viridis_d()+
  labs(title = "Monthly accidents in Three States",
       x="Month",
       y="Number of accidents",
       caption = "Data source: https://www.kaggle.com/datasets/sobhanmoosavi/us-accidents")+
   theme_bw(base_size = 10)+
  theme(text = element_text(face = "bold"), legend.position = c(.95, .95),
  legend.justification = c("right", "top"),
  legend.box.just = "right",
  legend.margin = margin(6, 6, 6, 6))+
  scale_y_continuous(labels = comma,
                     expand=expansion(mult=c(0, .1)))
                                                                                         
ggplotly(monthly_accidents)

As indicated above, the total number of cumulative accidents occurred in December in the 3 States while the least instances of accidents occurred in February.

Dec_Accidents = monthly_US_Accidents_2021 %>% mutate(day = lubridate::day(Start_time)) %>% filter(Month == "Dec", day %in% c(1:31)) %>% dplyr::count(day) %>% dplyr::rename(No_of_Accidents = n)

flextable::flextable(Dec_Accidents, cwidth = 2)
ggplotly(Dec_Accidents %>% ggplot(aes(x = day,
                             y = No_of_Accidents)) +
  geom_col(fill="dodgerblue") +
  scale_x_continuous(breaks = c(1:31))+
  scale_y_continuous(breaks = seq(0,250,10))+
  theme_bw(base_size = 10) +
  scale_fill_viridis_d() +
    theme(legend.position = "none",
          text = element_text(face = "bold")) +
  labs(title = "Dec 2021 Daily Accidents in MI, NY & OH",
       x = "Day",
       y = "Number of Accidents",
       caption = "Kaggle Data: US Accidents 2016 - 2021"))

The most number of accidents in December for the 3 states for the year 2021 occurred in the 23rd of December (210).

#Visualizing state accidents on various weather conditions
US_Accidents_2021_3States %>% filter(Weather_Condition %in% c("Clear", "Mostly Cloudy", "Overcast", "Partly Cloudy", "Scattterd Clouds", "Fair", "Light Rain", "Light Snow", "Cloudy", "Rain", "Fog")) %>% ggplot(aes(Weather_Condition, fill=Weather_Condition)) + geom_histogram(stat="count")+facet_wrap(.~State)+
  labs(title = " MI, NY, and OH Accidents as during various weather conditions",
       x="Weather conditions",
       y="Number of accidents",
       caption = "Kaggle Data: US Accidents 2016 - 2021")+
  scale_fill_viridis_d()+
  theme_bw()+
  theme_bw()+
  scale_fill_viridis_d() +
  theme(text = element_text(face = "bold"))+
  theme(legend.position = "None",
        axis.text.x=element_text(angle = 45,
                              hjust=.9, size=10))

As depicted by the faceted chart above, NY has the most number of accidents during selective weather conditions while OH has the lowest.

accidents_weather_conditions<- accidents_weather_condition %>% ggplot(aes(x=Weather_Condition,
                  y=accident, color=Severity)) +geom_jitter()+
  labs(title = " MI, NY, and OH Accidents as per the weather conditions",
       x="Weather_Condition",
       y="Number of accidents",
       caption = "Kaggle Data: US Accidents 2016 - 2021")+
  theme(axis.text.x=element_text(angle = 45,
                              hjust=.9, size=10)) +
    theme(legend.position = "bottom",
          text = element_text(face = "bold"))

ggplotly(accidents_weather_conditions)

As seen above, most accidents with a severity of 2 happened when the weather was fair.

#Visualizing accidents vs weather conditions
accidents_weather_condition %>% ggplot(aes(x=Weather_Condition,
                  y=accident, color=Severity)) +geom_boxplot()+
  labs(title = " MI, NY, and OH Accidents as per the weather conditions",
       x="Weather_Condition",
       y="Number of accidents")+
  scale_y_continuous(label=comma)+
  theme(axis.text.x=element_text(angle = 45,
                              hjust=.9, size=10))+
  theme_minimal() + 
    theme(legend.position = "bottom",
          text = element_text(face = "bold"))+
  theme_bw()

This is an alternative visualization in the form of a box-plot which indicates the most accidents with a severity of 2 happening when the weather was fair.

#Visualizing the number of Accidents in the three state
accidents_weather_condition %>% ggplot(aes(x=State,
                  y=accident, fill=State)) +geom_col() +
     scale_y_continuous(labels = comma,
                     expand=expansion(mult=c(0, .1)))+
  labs(title="Number of accidents per state",
       x="State",
       y="Number of accidents",
       caption = "Kaggle Data: US Accidents 2016 - 2021")+
  scale_fill_viridis_d()+
    theme_bw() +
  theme(legend.position = "none")

The visualization shows the total accidents segregated by the states with NY having the highest number of accidents.

#Creating a frequency table
day_and_night_accidents<-US_Accidents_2021_3States  %>% dplyr::count(Sunrise_Sunset, State) %>% filter(Sunrise_Sunset %in% c("Day","Night")) %>% dplyr::rename(Accidents=n)
flextable::flextable(day_and_night_accidents, cwidth = 2)
day_and_night_accidents %>% ggplot(aes(x=Sunrise_Sunset,
                                 y=Accidents, fill=State))+geom_col()+facet_wrap(.~State)+
  scale_fill_viridis_d()+
  labs(title = "A histogram displaying Sunrise and Sunset Accidents",
     x="Time of accident",
     y="Number of accidents",
     caption = "Kaggle Data: US Accidents 2016 - 2021")+
  theme_bw()

Most of the accidents occurred in NY during the day and the least number of accidents occurred in NY during the night.

#Daily accidents in Michigan
weekday_Michigan_Accidents_2021<-US_Accidents_2021_3States%>% mutate(weekday=lubridate::wday(Start_time, label=TRUE, abbr=TRUE)) %>% dplyr::count(State, weekday) %>% dplyr::rename(no_of_accidents=n) %>% filter(State=="MI")
flextable::flextable(weekday_Michigan_Accidents_2021, cwidth = 2)

The table shows that most of the accidents occurred in MI on Fridays.

#Daily accidents in New York
weekday_NewYork_Accidents_2021<-US_Accidents_2021_3States%>% mutate(weekday=lubridate::wday(Start_time, label=TRUE, abbr=TRUE)) %>% dplyr::count(State, weekday) %>% dplyr::rename(no_of_accidents=n) %>% filter(State=="NY")
flextable::flextable(weekday_NewYork_Accidents_2021, cwidth = 2)

The table shows that most of the accidents occurred in NY on Fridays.

#Daily accidents in Ohio
weekday_Ohio_Accidents_2021<-US_Accidents_2021_3States%>% mutate(weekday=lubridate::wday(Start_time, label=TRUE, abbr=TRUE)) %>% dplyr::count(State, weekday) %>% dplyr::rename(no_of_accidents=n) %>% filter(State=="OH")
flextable::flextable(weekday_Ohio_Accidents_2021, cwidth = 2)

The table shows that most of the accidents occurred in Ohio on Fridays.

#A table showing accidents in the three states in a week
weekdayAccidents3States<-US_Accidents_2021_3States%>% mutate(weekday=lubridate::wday(Start_time, label=TRUE, abbr=TRUE)) %>% dplyr::count(weekday) %>% dplyr::rename(no_of_accidents=n) %>% arrange(no_of_accidents)
flextable::flextable(weekdayAccidents3States, cwidth = 2)

The table shows that most of the accidents in all the 3 states have occurred on Fridays.

hourlyAccidents<-US_Accidents_2021_3States %>% mutate(hourlyAccidents=lubridate::hour(Start_time)) %>% dplyr::count(hourlyAccidents,State) %>% dplyr::rename(accidents=n)

flextable::flextable(hourlyAccidents, cwidth = 2)
hourlyAccidents %>% ggplot(aes(x=hourlyAccidents,
                  y=accidents, color=State))+ 
  geom_line(stat="identity", position="identity")+
  geom_point()+
  scale_x_continuous(breaks=seq(0,23,1))+theme(text = element_text(face="bold"), legend.box.margin = margin(10, 10, 10, 10))+
  labs(title = "Accidents within 24 hours in MI,NY,OH",
     x="Time of accident",
     y="Number of accidents")+
  theme(legend.box.margin = margin(6, 6, 6, 6))

The line-chart indicates that most of the accidents happened at 3 pm and 5 pm in all the 3 states. NY has the highest number of accidents within 24 hours.

#Frequency table showing the number of day and night accidents
count(US_Accidents_2021_3States, 'Sunrise_Sunset', 'Severity') %>% filter(Sunrise_Sunset %in% c("Day", "Night")) %>% dplyr::rename(Accidents = freq)
##   Sunrise_Sunset Accidents
## 1            Day    197886
## 2          Night    106154

The number of accidents happening during the day (197886) is nearly twice the accidents at night.

#Frequency table showing the number of accidents for the Michigan, New York and Ohio
count(US_Accidents_2021_3States, 'State', 'Severity')%>% dplyr::rename(Accidents = freq)
##   State Accidents
## 1    MI     69618
## 2    NY    220774
## 3    OH     14342

New York has the highest number of accidents (220774) amongst the 3 states.

#Joining State names with equivalent Abbreviations using the `State.name` and `state.abb` R fucntions
State_Abb<- data.frame(State = state.name, StateAbb = state.abb)

#Merging `State_Abb` with `PopData` using full_join()
FullPopData<- full_join(Population_data, State_Abb, by = c("STNAME" = "State"))

#Selecting three states from the population Data (MN, MI & NY)

populationTotals<- filter(FullPopData, CTYNAME %in% c("Michigan", "Ohio", "New York"))

FullPopData_3_States<- populationTotals %>% select(c("StateAbb","STNAME","CTYNAME", "POPESTIMATE2021"))

#Joining Accident Data `US_Accidents_2021_3States` and the Population Data `FullPopData_3_States`
Accident_Pop_Data <- US_Accidents_2021_3States %>% full_join(FullPopData_3_States, by = c("State" ="StateAbb"))
#Counting the number of accidents per State
US_Accidents_2021_3States1 <- count(US_Accidents_2021_3States, 'State', 'Severity')%>% dplyr::rename(accidents_2021 = freq)
US_Accidents_2021_3States_df <- data.frame(US_Accidents_2021_3States1)

# Creating a table showing the number of accidents and population of the respective states
US_accident_population <- US_Accidents_2021_3States_df %>% full_join(FullPopData_3_States, by = c("State" ="StateAbb"))
flextable::flextable(US_accident_population, cwidth = 2)

NY has the highest number of accidents alongside having the highest population estimate. Ohio has a higher population estimate compared to Michigan. However, it has a lower record of accidents in comparison to the latter.

#Interactive Visualizations for Population estimate (2021) vs accidents in NY, OH, MI

ggplotly(US_accident_population %>% ggplot(aes(x = STNAME,
                               y = POPESTIMATE2021,
                               fill= accidents_2021)) +
  geom_col()+
    scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) +
    theme_grey()+
    theme(legend.position = "bottom",
          text = element_text(face = "bold"))+
  labs(title="Population estimate (2021) vs accidents in NY, OH, MI",
      X="State Name",
      y="Population",
      caption = "https://www.kaggle.com/datasets/sobhanmoosavi/us-accidents"))
#Using str_detect and mutate() to create a new boolean variable name to indicate whether accident description contains "slow traffic"
slow_traffic_accidents<-US_Accidents_2021_3States %>% mutate(contains_slow_traffic= str_detect(str_to_lower(Description), pattern = "slow traffic")) 

#number of  accidents  when there was slow traffic
slow_traffic_accidents %>% pull(contains_slow_traffic) %>% sum()
## [1] 24371

The number of accidents that led to slow traffic was 24371.

#number of  accidents that caused stationary traffic
stationary_traffic_accidents<-US_Accidents_2021_3States %>% mutate(contains_stationary_traffic= str_detect(str_to_lower(Description), pattern = "stationary traffic"))
stationary_traffic_accidents %>% pull(contains_stationary_traffic) %>% sum()
## [1] 23227

23227 led to stationary traffic.

#Creating a Data Dictionary for the US Accidents and Population Data
myvariables<-Accident_Pop_Data %>% select(c("State", 
                         "Sunrise_Sunset", 
                         "Weather_Condition", 
                         "Side",
                         "Severity",
                         "CTYNAME",
                         "POPESTIMATE2021",
                         "accidentLocation",
                         "Start_time",
                         "Description",
                         "STNAME"))
dataDictionary <- tibble(Variable = colnames(myvariables),
                         Description = c("US States", 
                         "Day or Night", 
                         "Weather Conditions", 
                         " Side of the Road",
                         "Impact of the accident on traffic on a scale of 1 to 4",
                         "Name of the city",
                         "2021 Population Estimate",
                         "Location of the accident",
                         "Start time of the Accident",
                         "Description of the accident aftermath",
                         "The name of the State"),
Type = map_chr(myvariables, .f = function(x){typeof(x)[1]}),
Class = map_chr(myvariables, .f = function(x){class(x)[1]}))
flextable::flextable(dataDictionary, cwidth = 2)
# Randomization test example ---------------------------------

library(broom)

# Loading the dataset

data(US_Accidents_2021_3States)
myData <- US_Accidents_2021_3States %>% select(Sunrise_Sunset, `Visibility(mi)`) 

# Fitting One-Way ANOVA model
modFit <- aov(`Visibility(mi)` ~ Sunrise_Sunset, data = US_Accidents_2021_3States)
Fstatistic <- modFit %>% tidy() %>% slice_head(n = 1) %>% pull(statistic)

# Getting the number of accidents during the day and night

groupCounts <- US_Accidents_2021_3States %>% dplyr::count(Sunrise_Sunset) %>% filter(Sunrise_Sunset%in%c("Day", "Night"))
flextable::flextable(groupCounts, cwidth = 2)
# Overall sample size
N <- nrow(US_Accidents_2021_3States)

# Number of permutations
nperms <- 1000

# Instantiating vector for test statistics
permFs <- vector(length = nperms)
mean(permFs>=Fstatistic)
## [1] 0
# Create vector of the number of accidents during the day and night
groups <- rep(groupCounts$Sunrise_Sunset, times = groupCounts$n)

for(p in 1:nperms) {
permData <- US_Accidents_2021_3States %>% mutate(Sunrise_Sunset = groups[sample(1:N, size = N, replace = FALSE)])

# Calculate accidents during the day and night
modFit <- aov(`Visibility(mi)` ~ Sunrise_Sunset, data = permData)
permFs[p] <- modFit %>% tidy() %>% slice_head(n = 1) %>% pull(statistic)
}
permFs[p]
## [1] 0.008839787

96,800 accidents happened during the day and 51,486 accidents during the night. The number of permFs was 0.008839787.

# Calculating the standard error

myname<-US_Accidents_2021_3States %>% dplyr::rename(Visibility = `Visibility(mi)`) %>% filter(!is.na(Visibility))

set.seed(1994)

n <-nrow(myname) 
visibility <- myname$Visibility
median(visibility)
## [1] 10
B<- 100

# Instantiating matrix for bootstrap samples
paramboots <- matrix(NA, nrow = n, ncol = B)
xBar<-mean(visibility)
s<-sd(visibility)

# Simulating a normal set of 30 values, B times
for(b in 1:B) {
paramboots[, b] <- rnorm(n=n, mean=xBar, sd=s)
}

#Installing vector for bootstrap medians
bootparamMedians<-vector(length=B)

#Calculating median for each simulated data set
for(b in 1:B) {
  bootparamMedians[b]<-median(paramboots[,b])
}

#Obtain a parametric bootstrap estimate of the standard error of the sample median.
  SEparamestimate<-sd(bootparamMedians[b])

The parametric bootstrap estimate of the standard error of the sample median is 10.