# Loading the required package
library(skimr)
library(ggplot2)
library(knitr)
library(tidyverse)
library(data.table)
library(knitr)
library(arrow)
library(bench)
library(haven)
library(googlesheets4)
library(plyr)
library(scales)
library(ggdist)
library(patchwork)
library(ggthemes)
library(stringr)
#Importing the US Accidents dataset
US_Accidents <- data.table::fread("Accidents.csv") %>% as_tibble()
# Using R's `lubridate` function to manipulate 2021 Accidents Data
US_Accidents_2021 <- US_Accidents %>% mutate(Start_time=lubridate::mdy_hm(Start_Time)) %>% filter(year( Start_time)==2021)
#Extracting the number of accidents per month using the newly created variable for StarTtime
monthly_accidents_2021<-US_Accidents_2021 %>% mutate(accident_month=lubridate::month(Start_time, label=T, abbr=T))%>% dplyr::count(accident_month) %>% dplyr::rename(accidents=n) %>% flextable::flextable()
#Counting the number of accidents per weather condition and arranging them in a descending order
pie_data<-dplyr::count(US_Accidents_2021, Weather_Condition) %>% dplyr::rename(accidents=n) %>% arrange(desc(accidents))
flextable::flextable(pie_data, cwidth = 2)
Weather_Condition | accidents |
---|---|
Fair | 324,867 |
Mostly Cloudy | 87,358 |
Cloudy | 86,303 |
Partly Cloudy | 60,348 |
Light Rain | 28,410 |
13,932 | |
Fog | 9,725 |
Haze | 9,352 |
Rain | 7,029 |
Light Snow | 6,792 |
Fair / Windy | 3,892 |
Heavy Rain | 2,991 |
Thunder in the Vicinity | 2,927 |
Thunder | 2,588 |
T-Storm | 2,573 |
Smoke | 2,170 |
Light Rain with Thunder | 2,130 |
Cloudy / Windy | 1,594 |
Heavy T-Storm | 1,514 |
Mostly Cloudy / Windy | 1,359 |
Light Drizzle | 1,226 |
Snow | 925 |
Partly Cloudy / Windy | 844 |
Light Rain / Windy | 790 |
Light Snow / Windy | 428 |
Wintry Mix | 428 |
Rain / Windy | 324 |
Drizzle | 313 |
Heavy Snow | 235 |
Heavy Rain / Windy | 230 |
Patches of Fog | 225 |
Shallow Fog | 220 |
Showers in the Vicinity | 177 |
Mist | 171 |
N/A Precipitation | 166 |
Heavy T-Storm / Windy | 155 |
Haze / Windy | 151 |
T-Storm / Windy | 100 |
Thunder / Windy | 89 |
Snow / Windy | 81 |
Fog / Windy | 77 |
Heavy Snow / Windy | 49 |
Blowing Dust / Windy | 32 |
Drizzle and Fog | 31 |
Light Rain Shower | 31 |
Light Freezing Rain | 26 |
Light Drizzle / Windy | 24 |
Heavy Drizzle | 22 |
Light Snow and Sleet | 11 |
Blowing Dust | 9 |
Light Freezing Drizzle | 9 |
Smoke / Windy | 9 |
Light Snow Shower | 7 |
Light Sleet | 6 |
Widespread Dust / Windy | 6 |
Wintry Mix / Windy | 6 |
Blowing Snow / Windy | 5 |
Rain Shower | 5 |
Tornado | 5 |
Sand / Dust Whirlwinds | 4 |
Squalls / Windy | 4 |
Widespread Dust | 4 |
Drizzle / Windy | 3 |
Light Sleet / Windy | 3 |
Snow and Sleet | 3 |
Squalls | 3 |
Blowing Snow Nearby | 1 |
Duststorm | 1 |
Hail | 1 |
Heavy Rain Shower / Windy | 1 |
Light Freezing Rain / Windy | 1 |
Light Snow and Sleet / Windy | 1 |
Partial Fog | 1 |
Sand / Windy | 1 |
Small Hail | 1 |
Snow and Thunder / Windy | 1 |
Thunder / Wintry Mix | 1 |
#An interactive visualization showing accidents during different weather conditions
library(plotly)
plot_ly(pie_data,values=~accidents,labels=~factor(Weather_Condition),marker=list(colors=c("blue","green")),type="pie")
From the visualization, we can infer that most accidents happened when the weather condition was fair while the least number of accidents occurred during the instances of hail, heavy rain shower/windy, dust-storm, and so on.
#grouping accidents per month in year 2021 for all US states
monthly_US_Accidents_2021<-US_Accidents_2021%>% mutate(Month=lubridate::month(Start_time, label=TRUE, abbr=TRUE)) %>% dplyr::count(Month, State) %>% dplyr::rename(no_of_accidents=n)
flextable::flextable(monthly_US_Accidents_2021, cwidth = 2)
Month | State | no_of_accidents |
---|---|---|
Jan | MD | 1 |
Jan | MT | 1 |
Jan | OR | 6 |
Jan | SC | 2 |
Feb | AL | 6 |
Feb | AR | 12 |
Feb | AZ | 21 |
Feb | CA | 299 |
Feb | CO | 1 |
Feb | CT | 1 |
Feb | DC | 3 |
Feb | DE | 3 |
Feb | FL | 157 |
Feb | IA | 4 |
Feb | ID | 1 |
Feb | IN | 11 |
Feb | KS | 1 |
Feb | KY | 3 |
Feb | LA | 26 |
Feb | MA | 1 |
Feb | MD | 15 |
Feb | MI | 8 |
Feb | MN | 78 |
Feb | MO | 5 |
Feb | MS | 1 |
Feb | MT | 10 |
Feb | NC | 30 |
Feb | NJ | 6 |
Feb | NM | 1 |
Feb | NV | 1 |
Feb | NY | 13 |
Feb | OH | 4 |
Feb | OR | 25 |
Feb | PA | 71 |
Feb | SC | 35 |
Feb | TN | 28 |
Feb | TX | 71 |
Feb | UT | 13 |
Feb | VA | 30 |
Feb | WA | 5 |
Mar | AL | 362 |
Mar | AR | 255 |
Mar | AZ | 695 |
Mar | CA | 9,221 |
Mar | CO | 90 |
Mar | CT | 298 |
Mar | DC | 109 |
Mar | DE | 64 |
Mar | FL | 6,143 |
Mar | GA | 29 |
Mar | IA | 106 |
Mar | ID | 211 |
Mar | IL | 130 |
Mar | IN | 280 |
Mar | KS | 111 |
Mar | KY | 36 |
Mar | LA | 676 |
Mar | MA | 58 |
Mar | MD | 770 |
Mar | MI | 352 |
Mar | MN | 1,054 |
Mar | MO | 299 |
Mar | MS | 81 |
Mar | MT | 183 |
Mar | NC | 1,159 |
Mar | ND | 59 |
Mar | NE | 32 |
Mar | NH | 17 |
Mar | NJ | 550 |
Mar | NM | 50 |
Mar | NV | 77 |
Mar | NY | 1,146 |
Mar | OH | 57 |
Mar | OR | 1,052 |
Mar | PA | 1,618 |
Mar | RI | 14 |
Mar | SC | 1,202 |
Mar | TN | 889 |
Mar | TX | 1,830 |
Mar | UT | 302 |
Mar | VA | 1,534 |
Mar | WA | 194 |
Mar | WV | 111 |
Apr | AL | 466 |
Apr | AR | 288 |
Apr | AZ | 798 |
Apr | CA | 9,862 |
Apr | CO | 119 |
Apr | CT | 438 |
Apr | DC | 151 |
Apr | DE | 76 |
Apr | FL | 6,278 |
Apr | GA | 40 |
Apr | IA | 78 |
Apr | ID | 267 |
Apr | IL | 199 |
Apr | IN | 350 |
Apr | KS | 46 |
Apr | KY | 48 |
Apr | LA | 774 |
Apr | MA | 22 |
Apr | MD | 939 |
Apr | MI | 448 |
Apr | MN | 977 |
Apr | MO | 293 |
Apr | MS | 63 |
Apr | MT | 136 |
Apr | NC | 1,157 |
Apr | ND | 50 |
Apr | NE | 40 |
Apr | NJ | 741 |
Apr | NM | 45 |
Apr | NV | 83 |
Apr | NY | 1,519 |
Apr | OH | 96 |
Apr | OK | 5 |
Apr | OR | 928 |
Apr | PA | 1,686 |
Apr | RI | 24 |
Apr | SC | 1,422 |
Apr | TN | 850 |
Apr | TX | 2,121 |
Apr | UT | 337 |
Apr | VA | 1,908 |
Apr | WA | 181 |
Apr | WV | 140 |
May | AL | 476 |
May | AR | 299 |
May | AZ | 928 |
May | CA | 10,020 |
May | CO | 126 |
May | CT | 548 |
May | DC | 107 |
May | DE | 79 |
May | FL | 6,669 |
May | GA | 40 |
May | IA | 127 |
May | ID | 218 |
May | IL | 260 |
May | IN | 307 |
May | KS | 31 |
May | KY | 32 |
May | LA | 808 |
May | MD | 1,017 |
May | MI | 571 |
May | MN | 1,442 |
May | MO | 248 |
May | MS | 108 |
May | MT | 8 |
May | NC | 1,316 |
May | ND | 55 |
May | NE | 41 |
May | NH | 1 |
May | NJ | 1,000 |
May | NM | 51 |
May | NV | 74 |
May | NY | 1,948 |
May | OH | 72 |
May | OR | 1,079 |
May | PA | 1,979 |
May | RI | 27 |
May | SC | 1,771 |
May | TN | 935 |
May | TX | 2,636 |
May | UT | 637 |
May | VA | 2,318 |
May | WA | 220 |
May | WI | 4 |
May | WV | 171 |
Jun | AL | 528 |
Jun | AR | 422 |
Jun | AZ | 1,136 |
Jun | CA | 15,137 |
Jun | CO | 130 |
Jun | CT | 825 |
Jun | DC | 220 |
Jun | DE | 179 |
Jun | FL | 10,449 |
Jun | GA | 40 |
Jun | IA | 222 |
Jun | ID | 245 |
Jun | IL | 405 |
Jun | IN | 456 |
Jun | KS | 188 |
Jun | KY | 67 |
Jun | LA | 1,132 |
Jun | MA | 13 |
Jun | MD | 1,542 |
Jun | ME | 1 |
Jun | MI | 764 |
Jun | MN | 2,042 |
Jun | MO | 551 |
Jun | MS | 119 |
Jun | MT | 7 |
Jun | NC | 1,944 |
Jun | ND | 84 |
Jun | NE | 93 |
Jun | NJ | 1,130 |
Jun | NM | 63 |
Jun | NV | 117 |
Jun | NY | 2,534 |
Jun | OH | 150 |
Jun | OR | 1,780 |
Jun | PA | 2,782 |
Jun | RI | 37 |
Jun | SC | 3,096 |
Jun | TN | 1,263 |
Jun | TX | 3,901 |
Jun | UT | 968 |
Jun | VA | 3,154 |
Jun | WA | 335 |
Jun | WI | 2 |
Jun | WV | 205 |
Jul | AL | 522 |
Jul | AR | 263 |
Jul | AZ | 1,030 |
Jul | CA | 14,525 |
Jul | CO | 130 |
Jul | CT | 750 |
Jul | DC | 294 |
Jul | DE | 175 |
Jul | FL | 9,330 |
Jul | GA | 27 |
Jul | IA | 188 |
Jul | ID | 86 |
Jul | IL | 345 |
Jul | IN | 490 |
Jul | KS | 105 |
Jul | KY | 73 |
Jul | LA | 1,034 |
Jul | MA | 38 |
Jul | MD | 1,416 |
Jul | ME | 2 |
Jul | MI | 704 |
Jul | MN | 1,453 |
Jul | MO | 505 |
Jul | MS | 124 |
Jul | MT | 118 |
Jul | NC | 1,768 |
Jul | ND | 81 |
Jul | NE | 77 |
Jul | NH | 1 |
Jul | NJ | 1,213 |
Jul | NM | 51 |
Jul | NV | 102 |
Jul | NY | 2,348 |
Jul | OH | 161 |
Jul | OR | 1,626 |
Jul | PA | 2,400 |
Jul | RI | 37 |
Jul | SC | 2,747 |
Jul | TN | 1,388 |
Jul | TX | 3,656 |
Jul | UT | 797 |
Jul | VA | 2,893 |
Jul | WA | 324 |
Jul | WI | 3 |
Jul | WV | 187 |
Aug | AL | 438 |
Aug | AR | 277 |
Aug | AZ | 1,150 |
Aug | CA | 15,251 |
Aug | CO | 130 |
Aug | CT | 709 |
Aug | DC | 272 |
Aug | DE | 174 |
Aug | FL | 10,251 |
Aug | GA | 635 |
Aug | IA | 162 |
Aug | ID | 129 |
Aug | IL | 367 |
Aug | IN | 465 |
Aug | KS | 236 |
Aug | KY | 69 |
Aug | LA | 1,306 |
Aug | MA | 40 |
Aug | MD | 1,572 |
Aug | MI | 809 |
Aug | MN | 1,643 |
Aug | MO | 599 |
Aug | MS | 103 |
Aug | MT | 552 |
Aug | NC | 1,951 |
Aug | ND | 80 |
Aug | NE | 31 |
Aug | NH | 1 |
Aug | NJ | 1,290 |
Aug | NM | 48 |
Aug | NV | 107 |
Aug | NY | 2,352 |
Aug | OH | 131 |
Aug | OK | 204 |
Aug | OR | 1,613 |
Aug | PA | 2,797 |
Aug | RI | 41 |
Aug | SC | 2,694 |
Aug | TN | 1,466 |
Aug | TX | 4,159 |
Aug | UT | 859 |
Aug | VA | 3,081 |
Aug | VT | 2 |
Aug | WA | 314 |
Aug | WI | 1 |
Aug | WV | 192 |
Aug | WY | 1 |
Sep | AL | 515 |
Sep | AR | 320 |
Sep | AZ | 1,442 |
Sep | CA | 17,247 |
Sep | CO | 151 |
Sep | CT | 832 |
Sep | DC | 393 |
Sep | DE | 146 |
Sep | FL | 13,181 |
Sep | GA | 825 |
Sep | IA | 171 |
Sep | ID | 157 |
Sep | IL | 460 |
Sep | IN | 564 |
Sep | KS | 217 |
Sep | KY | 68 |
Sep | LA | 1,467 |
Sep | MA | 42 |
Sep | MD | 1,798 |
Sep | ME | 1 |
Sep | MI | 857 |
Sep | MN | 1,696 |
Sep | MO | 784 |
Sep | MS | 128 |
Sep | MT | 618 |
Sep | NC | 2,238 |
Sep | ND | 58 |
Sep | NE | 21 |
Sep | NJ | 1,225 |
Sep | NM | 16 |
Sep | NV | 133 |
Sep | NY | 2,339 |
Sep | OH | 182 |
Sep | OK | 242 |
Sep | OR | 1,844 |
Sep | PA | 3,056 |
Sep | RI | 55 |
Sep | SC | 3,198 |
Sep | TN | 1,431 |
Sep | TX | 3,989 |
Sep | UT | 632 |
Sep | VA | 3,173 |
Sep | VT | 5 |
Sep | WA | 349 |
Sep | WI | 1 |
Sep | WV | 207 |
Sep | WY | 29 |
Oct | AL | 490 |
Oct | AR | 246 |
Oct | AZ | 1,527 |
Oct | CA | 18,930 |
Oct | CO | 597 |
Oct | CT | 970 |
Oct | DC | 357 |
Oct | DE | 114 |
Oct | FL | 14,991 |
Oct | GA | 1,050 |
Oct | IA | 164 |
Oct | ID | 132 |
Oct | IL | 438 |
Oct | IN | 475 |
Oct | KS | 217 |
Oct | KY | 49 |
Oct | LA | 1,538 |
Oct | MA | 26 |
Oct | MD | 1,712 |
Oct | MI | 861 |
Oct | MN | 1,900 |
Oct | MO | 713 |
Oct | MS | 90 |
Oct | MT | 708 |
Oct | NC | 2,547 |
Oct | ND | 69 |
Oct | NE | 56 |
Oct | NH | 5 |
Oct | NJ | 1,377 |
Oct | NM | 21 |
Oct | NV | 131 |
Oct | NY | 2,644 |
Oct | OH | 193 |
Oct | OK | 255 |
Oct | OR | 1,706 |
Oct | PA | 3,082 |
Oct | RI | 36 |
Oct | SC | 3,288 |
Oct | SD | 1 |
Oct | TN | 1,723 |
Oct | TX | 3,769 |
Oct | UT | 903 |
Oct | VA | 3,649 |
Oct | VT | 2 |
Oct | WA | 612 |
Oct | WI | 5 |
Oct | WV | 230 |
Oct | WY | 97 |
Nov | AL | 613 |
Nov | AR | 391 |
Nov | AZ | 2,033 |
Nov | CA | 22,192 |
Nov | CO | 664 |
Nov | CT | 1,012 |
Nov | DC | 400 |
Nov | DE | 160 |
Nov | FL | 19,853 |
Nov | GA | 1,602 |
Nov | IA | 182 |
Nov | ID | 194 |
Nov | IL | 466 |
Nov | IN | 537 |
Nov | KS | 380 |
Nov | KY | 27 |
Nov | LA | 1,973 |
Nov | MA | 42 |
Nov | MD | 2,213 |
Nov | MI | 1,083 |
Nov | MN | 2,575 |
Nov | MO | 879 |
Nov | MS | 152 |
Nov | MT | 1,164 |
Nov | NC | 3,628 |
Nov | ND | 154 |
Nov | NE | 34 |
Nov | NH | 7 |
Nov | NJ | 1,593 |
Nov | NM | 18 |
Nov | NV | 142 |
Nov | NY | 3,575 |
Nov | OH | 250 |
Nov | OK | 408 |
Nov | OR | 2,191 |
Nov | PA | 4,156 |
Nov | RI | 43 |
Nov | SC | 4,292 |
Nov | SD | 1 |
Nov | TN | 2,192 |
Nov | TX | 4,726 |
Nov | UT | 1,021 |
Nov | VA | 5,262 |
Nov | WA | 1,005 |
Nov | WI | 58 |
Nov | WV | 257 |
Nov | WY | 25 |
Dec | AL | 702 |
Dec | AR | 409 |
Dec | AZ | 2,793 |
Dec | CA | 36,378 |
Dec | CO | 1,221 |
Dec | CT | 1,174 |
Dec | DC | 476 |
Dec | DE | 143 |
Dec | FL | 26,635 |
Dec | GA | 1,905 |
Dec | IA | 358 |
Dec | ID | 387 |
Dec | IL | 521 |
Dec | IN | 487 |
Dec | KS | 320 |
Dec | KY | 31 |
Dec | LA | 3,298 |
Dec | MA | 68 |
Dec | MD | 2,338 |
Dec | ME | 1 |
Dec | MI | 1,370 |
Dec | MN | 6,575 |
Dec | MO | 1,210 |
Dec | MS | 189 |
Dec | MT | 2,591 |
Dec | NC | 5,003 |
Dec | ND | 137 |
Dec | NE | 69 |
Dec | NH | 5 |
Dec | NJ | 1,831 |
Dec | NM | 43 |
Dec | NV | 394 |
Dec | NY | 4,389 |
Dec | OH | 345 |
Dec | OK | 577 |
Dec | OR | 4,697 |
Dec | PA | 4,701 |
Dec | RI | 59 |
Dec | SC | 5,852 |
Dec | SD | 7 |
Dec | TN | 2,932 |
Dec | TX | 5,104 |
Dec | UT | 1,806 |
Dec | VA | 6,200 |
Dec | WA | 1,549 |
Dec | WI | 18 |
Dec | WV | 415 |
Dec | WY | 192 |
#Visualizing the total accidents per month using geom_col
Monthly_accidents<-monthly_US_Accidents_2021 %>% ggplot(aes(x=Month,
y=no_of_accidents, fill=State))+geom_col() +
scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) +
labs(title = "Monthly 2021 Accidents in the US",
y = "Number of accidents",
x = "Month",
caption = "Kaggle Data: US Accidents 2016 - 2021") +
scale_fill_viridis_d()+
theme_minimal() +
theme_grey(base_size = 11,
base_family = "")+
theme(legend.position = "none",
text = element_text(face = "bold"))
ggplotly(Monthly_accidents)
The chart above indicates that the highest number of cumulative accidents in the US occurred in the month of December while the lowest number of accidents occurred in the month of February.
#Pivoting longer() to create the variable 'accidentLocation' and using filter() to get accidents for the three States
US_Accidents_2021_3States<-US_Accidents_2021 %>%
pivot_longer(cols=c(Amenity:Turning_Loop),
names_to ="accidentLocation",
values_to = "TRUE_FALSE")%>% filter(State==c("MI", "NY","OH"))
## Rows: 148,534
## Columns: 37
## $ ID <chr> "A-224995", "A-224995", "A-224995", "A-224995", …
## $ Severity <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ Start_Time <chr> "7/31/2021 23:35", "7/31/2021 23:35", "7/31/2021…
## $ End_Time <chr> "8/1/2021 2:45", "8/1/2021 2:45", "8/1/2021 2:45…
## $ Start_Lat <dbl> 40.81538, 40.81538, 40.81538, 40.81538, 40.81538…
## $ Start_Lng <dbl> -73.83590, -73.83590, -73.83590, -73.83590, -73.…
## $ End_Lat <dbl> 40.81768, 40.81768, 40.81768, 40.81768, 40.81768…
## $ End_Lng <dbl> -73.83604, -73.83604, -73.83604, -73.83604, -73.…
## $ `Distance(mi)` <dbl> 0.159, 0.159, 0.159, 0.159, 0.159, 0.083, 0.083,…
## $ Description <chr> "Crash on I-678 ramp northbound Hutchinson River…
## $ Number <int> 620, 620, 620, 620, 620, 1066, 1066, 1066, 1066,…
## $ Street <chr> "Hutchinson River Pkwy", "Hutchinson River Pkwy"…
## $ Side <chr> "L", "L", "L", "L", "L", "R", "R", "R", "R", "R"…
## $ City <chr> "Bronx", "Bronx", "Bronx", "Bronx", "Bronx", "Ca…
## $ County <chr> "Bronx", "Bronx", "Bronx", "Bronx", "Bronx", "Pu…
## $ State <chr> "NY", "NY", "NY", "NY", "NY", "NY", "NY", "NY", …
## $ Zipcode <chr> "10465", "10465", "10465", "10465", "10465", "10…
## $ Country <chr> "US", "US", "US", "US", "US", "US", "US", "US", …
## $ Timezone <chr> "US/Eastern", "US/Eastern", "US/Eastern", "US/Ea…
## $ Airport_Code <chr> "KLGA", "KLGA", "KLGA", "KLGA", "KLGA", "KDXR", …
## $ Weather_Timestamp <chr> "7/31/2021 23:51", "7/31/2021 23:51", "7/31/2021…
## $ `Temperature(F)` <dbl> 72, 72, 72, 72, 72, 67, 67, 67, 67, 35, 35, 35, …
## $ `Wind_Chill(F)` <dbl> 72, 72, 72, 72, 72, 67, 67, 67, 67, 26, 26, 26, …
## $ `Humidity(%)` <int> 64, 64, 64, 64, 64, 49, 49, 49, 49, 54, 54, 54, …
## $ `Pressure(in)` <dbl> 29.92, 29.92, 29.92, 29.92, 29.92, 29.70, 29.70,…
## $ `Visibility(mi)` <dbl> 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, …
## $ Wind_Direction <chr> "S", "S", "S", "S", "S", "NNW", "NNW", "NNW", "N…
## $ `Wind_Speed(mph)` <dbl> 6, 6, 6, 6, 6, 8, 8, 8, 8, 13, 13, 13, 13, 15, 1…
## $ `Precipitation(in)` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Weather_Condition <chr> "Fair", "Fair", "Fair", "Fair", "Fair", "Fair", …
## $ Sunrise_Sunset <chr> "Night", "Night", "Night", "Night", "Night", "Da…
## $ Civil_Twilight <chr> "Night", "Night", "Night", "Night", "Night", "Da…
## $ Nautical_Twilight <chr> "Night", "Night", "Night", "Night", "Night", "Da…
## $ Astronomical_Twilight <chr> "Night", "Night", "Night", "Night", "Night", "Da…
## $ Start_time <dttm> 2021-07-31 23:35:00, 2021-07-31 23:35:00, 2021-…
## $ accidentLocation <chr> "Amenity", "Give_Way", "Railway", "Stop", "Turni…
## $ TRUE_FALSE <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,…
Name | US_Accidents_2021_3States |
Number of rows | 148534 |
Number of columns | 37 |
_______________________ | |
Column type frequency: | |
character | 21 |
logical | 1 |
numeric | 14 |
POSIXct | 1 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
ID | 0 | 1 | 8 | 9 | 0 | 34275 | 0 |
Start_Time | 0 | 1 | 13 | 16 | 0 | 26512 | 0 |
End_Time | 0 | 1 | 13 | 16 | 0 | 30182 | 0 |
Description | 0 | 1 | 16 | 375 | 0 | 25700 | 0 |
Street | 0 | 1 | 4 | 50 | 0 | 4683 | 0 |
Side | 0 | 1 | 1 | 1 | 0 | 2 | 0 |
City | 0 | 1 | 3 | 22 | 0 | 1002 | 0 |
County | 0 | 1 | 3 | 14 | 0 | 128 | 0 |
State | 0 | 1 | 2 | 2 | 0 | 3 | 0 |
Zipcode | 0 | 1 | 0 | 10 | 9 | 8201 | 0 |
Country | 0 | 1 | 2 | 2 | 0 | 1 | 0 |
Timezone | 0 | 1 | 0 | 10 | 9 | 3 | 0 |
Airport_Code | 0 | 1 | 0 | 4 | 282 | 138 | 0 |
Weather_Timestamp | 0 | 1 | 0 | 16 | 685 | 16607 | 0 |
Wind_Direction | 0 | 1 | 0 | 4 | 2242 | 19 | 0 |
Weather_Condition | 0 | 1 | 0 | 28 | 872 | 54 | 0 |
Sunrise_Sunset | 0 | 1 | 0 | 5 | 248 | 3 | 0 |
Civil_Twilight | 0 | 1 | 0 | 5 | 248 | 3 | 0 |
Nautical_Twilight | 0 | 1 | 0 | 5 | 248 | 3 | 0 |
Astronomical_Twilight | 0 | 1 | 0 | 5 | 248 | 3 | 0 |
accidentLocation | 0 | 1 | 4 | 15 | 0 | 13 | 0 |
Variable type: logical
skim_variable | n_missing | complete_rate | mean | count |
---|---|---|---|---|
TRUE_FALSE | 0 | 1 | 0.02 | FAL: 145012, TRU: 3522 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
Severity | 0 | 1.00 | 2.05 | 0.32 | 2.00 | 2.00 | 2.00 | 2.00 | 4.00 | ▇▁▁▁▁ |
Start_Lat | 0 | 1.00 | 41.89 | 1.11 | 38.84 | 40.81 | 42.27 | 42.96 | 47.13 | ▁▇▇▁▁ |
Start_Lng | 0 | 1.00 | -77.35 | 4.49 | -90.17 | -83.08 | -74.07 | -73.84 | -71.95 | ▁▃▁▂▇ |
End_Lat | 0 | 1.00 | 41.89 | 1.11 | 38.83 | 40.81 | 42.27 | 42.96 | 47.13 | ▁▇▇▁▁ |
End_Lng | 0 | 1.00 | -77.35 | 4.49 | -90.17 | -83.09 | -74.08 | -73.84 | -71.95 | ▁▃▁▂▇ |
Distance(mi) | 0 | 1.00 | 0.85 | 1.25 | 0.00 | 0.11 | 0.42 | 1.06 | 39.63 | ▇▁▁▁▁ |
Number | 102743 | 0.31 | 2585.06 | 3565.84 | 1.00 | 329.00 | 1282.00 | 3346.00 | 44801.00 | ▇▁▁▁▁ |
Temperature(F) | 847 | 0.99 | 57.99 | 17.19 | 8.00 | 44.00 | 59.00 | 72.00 | 98.00 | ▁▆▆▇▂ |
Wind_Chill(F) | 2320 | 0.98 | 56.10 | 19.54 | -6.00 | 39.00 | 59.00 | 72.00 | 98.00 | ▁▃▅▇▃ |
Humidity(%) | 895 | 0.99 | 66.04 | 19.78 | 8.00 | 51.00 | 67.00 | 83.00 | 100.00 | ▁▅▇▇▇ |
Pressure(in) | 822 | 0.99 | 29.51 | 0.43 | 19.75 | 29.20 | 29.51 | 29.85 | 30.65 | ▁▁▁▁▇ |
Visibility(mi) | 1164 | 0.99 | 9.14 | 2.51 | 0.00 | 10.00 | 10.00 | 10.00 | 20.00 | ▁▁▇▁▁ |
Wind_Speed(mph) | 2242 | 0.98 | 8.37 | 5.40 | 0.00 | 5.00 | 8.00 | 12.00 | 38.00 | ▇▇▂▁▁ |
Precipitation(in) | 879 | 0.99 | 0.01 | 0.04 | 0.00 | 0.00 | 0.00 | 0.00 | 1.32 | ▇▁▁▁▁ |
Variable type: POSIXct
skim_variable | n_missing | complete_rate | min | max | median | n_unique |
---|---|---|---|---|---|---|
Start_time | 0 | 1 | 2021-02-27 22:41:00 | 2021-12-31 22:07:00 | 2021-09-10 13:21:30 | 26512 |
The dataset has a completion rate of 0.98 - 1.0.
# Calculating the number of accidents during days and nights in the 3 states using the variable 'Sunrise_Sunset'
dayAndNight<-US_Accidents_2021_3States %>%
dplyr::count(Sunrise_Sunset,State) %>% filter(Sunrise_Sunset %in% c("Day","Night"))
flextable::flextable(dayAndNight, cwidth = 2)
Sunrise_Sunset | State | n |
---|---|---|
Day | MI | 23,301 |
Day | NY | 68,486 |
Day | OH | 5,013 |
Night | MI | 10,522 |
Night | NY | 38,850 |
Night | OH | 2,114 |
NY had the most number of accidents during the day and night (68486, 38850) while Ohio had the least(5013, 2114).
# Summary statistics of the accidents that occurred at the Right and the Left side of the road
leftAndRightSideAccidents<-US_Accidents_2021_3States %>%
dplyr::count(Side,State) %>% filter(State %in% c("OH","MI","NY"))
flextable::flextable(leftAndRightSideAccidents, cwidth = 2)
Side | State | n |
---|---|---|
L | MI | 5,142 |
L | NY | 9,817 |
L | OH | 1,951 |
R | MI | 28,735 |
R | NY | 97,705 |
R | OH | 5,184 |
From the tibble above, we can see that most of the accidents happened on the right side of the road with NY having the highest number of accidents among the 3 states.
# Summary statistics of accidents caused by the selective weather types
accidents_weather_condition<-US_Accidents_2021_3States %>%
group_by(Weather_Condition, State, Severity) %>%
dplyr::summarise(Count = n()) %>%
dplyr::rename(accident = Count) %>% filter(Weather_Condition %in% c("Clear", "Mostly Cloudy", "Overcast", "Partly Cloudy", "Scattterd Clouds", "Fair", "Light Rain", "Light Snow", "Cloudy", "Rain", "Fog")) %>% arrange(desc(accident))
flextable::flextable(accidents_weather_condition, cwidth = 2)
Weather_Condition | State | Severity | accident |
---|---|---|---|
Fair | NY | 2 | 35,000 |
Mostly Cloudy | NY | 2 | 20,781 |
Cloudy | NY | 2 | 19,356 |
Fair | MI | 2 | 11,793 |
Partly Cloudy | NY | 2 | 10,908 |
Cloudy | MI | 2 | 7,152 |
Light Rain | NY | 2 | 6,396 |
Mostly Cloudy | MI | 2 | 4,770 |
Light Snow | NY | 2 | 2,631 |
Partly Cloudy | MI | 2 | 2,604 |
Fair | OH | 2 | 2,417 |
Light Rain | MI | 2 | 2,193 |
Fog | NY | 2 | 1,545 |
Light Snow | MI | 2 | 1,506 |
Rain | NY | 2 | 1,374 |
Cloudy | OH | 2 | 1,356 |
Mostly Cloudy | OH | 2 | 1,135 |
Fair | NY | 4 | 919 |
Partly Cloudy | OH | 2 | 847 |
Cloudy | NY | 4 | 603 |
Light Rain | OH | 2 | 583 |
Rain | MI | 2 | 526 |
Mostly Cloudy | NY | 4 | 496 |
Fair | MI | 4 | 381 |
Fog | MI | 2 | 281 |
Partly Cloudy | NY | 4 | 260 |
Light Rain | NY | 4 | 222 |
Cloudy | MI | 4 | 203 |
Mostly Cloudy | MI | 4 | 95 |
Light Snow | NY | 4 | 91 |
Rain | OH | 2 | 82 |
Light Snow | OH | 2 | 73 |
Fog | OH | 2 | 65 |
Light Rain | MI | 4 | 61 |
Partly Cloudy | MI | 4 | 49 |
Light Snow | MI | 4 | 40 |
Fog | NY | 4 | 31 |
Fog | MI | 4 | 25 |
Rain | MI | 4 | 21 |
Cloudy | OH | 4 | 13 |
Fair | OH | 4 | 13 |
Rain | NY | 4 | 12 |
Fog | OH | 4 | 5 |
Mostly Cloudy | OH | 4 | 5 |
Most of the accidents happened in NY when the weather was Fair and the least number of accidents happened in Ohio when the weather was mostly cloudy.
#Counting monthly accidents in the three states
monthly_US_Accidents_2021 <- US_Accidents_2021_3States %>% mutate(Month = lubridate::month(Start_time, label = T, abbr = T)) %>% dplyr::count(Month,State,Start_time) %>% dplyr::rename(no_of_Accidents = n)
#visualizing accidents per month in the three states
monthly_accidents<-monthly_US_Accidents_2021 %>% ggplot(aes(x=Month,
y=no_of_Accidents, fill=State))+
geom_col()+
scale_fill_viridis_d()+
labs(title = "Monthly accidents in Three States",
x="Month",
y="Number of accidents",
caption = "Data source: https://www.kaggle.com/datasets/sobhanmoosavi/us-accidents")+
theme_bw(base_size = 10)+
theme(text = element_text(face = "bold"), legend.position = c(.95, .95),
legend.justification = c("right", "top"),
legend.box.just = "right",
legend.margin = margin(6, 6, 6, 6))+
scale_y_continuous(labels = comma,
expand=expansion(mult=c(0, .1)))
ggplotly(monthly_accidents)
As indicated above, the total number of cumulative accidents occurred in December in the 3 States while the least instances of accidents occurred in February.
Dec_Accidents = monthly_US_Accidents_2021 %>% mutate(day = lubridate::day(Start_time)) %>% filter(Month == "Dec", day %in% c(1:31)) %>% dplyr::count(day) %>% dplyr::rename(No_of_Accidents = n)
flextable::flextable(Dec_Accidents, cwidth = 2)
day | No_of_Accidents |
---|---|
1 | 125 |
2 | 169 |
3 | 148 |
4 | 108 |
5 | 91 |
6 | 144 |
7 | 139 |
8 | 200 |
9 | 124 |
10 | 166 |
11 | 170 |
12 | 135 |
13 | 124 |
14 | 135 |
15 | 158 |
16 | 138 |
17 | 183 |
18 | 207 |
19 | 109 |
20 | 152 |
21 | 156 |
22 | 187 |
23 | 210 |
24 | 163 |
25 | 86 |
26 | 65 |
27 | 195 |
28 | 155 |
29 | 133 |
30 | 151 |
31 | 109 |
ggplotly(Dec_Accidents %>% ggplot(aes(x = day,
y = No_of_Accidents)) +
geom_col(fill="dodgerblue") +
scale_x_continuous(breaks = c(1:31))+
scale_y_continuous(breaks = seq(0,250,10))+
theme_bw(base_size = 10) +
scale_fill_viridis_d() +
theme(legend.position = "none",
text = element_text(face = "bold")) +
labs(title = "Dec 2021 Daily Accidents in MI, NY & OH",
x = "Day",
y = "Number of Accidents",
caption = "Kaggle Data: US Accidents 2016 - 2021"))
The most number of accidents in December for the 3 states for the year 2021 occurred in the 23rd of December (210).
#Visualizing state accidents on various weather conditions
US_Accidents_2021_3States %>% filter(Weather_Condition %in% c("Clear", "Mostly Cloudy", "Overcast", "Partly Cloudy", "Scattterd Clouds", "Fair", "Light Rain", "Light Snow", "Cloudy", "Rain", "Fog")) %>% ggplot(aes(Weather_Condition, fill=Weather_Condition)) + geom_histogram(stat="count")+facet_wrap(.~State)+
labs(title = " MI, NY, and OH Accidents as during various weather conditions",
x="Weather conditions",
y="Number of accidents",
caption = "Kaggle Data: US Accidents 2016 - 2021")+
scale_fill_viridis_d()+
theme_bw()+
theme_bw()+
scale_fill_viridis_d() +
theme(text = element_text(face = "bold"))+
theme(legend.position = "None",
axis.text.x=element_text(angle = 45,
hjust=.9, size=10))
As
depicted by the faceted chart above, NY has the most number of accidents
during selective weather conditions while OH has the lowest.
accidents_weather_conditions<- accidents_weather_condition %>% ggplot(aes(x=Weather_Condition,
y=accident, color=Severity)) +geom_jitter()+
labs(title = " MI, NY, and OH Accidents as per the weather conditions",
x="Weather_Condition",
y="Number of accidents",
caption = "Kaggle Data: US Accidents 2016 - 2021")+
theme(axis.text.x=element_text(angle = 45,
hjust=.9, size=10)) +
theme(legend.position = "bottom",
text = element_text(face = "bold"))
ggplotly(accidents_weather_conditions)
As seen above, most accidents with a severity of 2 happened when the weather was fair.
#Visualizing accidents vs weather conditions
accidents_weather_condition %>% ggplot(aes(x=Weather_Condition,
y=accident, color=Severity)) +geom_boxplot()+
labs(title = " MI, NY, and OH Accidents as per the weather conditions",
x="Weather_Condition",
y="Number of accidents")+
scale_y_continuous(label=comma)+
theme(axis.text.x=element_text(angle = 45,
hjust=.9, size=10))+
theme_minimal() +
theme(legend.position = "bottom",
text = element_text(face = "bold"))+
theme_bw()
This is an alternative visualization in the form of a box-plot which indicates the most accidents with a severity of 2 happening when the weather was fair.
#Visualizing the number of Accidents in the three state
accidents_weather_condition %>% ggplot(aes(x=State,
y=accident, fill=State)) +geom_col() +
scale_y_continuous(labels = comma,
expand=expansion(mult=c(0, .1)))+
labs(title="Number of accidents per state",
x="State",
y="Number of accidents",
caption = "Kaggle Data: US Accidents 2016 - 2021")+
scale_fill_viridis_d()+
theme_bw() +
theme(legend.position = "none")
The visualization shows the total accidents segregated by the states
with NY having the highest number of accidents.
#Creating a frequency table
day_and_night_accidents<-US_Accidents_2021_3States %>% dplyr::count(Sunrise_Sunset, State) %>% filter(Sunrise_Sunset %in% c("Day","Night")) %>% dplyr::rename(Accidents=n)
flextable::flextable(day_and_night_accidents, cwidth = 2)
Sunrise_Sunset | State | Accidents |
---|---|---|
Day | MI | 23,301 |
Day | NY | 68,486 |
Day | OH | 5,013 |
Night | MI | 10,522 |
Night | NY | 38,850 |
Night | OH | 2,114 |
day_and_night_accidents %>% ggplot(aes(x=Sunrise_Sunset,
y=Accidents, fill=State))+geom_col()+facet_wrap(.~State)+
scale_fill_viridis_d()+
labs(title = "A histogram displaying Sunrise and Sunset Accidents",
x="Time of accident",
y="Number of accidents",
caption = "Kaggle Data: US Accidents 2016 - 2021")+
theme_bw()
Most of the accidents occurred in NY during the day and the least number of accidents occurred in NY during the night.
#Daily accidents in Michigan
weekday_Michigan_Accidents_2021<-US_Accidents_2021_3States%>% mutate(weekday=lubridate::wday(Start_time, label=TRUE, abbr=TRUE)) %>% dplyr::count(State, weekday) %>% dplyr::rename(no_of_accidents=n) %>% filter(State=="MI")
flextable::flextable(weekday_Michigan_Accidents_2021, cwidth = 2)
State | weekday | no_of_accidents |
---|---|---|
MI | Sun | 3,534 |
MI | Mon | 5,051 |
MI | Tue | 4,975 |
MI | Wed | 4,727 |
MI | Thu | 5,171 |
MI | Fri | 5,768 |
MI | Sat | 4,651 |
The table shows that most of the accidents occurred in MI on Fridays.
#Daily accidents in New York
weekday_NewYork_Accidents_2021<-US_Accidents_2021_3States%>% mutate(weekday=lubridate::wday(Start_time, label=TRUE, abbr=TRUE)) %>% dplyr::count(State, weekday) %>% dplyr::rename(no_of_accidents=n) %>% filter(State=="NY")
flextable::flextable(weekday_NewYork_Accidents_2021, cwidth = 2)
State | weekday | no_of_accidents |
---|---|---|
NY | Sun | 12,868 |
NY | Mon | 14,297 |
NY | Tue | 15,810 |
NY | Wed | 16,936 |
NY | Thu | 16,592 |
NY | Fri | 17,621 |
NY | Sat | 13,398 |
The table shows that most of the accidents occurred in NY on Fridays.
#Daily accidents in Ohio
weekday_Ohio_Accidents_2021<-US_Accidents_2021_3States%>% mutate(weekday=lubridate::wday(Start_time, label=TRUE, abbr=TRUE)) %>% dplyr::count(State, weekday) %>% dplyr::rename(no_of_accidents=n) %>% filter(State=="OH")
flextable::flextable(weekday_Ohio_Accidents_2021, cwidth = 2)
State | weekday | no_of_accidents |
---|---|---|
OH | Sun | 752 |
OH | Mon | 863 |
OH | Tue | 1,054 |
OH | Wed | 1,034 |
OH | Thu | 1,119 |
OH | Fri | 1,217 |
OH | Sat | 1,096 |
The table shows that most of the accidents occurred in Ohio on Fridays.
#A table showing accidents in the three states in a week
weekdayAccidents3States<-US_Accidents_2021_3States%>% mutate(weekday=lubridate::wday(Start_time, label=TRUE, abbr=TRUE)) %>% dplyr::count(weekday) %>% dplyr::rename(no_of_accidents=n) %>% arrange(no_of_accidents)
flextable::flextable(weekdayAccidents3States, cwidth = 2)
weekday | no_of_accidents |
---|---|
Sun | 17,154 |
Sat | 19,145 |
Mon | 20,211 |
Tue | 21,839 |
Wed | 22,697 |
Thu | 22,882 |
Fri | 24,606 |
The table shows that most of the accidents in all the 3 states have occurred on Fridays.
hourlyAccidents<-US_Accidents_2021_3States %>% mutate(hourlyAccidents=lubridate::hour(Start_time)) %>% dplyr::count(hourlyAccidents,State) %>% dplyr::rename(accidents=n)
flextable::flextable(hourlyAccidents, cwidth = 2)
hourlyAccidents | State | accidents |
---|---|---|
0 | MI | 662 |
0 | NY | 2,440 |
0 | OH | 130 |
1 | MI | 491 |
1 | NY | 1,974 |
1 | OH | 131 |
2 | MI | 590 |
2 | NY | 1,502 |
2 | OH | 104 |
3 | MI | 506 |
3 | NY | 1,629 |
3 | OH | 62 |
4 | MI | 356 |
4 | NY | 2,142 |
4 | OH | 58 |
5 | MI | 741 |
5 | NY | 3,708 |
5 | OH | 76 |
6 | MI | 1,281 |
6 | NY | 4,848 |
6 | OH | 91 |
7 | MI | 1,703 |
7 | NY | 5,713 |
7 | OH | 317 |
8 | MI | 1,256 |
8 | NY | 4,672 |
8 | OH | 151 |
9 | MI | 1,252 |
9 | NY | 4,441 |
9 | OH | 220 |
10 | MI | 1,235 |
10 | NY | 3,925 |
10 | OH | 192 |
11 | MI | 1,505 |
11 | NY | 4,622 |
11 | OH | 299 |
12 | MI | 1,730 |
12 | NY | 5,574 |
12 | OH | 430 |
13 | MI | 2,089 |
13 | NY | 5,484 |
13 | OH | 380 |
14 | MI | 2,421 |
14 | NY | 6,712 |
14 | OH | 582 |
15 | MI | 3,326 |
15 | NY | 8,304 |
15 | OH | 758 |
16 | MI | 3,042 |
16 | NY | 7,722 |
16 | OH | 732 |
17 | MI | 2,765 |
17 | NY | 8,358 |
17 | OH | 780 |
18 | MI | 1,974 |
18 | NY | 5,367 |
18 | OH | 455 |
19 | MI | 1,427 |
19 | NY | 4,131 |
19 | OH | 309 |
20 | MI | 1,149 |
20 | NY | 4,102 |
20 | OH | 244 |
21 | MI | 903 |
21 | NY | 3,740 |
21 | OH | 231 |
22 | MI | 802 |
22 | NY | 3,499 |
22 | OH | 269 |
23 | MI | 671 |
23 | NY | 2,913 |
23 | OH | 134 |
hourlyAccidents %>% ggplot(aes(x=hourlyAccidents,
y=accidents, color=State))+
geom_line(stat="identity", position="identity")+
geom_point()+
scale_x_continuous(breaks=seq(0,23,1))+theme(text = element_text(face="bold"), legend.box.margin = margin(10, 10, 10, 10))+
labs(title = "Accidents within 24 hours in MI,NY,OH",
x="Time of accident",
y="Number of accidents")+
theme(legend.box.margin = margin(6, 6, 6, 6))
The line-chart indicates that most of the accidents happened at 3 pm and
5 pm in all the 3 states. NY has the highest number of accidents within
24 hours.
#Frequency table showing the number of day and night accidents
count(US_Accidents_2021_3States, 'Sunrise_Sunset', 'Severity') %>% filter(Sunrise_Sunset %in% c("Day", "Night")) %>% dplyr::rename(Accidents = freq)
## Sunrise_Sunset Accidents
## 1 Day 197886
## 2 Night 106154
The number of accidents happening during the day (197886) is nearly twice the accidents at night.
#Frequency table showing the number of accidents for the Michigan, New York and Ohio
count(US_Accidents_2021_3States, 'State', 'Severity')%>% dplyr::rename(Accidents = freq)
## State Accidents
## 1 MI 69618
## 2 NY 220774
## 3 OH 14342
New York has the highest number of accidents (220774) amongst the 3 states.
#Joining State names with equivalent Abbreviations using the `State.name` and `state.abb` R fucntions
State_Abb<- data.frame(State = state.name, StateAbb = state.abb)
#Merging `State_Abb` with `PopData` using full_join()
FullPopData<- full_join(Population_data, State_Abb, by = c("STNAME" = "State"))
#Selecting three states from the population Data (MN, MI & NY)
populationTotals<- filter(FullPopData, CTYNAME %in% c("Michigan", "Ohio", "New York"))
FullPopData_3_States<- populationTotals %>% select(c("StateAbb","STNAME","CTYNAME", "POPESTIMATE2021"))
#Joining Accident Data `US_Accidents_2021_3States` and the Population Data `FullPopData_3_States`
Accident_Pop_Data <- US_Accidents_2021_3States %>% full_join(FullPopData_3_States, by = c("State" ="StateAbb"))
#Counting the number of accidents per State
US_Accidents_2021_3States1 <- count(US_Accidents_2021_3States, 'State', 'Severity')%>% dplyr::rename(accidents_2021 = freq)
US_Accidents_2021_3States_df <- data.frame(US_Accidents_2021_3States1)
# Creating a table showing the number of accidents and population of the respective states
US_accident_population <- US_Accidents_2021_3States_df %>% full_join(FullPopData_3_States, by = c("State" ="StateAbb"))
flextable::flextable(US_accident_population, cwidth = 2)
State | accidents_2021 | STNAME | CTYNAME | POPESTIMATE2021 |
---|---|---|---|---|
MI | 69,618 | Michigan | Michigan | 10,050,811 |
NY | 220,774 | New York | New York | 19,835,913 |
OH | 14,342 | Ohio | Ohio | 11,780,017 |
NY has the highest number of accidents alongside having the highest population estimate. Ohio has a higher population estimate compared to Michigan. However, it has a lower record of accidents in comparison to the latter.
#Interactive Visualizations for Population estimate (2021) vs accidents in NY, OH, MI
ggplotly(US_accident_population %>% ggplot(aes(x = STNAME,
y = POPESTIMATE2021,
fill= accidents_2021)) +
geom_col()+
scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) +
theme_grey()+
theme(legend.position = "bottom",
text = element_text(face = "bold"))+
labs(title="Population estimate (2021) vs accidents in NY, OH, MI",
X="State Name",
y="Population",
caption = "https://www.kaggle.com/datasets/sobhanmoosavi/us-accidents"))
#Using str_detect and mutate() to create a new boolean variable name to indicate whether accident description contains "slow traffic"
slow_traffic_accidents<-US_Accidents_2021_3States %>% mutate(contains_slow_traffic= str_detect(str_to_lower(Description), pattern = "slow traffic"))
#number of accidents when there was slow traffic
slow_traffic_accidents %>% pull(contains_slow_traffic) %>% sum()
## [1] 24371
The number of accidents that led to slow traffic was 24371.
#number of accidents that caused stationary traffic
stationary_traffic_accidents<-US_Accidents_2021_3States %>% mutate(contains_stationary_traffic= str_detect(str_to_lower(Description), pattern = "stationary traffic"))
stationary_traffic_accidents %>% pull(contains_stationary_traffic) %>% sum()
## [1] 23227
23227 led to stationary traffic.
#Creating a Data Dictionary for the US Accidents and Population Data
myvariables<-Accident_Pop_Data %>% select(c("State",
"Sunrise_Sunset",
"Weather_Condition",
"Side",
"Severity",
"CTYNAME",
"POPESTIMATE2021",
"accidentLocation",
"Start_time",
"Description",
"STNAME"))
dataDictionary <- tibble(Variable = colnames(myvariables),
Description = c("US States",
"Day or Night",
"Weather Conditions",
" Side of the Road",
"Impact of the accident on traffic on a scale of 1 to 4",
"Name of the city",
"2021 Population Estimate",
"Location of the accident",
"Start time of the Accident",
"Description of the accident aftermath",
"The name of the State"),
Type = map_chr(myvariables, .f = function(x){typeof(x)[1]}),
Class = map_chr(myvariables, .f = function(x){class(x)[1]}))
flextable::flextable(dataDictionary, cwidth = 2)
Variable | Description | Type | Class |
---|---|---|---|
State | US States | character | character |
Sunrise_Sunset | Day or Night | character | character |
Weather_Condition | Weather Conditions | character | character |
Side | Side of the Road | character | character |
Severity | Impact of the accident on traffic on a scale of 1 to 4 | integer | integer |
CTYNAME | Name of the city | character | character |
POPESTIMATE2021 | 2021 Population Estimate | integer | integer |
accidentLocation | Location of the accident | character | character |
Start_time | Start time of the Accident | double | POSIXct |
Description | Description of the accident aftermath | character | character |
STNAME | The name of the State | character | character |
# Randomization test example ---------------------------------
library(broom)
# Loading the dataset
data(US_Accidents_2021_3States)
myData <- US_Accidents_2021_3States %>% select(Sunrise_Sunset, `Visibility(mi)`)
# Fitting One-Way ANOVA model
modFit <- aov(`Visibility(mi)` ~ Sunrise_Sunset, data = US_Accidents_2021_3States)
Fstatistic <- modFit %>% tidy() %>% slice_head(n = 1) %>% pull(statistic)
# Getting the number of accidents during the day and night
groupCounts <- US_Accidents_2021_3States %>% dplyr::count(Sunrise_Sunset) %>% filter(Sunrise_Sunset%in%c("Day", "Night"))
flextable::flextable(groupCounts, cwidth = 2)
Sunrise_Sunset | n |
---|---|
Day | 96,800 |
Night | 51,486 |
# Overall sample size
N <- nrow(US_Accidents_2021_3States)
# Number of permutations
nperms <- 1000
# Instantiating vector for test statistics
permFs <- vector(length = nperms)
mean(permFs>=Fstatistic)
## [1] 0
# Create vector of the number of accidents during the day and night
groups <- rep(groupCounts$Sunrise_Sunset, times = groupCounts$n)
for(p in 1:nperms) {
permData <- US_Accidents_2021_3States %>% mutate(Sunrise_Sunset = groups[sample(1:N, size = N, replace = FALSE)])
# Calculate accidents during the day and night
modFit <- aov(`Visibility(mi)` ~ Sunrise_Sunset, data = permData)
permFs[p] <- modFit %>% tidy() %>% slice_head(n = 1) %>% pull(statistic)
}
permFs[p]
## [1] 0.008839787
96,800 accidents happened during the day and 51,486 accidents during the night. The number of permFs was 0.008839787.
# Calculating the standard error
myname<-US_Accidents_2021_3States %>% dplyr::rename(Visibility = `Visibility(mi)`) %>% filter(!is.na(Visibility))
set.seed(1994)
n <-nrow(myname)
visibility <- myname$Visibility
median(visibility)
## [1] 10
B<- 100
# Instantiating matrix for bootstrap samples
paramboots <- matrix(NA, nrow = n, ncol = B)
xBar<-mean(visibility)
s<-sd(visibility)
# Simulating a normal set of 30 values, B times
for(b in 1:B) {
paramboots[, b] <- rnorm(n=n, mean=xBar, sd=s)
}
#Installing vector for bootstrap medians
bootparamMedians<-vector(length=B)
#Calculating median for each simulated data set
for(b in 1:B) {
bootparamMedians[b]<-median(paramboots[,b])
}
#Obtain a parametric bootstrap estimate of the standard error of the sample median.
SEparamestimate<-sd(bootparamMedians[b])
The parametric bootstrap estimate of the standard error of the sample median is 10.