Final Project: Exploring the Relationship Between Population and Accident Data in the US

# Loading the required package

library(skimr)
library(ggplot2)
library(knitr)
library(tidyverse)
library(data.table)
library(knitr)
library(arrow)
library(bench)
library(haven)
library(googlesheets4)
library(plyr)
library(scales)
library(ggdist)
library(patchwork)
library(ggthemes)
library(stringr)

set.seed(1994)

#Importing the US Accidents dataset
US_Accidents <- data.table::fread("Accidents.csv") %>% as_tibble()

# Importing the Population dataset
Population_data <- read.csv("Population.csv")

# Using R's `lubridate` function to manipulate 2021 Accidents Data
US_Accidents_2021 <- US_Accidents %>% mutate(Start_time=lubridate::mdy_hm(Start_Time)) %>% filter(year( Start_time)==2021) 

#Extracting the number of accidents per month using the newly created variable for StarTtime
monthly_accidents_2021<-US_Accidents_2021 %>% mutate(accident_month=lubridate::month(Start_time, label=T, abbr=T))%>% dplyr::count(accident_month) %>% dplyr::rename(accidents=n) %>% flextable::flextable()

#Counting the number of accidents per weather condition and arranging them in a descending order
pie_data<-dplyr::count(US_Accidents_2021, Weather_Condition) %>% dplyr::rename(accidents=n) %>% arrange(desc(accidents))
flextable::flextable(pie_data, cwidth = 2)

Weather_Condition	accidents
Fair	324,867
Mostly Cloudy	87,358
Cloudy	86,303
Partly Cloudy	60,348
Light Rain	28,410
	13,932
Fog	9,725
Haze	9,352
Rain	7,029
Light Snow	6,792
Fair / Windy	3,892
Heavy Rain	2,991
Thunder in the Vicinity	2,927
Thunder	2,588
T-Storm	2,573
Smoke	2,170
Light Rain with Thunder	2,130
Cloudy / Windy	1,594
Heavy T-Storm	1,514
Mostly Cloudy / Windy	1,359
Light Drizzle	1,226
Snow	925
Partly Cloudy / Windy	844
Light Rain / Windy	790
Light Snow / Windy	428
Wintry Mix	428
Rain / Windy	324
Drizzle	313
Heavy Snow	235
Heavy Rain / Windy	230
Patches of Fog	225
Shallow Fog	220
Showers in the Vicinity	177
Mist	171
N/A Precipitation	166
Heavy T-Storm / Windy	155
Haze / Windy	151
T-Storm / Windy	100
Thunder / Windy	89
Snow / Windy	81
Fog / Windy	77
Heavy Snow / Windy	49
Blowing Dust / Windy	32
Drizzle and Fog	31
Light Rain Shower	31
Light Freezing Rain	26
Light Drizzle / Windy	24
Heavy Drizzle	22
Light Snow and Sleet	11
Blowing Dust	9
Light Freezing Drizzle	9
Smoke / Windy	9
Light Snow Shower	7
Light Sleet	6
Widespread Dust / Windy	6
Wintry Mix / Windy	6
Blowing Snow / Windy	5
Rain Shower	5
Tornado	5
Sand / Dust Whirlwinds	4
Squalls / Windy	4
Widespread Dust	4
Drizzle / Windy	3
Light Sleet / Windy	3
Snow and Sleet	3
Squalls	3
Blowing Snow Nearby	1
Duststorm	1
Hail	1
Heavy Rain Shower / Windy	1
Light Freezing Rain / Windy	1
Light Snow and Sleet / Windy	1
Partial Fog	1
Sand / Windy	1
Small Hail	1
Snow and Thunder / Windy	1
Thunder / Wintry Mix	1

#An interactive visualization showing accidents during different weather conditions

library(plotly)
plot_ly(pie_data,values=~accidents,labels=~factor(Weather_Condition),marker=list(colors=c("blue","green")),type="pie")

From the visualization, we can infer that most accidents happened when the weather condition was fair while the least number of accidents occurred during the instances of hail, heavy rain shower/windy, dust-storm, and so on.

#grouping accidents per month in year 2021 for all US states
monthly_US_Accidents_2021<-US_Accidents_2021%>% mutate(Month=lubridate::month(Start_time, label=TRUE, abbr=TRUE)) %>% dplyr::count(Month, State) %>% dplyr::rename(no_of_accidents=n)
flextable::flextable(monthly_US_Accidents_2021, cwidth = 2)

Month	State	no_of_accidents
Jan	MD	1
Jan	MT	1
Jan	OR	6
Jan	SC	2
Feb	AL	6
Feb	AR	12
Feb	AZ	21
Feb	CA	299
Feb	CO	1
Feb	CT	1
Feb	DC	3
Feb	DE	3
Feb	FL	157
Feb	IA	4
Feb	ID	1
Feb	IN	11
Feb	KS	1
Feb	KY	3
Feb	LA	26
Feb	MA	1
Feb	MD	15
Feb	MI	8
Feb	MN	78
Feb	MO	5
Feb	MS	1
Feb	MT	10
Feb	NC	30
Feb	NJ	6
Feb	NM	1
Feb	NV	1
Feb	NY	13
Feb	OH	4
Feb	OR	25
Feb	PA	71
Feb	SC	35
Feb	TN	28
Feb	TX	71
Feb	UT	13
Feb	VA	30
Feb	WA	5
Mar	AL	362
Mar	AR	255
Mar	AZ	695
Mar	CA	9,221
Mar	CO	90
Mar	CT	298
Mar	DC	109
Mar	DE	64
Mar	FL	6,143
Mar	GA	29
Mar	IA	106
Mar	ID	211
Mar	IL	130
Mar	IN	280
Mar	KS	111
Mar	KY	36
Mar	LA	676
Mar	MA	58
Mar	MD	770
Mar	MI	352
Mar	MN	1,054
Mar	MO	299
Mar	MS	81
Mar	MT	183
Mar	NC	1,159
Mar	ND	59
Mar	NE	32
Mar	NH	17
Mar	NJ	550
Mar	NM	50
Mar	NV	77
Mar	NY	1,146
Mar	OH	57
Mar	OR	1,052
Mar	PA	1,618
Mar	RI	14
Mar	SC	1,202
Mar	TN	889
Mar	TX	1,830
Mar	UT	302
Mar	VA	1,534
Mar	WA	194
Mar	WV	111
Apr	AL	466
Apr	AR	288
Apr	AZ	798
Apr	CA	9,862
Apr	CO	119
Apr	CT	438
Apr	DC	151
Apr	DE	76
Apr	FL	6,278
Apr	GA	40
Apr	IA	78
Apr	ID	267
Apr	IL	199
Apr	IN	350
Apr	KS	46
Apr	KY	48
Apr	LA	774
Apr	MA	22
Apr	MD	939
Apr	MI	448
Apr	MN	977
Apr	MO	293
Apr	MS	63
Apr	MT	136
Apr	NC	1,157
Apr	ND	50
Apr	NE	40
Apr	NJ	741
Apr	NM	45
Apr	NV	83
Apr	NY	1,519
Apr	OH	96
Apr	OK	5
Apr	OR	928
Apr	PA	1,686
Apr	RI	24
Apr	SC	1,422
Apr	TN	850
Apr	TX	2,121
Apr	UT	337
Apr	VA	1,908
Apr	WA	181
Apr	WV	140
May	AL	476
May	AR	299
May	AZ	928
May	CA	10,020
May	CO	126
May	CT	548
May	DC	107
May	DE	79
May	FL	6,669
May	GA	40
May	IA	127
May	ID	218
May	IL	260
May	IN	307
May	KS	31
May	KY	32
May	LA	808
May	MD	1,017
May	MI	571
May	MN	1,442
May	MO	248
May	MS	108
May	MT	8
May	NC	1,316
May	ND	55
May	NE	41
May	NH	1
May	NJ	1,000
May	NM	51
May	NV	74
May	NY	1,948
May	OH	72
May	OR	1,079
May	PA	1,979
May	RI	27
May	SC	1,771
May	TN	935
May	TX	2,636
May	UT	637
May	VA	2,318
May	WA	220
May	WI	4
May	WV	171
Jun	AL	528
Jun	AR	422
Jun	AZ	1,136
Jun	CA	15,137
Jun	CO	130
Jun	CT	825
Jun	DC	220
Jun	DE	179
Jun	FL	10,449
Jun	GA	40
Jun	IA	222
Jun	ID	245
Jun	IL	405
Jun	IN	456
Jun	KS	188
Jun	KY	67
Jun	LA	1,132
Jun	MA	13
Jun	MD	1,542
Jun	ME	1
Jun	MI	764
Jun	MN	2,042
Jun	MO	551
Jun	MS	119
Jun	MT	7
Jun	NC	1,944
Jun	ND	84
Jun	NE	93
Jun	NJ	1,130
Jun	NM	63
Jun	NV	117
Jun	NY	2,534
Jun	OH	150
Jun	OR	1,780
Jun	PA	2,782
Jun	RI	37
Jun	SC	3,096
Jun	TN	1,263
Jun	TX	3,901
Jun	UT	968
Jun	VA	3,154
Jun	WA	335
Jun	WI	2
Jun	WV	205
Jul	AL	522
Jul	AR	263
Jul	AZ	1,030
Jul	CA	14,525
Jul	CO	130
Jul	CT	750
Jul	DC	294
Jul	DE	175
Jul	FL	9,330
Jul	GA	27
Jul	IA	188
Jul	ID	86
Jul	IL	345
Jul	IN	490
Jul	KS	105
Jul	KY	73
Jul	LA	1,034
Jul	MA	38
Jul	MD	1,416
Jul	ME	2
Jul	MI	704
Jul	MN	1,453
Jul	MO	505
Jul	MS	124
Jul	MT	118
Jul	NC	1,768
Jul	ND	81
Jul	NE	77
Jul	NH	1
Jul	NJ	1,213
Jul	NM	51
Jul	NV	102
Jul	NY	2,348
Jul	OH	161
Jul	OR	1,626
Jul	PA	2,400
Jul	RI	37
Jul	SC	2,747
Jul	TN	1,388
Jul	TX	3,656
Jul	UT	797
Jul	VA	2,893
Jul	WA	324
Jul	WI	3
Jul	WV	187
Aug	AL	438
Aug	AR	277
Aug	AZ	1,150
Aug	CA	15,251
Aug	CO	130
Aug	CT	709
Aug	DC	272
Aug	DE	174
Aug	FL	10,251
Aug	GA	635
Aug	IA	162
Aug	ID	129
Aug	IL	367
Aug	IN	465
Aug	KS	236
Aug	KY	69
Aug	LA	1,306
Aug	MA	40
Aug	MD	1,572
Aug	MI	809
Aug	MN	1,643
Aug	MO	599
Aug	MS	103
Aug	MT	552
Aug	NC	1,951
Aug	ND	80
Aug	NE	31
Aug	NH	1
Aug	NJ	1,290
Aug	NM	48
Aug	NV	107
Aug	NY	2,352
Aug	OH	131
Aug	OK	204
Aug	OR	1,613
Aug	PA	2,797
Aug	RI	41
Aug	SC	2,694
Aug	TN	1,466
Aug	TX	4,159
Aug	UT	859
Aug	VA	3,081
Aug	VT	2
Aug	WA	314
Aug	WI	1
Aug	WV	192
Aug	WY	1
Sep	AL	515
Sep	AR	320
Sep	AZ	1,442
Sep	CA	17,247
Sep	CO	151
Sep	CT	832
Sep	DC	393
Sep	DE	146
Sep	FL	13,181
Sep	GA	825
Sep	IA	171
Sep	ID	157
Sep	IL	460
Sep	IN	564
Sep	KS	217
Sep	KY	68
Sep	LA	1,467
Sep	MA	42
Sep	MD	1,798
Sep	ME	1
Sep	MI	857
Sep	MN	1,696
Sep	MO	784
Sep	MS	128
Sep	MT	618
Sep	NC	2,238
Sep	ND	58
Sep	NE	21
Sep	NJ	1,225
Sep	NM	16
Sep	NV	133
Sep	NY	2,339
Sep	OH	182
Sep	OK	242
Sep	OR	1,844
Sep	PA	3,056
Sep	RI	55
Sep	SC	3,198
Sep	TN	1,431
Sep	TX	3,989
Sep	UT	632
Sep	VA	3,173
Sep	VT	5
Sep	WA	349
Sep	WI	1
Sep	WV	207
Sep	WY	29
Oct	AL	490
Oct	AR	246
Oct	AZ	1,527
Oct	CA	18,930
Oct	CO	597
Oct	CT	970
Oct	DC	357
Oct	DE	114
Oct	FL	14,991
Oct	GA	1,050
Oct	IA	164
Oct	ID	132
Oct	IL	438
Oct	IN	475
Oct	KS	217
Oct	KY	49
Oct	LA	1,538
Oct	MA	26
Oct	MD	1,712
Oct	MI	861
Oct	MN	1,900
Oct	MO	713
Oct	MS	90
Oct	MT	708
Oct	NC	2,547
Oct	ND	69
Oct	NE	56
Oct	NH	5
Oct	NJ	1,377
Oct	NM	21
Oct	NV	131
Oct	NY	2,644
Oct	OH	193
Oct	OK	255
Oct	OR	1,706
Oct	PA	3,082
Oct	RI	36
Oct	SC	3,288
Oct	SD	1
Oct	TN	1,723
Oct	TX	3,769
Oct	UT	903
Oct	VA	3,649
Oct	VT	2
Oct	WA	612
Oct	WI	5
Oct	WV	230
Oct	WY	97
Nov	AL	613
Nov	AR	391
Nov	AZ	2,033
Nov	CA	22,192
Nov	CO	664
Nov	CT	1,012
Nov	DC	400
Nov	DE	160
Nov	FL	19,853
Nov	GA	1,602
Nov	IA	182
Nov	ID	194
Nov	IL	466
Nov	IN	537
Nov	KS	380
Nov	KY	27
Nov	LA	1,973
Nov	MA	42
Nov	MD	2,213
Nov	MI	1,083
Nov	MN	2,575
Nov	MO	879
Nov	MS	152
Nov	MT	1,164
Nov	NC	3,628
Nov	ND	154
Nov	NE	34
Nov	NH	7
Nov	NJ	1,593
Nov	NM	18
Nov	NV	142
Nov	NY	3,575
Nov	OH	250
Nov	OK	408
Nov	OR	2,191
Nov	PA	4,156
Nov	RI	43
Nov	SC	4,292
Nov	SD	1
Nov	TN	2,192
Nov	TX	4,726
Nov	UT	1,021
Nov	VA	5,262
Nov	WA	1,005
Nov	WI	58
Nov	WV	257
Nov	WY	25
Dec	AL	702
Dec	AR	409
Dec	AZ	2,793
Dec	CA	36,378
Dec	CO	1,221
Dec	CT	1,174
Dec	DC	476
Dec	DE	143
Dec	FL	26,635
Dec	GA	1,905
Dec	IA	358
Dec	ID	387
Dec	IL	521
Dec	IN	487
Dec	KS	320
Dec	KY	31
Dec	LA	3,298
Dec	MA	68
Dec	MD	2,338
Dec	ME	1
Dec	MI	1,370
Dec	MN	6,575
Dec	MO	1,210
Dec	MS	189
Dec	MT	2,591
Dec	NC	5,003
Dec	ND	137
Dec	NE	69
Dec	NH	5
Dec	NJ	1,831
Dec	NM	43
Dec	NV	394
Dec	NY	4,389
Dec	OH	345
Dec	OK	577
Dec	OR	4,697
Dec	PA	4,701
Dec	RI	59
Dec	SC	5,852
Dec	SD	7
Dec	TN	2,932
Dec	TX	5,104
Dec	UT	1,806
Dec	VA	6,200
Dec	WA	1,549
Dec	WI	18
Dec	WV	415
Dec	WY	192

#Visualizing the total accidents per month using geom_col

Monthly_accidents<-monthly_US_Accidents_2021 %>% ggplot(aes(x=Month,
                                       y=no_of_accidents, fill=State))+geom_col() +
  scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) +
  labs(title = "Monthly 2021 Accidents in the US",
         y = "Number of accidents",
         x = "Month",
         caption = "Kaggle Data: US Accidents 2016 - 2021") +
  scale_fill_viridis_d()+
    theme_minimal() + 
  theme_grey(base_size = 11,
  base_family = "")+
    theme(legend.position = "none",
          text = element_text(face = "bold"))
ggplotly(Monthly_accidents)

The chart above indicates that the highest number of cumulative accidents in the US occurred in the month of December while the lowest number of accidents occurred in the month of February.

#Pivoting longer() to create the variable 'accidentLocation' and using filter() to get accidents for the three States

US_Accidents_2021_3States<-US_Accidents_2021 %>% 
  pivot_longer(cols=c(Amenity:Turning_Loop),
               names_to ="accidentLocation",
               values_to = "TRUE_FALSE")%>% filter(State==c("MI", "NY","OH"))

#Exploratory data analysis
glimpse(US_Accidents_2021_3States)

## Rows: 148,534
## Columns: 37
## $ ID                    <chr> "A-224995", "A-224995", "A-224995", "A-224995", …
## $ Severity              <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ Start_Time            <chr> "7/31/2021 23:35", "7/31/2021 23:35", "7/31/2021…
## $ End_Time              <chr> "8/1/2021 2:45", "8/1/2021 2:45", "8/1/2021 2:45…
## $ Start_Lat             <dbl> 40.81538, 40.81538, 40.81538, 40.81538, 40.81538…
## $ Start_Lng             <dbl> -73.83590, -73.83590, -73.83590, -73.83590, -73.…
## $ End_Lat               <dbl> 40.81768, 40.81768, 40.81768, 40.81768, 40.81768…
## $ End_Lng               <dbl> -73.83604, -73.83604, -73.83604, -73.83604, -73.…
## $ `Distance(mi)`        <dbl> 0.159, 0.159, 0.159, 0.159, 0.159, 0.083, 0.083,…
## $ Description           <chr> "Crash on I-678 ramp northbound Hutchinson River…
## $ Number                <int> 620, 620, 620, 620, 620, 1066, 1066, 1066, 1066,…
## $ Street                <chr> "Hutchinson River Pkwy", "Hutchinson River Pkwy"…
## $ Side                  <chr> "L", "L", "L", "L", "L", "R", "R", "R", "R", "R"…
## $ City                  <chr> "Bronx", "Bronx", "Bronx", "Bronx", "Bronx", "Ca…
## $ County                <chr> "Bronx", "Bronx", "Bronx", "Bronx", "Bronx", "Pu…
## $ State                 <chr> "NY", "NY", "NY", "NY", "NY", "NY", "NY", "NY", …
## $ Zipcode               <chr> "10465", "10465", "10465", "10465", "10465", "10…
## $ Country               <chr> "US", "US", "US", "US", "US", "US", "US", "US", …
## $ Timezone              <chr> "US/Eastern", "US/Eastern", "US/Eastern", "US/Ea…
## $ Airport_Code          <chr> "KLGA", "KLGA", "KLGA", "KLGA", "KLGA", "KDXR", …
## $ Weather_Timestamp     <chr> "7/31/2021 23:51", "7/31/2021 23:51", "7/31/2021…
## $ `Temperature(F)`      <dbl> 72, 72, 72, 72, 72, 67, 67, 67, 67, 35, 35, 35, …
## $ `Wind_Chill(F)`       <dbl> 72, 72, 72, 72, 72, 67, 67, 67, 67, 26, 26, 26, …
## $ `Humidity(%)`         <int> 64, 64, 64, 64, 64, 49, 49, 49, 49, 54, 54, 54, …
## $ `Pressure(in)`        <dbl> 29.92, 29.92, 29.92, 29.92, 29.92, 29.70, 29.70,…
## $ `Visibility(mi)`      <dbl> 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, …
## $ Wind_Direction        <chr> "S", "S", "S", "S", "S", "NNW", "NNW", "NNW", "N…
## $ `Wind_Speed(mph)`     <dbl> 6, 6, 6, 6, 6, 8, 8, 8, 8, 13, 13, 13, 13, 15, 1…
## $ `Precipitation(in)`   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Weather_Condition     <chr> "Fair", "Fair", "Fair", "Fair", "Fair", "Fair", …
## $ Sunrise_Sunset        <chr> "Night", "Night", "Night", "Night", "Night", "Da…
## $ Civil_Twilight        <chr> "Night", "Night", "Night", "Night", "Night", "Da…
## $ Nautical_Twilight     <chr> "Night", "Night", "Night", "Night", "Night", "Da…
## $ Astronomical_Twilight <chr> "Night", "Night", "Night", "Night", "Night", "Da…
## $ Start_time            <dttm> 2021-07-31 23:35:00, 2021-07-31 23:35:00, 2021-…
## $ accidentLocation      <chr> "Amenity", "Give_Way", "Railway", "Stop", "Turni…
## $ TRUE_FALSE            <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,…

skim(US_Accidents_2021_3States)

Data summary
Name	US_Accidents_2021_3States
Number of rows	148534
Number of columns	37
_______________________
Column type frequency:
character	21
logical	1
numeric	14
POSIXct	1
________________________
Group variables	None

Variable type: character

skim_variable	complete_rate	min	max	empty	n_unique
ID	1	8	9	0	34275
Start_Time	1	13	16	0	26512
End_Time	1	13	16	0	30182
Description	1	16	375	0	25700
Street	1	4	50	0	4683
Side	1	1	1	0	2
City	1	3	22	0	1002
County	1	3	14	0	128
State	1	2	2	0	3
Zipcode	1	0	10	9	8201
Country	1	2	2	0	1
Timezone	1	0	10	9	3
Airport_Code	1	0	4	282	138
Weather_Timestamp	1	0	16	685	16607
Wind_Direction	1	0	4	2242	19
Weather_Condition	1	0	28	872	54
Sunrise_Sunset	1	0	5	248	3
Civil_Twilight	1	0	5	248	3
Nautical_Twilight	1	0	5	248	3
Astronomical_Twilight	1	0	5	248	3
accidentLocation	1	4	15	0	13

Variable type: logical

skim_variable	n_missing	complete_rate	mean	count
TRUE_FALSE	0	1	0.02	FAL: 145012, TRU: 3522

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
Severity	0	1.00	2.05	0.32	2.00	2.00	2.00	2.00	4.00	▇▁▁▁▁
Start_Lat	0	1.00	41.89	1.11	38.84	40.81	42.27	42.96	47.13	▁▇▇▁▁
Start_Lng	0	1.00	-77.35	4.49	-90.17	-83.08	-74.07	-73.84	-71.95	▁▃▁▂▇
End_Lat	0	1.00	41.89	1.11	38.83	40.81	42.27	42.96	47.13	▁▇▇▁▁
End_Lng	0	1.00	-77.35	4.49	-90.17	-83.09	-74.08	-73.84	-71.95	▁▃▁▂▇
Distance(mi)	0	1.00	0.85	1.25	0.00	0.11	0.42	1.06	39.63	▇▁▁▁▁
Number	102743	0.31	2585.06	3565.84	1.00	329.00	1282.00	3346.00	44801.00	▇▁▁▁▁
Temperature(F)	847	0.99	57.99	17.19	8.00	44.00	59.00	72.00	98.00	▁▆▆▇▂
Wind_Chill(F)	2320	0.98	56.10	19.54	-6.00	39.00	59.00	72.00	98.00	▁▃▅▇▃
Humidity(%)	895	0.99	66.04	19.78	8.00	51.00	67.00	83.00	100.00	▁▅▇▇▇
Pressure(in)	822	0.99	29.51	0.43	19.75	29.20	29.51	29.85	30.65	▁▁▁▁▇
Visibility(mi)	1164	0.99	9.14	2.51	0.00	10.00	10.00	10.00	20.00	▁▁▇▁▁
Wind_Speed(mph)	2242	0.98	8.37	5.40	0.00	5.00	8.00	12.00	38.00	▇▇▂▁▁
Precipitation(in)	879	0.99	0.01	0.04	0.00	0.00	0.00	0.00	1.32	▇▁▁▁▁

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
Start_time	0	1	2021-02-27 22:41:00	2021-12-31 22:07:00	2021-09-10 13:21:30	26512

The dataset has a completion rate of 0.98 - 1.0.

# Calculating the number of accidents during days and nights in the 3 states using the variable 'Sunrise_Sunset'

dayAndNight<-US_Accidents_2021_3States %>%
   dplyr::count(Sunrise_Sunset,State) %>% filter(Sunrise_Sunset %in% c("Day","Night"))
flextable::flextable(dayAndNight, cwidth = 2)

Sunrise_Sunset	State	n
Day	MI	23,301
Day	NY	68,486
Day	OH	5,013
Night	MI	10,522
Night	NY	38,850
Night	OH	2,114

NY had the most number of accidents during the day and night (68486, 38850) while Ohio had the least(5013, 2114).

# Summary statistics of the accidents that occurred at the Right and the Left side of the road

leftAndRightSideAccidents<-US_Accidents_2021_3States %>%
   dplyr::count(Side,State)  %>% filter(State %in% c("OH","MI","NY"))
flextable::flextable(leftAndRightSideAccidents, cwidth = 2)

Side	State	n
L	MI	5,142
L	NY	9,817
L	OH	1,951
R	MI	28,735
R	NY	97,705
R	OH	5,184

From the tibble above, we can see that most of the accidents happened on the right side of the road with NY having the highest number of accidents among the 3 states.

# Summary statistics of accidents caused by the selective weather types 
accidents_weather_condition<-US_Accidents_2021_3States %>%
   group_by(Weather_Condition, State, Severity) %>%
   dplyr::summarise(Count = n()) %>%
   dplyr::rename(accident = Count) %>% filter(Weather_Condition %in% c("Clear", "Mostly Cloudy", "Overcast", "Partly Cloudy", "Scattterd Clouds", "Fair", "Light Rain", "Light Snow", "Cloudy", "Rain", "Fog")) %>% arrange(desc(accident))
flextable::flextable(accidents_weather_condition, cwidth = 2)

Weather_Condition	State	Severity	accident
Fair	NY	2	35,000
Mostly Cloudy	NY	2	20,781
Cloudy	NY	2	19,356
Fair	MI	2	11,793
Partly Cloudy	NY	2	10,908
Cloudy	MI	2	7,152
Light Rain	NY	2	6,396
Mostly Cloudy	MI	2	4,770
Light Snow	NY	2	2,631
Partly Cloudy	MI	2	2,604
Fair	OH	2	2,417
Light Rain	MI	2	2,193
Fog	NY	2	1,545
Light Snow	MI	2	1,506
Rain	NY	2	1,374
Cloudy	OH	2	1,356
Mostly Cloudy	OH	2	1,135
Fair	NY	4	919
Partly Cloudy	OH	2	847
Cloudy	NY	4	603
Light Rain	OH	2	583
Rain	MI	2	526
Mostly Cloudy	NY	4	496
Fair	MI	4	381
Fog	MI	2	281
Partly Cloudy	NY	4	260
Light Rain	NY	4	222
Cloudy	MI	4	203
Mostly Cloudy	MI	4	95
Light Snow	NY	4	91
Rain	OH	2	82
Light Snow	OH	2	73
Fog	OH	2	65
Light Rain	MI	4	61
Partly Cloudy	MI	4	49
Light Snow	MI	4	40
Fog	NY	4	31
Fog	MI	4	25
Rain	MI	4	21
Cloudy	OH	4	13
Fair	OH	4	13
Rain	NY	4	12
Fog	OH	4	5
Mostly Cloudy	OH	4	5

Most of the accidents happened in NY when the weather was Fair and the least number of accidents happened in Ohio when the weather was mostly cloudy.

#Counting monthly accidents in the three states
monthly_US_Accidents_2021 <- US_Accidents_2021_3States %>% mutate(Month = lubridate::month(Start_time, label = T, abbr = T)) %>% dplyr::count(Month,State,Start_time) %>% dplyr::rename(no_of_Accidents = n)


#visualizing accidents per month in the three states
monthly_accidents<-monthly_US_Accidents_2021 %>% ggplot(aes(x=Month,
                                       y=no_of_Accidents, fill=State))+
  geom_col()+
  scale_fill_viridis_d()+
  labs(title = "Monthly accidents in Three States",
       x="Month",
       y="Number of accidents",
       caption = "Data source: https://www.kaggle.com/datasets/sobhanmoosavi/us-accidents")+
   theme_bw(base_size = 10)+
  theme(text = element_text(face = "bold"), legend.position = c(.95, .95),
  legend.justification = c("right", "top"),
  legend.box.just = "right",
  legend.margin = margin(6, 6, 6, 6))+
  scale_y_continuous(labels = comma,
                     expand=expansion(mult=c(0, .1)))
                                                                                         
ggplotly(monthly_accidents)

As indicated above, the total number of cumulative accidents occurred in December in the 3 States while the least instances of accidents occurred in February.

Dec_Accidents = monthly_US_Accidents_2021 %>% mutate(day = lubridate::day(Start_time)) %>% filter(Month == "Dec", day %in% c(1:31)) %>% dplyr::count(day) %>% dplyr::rename(No_of_Accidents = n)

flextable::flextable(Dec_Accidents, cwidth = 2)

day	No_of_Accidents
1	125
2	169
3	148
4	108
5	91
6	144
7	139
8	200
9	124
10	166
11	170
12	135
13	124
14	135
15	158
16	138
17	183
18	207
19	109
20	152
21	156
22	187
23	210
24	163
25	86
26	65
27	195
28	155
29	133
30	151
31	109

ggplotly(Dec_Accidents %>% ggplot(aes(x = day,
                             y = No_of_Accidents)) +
  geom_col(fill="dodgerblue") +
  scale_x_continuous(breaks = c(1:31))+
  scale_y_continuous(breaks = seq(0,250,10))+
  theme_bw(base_size = 10) +
  scale_fill_viridis_d() +
    theme(legend.position = "none",
          text = element_text(face = "bold")) +
  labs(title = "Dec 2021 Daily Accidents in MI, NY & OH",
       x = "Day",
       y = "Number of Accidents",
       caption = "Kaggle Data: US Accidents 2016 - 2021"))

The most number of accidents in December for the 3 states for the year 2021 occurred in the 23rd of December (210).

#Visualizing state accidents on various weather conditions
US_Accidents_2021_3States %>% filter(Weather_Condition %in% c("Clear", "Mostly Cloudy", "Overcast", "Partly Cloudy", "Scattterd Clouds", "Fair", "Light Rain", "Light Snow", "Cloudy", "Rain", "Fog")) %>% ggplot(aes(Weather_Condition, fill=Weather_Condition)) + geom_histogram(stat="count")+facet_wrap(.~State)+
  labs(title = " MI, NY, and OH Accidents as during various weather conditions",
       x="Weather conditions",
       y="Number of accidents",
       caption = "Kaggle Data: US Accidents 2016 - 2021")+
  scale_fill_viridis_d()+
  theme_bw()+
  theme_bw()+
  scale_fill_viridis_d() +
  theme(text = element_text(face = "bold"))+
  theme(legend.position = "None",
        axis.text.x=element_text(angle = 45,
                              hjust=.9, size=10))

As depicted by the faceted chart above, NY has the most number of accidents during selective weather conditions while OH has the lowest.

accidents_weather_conditions<- accidents_weather_condition %>% ggplot(aes(x=Weather_Condition,
                  y=accident, color=Severity)) +geom_jitter()+
  labs(title = " MI, NY, and OH Accidents as per the weather conditions",
       x="Weather_Condition",
       y="Number of accidents",
       caption = "Kaggle Data: US Accidents 2016 - 2021")+
  theme(axis.text.x=element_text(angle = 45,
                              hjust=.9, size=10)) +
    theme(legend.position = "bottom",
          text = element_text(face = "bold"))

ggplotly(accidents_weather_conditions)

As seen above, most accidents with a severity of 2 happened when the weather was fair.

#Visualizing accidents vs weather conditions
accidents_weather_condition %>% ggplot(aes(x=Weather_Condition,
                  y=accident, color=Severity)) +geom_boxplot()+
  labs(title = " MI, NY, and OH Accidents as per the weather conditions",
       x="Weather_Condition",
       y="Number of accidents")+
  scale_y_continuous(label=comma)+
  theme(axis.text.x=element_text(angle = 45,
                              hjust=.9, size=10))+
  theme_minimal() + 
    theme(legend.position = "bottom",
          text = element_text(face = "bold"))+
  theme_bw()

This is an alternative visualization in the form of a box-plot which indicates the most accidents with a severity of 2 happening when the weather was fair.

#Visualizing the number of Accidents in the three state
accidents_weather_condition %>% ggplot(aes(x=State,
                  y=accident, fill=State)) +geom_col() +
     scale_y_continuous(labels = comma,
                     expand=expansion(mult=c(0, .1)))+
  labs(title="Number of accidents per state",
       x="State",
       y="Number of accidents",
       caption = "Kaggle Data: US Accidents 2016 - 2021")+
  scale_fill_viridis_d()+
    theme_bw() +
  theme(legend.position = "none")

The visualization shows the total accidents segregated by the states with NY having the highest number of accidents.

#Creating a frequency table
day_and_night_accidents<-US_Accidents_2021_3States  %>% dplyr::count(Sunrise_Sunset, State) %>% filter(Sunrise_Sunset %in% c("Day","Night")) %>% dplyr::rename(Accidents=n)
flextable::flextable(day_and_night_accidents, cwidth = 2)

Sunrise_Sunset	State	Accidents
Day	MI	23,301
Day	NY	68,486
Day	OH	5,013
Night	MI	10,522
Night	NY	38,850
Night	OH	2,114

day_and_night_accidents %>% ggplot(aes(x=Sunrise_Sunset,
                                 y=Accidents, fill=State))+geom_col()+facet_wrap(.~State)+
  scale_fill_viridis_d()+
  labs(title = "A histogram displaying Sunrise and Sunset Accidents",
     x="Time of accident",
     y="Number of accidents",
     caption = "Kaggle Data: US Accidents 2016 - 2021")+
  theme_bw()

Most of the accidents occurred in NY during the day and the least number of accidents occurred in NY during the night.

#Daily accidents in Michigan
weekday_Michigan_Accidents_2021<-US_Accidents_2021_3States%>% mutate(weekday=lubridate::wday(Start_time, label=TRUE, abbr=TRUE)) %>% dplyr::count(State, weekday) %>% dplyr::rename(no_of_accidents=n) %>% filter(State=="MI")
flextable::flextable(weekday_Michigan_Accidents_2021, cwidth = 2)

State	weekday	no_of_accidents
MI	Sun	3,534
MI	Mon	5,051
MI	Tue	4,975
MI	Wed	4,727
MI	Thu	5,171
MI	Fri	5,768
MI	Sat	4,651

The table shows that most of the accidents occurred in MI on Fridays.

#Daily accidents in New York
weekday_NewYork_Accidents_2021<-US_Accidents_2021_3States%>% mutate(weekday=lubridate::wday(Start_time, label=TRUE, abbr=TRUE)) %>% dplyr::count(State, weekday) %>% dplyr::rename(no_of_accidents=n) %>% filter(State=="NY")
flextable::flextable(weekday_NewYork_Accidents_2021, cwidth = 2)

State	weekday	no_of_accidents
NY	Sun	12,868
NY	Mon	14,297
NY	Tue	15,810
NY	Wed	16,936
NY	Thu	16,592
NY	Fri	17,621
NY	Sat	13,398

The table shows that most of the accidents occurred in NY on Fridays.

#Daily accidents in Ohio
weekday_Ohio_Accidents_2021<-US_Accidents_2021_3States%>% mutate(weekday=lubridate::wday(Start_time, label=TRUE, abbr=TRUE)) %>% dplyr::count(State, weekday) %>% dplyr::rename(no_of_accidents=n) %>% filter(State=="OH")
flextable::flextable(weekday_Ohio_Accidents_2021, cwidth = 2)

State	weekday	no_of_accidents
OH	Sun	752
OH	Mon	863
OH	Tue	1,054
OH	Wed	1,034
OH	Thu	1,119
OH	Fri	1,217
OH	Sat	1,096

The table shows that most of the accidents occurred in Ohio on Fridays.

#A table showing accidents in the three states in a week
weekdayAccidents3States<-US_Accidents_2021_3States%>% mutate(weekday=lubridate::wday(Start_time, label=TRUE, abbr=TRUE)) %>% dplyr::count(weekday) %>% dplyr::rename(no_of_accidents=n) %>% arrange(no_of_accidents)
flextable::flextable(weekdayAccidents3States, cwidth = 2)

weekday	no_of_accidents
Sun	17,154
Sat	19,145
Mon	20,211
Tue	21,839
Wed	22,697
Thu	22,882
Fri	24,606

The table shows that most of the accidents in all the 3 states have occurred on Fridays.

hourlyAccidents<-US_Accidents_2021_3States %>% mutate(hourlyAccidents=lubridate::hour(Start_time)) %>% dplyr::count(hourlyAccidents,State) %>% dplyr::rename(accidents=n)

flextable::flextable(hourlyAccidents, cwidth = 2)

hourlyAccidents	State	accidents
0	MI	662
0	NY	2,440
0	OH	130
1	MI	491
1	NY	1,974
1	OH	131
2	MI	590
2	NY	1,502
2	OH	104
3	MI	506
3	NY	1,629
3	OH	62
4	MI	356
4	NY	2,142
4	OH	58
5	MI	741
5	NY	3,708
5	OH	76
6	MI	1,281
6	NY	4,848
6	OH	91
7	MI	1,703
7	NY	5,713
7	OH	317
8	MI	1,256
8	NY	4,672
8	OH	151
9	MI	1,252
9	NY	4,441
9	OH	220
10	MI	1,235
10	NY	3,925
10	OH	192
11	MI	1,505
11	NY	4,622
11	OH	299
12	MI	1,730
12	NY	5,574
12	OH	430
13	MI	2,089
13	NY	5,484
13	OH	380
14	MI	2,421
14	NY	6,712
14	OH	582
15	MI	3,326
15	NY	8,304
15	OH	758
16	MI	3,042
16	NY	7,722
16	OH	732
17	MI	2,765
17	NY	8,358
17	OH	780
18	MI	1,974
18	NY	5,367
18	OH	455
19	MI	1,427
19	NY	4,131
19	OH	309
20	MI	1,149
20	NY	4,102
20	OH	244
21	MI	903
21	NY	3,740
21	OH	231
22	MI	802
22	NY	3,499
22	OH	269
23	MI	671
23	NY	2,913
23	OH	134

hourlyAccidents %>% ggplot(aes(x=hourlyAccidents,
                  y=accidents, color=State))+ 
  geom_line(stat="identity", position="identity")+
  geom_point()+
  scale_x_continuous(breaks=seq(0,23,1))+theme(text = element_text(face="bold"), legend.box.margin = margin(10, 10, 10, 10))+
  labs(title = "Accidents within 24 hours in MI,NY,OH",
     x="Time of accident",
     y="Number of accidents")+
  theme(legend.box.margin = margin(6, 6, 6, 6))

The line-chart indicates that most of the accidents happened at 3 pm and 5 pm in all the 3 states. NY has the highest number of accidents within 24 hours.

#Frequency table showing the number of day and night accidents
count(US_Accidents_2021_3States, 'Sunrise_Sunset', 'Severity') %>% filter(Sunrise_Sunset %in% c("Day", "Night")) %>% dplyr::rename(Accidents = freq)

##   Sunrise_Sunset Accidents
## 1            Day    197886
## 2          Night    106154

The number of accidents happening during the day (197886) is nearly twice the accidents at night.

#Frequency table showing the number of accidents for the Michigan, New York and Ohio
count(US_Accidents_2021_3States, 'State', 'Severity')%>% dplyr::rename(Accidents = freq)

##   State Accidents
## 1    MI     69618
## 2    NY    220774
## 3    OH     14342

New York has the highest number of accidents (220774) amongst the 3 states.

#Joining State names with equivalent Abbreviations using the `State.name` and `state.abb` R fucntions
State_Abb<- data.frame(State = state.name, StateAbb = state.abb)

#Merging `State_Abb` with `PopData` using full_join()
FullPopData<- full_join(Population_data, State_Abb, by = c("STNAME" = "State"))

#Selecting three states from the population Data (MN, MI & NY)

populationTotals<- filter(FullPopData, CTYNAME %in% c("Michigan", "Ohio", "New York"))

FullPopData_3_States<- populationTotals %>% select(c("StateAbb","STNAME","CTYNAME", "POPESTIMATE2021"))

#Joining Accident Data `US_Accidents_2021_3States` and the Population Data `FullPopData_3_States`
Accident_Pop_Data <- US_Accidents_2021_3States %>% full_join(FullPopData_3_States, by = c("State" ="StateAbb"))

#Counting the number of accidents per State
US_Accidents_2021_3States1 <- count(US_Accidents_2021_3States, 'State', 'Severity')%>% dplyr::rename(accidents_2021 = freq)
US_Accidents_2021_3States_df <- data.frame(US_Accidents_2021_3States1)

# Creating a table showing the number of accidents and population of the respective states
US_accident_population <- US_Accidents_2021_3States_df %>% full_join(FullPopData_3_States, by = c("State" ="StateAbb"))
flextable::flextable(US_accident_population, cwidth = 2)

State	accidents_2021	STNAME	CTYNAME	POPESTIMATE2021
MI	69,618	Michigan	Michigan	10,050,811
NY	220,774	New York	New York	19,835,913
OH	14,342	Ohio	Ohio	11,780,017

NY has the highest number of accidents alongside having the highest population estimate. Ohio has a higher population estimate compared to Michigan. However, it has a lower record of accidents in comparison to the latter.

#Interactive Visualizations for Population estimate (2021) vs accidents in NY, OH, MI

ggplotly(US_accident_population %>% ggplot(aes(x = STNAME,
                               y = POPESTIMATE2021,
                               fill= accidents_2021)) +
  geom_col()+
    scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) +
    theme_grey()+
    theme(legend.position = "bottom",
          text = element_text(face = "bold"))+
  labs(title="Population estimate (2021) vs accidents in NY, OH, MI",
      X="State Name",
      y="Population",
      caption = "https://www.kaggle.com/datasets/sobhanmoosavi/us-accidents"))

#Using str_detect and mutate() to create a new boolean variable name to indicate whether accident description contains "slow traffic"
slow_traffic_accidents<-US_Accidents_2021_3States %>% mutate(contains_slow_traffic= str_detect(str_to_lower(Description), pattern = "slow traffic")) 

#number of  accidents  when there was slow traffic
slow_traffic_accidents %>% pull(contains_slow_traffic) %>% sum()

## [1] 24371

The number of accidents that led to slow traffic was 24371.

#number of  accidents that caused stationary traffic
stationary_traffic_accidents<-US_Accidents_2021_3States %>% mutate(contains_stationary_traffic= str_detect(str_to_lower(Description), pattern = "stationary traffic"))
stationary_traffic_accidents %>% pull(contains_stationary_traffic) %>% sum()

## [1] 23227

23227 led to stationary traffic.

#Creating a Data Dictionary for the US Accidents and Population Data
myvariables<-Accident_Pop_Data %>% select(c("State", 
                         "Sunrise_Sunset", 
                         "Weather_Condition", 
                         "Side",
                         "Severity",
                         "CTYNAME",
                         "POPESTIMATE2021",
                         "accidentLocation",
                         "Start_time",
                         "Description",
                         "STNAME"))
dataDictionary <- tibble(Variable = colnames(myvariables),
                         Description = c("US States", 
                         "Day or Night", 
                         "Weather Conditions", 
                         " Side of the Road",
                         "Impact of the accident on traffic on a scale of 1 to 4",
                         "Name of the city",
                         "2021 Population Estimate",
                         "Location of the accident",
                         "Start time of the Accident",
                         "Description of the accident aftermath",
                         "The name of the State"),
Type = map_chr(myvariables, .f = function(x){typeof(x)[1]}),
Class = map_chr(myvariables, .f = function(x){class(x)[1]}))
flextable::flextable(dataDictionary, cwidth = 2)

Variable	Description	Type	Class
State	US States	character	character
Sunrise_Sunset	Day or Night	character	character
Weather_Condition	Weather Conditions	character	character
Side	Side of the Road	character	character
Severity	Impact of the accident on traffic on a scale of 1 to 4	integer	integer
CTYNAME	Name of the city	character	character
POPESTIMATE2021	2021 Population Estimate	integer	integer
accidentLocation	Location of the accident	character	character
Start_time	Start time of the Accident	double	POSIXct
Description	Description of the accident aftermath	character	character
STNAME	The name of the State	character	character

# Randomization test example ---------------------------------

library(broom)

# Loading the dataset

data(US_Accidents_2021_3States)
myData <- US_Accidents_2021_3States %>% select(Sunrise_Sunset, `Visibility(mi)`) 

# Fitting One-Way ANOVA model
modFit <- aov(`Visibility(mi)` ~ Sunrise_Sunset, data = US_Accidents_2021_3States)
Fstatistic <- modFit %>% tidy() %>% slice_head(n = 1) %>% pull(statistic)

# Getting the number of accidents during the day and night

groupCounts <- US_Accidents_2021_3States %>% dplyr::count(Sunrise_Sunset) %>% filter(Sunrise_Sunset%in%c("Day", "Night"))
flextable::flextable(groupCounts, cwidth = 2)

Sunrise_Sunset	n
Day	96,800
Night	51,486

# Overall sample size
N <- nrow(US_Accidents_2021_3States)

# Number of permutations
nperms <- 1000

# Instantiating vector for test statistics
permFs <- vector(length = nperms)
mean(permFs>=Fstatistic)

## [1] 0

# Create vector of the number of accidents during the day and night
groups <- rep(groupCounts$Sunrise_Sunset, times = groupCounts$n)

for(p in 1:nperms) {
permData <- US_Accidents_2021_3States %>% mutate(Sunrise_Sunset = groups[sample(1:N, size = N, replace = FALSE)])

# Calculate accidents during the day and night
modFit <- aov(`Visibility(mi)` ~ Sunrise_Sunset, data = permData)
permFs[p] <- modFit %>% tidy() %>% slice_head(n = 1) %>% pull(statistic)
}
permFs[p]

## [1] 0.008839787

96,800 accidents happened during the day and 51,486 accidents during the night. The number of permFs was 0.008839787.

# Calculating the standard error

myname<-US_Accidents_2021_3States %>% dplyr::rename(Visibility = `Visibility(mi)`) %>% filter(!is.na(Visibility))

set.seed(1994)

n <-nrow(myname) 
visibility <- myname$Visibility
median(visibility)

## [1] 10

B<- 100

# Instantiating matrix for bootstrap samples
paramboots <- matrix(NA, nrow = n, ncol = B)
xBar<-mean(visibility)
s<-sd(visibility)

# Simulating a normal set of 30 values, B times
for(b in 1:B) {
paramboots[, b] <- rnorm(n=n, mean=xBar, sd=s)
}

#Installing vector for bootstrap medians
bootparamMedians<-vector(length=B)

#Calculating median for each simulated data set
for(b in 1:B) {
  bootparamMedians[b]<-median(paramboots[,b])
}

#Obtain a parametric bootstrap estimate of the standard error of the sample median.
  SEparamestimate<-sd(bootparamMedians[b])

The parametric bootstrap estimate of the standard error of the sample median is 10.

Final Project: Exploring the Relationship Between Population and Accident Data in the US

Hyreen Alice, Pujan Rijal, Banabas Kariuki

2022-12-12

day	No_of_Accidents
1	125
2	169
3	148
4	108
5	91
6	144
7	139
8	200
9	124
10	166
11	170
12	135
13	124
14	135
15	158
16	138
17	183
18	207
19	109
20	152
21	156
22	187
23	210
24	163
25	86
26	65
27	195
28	155
29	133
30	151
31	109

day	No_of_Accidents
1	125
2	169
3	148
4	108
5	91
6	144
7	139
8	200
9	124
10	166
11	170
12	135
13	124
14	135
15	158
16	138
17	183
18	207
19	109
20	152
21	156
22	187
23	210
24	163
25	86
26	65
27	195
28	155
29	133
30	151
31	109

day	No_of_Accidents
1	125
2	169
3	148
4	108
5	91
6	144
7	139
8	200
9	124
10	166
11	170
12	135
13	124
14	135
15	158
16	138
17	183
18	207
19	109
20	152
21	156
22	187
23	210
24	163
25	86
26	65
27	195
28	155
29	133
30	151
31	109