1 Introduction
In this R Markdown, historical data of motor vehicle collisions happened in New York is analyzed using practical statistic.
2 Input Data
Data was downloaded from NYC Open Data (https://data.cityofnewyork.us/Public-Safety/Motor-Vehicle-Collisions-Crashes/h9gi-nx95).
nyc_crashes <- read.csv("data/Motor_Vehicle_Collisions_-_Crashes.csv")2.1 Data Inspection
Checking data types:
glimpse(nyc_crashes)## Rows: 1,809,646
## Columns: 29
## $ CRASH.DATE <chr> "04/14/2021", "04/13/2021", "04/15/2021"~
## $ CRASH.TIME <chr> "5:32", "21:35", "16:15", "16:00", "8:25~
## $ BOROUGH <chr> "", "BROOKLYN", "", "BROOKLYN", "", "", ~
## $ ZIP.CODE <int> NA, 11217, NA, 11222, NA, NA, 11106, NA,~
## $ LATITUDE <dbl> NA, 40.68358, NA, NA, 0.00000, NA, NA, N~
## $ LONGITUDE <dbl> NA, -73.97617, NA, NA, 0.00000, NA, NA, ~
## $ LOCATION <chr> "", "(40.68358, -73.97617)", "", "", "(0~
## $ ON.STREET.NAME <chr> "BRONX WHITESTONE BRIDGE", "", "HUTCHINS~
## $ CROSS.STREET.NAME <chr> "", "", "", "ANTHONY STREET", "", "", "3~
## $ OFF.STREET.NAME <chr> "", "620 ATLANTIC AVENUE ~
## $ NUMBER.OF.PERSONS.INJURED <int> 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0~
## $ NUMBER.OF.PERSONS.KILLED <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ NUMBER.OF.PEDESTRIANS.INJURED <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0~
## $ NUMBER.OF.PEDESTRIANS.KILLED <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ NUMBER.OF.CYCLIST.INJURED <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ NUMBER.OF.CYCLIST.KILLED <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ NUMBER.OF.MOTORIST.INJURED <int> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0~
## $ NUMBER.OF.MOTORIST.KILLED <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ CONTRIBUTING.FACTOR.VEHICLE.1 <chr> "Following Too Closely", "Unspecified", ~
## $ CONTRIBUTING.FACTOR.VEHICLE.2 <chr> "Unspecified", "", "", "Unspecified", "U~
## $ CONTRIBUTING.FACTOR.VEHICLE.3 <chr> "", "", "", "", "", "", "", "", "", "", ~
## $ CONTRIBUTING.FACTOR.VEHICLE.4 <chr> "", "", "", "", "", "", "", "", "", "", ~
## $ CONTRIBUTING.FACTOR.VEHICLE.5 <chr> "", "", "", "", "", "", "", "", "", "", ~
## $ COLLISION_ID <int> 4407480, 4407147, 4407665, 4407811, 4406~
## $ VEHICLE.TYPE.CODE.1 <chr> "Sedan", "Sedan", "Station Wagon/Sport U~
## $ VEHICLE.TYPE.CODE.2 <chr> "Sedan", "", "", "", "Sedan", "Box Truck~
## $ VEHICLE.TYPE.CODE.3 <chr> "", "", "", "", "", "", "", "", "", "", ~
## $ VEHICLE.TYPE.CODE.4 <chr> "", "", "", "", "", "", "", "", "", "", ~
## $ VEHICLE.TYPE.CODE.5 <chr> "", "", "", "", "", "", "", "", "", "", ~
Trying to identify unique values:
length(unique(nyc_crashes$BOROUGH))## [1] 6
length(unique(nyc_crashes$CONTRIBUTING.FACTOR.VEHICLE.1))## [1] 62
length(unique(nyc_crashes$VEHICLE.TYPE.CODE.1))## [1] 1274
Trying to find any missing values:
anyNA(nyc_crashes)## [1] TRUE
Conclusion:
- Need to derived CRASH.DATE into MONTH and YEAR to analyze seasonality
- Need to group CRASH.TIME into MORNING, AFTERNOON and EVENING
- Keep only CONTRIBUTING.FACTOR and VEHICLE.TYPE number 1 for simplicity
- Remove missing values
2.2 Data Cleansing and Coertions
Remove rows with missing values:
nyc_crashes <- na.omit(nyc_crashes)
anyNA(nyc_crashes)## [1] FALSE
Drop columns CONTRIBUTING.FACTOR.VEHICLE and VEHICLE.TYPE.CODE other than numbered 1:
nyc_crashes <- nyc_crashes[,-c(20,21,22,23,26,27,28,29)]
colnames(nyc_crashes)## [1] "CRASH.DATE" "CRASH.TIME"
## [3] "BOROUGH" "ZIP.CODE"
## [5] "LATITUDE" "LONGITUDE"
## [7] "LOCATION" "ON.STREET.NAME"
## [9] "CROSS.STREET.NAME" "OFF.STREET.NAME"
## [11] "NUMBER.OF.PERSONS.INJURED" "NUMBER.OF.PERSONS.KILLED"
## [13] "NUMBER.OF.PEDESTRIANS.INJURED" "NUMBER.OF.PEDESTRIANS.KILLED"
## [15] "NUMBER.OF.CYCLIST.INJURED" "NUMBER.OF.CYCLIST.KILLED"
## [17] "NUMBER.OF.MOTORIST.INJURED" "NUMBER.OF.MOTORIST.KILLED"
## [19] "CONTRIBUTING.FACTOR.VEHICLE.1" "COLLISION_ID"
## [21] "VEHICLE.TYPE.CODE.1"
Change type:
nyc_crashes$CRASH.DATE <- as.Date(nyc_crashes$CRASH.DATE,"%m/%d/%Y")
nyc_crashes$MONTH <- format(nyc_crashes$CRASH.DATE,"%m")
nyc_crashes$YEAR <- format(nyc_crashes$CRASH.DATE,"%Y")
nyc_crashes$CRASH.TIME <- hm(nyc_crashes$CRASH.TIME)
nyc_crashes$BOROUGH <- as.factor(nyc_crashes$BOROUGH)
nyc_crashes$CONTRIBUTING.FACTOR.VEHICLE.1 <- as.factor(nyc_crashes$BOROUGH)
nyc_crashes$VEHICLE.TYPE.CODE.1 <- as.factor(nyc_crashes$VEHICLE.TYPE.CODE.1)
nyc_crashes$MONTH <- as.factor(nyc_crashes$MONTH)
nyc_crashes$YEAR <- as.factor(nyc_crashes$YEAR)
nyc_crashes %>% mutate(PERIOD = ifelse(hour(CRASH.TIME) >= 18,"EVENING",ifelse(hour(CRASH.TIME)<12,"MORNING","AFTERNOON"))) -> nyc_crashes
glimpse(nyc_crashes)## Rows: 1,218,979
## Columns: 24
## $ CRASH.DATE <date> 2021-04-13, 2019-05-21, 2021-02-26, 202~
## $ CRASH.TIME <Period> 21H 35M 0S, 22H 50M 0S, 14H 50M 0S, 2~
## $ BOROUGH <fct> BROOKLYN, BROOKLYN, BRONX, BROOKLYN, STA~
## $ ZIP.CODE <int> 11217, 11201, 10461, 11234, 10312, 10461~
## $ LATITUDE <dbl> 40.68358, 40.69754, 40.84346, 40.62646, ~
## $ LONGITUDE <dbl> -73.97617, -73.98312, -73.83600, -73.918~
## $ LOCATION <chr> "(40.68358, -73.97617)", "(40.69754, -73~
## $ ON.STREET.NAME <chr> "", "GOLD STREET", "", "RALPH AVENUE", "~
## $ CROSS.STREET.NAME <chr> "", "CONCORD STREET", "", "AVENUE K", "H~
## $ OFF.STREET.NAME <chr> "620 ATLANTIC AVENUE ~
## $ NUMBER.OF.PERSONS.INJURED <int> 1, 0, 0, 1, 7, 0, 0, 0, 0, 2, 1, 0, 0, 0~
## $ NUMBER.OF.PERSONS.KILLED <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ NUMBER.OF.PEDESTRIANS.INJURED <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ NUMBER.OF.PEDESTRIANS.KILLED <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ NUMBER.OF.CYCLIST.INJURED <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ NUMBER.OF.CYCLIST.KILLED <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ NUMBER.OF.MOTORIST.INJURED <int> 0, 0, 0, 1, 7, 0, 0, 0, 0, 2, 1, 0, 0, 0~
## $ NUMBER.OF.MOTORIST.KILLED <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ CONTRIBUTING.FACTOR.VEHICLE.1 <fct> BROOKLYN, BROOKLYN, BRONX, BROOKLYN, STA~
## $ COLLISION_ID <int> 4407147, 4136992, 4395664, 4403773, 4405~
## $ VEHICLE.TYPE.CODE.1 <fct> Sedan, �MBU, Station Wagon/Sport Utili~
## $ MONTH <fct> 04, 05, 02, 03, 04, 04, 04, 04, 04, 04, ~
## $ YEAR <fct> 2021, 2019, 2021, 2021, 2021, 2021, 2021~
## $ PERIOD <chr> "EVENING", "EVENING", "AFTERNOON", "EVEN~
3 Analysis
Summary:
summary(nyc_crashes)## CRASH.DATE CRASH.TIME
## Min. :2012-07-01 Min. :0S
## 1st Qu.:2014-06-18 1st Qu.:10H 0M 0S
## Median :2016-06-10 Median :14H 20M 0S
## Mean :2016-07-25 Mean :13H 45M 2.91691653424641S
## 3rd Qu.:2018-08-06 3rd Qu.:18H 0M 0S
## Max. :2021-08-10 Max. :23H 59M 0S
##
## BOROUGH ZIP.CODE LATITUDE LONGITUDE
## BRONX :175115 Min. :10000 Min. : 0.00 Min. :-74.25
## BROOKLYN :384678 1st Qu.:10306 1st Qu.:40.67 1st Qu.:-73.98
## MANHATTAN :280109 Median :11207 Median :40.72 Median :-73.93
## QUEENS :327589 Mean :10838 Mean :40.67 Mean :-73.83
## STATEN ISLAND: 51488 3rd Qu.:11237 3rd Qu.:40.77 3rd Qu.:-73.87
## Max. :11697 Max. :41.13 Max. : 0.00
##
## LOCATION ON.STREET.NAME CROSS.STREET.NAME OFF.STREET.NAME
## Length:1218979 Length:1218979 Length:1218979 Length:1218979
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## NUMBER.OF.PERSONS.INJURED NUMBER.OF.PERSONS.KILLED
## Min. : 0.0000 Min. :0.00000
## 1st Qu.: 0.0000 1st Qu.:0.00000
## Median : 0.0000 Median :0.00000
## Mean : 0.2648 Mean :0.00116
## 3rd Qu.: 0.0000 3rd Qu.:0.00000
## Max. :43.0000 Max. :8.00000
##
## NUMBER.OF.PEDESTRIANS.INJURED NUMBER.OF.PEDESTRIANS.KILLED
## Min. : 0.00000 Min. :0.000000
## 1st Qu.: 0.00000 1st Qu.:0.000000
## Median : 0.00000 Median :0.000000
## Mean : 0.06072 Mean :0.000667
## 3rd Qu.: 0.00000 3rd Qu.:0.000000
## Max. :27.00000 Max. :6.000000
##
## NUMBER.OF.CYCLIST.INJURED NUMBER.OF.CYCLIST.KILLED NUMBER.OF.MOTORIST.INJURED
## Min. :0.00000 Min. :0.00e+00 Min. : 0.0000
## 1st Qu.:0.00000 1st Qu.:0.00e+00 1st Qu.: 0.0000
## Median :0.00000 Median :0.00e+00 Median : 0.0000
## Mean :0.02668 Mean :9.68e-05 Mean : 0.1766
## 3rd Qu.:0.00000 3rd Qu.:0.00e+00 3rd Qu.: 0.0000
## Max. :4.00000 Max. :2.00e+00 Max. :43.0000
##
## NUMBER.OF.MOTORIST.KILLED CONTRIBUTING.FACTOR.VEHICLE.1 COLLISION_ID
## Min. :0.00000 BRONX :175115 Min. : 22
## 1st Qu.:0.00000 BROOKLYN :384678 1st Qu.: 354722
## Median :0.00000 MANHATTAN :280109 Median :3462186
## Mean :0.00039 QUEENS :327589 Mean :2736171
## 3rd Qu.:0.00000 STATEN ISLAND: 51488 3rd Qu.:3961528
## Max. :5.00000 Max. :4446512
##
## VEHICLE.TYPE.CODE.1 MONTH YEAR
## PASSENGER VEHICLE :310312 07 :114707 2015 :163428
## Sedan :281012 10 :108166 2014 :156327
## Station Wagon/Sport Utility Vehicle:218084 08 :106842 2013 :155959
## SPORT UTILITY / STATION WAGON :134188 12 :105681 2018 :144752
## TAXI : 28136 09 :104973 2017 :138461
## Taxi : 27758 06 :104850 2016 :136859
## (Other) :219489 (Other):573760 (Other):323193
## PERIOD
## Length:1218979
## Class :character
## Mode :character
##
##
##
##
3.1 Univariate Analysis
Check fatality distribution:
boxplot(nyc_crashes$NUMBER.OF.PERSONS.KILLED)
Check injury case distribution:
boxplot(nyc_crashes$NUMBER.OF.PERSONS.INJURED)Check fatality and injury percentage:
crash_with_fatality <- aggregate(COLLISION_ID ~ NUMBER.OF.PERSONS.KILLED,nyc_crashes,length)
crash_with_injury <- aggregate(COLLISION_ID ~ NUMBER.OF.PERSONS.INJURED,nyc_crashes,length)
crash_with_fatality$Percentage <- round(crash_with_fatality$COLLISION_ID / nrow(nyc_crashes) * 100,2)
crash_with_injury$Percentage <- round(crash_with_injury$COLLISION_ID / nrow(nyc_crashes) * 100,2)
crash_with_fatality## NUMBER.OF.PERSONS.KILLED COLLISION_ID Percentage
## 1 0 1217615 99.89
## 2 1 1332 0.11
## 3 2 23 0.00
## 4 3 5 0.00
## 5 4 2 0.00
## 6 5 1 0.00
## 7 8 1 0.00
crash_with_injury## NUMBER.OF.PERSONS.INJURED COLLISION_ID Percentage
## 1 0 975563 80.03
## 2 1 193375 15.86
## 3 2 32870 2.70
## 4 3 10567 0.87
## 5 4 3871 0.32
## 6 5 1532 0.13
## 7 6 622 0.05
## 8 7 273 0.02
## 9 8 120 0.01
## 10 9 56 0.00
## 11 10 48 0.00
## 12 11 23 0.00
## 13 12 16 0.00
## 14 13 10 0.00
## 15 14 5 0.00
## 16 15 5 0.00
## 17 16 4 0.00
## 18 17 4 0.00
## 19 18 3 0.00
## 20 19 3 0.00
## 21 20 1 0.00
## 22 22 3 0.00
## 23 24 2 0.00
## 24 27 1 0.00
## 25 32 1 0.00
## 26 43 1 0.00
Fortunately, fatality and injuries from crashes in NYC relatively small compares to the total of crashes, with both average below 1.
3.2 Bivariate Analysis
Crash freq by Year and Month:
freq_yearmonth <- aggregate(COLLISION_ID ~ YEAR + MONTH,nyc_crashes,length)
freq_yearmonth$COLLISION_ID <- round(freq_yearmonth$COLLISION_ID/1000,2)
colnames(freq_yearmonth)[3] <- "CRASH"
freq_yearmonth[with(freq_yearmonth,order(YEAR,MONTH)),] %>% ggplot(aes(YEAR,MONTH,fill = CRASH)) + geom_tile() +scale_fill_distiller(palette = "Reds", direction = +1)+ geom_text(aes(label=CRASH),size=2.5,color = "black") +ggtitle("Crashes by Year and Month (in Thousands)")Fatality by Year and Month:
fatal_yearmonth <- aggregate(NUMBER.OF.PERSONS.KILLED ~ YEAR + MONTH,nyc_crashes,sum)
colnames(fatal_yearmonth)[3] <- "FATALITY"
fatal_yearmonth[with(fatal_yearmonth,order(YEAR,MONTH)),] %>% ggplot(aes(YEAR,MONTH,fill = FATALITY)) + geom_tile() +scale_fill_distiller(palette = "Reds", direction = +1)+ geom_text(aes(label=FATALITY),size=2.5,color = "black") +ggtitle("Fatality by Year and Month")Check crashes happened in Jan 2013:
nyc_crashes[ nyc_crashes$YEAR == 2013 & nyc_crashes$MONTH == "01" & nyc_crashes$NUMBER.OF.PERSONS.KILLED > 0,c("CRASH.DATE","COLLISION_ID","NUMBER.OF.PERSONS.KILLED","PERIOD")]## CRASH.DATE COLLISION_ID NUMBER.OF.PERSONS.KILLED PERIOD
## 1689763 2013-01-25 145617 1 MORNING
## 1692681 2013-01-29 248775 1 AFTERNOON
## 1693503 2013-01-19 229627 1 EVENING
## 1695221 2013-01-30 136979 1 EVENING
## 1695519 2013-01-30 222269 1 AFTERNOON
## 1697245 2013-01-26 122858 1 AFTERNOON
## 1697483 2013-01-30 62619 1 MORNING
## 1699969 2013-01-11 222056 1 EVENING
## 1701541 2013-01-22 131557 1 AFTERNOON
## 1702269 2013-01-29 110757 1 AFTERNOON
## 1702593 2013-01-31 215286 1 EVENING
## 1702731 2013-01-17 215162 1 EVENING
## 1703127 2013-01-29 136972 1 EVENING
## 1703343 2013-01-20 23074 1 MORNING
## 1703434 2013-01-18 27940 1 AFTERNOON
## 1703727 2013-01-23 222196 1 EVENING
## 1703733 2013-01-29 47363 1 EVENING
## 1704273 2013-01-12 215118 1 MORNING
## 1708272 2013-01-04 96432 1 EVENING
## 1710918 2013-01-07 248436 1 MORNING
## 1711575 2013-01-05 100720 1 AFTERNOON
## 1712994 2013-01-06 153400 1 EVENING
## 1714103 2013-01-04 22964 1 AFTERNOON
## 1716529 2013-01-04 73636 1 MORNING
## 1717539 2013-01-05 117875 3 MORNING
head(nyc_crashes[order(-nyc_crashes$NUMBER.OF.PERSONS.KILLED),c("CRASH.DATE","COLLISION_ID","NUMBER.OF.PERSONS.KILLED","PERIOD")],10)## CRASH.DATE COLLISION_ID NUMBER.OF.PERSONS.KILLED PERIOD
## 666895 2017-10-31 3782508 8 AFTERNOON
## 1794311 2012-07-22 208863 5 MORNING
## 703346 2017-09-18 3752786 4 MORNING
## 1470206 2014-04-04 316814 4 EVENING
## 104624 2020-10-06 4355333 3 MORNING
## 112205 2020-07-11 4327676 3 EVENING
## 1192397 2015-07-01 3251308 3 EVENING
## 1674949 2013-03-03 195451 3 MORNING
## 1717539 2013-01-05 117875 3 MORNING
## 51263 2020-12-12 4375377 2 EVENING
Crash freq by Month and time period:
freq_period <- aggregate(COLLISION_ID ~ MONTH + PERIOD,nyc_crashes,length)
colnames(freq_period)[3] <- "CRASH"
freq_period %>% ggplot(aes(MONTH,CRASH,fill = PERIOD)) + geom_bar(position = "dodge", stat = "identity") + scale_fill_brewer(palette = "Blues") +ggtitle("Crash Freq by Month and Time Period")Crash freq by Borough and time period:
freq_borperiod <- aggregate(COLLISION_ID ~ BOROUGH + PERIOD,nyc_crashes,length)
colnames(freq_borperiod)[3] <- "CRASH"
freq_borperiod %>% ggplot(aes(BOROUGH,CRASH,fill = PERIOD)) + geom_bar(position = "dodge", stat = "identity") + scale_fill_brewer(palette = "Blues") +ggtitle("Crash Freq by Month and Time Period")From the heatmap, looks like most of the crashes happened between the months of May and June. Highest count of crashes happen in 2015 with all-time high happened in October 2015. Most fatality occurred in a month happened in Jan 2013 with Jan 5^th 2013 had most fatalities of 3, although still under record-breaking fatalities of 8 people in Oct 31st 2017.Afternoon consistently record the most frequent crashes happens in NYC, where most of the time happened in Brooklyn.
3.3 Conclusion
NYC still manage to keep crash fatalities and injuries low year-by-year. Trends of crashes start increasing in mid-year and slows down in early months. NYC officials may want to put more traffic management effort in the afternoon, since crashes constantly happens at that time of the day. Crashes significantly decrease in 2020 and 2021, which we may find it interesting to relate it with Covid19 pandemic.