This dataset is based on approximately the last 5 years of NYPD motor vehicle collisions in New York City.
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.3.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.3.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.3
data2 <- read.csv("NYPD_Motor_Vehicle_Collisions.csv")
head(data2)
## DATE TIME BOROUGH ZIP.CODE LATITUDE LONGITUDE
## 1 10/01/2017 0:00 BROOKLYN 11236 40.64423 -73.89980
## 2 10/01/2017 0:00 MANHATTAN 10021 40.77075 -73.95695
## 3 10/01/2017 0:00 MANHATTAN 10040 40.85522 -73.93587
## 4 10/01/2017 0:00 QUEENS 11109 NA NA
## 5 10/01/2017 0:00 QUEENS 11419 40.68653 -73.83005
## 6 10/01/2017 0:00 NA 40.65008 -73.91544
## LOCATION ON.STREET.NAME
## 1 (40.644226, -73.899796) FLATLANDS AVENUE
## 2 (40.770752, -73.95695) 2 AVENUE
## 3 (40.855217, -73.935875)
## 4
## 5 (40.686527, -73.83005) 103 AVENUE
## 6 (40.65008, -73.915436) AVENUE B
## CROSS.STREET.NAME OFF.STREET.NAME
## 1 EAST 98 STREET
## 2 EAST 75 STREET
## 3 66 OVERLOOK TERRACE
## 4 46-30 CENTER BOULEVARD
## 5 113 STREET
## 6
## NUMBER.OF.PERSONS.INJURED NUMBER.OF.PERSONS.KILLED
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## NUMBER.OF.PEDESTRIANS.INJURED NUMBER.OF.PEDESTRIANS.KILLED
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## NUMBER.OF.CYCLIST.INJURED NUMBER.OF.CYCLIST.KILLED
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## NUMBER.OF.MOTORIST.INJURED NUMBER.OF.MOTORIST.KILLED
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## CONTRIBUTING.FACTOR.VEHICLE.1 CONTRIBUTING.FACTOR.VEHICLE.2
## 1 Aggressive Driving/Road Rage Unspecified
## 2 Driver Inattention/Distraction Unspecified
## 3 Driver Inattention/Distraction Unspecified
## 4 Unspecified
## 5 Driver Inattention/Distraction Driver Inattention/Distraction
## 6 Unsafe Lane Changing
## CONTRIBUTING.FACTOR.VEHICLE.3 CONTRIBUTING.FACTOR.VEHICLE.4
## 1
## 2
## 3 Unspecified
## 4
## 5 Unspecified
## 6
## CONTRIBUTING.FACTOR.VEHICLE.5 UNIQUE.KEY VEHICLE.TYPE.CODE.1
## 1 3761222 PASSENGER VEHICLE
## 2 3761824 PICK-UP TRUCK
## 3 3761939 SPORT UTILITY / STATION WAGON
## 4 3760797 SPORT UTILITY / STATION WAGON
## 5 3761556 PASSENGER VEHICLE
## 6 3761132 SPORT UTILITY / STATION WAGON
## VEHICLE.TYPE.CODE.2 VEHICLE.TYPE.CODE.3 VEHICLE.TYPE.CODE.4
## 1 PASSENGER VEHICLE
## 2 TAXI
## 3 PASSENGER VEHICLE PASSENGER VEHICLE
## 4
## 5 SPORT UTILITY / STATION WAGON PASSENGER VEHICLE
## 6 PASSENGER VEHICLE
## VEHICLE.TYPE.CODE.5
## 1
## 2
## 3
## 4
## 5
## 6
checking the data types
sapply(data2, mode)
## DATE TIME
## "numeric" "numeric"
## BOROUGH ZIP.CODE
## "numeric" "numeric"
## LATITUDE LONGITUDE
## "numeric" "numeric"
## LOCATION ON.STREET.NAME
## "numeric" "numeric"
## CROSS.STREET.NAME OFF.STREET.NAME
## "numeric" "numeric"
## NUMBER.OF.PERSONS.INJURED NUMBER.OF.PERSONS.KILLED
## "numeric" "numeric"
## NUMBER.OF.PEDESTRIANS.INJURED NUMBER.OF.PEDESTRIANS.KILLED
## "numeric" "numeric"
## NUMBER.OF.CYCLIST.INJURED NUMBER.OF.CYCLIST.KILLED
## "numeric" "numeric"
## NUMBER.OF.MOTORIST.INJURED NUMBER.OF.MOTORIST.KILLED
## "numeric" "numeric"
## CONTRIBUTING.FACTOR.VEHICLE.1 CONTRIBUTING.FACTOR.VEHICLE.2
## "numeric" "numeric"
## CONTRIBUTING.FACTOR.VEHICLE.3 CONTRIBUTING.FACTOR.VEHICLE.4
## "numeric" "numeric"
## CONTRIBUTING.FACTOR.VEHICLE.5 UNIQUE.KEY
## "numeric" "numeric"
## VEHICLE.TYPE.CODE.1 VEHICLE.TYPE.CODE.2
## "numeric" "numeric"
## VEHICLE.TYPE.CODE.3 VEHICLE.TYPE.CODE.4
## "numeric" "numeric"
## VEHICLE.TYPE.CODE.5
## "numeric"
sapply(data2, class)
## DATE TIME
## "factor" "factor"
## BOROUGH ZIP.CODE
## "factor" "integer"
## LATITUDE LONGITUDE
## "numeric" "numeric"
## LOCATION ON.STREET.NAME
## "factor" "factor"
## CROSS.STREET.NAME OFF.STREET.NAME
## "factor" "factor"
## NUMBER.OF.PERSONS.INJURED NUMBER.OF.PERSONS.KILLED
## "integer" "integer"
## NUMBER.OF.PEDESTRIANS.INJURED NUMBER.OF.PEDESTRIANS.KILLED
## "integer" "integer"
## NUMBER.OF.CYCLIST.INJURED NUMBER.OF.CYCLIST.KILLED
## "integer" "integer"
## NUMBER.OF.MOTORIST.INJURED NUMBER.OF.MOTORIST.KILLED
## "integer" "integer"
## CONTRIBUTING.FACTOR.VEHICLE.1 CONTRIBUTING.FACTOR.VEHICLE.2
## "factor" "factor"
## CONTRIBUTING.FACTOR.VEHICLE.3 CONTRIBUTING.FACTOR.VEHICLE.4
## "factor" "factor"
## CONTRIBUTING.FACTOR.VEHICLE.5 UNIQUE.KEY
## "factor" "integer"
## VEHICLE.TYPE.CODE.1 VEHICLE.TYPE.CODE.2
## "factor" "factor"
## VEHICLE.TYPE.CODE.3 VEHICLE.TYPE.CODE.4
## "factor" "factor"
## VEHICLE.TYPE.CODE.5
## "factor"
select the relvant variables, clean up the column names
Accidents <- data2 %>%
select(DATE,
TIME,
BOROUGH,
ZIP.CODE,
NUMBER.OF.PERSONS.INJURED,
NUMBER.OF.PERSONS.KILLED,
NUMBER.OF.PEDESTRIANS.INJURED,
NUMBER.OF.PEDESTRIANS.KILLED,
NUMBER.OF.CYCLIST.INJURED,
NUMBER.OF.CYCLIST.KILLED,
NUMBER.OF.MOTORIST.INJURED,
NUMBER.OF.MOTORIST.KILLED,
CONTRIBUTING.FACTOR.VEHICLE.1,
CONTRIBUTING.FACTOR.VEHICLE.2,
VEHICLE.TYPE.CODE.1,
VEHICLE.TYPE.CODE.2) %>%
rename('PERS.INJ' = NUMBER.OF.PERSONS.INJURED,
'PERS.KILLED' = NUMBER.OF.PERSONS.KILLED,
'PED.INJ' = NUMBER.OF.PEDESTRIANS.INJURED,
'PED.KILLED' = NUMBER.OF.PEDESTRIANS.KILLED,
'CYC.INJ' = NUMBER.OF.CYCLIST.INJURED,
'CYC.KILLED' = NUMBER.OF.CYCLIST.KILLED,
'MOT.INJ' = NUMBER.OF.MOTORIST.INJURED,
'MOT.KILLED' = NUMBER.OF.MOTORIST.KILLED)
head(Accidents)
## DATE TIME BOROUGH ZIP.CODE PERS.INJ PERS.KILLED PED.INJ
## 1 10/01/2017 0:00 BROOKLYN 11236 0 0 0
## 2 10/01/2017 0:00 MANHATTAN 10021 0 0 0
## 3 10/01/2017 0:00 MANHATTAN 10040 0 0 0
## 4 10/01/2017 0:00 QUEENS 11109 0 0 0
## 5 10/01/2017 0:00 QUEENS 11419 0 0 0
## 6 10/01/2017 0:00 NA 0 0 0
## PED.KILLED CYC.INJ CYC.KILLED MOT.INJ MOT.KILLED
## 1 0 0 0 0 0
## 2 0 0 0 0 0
## 3 0 0 0 0 0
## 4 0 0 0 0 0
## 5 0 0 0 0 0
## 6 0 0 0 0 0
## CONTRIBUTING.FACTOR.VEHICLE.1 CONTRIBUTING.FACTOR.VEHICLE.2
## 1 Aggressive Driving/Road Rage Unspecified
## 2 Driver Inattention/Distraction Unspecified
## 3 Driver Inattention/Distraction Unspecified
## 4 Unspecified
## 5 Driver Inattention/Distraction Driver Inattention/Distraction
## 6 Unsafe Lane Changing
## VEHICLE.TYPE.CODE.1 VEHICLE.TYPE.CODE.2
## 1 PASSENGER VEHICLE PASSENGER VEHICLE
## 2 PICK-UP TRUCK TAXI
## 3 SPORT UTILITY / STATION WAGON PASSENGER VEHICLE
## 4 SPORT UTILITY / STATION WAGON
## 5 PASSENGER VEHICLE SPORT UTILITY / STATION WAGON
## 6 SPORT UTILITY / STATION WAGON PASSENGER VEHICLE
Build a new table grouped by borough
By.Borough <- Accidents %>%
select(BOROUGH,
PERS.INJ,
PERS.KILLED,
PED.INJ,
PED.KILLED,
CYC.INJ,
CYC.KILLED,
MOT.INJ,
MOT.KILLED) %>%
group_by(BOROUGH) %>%
summarise(TOTAL.INJ = sum(PERS.INJ),
TOTAL.KILLED = sum(PERS.KILLED),
PED.INJ = sum(PED.INJ),
PED.KILLED = sum(PED.KILLED),
CYC.INJ = sum(CYC.INJ),
CYC.KILLED = sum(CYC.KILLED),
MOT.INJ = sum(MOT.INJ),
MOT.KILLED = sum(MOT.KILLED))
By.Borough
## # A tibble: 6 x 9
## BOROUGH TOTAL.INJ TOTAL.KILLED PED.INJ PED.KILLED CYC.INJ
## <fctr> <int> <int> <int> <int> <int>
## 1 85153 452 8833 202 3707
## 2 BRONX 31012 127 7697 75 1795
## 3 BROOKLYN 73174 294 16824 170 7824
## 4 MANHATTAN 35269 169 12546 130 5880
## 5 QUEENS 54966 260 11275 147 3904
## 6 STATEN ISLAND 9099 52 1407 22 216
## # ... with 3 more variables: CYC.KILLED <int>, MOT.INJ <int>,
## # MOT.KILLED <int>
Compare total injured by zip code, drop the NA rows
By.Zip <- Accidents %>%
select(ZIP.CODE,
PERS.INJ,
PERS.KILLED) %>%
group_by(ZIP.CODE) %>%
summarise(Total.Injured = sum(PERS.INJ),
Total.Killed = sum(PERS.KILLED)) %>%
drop_na(ZIP.CODE)
# get summary data for total injured and total killed
summary(By.Zip$Total.Injured)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 157.5 773.5 908.4 1449.0 5301.0
summary(By.Zip$Total.Killed)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 3.000 4.027 6.000 24.000