This dataset is based on approximately the last 5 years of NYPD motor vehicle collisions in New York City.

library(tidyr)
## Warning: package 'tidyr' was built under R version 3.3.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.3.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.3
data2 <- read.csv("NYPD_Motor_Vehicle_Collisions.csv")
head(data2)
##         DATE TIME   BOROUGH ZIP.CODE LATITUDE LONGITUDE
## 1 10/01/2017 0:00  BROOKLYN    11236 40.64423 -73.89980
## 2 10/01/2017 0:00 MANHATTAN    10021 40.77075 -73.95695
## 3 10/01/2017 0:00 MANHATTAN    10040 40.85522 -73.93587
## 4 10/01/2017 0:00    QUEENS    11109       NA        NA
## 5 10/01/2017 0:00    QUEENS    11419 40.68653 -73.83005
## 6 10/01/2017 0:00                 NA 40.65008 -73.91544
##                  LOCATION                   ON.STREET.NAME
## 1 (40.644226, -73.899796) FLATLANDS AVENUE                
## 2  (40.770752, -73.95695) 2 AVENUE                        
## 3 (40.855217, -73.935875)                                 
## 4                                                         
## 5  (40.686527, -73.83005) 103 AVENUE                      
## 6  (40.65008, -73.915436) AVENUE B                        
##   CROSS.STREET.NAME                          OFF.STREET.NAME
## 1    EAST 98 STREET                                         
## 2    EAST 75 STREET                                         
## 3                   66        OVERLOOK TERRACE              
## 4                   46-30     CENTER BOULEVARD              
## 5        113 STREET                                         
## 6                                                           
##   NUMBER.OF.PERSONS.INJURED NUMBER.OF.PERSONS.KILLED
## 1                         0                        0
## 2                         0                        0
## 3                         0                        0
## 4                         0                        0
## 5                         0                        0
## 6                         0                        0
##   NUMBER.OF.PEDESTRIANS.INJURED NUMBER.OF.PEDESTRIANS.KILLED
## 1                             0                            0
## 2                             0                            0
## 3                             0                            0
## 4                             0                            0
## 5                             0                            0
## 6                             0                            0
##   NUMBER.OF.CYCLIST.INJURED NUMBER.OF.CYCLIST.KILLED
## 1                         0                        0
## 2                         0                        0
## 3                         0                        0
## 4                         0                        0
## 5                         0                        0
## 6                         0                        0
##   NUMBER.OF.MOTORIST.INJURED NUMBER.OF.MOTORIST.KILLED
## 1                          0                         0
## 2                          0                         0
## 3                          0                         0
## 4                          0                         0
## 5                          0                         0
## 6                          0                         0
##    CONTRIBUTING.FACTOR.VEHICLE.1  CONTRIBUTING.FACTOR.VEHICLE.2
## 1   Aggressive Driving/Road Rage                    Unspecified
## 2 Driver Inattention/Distraction                    Unspecified
## 3 Driver Inattention/Distraction                    Unspecified
## 4                    Unspecified                               
## 5 Driver Inattention/Distraction Driver Inattention/Distraction
## 6           Unsafe Lane Changing                               
##   CONTRIBUTING.FACTOR.VEHICLE.3 CONTRIBUTING.FACTOR.VEHICLE.4
## 1                                                            
## 2                                                            
## 3                   Unspecified                              
## 4                                                            
## 5                   Unspecified                              
## 6                                                            
##   CONTRIBUTING.FACTOR.VEHICLE.5 UNIQUE.KEY           VEHICLE.TYPE.CODE.1
## 1                                  3761222             PASSENGER VEHICLE
## 2                                  3761824                 PICK-UP TRUCK
## 3                                  3761939 SPORT UTILITY / STATION WAGON
## 4                                  3760797 SPORT UTILITY / STATION WAGON
## 5                                  3761556             PASSENGER VEHICLE
## 6                                  3761132 SPORT UTILITY / STATION WAGON
##             VEHICLE.TYPE.CODE.2 VEHICLE.TYPE.CODE.3 VEHICLE.TYPE.CODE.4
## 1             PASSENGER VEHICLE                                        
## 2                          TAXI                                        
## 3             PASSENGER VEHICLE   PASSENGER VEHICLE                    
## 4                                                                      
## 5 SPORT UTILITY / STATION WAGON   PASSENGER VEHICLE                    
## 6             PASSENGER VEHICLE                                        
##   VEHICLE.TYPE.CODE.5
## 1                    
## 2                    
## 3                    
## 4                    
## 5                    
## 6

checking the data types

sapply(data2, mode)
##                          DATE                          TIME 
##                     "numeric"                     "numeric" 
##                       BOROUGH                      ZIP.CODE 
##                     "numeric"                     "numeric" 
##                      LATITUDE                     LONGITUDE 
##                     "numeric"                     "numeric" 
##                      LOCATION                ON.STREET.NAME 
##                     "numeric"                     "numeric" 
##             CROSS.STREET.NAME               OFF.STREET.NAME 
##                     "numeric"                     "numeric" 
##     NUMBER.OF.PERSONS.INJURED      NUMBER.OF.PERSONS.KILLED 
##                     "numeric"                     "numeric" 
## NUMBER.OF.PEDESTRIANS.INJURED  NUMBER.OF.PEDESTRIANS.KILLED 
##                     "numeric"                     "numeric" 
##     NUMBER.OF.CYCLIST.INJURED      NUMBER.OF.CYCLIST.KILLED 
##                     "numeric"                     "numeric" 
##    NUMBER.OF.MOTORIST.INJURED     NUMBER.OF.MOTORIST.KILLED 
##                     "numeric"                     "numeric" 
## CONTRIBUTING.FACTOR.VEHICLE.1 CONTRIBUTING.FACTOR.VEHICLE.2 
##                     "numeric"                     "numeric" 
## CONTRIBUTING.FACTOR.VEHICLE.3 CONTRIBUTING.FACTOR.VEHICLE.4 
##                     "numeric"                     "numeric" 
## CONTRIBUTING.FACTOR.VEHICLE.5                    UNIQUE.KEY 
##                     "numeric"                     "numeric" 
##           VEHICLE.TYPE.CODE.1           VEHICLE.TYPE.CODE.2 
##                     "numeric"                     "numeric" 
##           VEHICLE.TYPE.CODE.3           VEHICLE.TYPE.CODE.4 
##                     "numeric"                     "numeric" 
##           VEHICLE.TYPE.CODE.5 
##                     "numeric"
sapply(data2, class)
##                          DATE                          TIME 
##                      "factor"                      "factor" 
##                       BOROUGH                      ZIP.CODE 
##                      "factor"                     "integer" 
##                      LATITUDE                     LONGITUDE 
##                     "numeric"                     "numeric" 
##                      LOCATION                ON.STREET.NAME 
##                      "factor"                      "factor" 
##             CROSS.STREET.NAME               OFF.STREET.NAME 
##                      "factor"                      "factor" 
##     NUMBER.OF.PERSONS.INJURED      NUMBER.OF.PERSONS.KILLED 
##                     "integer"                     "integer" 
## NUMBER.OF.PEDESTRIANS.INJURED  NUMBER.OF.PEDESTRIANS.KILLED 
##                     "integer"                     "integer" 
##     NUMBER.OF.CYCLIST.INJURED      NUMBER.OF.CYCLIST.KILLED 
##                     "integer"                     "integer" 
##    NUMBER.OF.MOTORIST.INJURED     NUMBER.OF.MOTORIST.KILLED 
##                     "integer"                     "integer" 
## CONTRIBUTING.FACTOR.VEHICLE.1 CONTRIBUTING.FACTOR.VEHICLE.2 
##                      "factor"                      "factor" 
## CONTRIBUTING.FACTOR.VEHICLE.3 CONTRIBUTING.FACTOR.VEHICLE.4 
##                      "factor"                      "factor" 
## CONTRIBUTING.FACTOR.VEHICLE.5                    UNIQUE.KEY 
##                      "factor"                     "integer" 
##           VEHICLE.TYPE.CODE.1           VEHICLE.TYPE.CODE.2 
##                      "factor"                      "factor" 
##           VEHICLE.TYPE.CODE.3           VEHICLE.TYPE.CODE.4 
##                      "factor"                      "factor" 
##           VEHICLE.TYPE.CODE.5 
##                      "factor"

select the relvant variables, clean up the column names

Accidents <- data2 %>% 
  select(DATE,
         TIME,
         BOROUGH, 
         ZIP.CODE,
         NUMBER.OF.PERSONS.INJURED,
         NUMBER.OF.PERSONS.KILLED,
         NUMBER.OF.PEDESTRIANS.INJURED,
         NUMBER.OF.PEDESTRIANS.KILLED,
         NUMBER.OF.CYCLIST.INJURED,
         NUMBER.OF.CYCLIST.KILLED,
         NUMBER.OF.MOTORIST.INJURED,
         NUMBER.OF.MOTORIST.KILLED,
         CONTRIBUTING.FACTOR.VEHICLE.1,
         CONTRIBUTING.FACTOR.VEHICLE.2,
         VEHICLE.TYPE.CODE.1,
         VEHICLE.TYPE.CODE.2) %>% 

  rename('PERS.INJ'    = NUMBER.OF.PERSONS.INJURED,
         'PERS.KILLED' = NUMBER.OF.PERSONS.KILLED,
         'PED.INJ'     = NUMBER.OF.PEDESTRIANS.INJURED,
         'PED.KILLED'  = NUMBER.OF.PEDESTRIANS.KILLED,
         'CYC.INJ'     = NUMBER.OF.CYCLIST.INJURED,
         'CYC.KILLED'  = NUMBER.OF.CYCLIST.KILLED,
         'MOT.INJ'     = NUMBER.OF.MOTORIST.INJURED,
         'MOT.KILLED'  = NUMBER.OF.MOTORIST.KILLED)

head(Accidents)
##         DATE TIME   BOROUGH ZIP.CODE PERS.INJ PERS.KILLED PED.INJ
## 1 10/01/2017 0:00  BROOKLYN    11236        0           0       0
## 2 10/01/2017 0:00 MANHATTAN    10021        0           0       0
## 3 10/01/2017 0:00 MANHATTAN    10040        0           0       0
## 4 10/01/2017 0:00    QUEENS    11109        0           0       0
## 5 10/01/2017 0:00    QUEENS    11419        0           0       0
## 6 10/01/2017 0:00                 NA        0           0       0
##   PED.KILLED CYC.INJ CYC.KILLED MOT.INJ MOT.KILLED
## 1          0       0          0       0          0
## 2          0       0          0       0          0
## 3          0       0          0       0          0
## 4          0       0          0       0          0
## 5          0       0          0       0          0
## 6          0       0          0       0          0
##    CONTRIBUTING.FACTOR.VEHICLE.1  CONTRIBUTING.FACTOR.VEHICLE.2
## 1   Aggressive Driving/Road Rage                    Unspecified
## 2 Driver Inattention/Distraction                    Unspecified
## 3 Driver Inattention/Distraction                    Unspecified
## 4                    Unspecified                               
## 5 Driver Inattention/Distraction Driver Inattention/Distraction
## 6           Unsafe Lane Changing                               
##             VEHICLE.TYPE.CODE.1           VEHICLE.TYPE.CODE.2
## 1             PASSENGER VEHICLE             PASSENGER VEHICLE
## 2                 PICK-UP TRUCK                          TAXI
## 3 SPORT UTILITY / STATION WAGON             PASSENGER VEHICLE
## 4 SPORT UTILITY / STATION WAGON                              
## 5             PASSENGER VEHICLE SPORT UTILITY / STATION WAGON
## 6 SPORT UTILITY / STATION WAGON             PASSENGER VEHICLE

Build a new table grouped by borough

By.Borough <- Accidents %>% 
  select(BOROUGH, 
         PERS.INJ,
         PERS.KILLED,
         PED.INJ,
         PED.KILLED,
         CYC.INJ,
         CYC.KILLED,
         MOT.INJ,
         MOT.KILLED) %>% 
  
  group_by(BOROUGH) %>% 
  
  summarise(TOTAL.INJ    = sum(PERS.INJ),
            TOTAL.KILLED = sum(PERS.KILLED),
            PED.INJ      = sum(PED.INJ),
            PED.KILLED   = sum(PED.KILLED),
            CYC.INJ      = sum(CYC.INJ),
            CYC.KILLED   = sum(CYC.KILLED),
            MOT.INJ      = sum(MOT.INJ),
            MOT.KILLED   = sum(MOT.KILLED)) 
  

By.Borough
## # A tibble: 6 x 9
##         BOROUGH TOTAL.INJ TOTAL.KILLED PED.INJ PED.KILLED CYC.INJ
##          <fctr>     <int>        <int>   <int>      <int>   <int>
## 1                   85153          452    8833        202    3707
## 2         BRONX     31012          127    7697         75    1795
## 3      BROOKLYN     73174          294   16824        170    7824
## 4     MANHATTAN     35269          169   12546        130    5880
## 5        QUEENS     54966          260   11275        147    3904
## 6 STATEN ISLAND      9099           52    1407         22     216
## # ... with 3 more variables: CYC.KILLED <int>, MOT.INJ <int>,
## #   MOT.KILLED <int>

Plot the total killed by borough. There is a huge contingent of empty borough data. The zipcode data was also empty. It seems the onlyi thing you could do is figure out a way to map the GPS data to the relevant borough to fill in the raw data.

ggplot(By.Borough,mapping = aes(BOROUGH, TOTAL.KILLED)) +
  geom_bar(stat = 'identity')

Compare total injured by zip code, drop the NA rows

By.Zip <- Accidents %>% 
  select(ZIP.CODE,
         PERS.INJ,
         PERS.KILLED) %>% 
  
  group_by(ZIP.CODE) %>% 
  
  summarise(Total.Injured = sum(PERS.INJ),
            Total.Killed  = sum(PERS.KILLED)) %>% 
  
  drop_na(ZIP.CODE)

# get summary data for total injured and total killed
summary(By.Zip$Total.Injured)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0   157.5   773.5   908.4  1449.0  5301.0
summary(By.Zip$Total.Killed)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   3.000   4.027   6.000  24.000