Project2 - Data Set1

Import NYC Crash Data

nyc_crash_data <- "https://raw.githubusercontent.com/smithchad17/Class607/master/qiz3-axqb.csv"
crash <- read.csv(nyc_crash_data, stringsAsFactors = F)
head(crash)

##   X..computed_region_92fq_4b7q X..computed_region_efsh_h5xi
## 1                           47                        20529
## 2                           37                        17215
## 3                           NA                           NA
## 4                           47                        20529
## 5                           18                        13513
## 6                           NA                           NA
##   X..computed_region_f5dn_yrer X..computed_region_sbqj_enih
## 1                           51                           59
## 2                           45                           47
## 3                           NA                           NA
## 4                           51                           59
## 5                           60                           43
## 6                           NA                           NA
##   X..computed_region_yeji_bk3q borough  contributing_factor_vehicle_1
## 1                            3  QUEENS Driver Inattention/Distraction
## 2                            2                View Obstructed/Limited
## 3                           NA  QUEENS               Backing Unsafely
## 4                            3  QUEENS           Unsafe Lane Changing
## 5                            2         Driver Inattention/Distraction
## 6                           NA                        Other Vehicular
##   contributing_factor_vehicle_2 contributing_factor_vehicle_3
## 1                   Unspecified                              
## 2                   Unspecified                              
## 3                   Unspecified                              
## 4               Other Vehicular                              
## 5                   Unspecified                              
## 6                                                            
##   contributing_factor_vehicle_4 contributing_factor_vehicle_5
## 1                                                            
## 2                                                            
## 3                                                            
## 4                                                            
## 5                                                            
## 6                                                            
##                          cross_street_name                    date
## 1 51-45     ALMEDA AVENUE                  2017-10-01T00:00:00.000
## 2                                          2017-10-01T00:00:00.000
## 3                                          2017-10-01T00:00:00.000
## 4 19-30     MOTT AVENUE                    2017-10-01T00:00:00.000
## 5                                          2017-10-01T00:00:00.000
## 6                                          2017-10-01T00:00:00.000
##   latitude                     location location_address location_city
## 1 40.59782  POINT (-73.78255 40.597824)               NA            NA
## 2 40.67884  POINT (-73.86609 40.678844)               NA            NA
## 3       NA                                            NA            NA
## 4 40.60395 POINT (-73.752335 40.603947)               NA            NA
## 5 40.62488  POINT (-73.96418 40.624878)               NA            NA
## 6       NA                                            NA            NA
##   location_state location_zip longitude number_of_cyclist_injured
## 1             NA           NA -73.78255                         0
## 2             NA           NA -73.86609                         0
## 3             NA           NA        NA                         0
## 4             NA           NA -73.75234                         0
## 5             NA           NA -73.96418                         0
## 6             NA           NA        NA                         0
##   number_of_cyclist_killed number_of_motorist_injured
## 1                        0                          0
## 2                        0                          0
## 3                        0                          0
## 4                        0                          0
## 5                        0                          0
## 6                        0                          0
##   number_of_motorist_killed number_of_pedestrians_injured
## 1                         0                             0
## 2                         0                             0
## 3                         0                             0
## 4                         0                             0
## 5                         0                             0
## 6                         0                             0
##   number_of_pedestrians_killed number_of_persons_injured
## 1                            0                         0
## 2                            0                         0
## 3                            0                         0
## 4                            0                         0
## 5                            0                         0
## 6                            0                         0
##   number_of_persons_killed          off_street_name
## 1                        0                         
## 2                        0             GRANT AVENUE
## 3                        0              queens blvd
## 4                        0                         
## 5                        0                         
## 6                        0 HUTCHINSON RIVER PARKWAY
##                     on_street_name  time unique_key
## 1                                  13:00    3761053
## 2 LIBERTY AVENUE                   19:55    3762078
## 3 68 DRIVE                         18:50    3761295
## 4                                  16:40    3761054
## 5 AVENUE J                         20:40    3760606
## 6 PELHAM PARKWAY                   20:30    3762190
##              vehicle_type_code1 vehicle_type_code2 vehicle_type_code_3
## 1             PASSENGER VEHICLE                                       
## 2             PASSENGER VEHICLE                                       
## 3 SPORT UTILITY / STATION WAGON  PASSENGER VEHICLE                    
## 4             PASSENGER VEHICLE                                       
## 5             PASSENGER VEHICLE  PASSENGER VEHICLE                    
## 6             PASSENGER VEHICLE                                       
##   vehicle_type_code_4 vehicle_type_code_5 zip_code
## 1                                            11691
## 2                                               NA
## 3                                            11375
## 4                                            11691
## 5                                               NA
## 6                                               NA

Tidy Crash Data

Create subset by eliminating columns that we’re not going to analyze

#All the 'location' columns are empty
crash_data <- crash %>% 
  select(-c(latitude, longitude), -starts_with("x"), -starts_with("l"))  #remove unneccessary columns

colnames(crash_data)

##  [1] "borough"                       "contributing_factor_vehicle_1"
##  [3] "contributing_factor_vehicle_2" "contributing_factor_vehicle_3"
##  [5] "contributing_factor_vehicle_4" "contributing_factor_vehicle_5"
##  [7] "cross_street_name"             "date"                         
##  [9] "number_of_cyclist_injured"     "number_of_cyclist_killed"     
## [11] "number_of_motorist_injured"    "number_of_motorist_killed"    
## [13] "number_of_pedestrians_injured" "number_of_pedestrians_killed" 
## [15] "number_of_persons_injured"     "number_of_persons_killed"     
## [17] "off_street_name"               "on_street_name"               
## [19] "time"                          "unique_key"                   
## [21] "vehicle_type_code1"            "vehicle_type_code2"           
## [23] "vehicle_type_code_3"           "vehicle_type_code_4"          
## [25] "vehicle_type_code_5"           "zip_code"

Display column names and change them for future tidying

colnames(crash_data) <- gsub("^([[:alpha:]]+_[[:alpha:]]+)_", "\\1.", colnames(crash_data)) #Replace the second '_' with a '.' 
colnames(crash_data)

##  [1] "borough"                       "contributing_factor.vehicle_1"
##  [3] "contributing_factor.vehicle_2" "contributing_factor.vehicle_3"
##  [5] "contributing_factor.vehicle_4" "contributing_factor.vehicle_5"
##  [7] "cross_street.name"             "date"                         
##  [9] "number_of.cyclist_injured"     "number_of.cyclist_killed"     
## [11] "number_of.motorist_injured"    "number_of.motorist_killed"    
## [13] "number_of.pedestrians_injured" "number_of.pedestrians_killed" 
## [15] "number_of.persons_injured"     "number_of.persons_killed"     
## [17] "off_street.name"               "on_street.name"               
## [19] "time"                          "unique_key"                   
## [21] "vehicle_type.code1"            "vehicle_type.code2"           
## [23] "vehicle_type.code_3"           "vehicle_type.code_4"          
## [25] "vehicle_type.code_5"           "zip_code"

Clean up ‘date’ column. Unite the on, off, cross street columns.

crash_data$date <- gsub("T\\d{2}:\\d{2}:\\d{2}\\.\\d{3}", " ", crash_data$date) 

crash_data <- crash_data%>% 
  unite("on_off", on_street.name, off_street.name, sep = ", ") %>%
  unite("on_off_cross street names", on_off, cross_street.name, sep = ", ")

head(crash_data$date_time)

## NULL

head(crash_data$`on_off_cross street names`)

## [1] ", , 51-45     ALMEDA AVENUE                 "                
## [2] "LIBERTY AVENUE                  , GRANT AVENUE, "            
## [3] "68 DRIVE                        , queens blvd, "             
## [4] ", , 19-30     MOTT AVENUE                   "                
## [5] "AVENUE J                        , , "                        
## [6] "PELHAM PARKWAY                  , HUTCHINSON RIVER PARKWAY, "

Create subset for type.injured and type.killed

killed <- crash_data %>% 
  select(c(unique_key, ends_with("killed"))) %>%  #select the 'unique_key', and any columns ending with 'killed'
  gather("type", "number_killed", ends_with("killed")) #create 'drug' and 'percent_used' columns
killed$type <-  gsub(".+(cyclist|pedestrians|persons|motorist).+", "\\1", killed$type) #clean the type column

head(killed)

##   unique_key    type number_killed
## 1    3761053 cyclist             0
## 2    3762078 cyclist             0
## 3    3761295 cyclist             0
## 4    3761054 cyclist             0
## 5    3760606 cyclist             0
## 6    3762190 cyclist             0

injured <- crash_data %>% 
  select(c(unique_key, ends_with("injured"))) %>% #select the 'unique_key', and any columns ending with 'injured'
  gather("type", "number_injured", ends_with("injured")) %>%  #create 'drug' and 'frequency' columns
  select(-type)#remove the type column since we will use the 'unique_key' column to merge later

head(injured)

##   unique_key number_injured
## 1    3761053              0
## 2    3762078              0
## 3    3761295              0
## 4    3761054              0
## 5    3760606              0
## 6    3762190              0

Tidy the contributing vehicles columns

#filter only the columns we want
contr <- select(crash_data, unique_key, c(starts_with("c")))

#There were some operators like '/' and '-' that was keeping the gather() function from working. I had to replace them in every column
i <- 2
while(i <= dim(contr)[2]){
  contr[,i] <- str_replace_all(contr[,i],  "[/|-]", "_" )
  i <- i + 1
}

#create 'contributing_vehicle' and 'contributing_description' columns
contr <- gather(contr, "vehicle", "contributing_description", matches("\\d$")) 

#clean the 'vehicle' column by extracting the vehicle number.
contr$vehicle <- gsub(".+\\.(\\w+_\\d)", "\\1", contr$vehicle)

head(contr)

##   unique_key   vehicle       contributing_description
## 1    3761053 vehicle_1 Driver Inattention_Distraction
## 2    3762078 vehicle_1        View Obstructed_Limited
## 3    3761295 vehicle_1               Backing Unsafely
## 4    3761054 vehicle_1           Unsafe Lane Changing
## 5    3760606 vehicle_1 Driver Inattention_Distraction
## 6    3762190 vehicle_1                Other Vehicular

Tidy the vehicle type columns

#filter only the columns we want
vehicle <- select(crash_data, unique_key, c(starts_with("v")))

#There were some operators like '/' and '-' that was keeping the gather() function from working. I had to replace them in every column
i <- 2
while(i <= dim(vehicle)[2]){
  vehicle[,i] <- str_replace_all(vehicle[,i],  "[/|-]", "_" )
  i <- i + 1
}

#create 'vehicletype' and 'vehicle_description' columns
vehicle <- gather(vehicle, "vehicletype", "vehicle_description", matches("\\d$")) 

#Remove vehicle column since it will match the unique key of other subset
vehicle <- select(vehicle, -vehicletype)

head(vehicle)

##   unique_key           vehicle_description
## 1    3761053             PASSENGER VEHICLE
## 2    3762078             PASSENGER VEHICLE
## 3    3761295 SPORT UTILITY _ STATION WAGON
## 4    3761054             PASSENGER VEHICLE
## 5    3760606             PASSENGER VEHICLE
## 6    3762190             PASSENGER VEHICLE

Merge subsets into data frame

#filter unused columns in crash_data so we can merge the rest
crash_data <- select(crash_data, c("borough", "date", "time", "on_off_cross street names", "unique_key", "zip_code"))

joined1 <- merge(contr, vehicle, by.x = "unique_key", by.y = "unique_key")
joined2 <- merge(killed, injured, by.x = "unique_key", by.y = "unique_key")
joined3 <- merge(joined1, joined2, by.x = "unique_key", by.y = "unique_key")

crash_data <- merge(joined3, crash_data, by.x = "unique_key", by.y = "unique_key")
head(crash_data)

##   unique_key   vehicle contributing_description vehicle_description
## 1    3759676 vehicle_4                                             
## 2    3759676 vehicle_4                                             
## 3    3759676 vehicle_4                                             
## 4    3759676 vehicle_4                                             
## 5    3759676 vehicle_4                                             
## 6    3759676 vehicle_4                                             
##          type number_killed number_injured borough        date  time
## 1    motorist             0              0         2017-09-29  12:50
## 2    motorist             0              0         2017-09-29  12:50
## 3    motorist             0              0         2017-09-29  12:50
## 4    motorist             0              0         2017-09-29  12:50
## 5 pedestrians             0              0         2017-09-29  12:50
## 6 pedestrians             0              0         2017-09-29  12:50
##   on_off_cross street names zip_code
## 1                      , ,        NA
## 2                      , ,        NA
## 3                      , ,        NA
## 4                      , ,        NA
## 5                      , ,        NA
## 6                      , ,        NA

Analyze Crash Data

Summary

summary(crash_data)

##    unique_key        vehicle          contributing_description
##  Min.   :3759676   Length:400000      Length:400000           
##  1st Qu.:3760621   Class :character   Class :character        
##  Median :3761101   Mode  :character   Mode  :character        
##  Mean   :3761124                                              
##  3rd Qu.:3761510                                              
##  Max.   :3762843                                              
##                                                               
##  vehicle_description     type           number_killed    number_injured  
##  Length:400000       Length:400000      Min.   :0.0000   Min.   :0.0000  
##  Class :character    Class :character   1st Qu.:0.0000   1st Qu.:0.0000  
##  Mode  :character    Mode  :character   Median :0.0000   Median :0.0000  
##                                         Mean   :0.0025   Mean   :0.1403  
##                                         3rd Qu.:0.0000   3rd Qu.:0.0000  
##                                         Max.   :1.0000   Max.   :6.0000  
##                                                                          
##    borough              date               time          
##  Length:400000      Length:400000      Length:400000     
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##  on_off_cross street names    zip_code     
##  Length:400000             Min.   :10000   
##  Class :character          1st Qu.:10454   
##  Mode  :character          Median :11211   
##                            Mean   :10897   
##                            3rd Qu.:11354   
##                            Max.   :11693   
##                            NA's   :157600

Total number of deaths and injuries between 9/29/2017 and 10/1/2017

print(paste0("Total number of people killed: ",sum(crash_data$number_killed)))

## [1] "Total number of people killed: 1000"

print(paste0("Total number of people injured: ",sum(crash_data$number_injured)))

## [1] "Total number of people injured: 56100"

Deaths and Injuries per zip code

I had some problems with this. There are 153 unique zip codes but only two zip codes had any deaths reported. I tried using while loops, aggregate, and arrange methods and they were all the same. This could be becuase the data was already a subset of a larger data set.

dead <- aggregate(number_killed ~ zip_code, data = crash_data, sum)
injured <- aggregate(number_injured ~ zip_code, data = crash_data, sum)

d <- data.frame(dead$zip_code, dead$number_killed, injured$number_injured)

arrange(d, desc(d$dead.number_killed))

##     dead.zip_code dead.number_killed injured.number_injured
## 1           10037                200                    200
## 2           11233                200                    800
## 3           10000                  0                      0
## 4           10001                  0                   1000
## 5           10002                  0                    800
## 6           10003                  0                    200
## 7           10004                  0                      0
## 8           10007                  0                    400
## 9           10009                  0                    200
## 10          10011                  0                      0
## 11          10012                  0                      0
## 12          10013                  0                      0
## 13          10016                  0                    400
## 14          10018                  0                      0
## 15          10019                  0                      0
## 16          10021                  0                    600
## 17          10022                  0                    400
## 18          10023                  0                    400
## 19          10024                  0                      0
## 20          10025                  0                      0
## 21          10026                  0                      0
## 22          10028                  0                      0
## 23          10031                  0                    200
## 24          10032                  0                      0
## 25          10033                  0                    400
## 26          10034                  0                      0
## 27          10035                  0                    200
## 28          10036                  0                    200
## 29          10038                  0                      0
## 30          10040                  0                    200
## 31          10065                  0                      0
## 32          10075                  0                      0
## 33          10128                  0                      0
## 34          10301                  0                      0
## 35          10304                  0                      0
## 36          10306                  0                      0
## 37          10308                  0                      0
## 38          10309                  0                    200
## 39          10310                  0                      0
## 40          10312                  0                      0
## 41          10314                  0                      0
## 42          10451                  0                    600
## 43          10452                  0                      0
## 44          10453                  0                    200
## 45          10454                  0                      0
## 46          10455                  0                    200
## 47          10456                  0                    200
## 48          10457                  0                    200
## 49          10458                  0                    200
## 50          10459                  0                      0
## 51          10460                  0                    400
## 52          10461                  0                    200
## 53          10462                  0                      0
## 54          10463                  0                      0
## 55          10464                  0                    200
## 56          10465                  0                    200
## 57          10466                  0                   1200
## 58          10467                  0                    200
## 59          10468                  0                    400
## 60          10469                  0                    200
## 61          10472                  0                    200
## 62          10473                  0                      0
## 63          10474                  0                      0
## 64          11004                  0                      0
## 65          11005                  0                      0
## 66          11101                  0                    400
## 67          11102                  0                      0
## 68          11103                  0                      0
## 69          11104                  0                      0
## 70          11105                  0                      0
## 71          11106                  0                      0
## 72          11109                  0                      0
## 73          11201                  0                    200
## 74          11203                  0                    400
## 75          11204                  0                      0
## 76          11205                  0                    200
## 77          11206                  0                    600
## 78          11207                  0                    400
## 79          11208                  0                    200
## 80          11209                  0                    400
## 81          11210                  0                      0
## 82          11211                  0                    400
## 83          11212                  0                      0
## 84          11213                  0                    400
## 85          11214                  0                    400
## 86          11215                  0                    200
## 87          11216                  0                      0
## 88          11217                  0                      0
## 89          11218                  0                      0
## 90          11219                  0                    200
## 91          11220                  0                    200
## 92          11221                  0                    400
## 93          11222                  0                      0
## 94          11223                  0                      0
## 95          11224                  0                    400
## 96          11225                  0                      0
## 97          11226                  0                      0
## 98          11228                  0                    800
## 99          11229                  0                    400
## 100         11230                  0                    200
## 101         11231                  0                      0
## 102         11232                  0                    200
## 103         11234                  0                      0
## 104         11235                  0                      0
## 105         11236                  0                    800
## 106         11237                  0                    200
## 107         11238                  0                    200
## 108         11239                  0                      0
## 109         11249                  0                    200
## 110         11354                  0                      0
## 111         11355                  0                      0
## 112         11356                  0                    200
## 113         11357                  0                    200
## 114         11358                  0                    200
## 115         11362                  0                    200
## 116         11365                  0                      0
## 117         11366                  0                   1000
## 118         11367                  0                    200
## 119         11368                  0                    400
## 120         11369                  0                    400
## 121         11370                  0                      0
## 122         11372                  0                    400
## 123         11373                  0                    200
## 124         11374                  0                    200
## 125         11375                  0                      0
## 126         11377                  0                    400
## 127         11378                  0                      0
## 128         11379                  0                    200
## 129         11385                  0                    200
## 130         11411                  0                      0
## 131         11412                  0                    200
## 132         11413                  0                      0
## 133         11414                  0                      0
## 134         11416                  0                    400
## 135         11417                  0                    400
## 136         11418                  0                   1000
## 137         11419                  0                      0
## 138         11420                  0                      0
## 139         11421                  0                    200
## 140         11422                  0                    200
## 141         11423                  0                    200
## 142         11427                  0                    800
## 143         11428                  0                    200
## 144         11429                  0                    200
## 145         11432                  0                    200
## 146         11433                  0                    200
## 147         11434                  0                    200
## 148         11435                  0                    200
## 149         11436                  0                      0
## 150         11691                  0                      0
## 151         11692                  0                      0
## 152         11693                  0                      0

Deaths and Injuries by Person Type

dead <- aggregate(number_killed ~ borough, data = crash_data, sum)
injured <- aggregate(number_injured ~ borough, data = crash_data, sum)

d <- data.frame(person_type = dead$borough, number_killed = dead$number_killed, number_injured = injured$number_injured)

arrange(d, desc(d$number_killed))

##     person_type number_killed number_injured
## 1                         600          27100
## 2      BROOKLYN           200           8800
## 3     MANHATTAN           200           5800
## 4         BRONX             0           4800
## 5        QUEENS             0           9400
## 6 STATEN ISLAND             0            200

Project2 - Data Set1

Chad Smith

October 7, 2017

Import NYC Crash Data

Tidy Crash Data

Analyze Crash Data