nyc_crash_data <- "https://raw.githubusercontent.com/smithchad17/Class607/master/qiz3-axqb.csv"
crash <- read.csv(nyc_crash_data, stringsAsFactors = F)
head(crash)
## X..computed_region_92fq_4b7q X..computed_region_efsh_h5xi
## 1 47 20529
## 2 37 17215
## 3 NA NA
## 4 47 20529
## 5 18 13513
## 6 NA NA
## X..computed_region_f5dn_yrer X..computed_region_sbqj_enih
## 1 51 59
## 2 45 47
## 3 NA NA
## 4 51 59
## 5 60 43
## 6 NA NA
## X..computed_region_yeji_bk3q borough contributing_factor_vehicle_1
## 1 3 QUEENS Driver Inattention/Distraction
## 2 2 View Obstructed/Limited
## 3 NA QUEENS Backing Unsafely
## 4 3 QUEENS Unsafe Lane Changing
## 5 2 Driver Inattention/Distraction
## 6 NA Other Vehicular
## contributing_factor_vehicle_2 contributing_factor_vehicle_3
## 1 Unspecified
## 2 Unspecified
## 3 Unspecified
## 4 Other Vehicular
## 5 Unspecified
## 6
## contributing_factor_vehicle_4 contributing_factor_vehicle_5
## 1
## 2
## 3
## 4
## 5
## 6
## cross_street_name date
## 1 51-45 ALMEDA AVENUE 2017-10-01T00:00:00.000
## 2 2017-10-01T00:00:00.000
## 3 2017-10-01T00:00:00.000
## 4 19-30 MOTT AVENUE 2017-10-01T00:00:00.000
## 5 2017-10-01T00:00:00.000
## 6 2017-10-01T00:00:00.000
## latitude location location_address location_city
## 1 40.59782 POINT (-73.78255 40.597824) NA NA
## 2 40.67884 POINT (-73.86609 40.678844) NA NA
## 3 NA NA NA
## 4 40.60395 POINT (-73.752335 40.603947) NA NA
## 5 40.62488 POINT (-73.96418 40.624878) NA NA
## 6 NA NA NA
## location_state location_zip longitude number_of_cyclist_injured
## 1 NA NA -73.78255 0
## 2 NA NA -73.86609 0
## 3 NA NA NA 0
## 4 NA NA -73.75234 0
## 5 NA NA -73.96418 0
## 6 NA NA NA 0
## number_of_cyclist_killed number_of_motorist_injured
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## number_of_motorist_killed number_of_pedestrians_injured
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## number_of_pedestrians_killed number_of_persons_injured
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## number_of_persons_killed off_street_name
## 1 0
## 2 0 GRANT AVENUE
## 3 0 queens blvd
## 4 0
## 5 0
## 6 0 HUTCHINSON RIVER PARKWAY
## on_street_name time unique_key
## 1 13:00 3761053
## 2 LIBERTY AVENUE 19:55 3762078
## 3 68 DRIVE 18:50 3761295
## 4 16:40 3761054
## 5 AVENUE J 20:40 3760606
## 6 PELHAM PARKWAY 20:30 3762190
## vehicle_type_code1 vehicle_type_code2 vehicle_type_code_3
## 1 PASSENGER VEHICLE
## 2 PASSENGER VEHICLE
## 3 SPORT UTILITY / STATION WAGON PASSENGER VEHICLE
## 4 PASSENGER VEHICLE
## 5 PASSENGER VEHICLE PASSENGER VEHICLE
## 6 PASSENGER VEHICLE
## vehicle_type_code_4 vehicle_type_code_5 zip_code
## 1 11691
## 2 NA
## 3 11375
## 4 11691
## 5 NA
## 6 NA
Create subset by eliminating columns that we’re not going to analyze
#All the 'location' columns are empty
crash_data <- crash %>%
select(-c(latitude, longitude), -starts_with("x"), -starts_with("l")) #remove unneccessary columns
colnames(crash_data)
## [1] "borough" "contributing_factor_vehicle_1"
## [3] "contributing_factor_vehicle_2" "contributing_factor_vehicle_3"
## [5] "contributing_factor_vehicle_4" "contributing_factor_vehicle_5"
## [7] "cross_street_name" "date"
## [9] "number_of_cyclist_injured" "number_of_cyclist_killed"
## [11] "number_of_motorist_injured" "number_of_motorist_killed"
## [13] "number_of_pedestrians_injured" "number_of_pedestrians_killed"
## [15] "number_of_persons_injured" "number_of_persons_killed"
## [17] "off_street_name" "on_street_name"
## [19] "time" "unique_key"
## [21] "vehicle_type_code1" "vehicle_type_code2"
## [23] "vehicle_type_code_3" "vehicle_type_code_4"
## [25] "vehicle_type_code_5" "zip_code"
Display column names and change them for future tidying
colnames(crash_data) <- gsub("^([[:alpha:]]+_[[:alpha:]]+)_", "\\1.", colnames(crash_data)) #Replace the second '_' with a '.'
colnames(crash_data)
## [1] "borough" "contributing_factor.vehicle_1"
## [3] "contributing_factor.vehicle_2" "contributing_factor.vehicle_3"
## [5] "contributing_factor.vehicle_4" "contributing_factor.vehicle_5"
## [7] "cross_street.name" "date"
## [9] "number_of.cyclist_injured" "number_of.cyclist_killed"
## [11] "number_of.motorist_injured" "number_of.motorist_killed"
## [13] "number_of.pedestrians_injured" "number_of.pedestrians_killed"
## [15] "number_of.persons_injured" "number_of.persons_killed"
## [17] "off_street.name" "on_street.name"
## [19] "time" "unique_key"
## [21] "vehicle_type.code1" "vehicle_type.code2"
## [23] "vehicle_type.code_3" "vehicle_type.code_4"
## [25] "vehicle_type.code_5" "zip_code"
Clean up ‘date’ column. Unite the on, off, cross street columns.
crash_data$date <- gsub("T\\d{2}:\\d{2}:\\d{2}\\.\\d{3}", " ", crash_data$date)
crash_data <- crash_data%>%
unite("on_off", on_street.name, off_street.name, sep = ", ") %>%
unite("on_off_cross street names", on_off, cross_street.name, sep = ", ")
head(crash_data$date_time)
## NULL
head(crash_data$`on_off_cross street names`)
## [1] ", , 51-45 ALMEDA AVENUE "
## [2] "LIBERTY AVENUE , GRANT AVENUE, "
## [3] "68 DRIVE , queens blvd, "
## [4] ", , 19-30 MOTT AVENUE "
## [5] "AVENUE J , , "
## [6] "PELHAM PARKWAY , HUTCHINSON RIVER PARKWAY, "
Create subset for type.injured and type.killed
killed <- crash_data %>%
select(c(unique_key, ends_with("killed"))) %>% #select the 'unique_key', and any columns ending with 'killed'
gather("type", "number_killed", ends_with("killed")) #create 'drug' and 'percent_used' columns
killed$type <- gsub(".+(cyclist|pedestrians|persons|motorist).+", "\\1", killed$type) #clean the type column
head(killed)
## unique_key type number_killed
## 1 3761053 cyclist 0
## 2 3762078 cyclist 0
## 3 3761295 cyclist 0
## 4 3761054 cyclist 0
## 5 3760606 cyclist 0
## 6 3762190 cyclist 0
injured <- crash_data %>%
select(c(unique_key, ends_with("injured"))) %>% #select the 'unique_key', and any columns ending with 'injured'
gather("type", "number_injured", ends_with("injured")) %>% #create 'drug' and 'frequency' columns
select(-type)#remove the type column since we will use the 'unique_key' column to merge later
head(injured)
## unique_key number_injured
## 1 3761053 0
## 2 3762078 0
## 3 3761295 0
## 4 3761054 0
## 5 3760606 0
## 6 3762190 0
Tidy the contributing vehicles columns
#filter only the columns we want
contr <- select(crash_data, unique_key, c(starts_with("c")))
#There were some operators like '/' and '-' that was keeping the gather() function from working. I had to replace them in every column
i <- 2
while(i <= dim(contr)[2]){
contr[,i] <- str_replace_all(contr[,i], "[/|-]", "_" )
i <- i + 1
}
#create 'contributing_vehicle' and 'contributing_description' columns
contr <- gather(contr, "vehicle", "contributing_description", matches("\\d$"))
#clean the 'vehicle' column by extracting the vehicle number.
contr$vehicle <- gsub(".+\\.(\\w+_\\d)", "\\1", contr$vehicle)
head(contr)
## unique_key vehicle contributing_description
## 1 3761053 vehicle_1 Driver Inattention_Distraction
## 2 3762078 vehicle_1 View Obstructed_Limited
## 3 3761295 vehicle_1 Backing Unsafely
## 4 3761054 vehicle_1 Unsafe Lane Changing
## 5 3760606 vehicle_1 Driver Inattention_Distraction
## 6 3762190 vehicle_1 Other Vehicular
Tidy the vehicle type columns
#filter only the columns we want
vehicle <- select(crash_data, unique_key, c(starts_with("v")))
#There were some operators like '/' and '-' that was keeping the gather() function from working. I had to replace them in every column
i <- 2
while(i <= dim(vehicle)[2]){
vehicle[,i] <- str_replace_all(vehicle[,i], "[/|-]", "_" )
i <- i + 1
}
#create 'vehicletype' and 'vehicle_description' columns
vehicle <- gather(vehicle, "vehicletype", "vehicle_description", matches("\\d$"))
#Remove vehicle column since it will match the unique key of other subset
vehicle <- select(vehicle, -vehicletype)
head(vehicle)
## unique_key vehicle_description
## 1 3761053 PASSENGER VEHICLE
## 2 3762078 PASSENGER VEHICLE
## 3 3761295 SPORT UTILITY _ STATION WAGON
## 4 3761054 PASSENGER VEHICLE
## 5 3760606 PASSENGER VEHICLE
## 6 3762190 PASSENGER VEHICLE
Merge subsets into data frame
#filter unused columns in crash_data so we can merge the rest
crash_data <- select(crash_data, c("borough", "date", "time", "on_off_cross street names", "unique_key", "zip_code"))
joined1 <- merge(contr, vehicle, by.x = "unique_key", by.y = "unique_key")
joined2 <- merge(killed, injured, by.x = "unique_key", by.y = "unique_key")
joined3 <- merge(joined1, joined2, by.x = "unique_key", by.y = "unique_key")
crash_data <- merge(joined3, crash_data, by.x = "unique_key", by.y = "unique_key")
head(crash_data)
## unique_key vehicle contributing_description vehicle_description
## 1 3759676 vehicle_4
## 2 3759676 vehicle_4
## 3 3759676 vehicle_4
## 4 3759676 vehicle_4
## 5 3759676 vehicle_4
## 6 3759676 vehicle_4
## type number_killed number_injured borough date time
## 1 motorist 0 0 2017-09-29 12:50
## 2 motorist 0 0 2017-09-29 12:50
## 3 motorist 0 0 2017-09-29 12:50
## 4 motorist 0 0 2017-09-29 12:50
## 5 pedestrians 0 0 2017-09-29 12:50
## 6 pedestrians 0 0 2017-09-29 12:50
## on_off_cross street names zip_code
## 1 , , NA
## 2 , , NA
## 3 , , NA
## 4 , , NA
## 5 , , NA
## 6 , , NA
Summary
summary(crash_data)
## unique_key vehicle contributing_description
## Min. :3759676 Length:400000 Length:400000
## 1st Qu.:3760621 Class :character Class :character
## Median :3761101 Mode :character Mode :character
## Mean :3761124
## 3rd Qu.:3761510
## Max. :3762843
##
## vehicle_description type number_killed number_injured
## Length:400000 Length:400000 Min. :0.0000 Min. :0.0000
## Class :character Class :character 1st Qu.:0.0000 1st Qu.:0.0000
## Mode :character Mode :character Median :0.0000 Median :0.0000
## Mean :0.0025 Mean :0.1403
## 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :6.0000
##
## borough date time
## Length:400000 Length:400000 Length:400000
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## on_off_cross street names zip_code
## Length:400000 Min. :10000
## Class :character 1st Qu.:10454
## Mode :character Median :11211
## Mean :10897
## 3rd Qu.:11354
## Max. :11693
## NA's :157600
Total number of deaths and injuries between 9/29/2017 and 10/1/2017
print(paste0("Total number of people killed: ",sum(crash_data$number_killed)))
## [1] "Total number of people killed: 1000"
print(paste0("Total number of people injured: ",sum(crash_data$number_injured)))
## [1] "Total number of people injured: 56100"
Deaths and Injuries per zip code
I had some problems with this. There are 153 unique zip codes but only two zip codes had any deaths reported. I tried using while loops, aggregate, and arrange methods and they were all the same. This could be becuase the data was already a subset of a larger data set.
dead <- aggregate(number_killed ~ zip_code, data = crash_data, sum)
injured <- aggregate(number_injured ~ zip_code, data = crash_data, sum)
d <- data.frame(dead$zip_code, dead$number_killed, injured$number_injured)
arrange(d, desc(d$dead.number_killed))
## dead.zip_code dead.number_killed injured.number_injured
## 1 10037 200 200
## 2 11233 200 800
## 3 10000 0 0
## 4 10001 0 1000
## 5 10002 0 800
## 6 10003 0 200
## 7 10004 0 0
## 8 10007 0 400
## 9 10009 0 200
## 10 10011 0 0
## 11 10012 0 0
## 12 10013 0 0
## 13 10016 0 400
## 14 10018 0 0
## 15 10019 0 0
## 16 10021 0 600
## 17 10022 0 400
## 18 10023 0 400
## 19 10024 0 0
## 20 10025 0 0
## 21 10026 0 0
## 22 10028 0 0
## 23 10031 0 200
## 24 10032 0 0
## 25 10033 0 400
## 26 10034 0 0
## 27 10035 0 200
## 28 10036 0 200
## 29 10038 0 0
## 30 10040 0 200
## 31 10065 0 0
## 32 10075 0 0
## 33 10128 0 0
## 34 10301 0 0
## 35 10304 0 0
## 36 10306 0 0
## 37 10308 0 0
## 38 10309 0 200
## 39 10310 0 0
## 40 10312 0 0
## 41 10314 0 0
## 42 10451 0 600
## 43 10452 0 0
## 44 10453 0 200
## 45 10454 0 0
## 46 10455 0 200
## 47 10456 0 200
## 48 10457 0 200
## 49 10458 0 200
## 50 10459 0 0
## 51 10460 0 400
## 52 10461 0 200
## 53 10462 0 0
## 54 10463 0 0
## 55 10464 0 200
## 56 10465 0 200
## 57 10466 0 1200
## 58 10467 0 200
## 59 10468 0 400
## 60 10469 0 200
## 61 10472 0 200
## 62 10473 0 0
## 63 10474 0 0
## 64 11004 0 0
## 65 11005 0 0
## 66 11101 0 400
## 67 11102 0 0
## 68 11103 0 0
## 69 11104 0 0
## 70 11105 0 0
## 71 11106 0 0
## 72 11109 0 0
## 73 11201 0 200
## 74 11203 0 400
## 75 11204 0 0
## 76 11205 0 200
## 77 11206 0 600
## 78 11207 0 400
## 79 11208 0 200
## 80 11209 0 400
## 81 11210 0 0
## 82 11211 0 400
## 83 11212 0 0
## 84 11213 0 400
## 85 11214 0 400
## 86 11215 0 200
## 87 11216 0 0
## 88 11217 0 0
## 89 11218 0 0
## 90 11219 0 200
## 91 11220 0 200
## 92 11221 0 400
## 93 11222 0 0
## 94 11223 0 0
## 95 11224 0 400
## 96 11225 0 0
## 97 11226 0 0
## 98 11228 0 800
## 99 11229 0 400
## 100 11230 0 200
## 101 11231 0 0
## 102 11232 0 200
## 103 11234 0 0
## 104 11235 0 0
## 105 11236 0 800
## 106 11237 0 200
## 107 11238 0 200
## 108 11239 0 0
## 109 11249 0 200
## 110 11354 0 0
## 111 11355 0 0
## 112 11356 0 200
## 113 11357 0 200
## 114 11358 0 200
## 115 11362 0 200
## 116 11365 0 0
## 117 11366 0 1000
## 118 11367 0 200
## 119 11368 0 400
## 120 11369 0 400
## 121 11370 0 0
## 122 11372 0 400
## 123 11373 0 200
## 124 11374 0 200
## 125 11375 0 0
## 126 11377 0 400
## 127 11378 0 0
## 128 11379 0 200
## 129 11385 0 200
## 130 11411 0 0
## 131 11412 0 200
## 132 11413 0 0
## 133 11414 0 0
## 134 11416 0 400
## 135 11417 0 400
## 136 11418 0 1000
## 137 11419 0 0
## 138 11420 0 0
## 139 11421 0 200
## 140 11422 0 200
## 141 11423 0 200
## 142 11427 0 800
## 143 11428 0 200
## 144 11429 0 200
## 145 11432 0 200
## 146 11433 0 200
## 147 11434 0 200
## 148 11435 0 200
## 149 11436 0 0
## 150 11691 0 0
## 151 11692 0 0
## 152 11693 0 0
Deaths and Injuries by Person Type
dead <- aggregate(number_killed ~ borough, data = crash_data, sum)
injured <- aggregate(number_injured ~ borough, data = crash_data, sum)
d <- data.frame(person_type = dead$borough, number_killed = dead$number_killed, number_injured = injured$number_injured)
arrange(d, desc(d$number_killed))
## person_type number_killed number_injured
## 1 600 27100
## 2 BROOKLYN 200 8800
## 3 MANHATTAN 200 5800
## 4 BRONX 0 4800
## 5 QUEENS 0 9400
## 6 STATEN ISLAND 0 200