Load required libraries
library(tidyverse)
library(lubridate)
library(plotly)
library(shiny)
Read the data
url <- "MotorVehicleCollisionsCrashes.csv"
motorCrash <- read.csv(url)
motorCrash <- as_tibble(motorCrash)
Check the head of the data
head(motorCrash)
## # A tibble: 6 x 29
## CRASH.DATE CRASH.TIME BOROUGH ZIP.CODE LATITUDE LONGITUDE LOCATION
## <chr> <chr> <chr> <int> <dbl> <dbl> <chr>
## 1 09/11/2021 2:39 "" NA NA NA ""
## 2 03/26/2022 11:45 "" NA NA NA ""
## 3 06/29/2022 6:55 "" NA NA NA ""
## 4 09/11/2021 9:35 "BROOKLYN" 11208 40.7 -73.9 "(40.667202, -73~
## 5 12/14/2021 8:13 "BROOKLYN" 11233 40.7 -73.9 "(40.683304, -73~
## 6 04/14/2021 12:47 "" NA NA NA ""
## # ... with 22 more variables: ON.STREET.NAME <chr>, CROSS.STREET.NAME <chr>,
## # OFF.STREET.NAME <chr>, NUMBER.OF.PERSONS.INJURED <int>,
## # NUMBER.OF.PERSONS.KILLED <int>, NUMBER.OF.PEDESTRIANS.INJURED <int>,
## # NUMBER.OF.PEDESTRIANS.KILLED <int>, NUMBER.OF.CYCLIST.INJURED <int>,
## # NUMBER.OF.CYCLIST.KILLED <int>, NUMBER.OF.MOTORIST.INJURED <int>,
## # NUMBER.OF.MOTORIST.KILLED <int>, CONTRIBUTING.FACTOR.VEHICLE.1 <chr>,
## # CONTRIBUTING.FACTOR.VEHICLE.2 <chr>, ...
Select required columns and rename the columns
req_columns <- motorCrash %>% select("CRASH.DATE", "CRASH.TIME", "BOROUGH", "ZIP.CODE", "LATITUDE", "LONGITUDE",
"NUMBER.OF.PERSONS.INJURED", "NUMBER.OF.PERSONS.KILLED", "CONTRIBUTING.FACTOR.VEHICLE.1",
"VEHICLE.TYPE.CODE.1")
headers <- c("crash_date", "crash_time", "borough", "zip_code", "latitude", "longitude", "number_injured", "number_killed",
"contributing_factor","vehicle_type")
names(req_columns) <- headers
head(req_columns)
## # A tibble: 6 x 10
## crash_date crash_time borough zip_code latitude longitude number_injured
## <chr> <chr> <chr> <int> <dbl> <dbl> <int>
## 1 09/11/2021 2:39 "" NA NA NA 2
## 2 03/26/2022 11:45 "" NA NA NA 1
## 3 06/29/2022 6:55 "" NA NA NA 0
## 4 09/11/2021 9:35 "BROOKLYN" 11208 40.7 -73.9 0
## 5 12/14/2021 8:13 "BROOKLYN" 11233 40.7 -73.9 0
## 6 04/14/2021 12:47 "" NA NA NA 0
## # ... with 3 more variables: number_killed <int>, contributing_factor <chr>,
## # vehicle_type <chr>
Remove records that have missing zip_code, latitude, longitude, vehicle_type, and contributing_factor
missing <- c(NA, NULL, "")
remove_missing <- req_columns %>% filter(!zip_code %in% missing & !latitude %in% missing & !longitude %in% missing &
!vehicle_type %in% missing & !contributing_factor %in% missing)
head(remove_missing, 20)
## # A tibble: 20 x 10
## crash_date crash_time borough zip_code latitude longitude number_injured
## <chr> <chr> <chr> <int> <dbl> <dbl> <int>
## 1 09/11/2021 9:35 BROOKLYN 11208 40.7 -73.9 0
## 2 12/14/2021 8:17 BRONX 10475 40.9 -73.8 2
## 3 12/14/2021 21:10 BROOKLYN 11207 40.7 -73.9 0
## 4 12/14/2021 14:58 MANHATTAN 10017 40.8 -74.0 0
## 5 12/14/2021 16:50 QUEENS 11413 40.7 -73.8 0
## 6 12/14/2021 23:10 QUEENS 11434 40.7 -73.8 2
## 7 12/14/2021 17:58 BROOKLYN 11217 40.7 -74.0 0
## 8 12/14/2021 20:03 BROOKLYN 11226 40.7 -74.0 4
## 9 12/11/2021 19:43 BRONX 10463 40.9 -73.9 1
## 10 12/11/2021 4:45 MANHATTAN 10001 40.7 -74.0 0
## 11 12/13/2021 6:30 QUEENS 11372 40.8 -73.9 0
## 12 12/13/2021 17:40 STATEN ISLA~ 10301 40.6 -74.1 1
## 13 12/14/2021 17:31 BROOKLYN 11230 40.6 -74.0 1
## 14 12/14/2021 20:13 BROOKLYN 11215 40.7 -74.0 0
## 15 12/14/2021 12:54 BROOKLYN 11217 40.7 -74.0 1
## 16 12/14/2021 17:15 BROOKLYN 11211 40.7 -74.0 1
## 17 12/14/2021 22:49 BRONX 10455 40.8 -73.9 0
## 18 12/12/2021 9:00 QUEENS 11385 40.7 -73.9 0
## 19 07/12/2022 17:50 BROOKLYN 11225 40.7 -74.0 0
## 20 04/24/2022 1:30 BROOKLYN 11220 40.6 -74.0 0
## # ... with 3 more variables: number_killed <int>, contributing_factor <chr>,
## # vehicle_type <chr>
Convert the crash_date column to date format
remove_missing$crash_date <- as.Date(remove_missing$crash_date, format = "%m/%d/%Y")
Filter the data for only records with year greater than 2015
motor_crash = remove_missing %>% filter(year(crash_date) > 2015)
head(motor_crash)
## # A tibble: 6 x 10
## crash_date crash_time borough zip_code latitude longitude number_injured
## <date> <chr> <chr> <int> <dbl> <dbl> <int>
## 1 2021-09-11 9:35 BROOKLYN 11208 40.7 -73.9 0
## 2 2021-12-14 8:17 BRONX 10475 40.9 -73.8 2
## 3 2021-12-14 21:10 BROOKLYN 11207 40.7 -73.9 0
## 4 2021-12-14 14:58 MANHATTAN 10017 40.8 -74.0 0
## 5 2021-12-14 16:50 QUEENS 11413 40.7 -73.8 0
## 6 2021-12-14 23:10 QUEENS 11434 40.7 -73.8 2
## # ... with 3 more variables: number_killed <int>, contributing_factor <chr>,
## # vehicle_type <chr>
Drop the remaining NA
motor_crash_df <- motor_crash %>% drop_na()
Check if there is any NA in the dataset
any(is.na(motor_crash_df))
## [1] FALSE
Take a glimpse of the data
glimpse(motor_crash_df)
## Rows: 745,855
## Columns: 10
## $ crash_date <date> 2021-09-11, 2021-12-14, 2021-12-14, 2021-12-14, 2~
## $ crash_time <chr> "9:35", "8:17", "21:10", "14:58", "16:50", "23:10"~
## $ borough <chr> "BROOKLYN", "BRONX", "BROOKLYN", "MANHATTAN", "QUE~
## $ zip_code <int> 11208, 10475, 11207, 10017, 11413, 11434, 11217, 1~
## $ latitude <dbl> 40.66720, 40.86816, 40.67172, 40.75144, 40.67588, ~
## $ longitude <dbl> -73.86650, -73.83148, -73.89710, -73.97397, -73.75~
## $ number_injured <int> 0, 2, 0, 0, 0, 2, 0, 4, 1, 0, 0, 1, 1, 0, 1, 1, 0,~
## $ number_killed <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ contributing_factor <chr> "Unspecified", "Unspecified", "Driver Inexperience~
## $ vehicle_type <chr> "Sedan", "Sedan", "Sedan", "Sedan", "Sedan", "Seda~
Look at the summary statistics of the data
summary(motor_crash_df)
## crash_date crash_time borough zip_code
## Min. :2016-01-01 Length:745855 Length:745855 Min. :10000
## 1st Qu.:2017-05-19 Class :character Class :character 1st Qu.:10453
## Median :2018-09-15 Mode :character Mode :character Median :11208
## Mean :2018-11-27 Mean :10870
## 3rd Qu.:2020-02-08 3rd Qu.:11249
## Max. :2022-11-22 Max. :11697
## latitude longitude number_injured number_killed
## Min. : 0.00 Min. :-74.25 Min. : 0.0000 Min. :0.000000
## 1st Qu.:40.67 1st Qu.:-73.97 1st Qu.: 0.0000 1st Qu.:0.000000
## Median :40.72 Median :-73.92 Median : 0.0000 Median :0.000000
## Mean :40.57 Mean :-73.64 Mean : 0.2933 Mean :0.001154
## 3rd Qu.:40.77 3rd Qu.:-73.87 3rd Qu.: 0.0000 3rd Qu.:0.000000
## Max. :41.13 Max. : 0.00 Max. :27.0000 Max. :8.000000
## contributing_factor vehicle_type
## Length:745855 Length:745855
## Class :character Class :character
## Mode :character Mode :character
##
##
##
file = "C:\\Users\\exper\\OneDrive\\Documents\\CUNY SPS\\Fall 2022\\DATA_608\\project\\data608-project\\data608-project\\data\\motor_crash_df.csv"
write_csv(motor_crash_df, file = file)
Read the data from github
url_data <- "https://raw.githubusercontent.com/chinedu2301/data608-project/main/data/motor_crash_df.csv"
crash_df <- read.csv(url_data)
crash_df <- as_tibble(crash_df)
Change the crash_date data type to date format and View the head of the data
crash_df$crash_date <- as.Date(crash_df$crash_date, format = "%Y-%m-%d")
crash_df %>% head(20)
## # A tibble: 20 x 10
## crash_date crash_time borough zip_code latitude longitude number_injured
## <date> <chr> <chr> <int> <dbl> <dbl> <int>
## 1 2021-09-11 9:35 BROOKLYN 11208 40.7 -73.9 0
## 2 2021-12-14 8:17 BRONX 10475 40.9 -73.8 2
## 3 2021-12-14 21:10 BROOKLYN 11207 40.7 -73.9 0
## 4 2021-12-14 14:58 MANHATTAN 10017 40.8 -74.0 0
## 5 2021-12-14 16:50 QUEENS 11413 40.7 -73.8 0
## 6 2021-12-14 23:10 QUEENS 11434 40.7 -73.8 2
## 7 2021-12-14 17:58 BROOKLYN 11217 40.7 -74.0 0
## 8 2021-12-14 20:03 BROOKLYN 11226 40.7 -74.0 4
## 9 2021-12-11 19:43 BRONX 10463 40.9 -73.9 1
## 10 2021-12-11 4:45 MANHATTAN 10001 40.7 -74.0 0
## 11 2021-12-13 6:30 QUEENS 11372 40.8 -73.9 0
## 12 2021-12-13 17:40 STATEN ISLA~ 10301 40.6 -74.1 1
## 13 2021-12-14 17:31 BROOKLYN 11230 40.6 -74.0 1
## 14 2021-12-14 20:13 BROOKLYN 11215 40.7 -74.0 0
## 15 2021-12-14 12:54 BROOKLYN 11217 40.7 -74.0 1
## 16 2021-12-14 17:15 BROOKLYN 11211 40.7 -74.0 1
## 17 2021-12-14 22:49 BRONX 10455 40.8 -73.9 0
## 18 2021-12-12 9:00 QUEENS 11385 40.7 -73.9 0
## 19 2022-07-12 17:50 BROOKLYN 11225 40.7 -74.0 0
## 20 2022-04-24 1:30 BROOKLYN 11220 40.6 -74.0 0
## # ... with 3 more variables: number_killed <int>, contributing_factor <chr>,
## # vehicle_type <chr>