data608-project data preparation

Load required libraries

library(tidyverse)
library(lubridate)
library(plotly)
library(shiny)

Read the data

url <- "MotorVehicleCollisionsCrashes.csv"
motorCrash <- read.csv(url)
motorCrash <- as_tibble(motorCrash)

Check the head of the data

head(motorCrash)

## # A tibble: 6 x 29
##   CRASH.DATE CRASH.TIME BOROUGH    ZIP.CODE LATITUDE LONGITUDE LOCATION         
##   <chr>      <chr>      <chr>         <int>    <dbl>     <dbl> <chr>            
## 1 09/11/2021 2:39       ""               NA     NA        NA   ""               
## 2 03/26/2022 11:45      ""               NA     NA        NA   ""               
## 3 06/29/2022 6:55       ""               NA     NA        NA   ""               
## 4 09/11/2021 9:35       "BROOKLYN"    11208     40.7     -73.9 "(40.667202, -73~
## 5 12/14/2021 8:13       "BROOKLYN"    11233     40.7     -73.9 "(40.683304, -73~
## 6 04/14/2021 12:47      ""               NA     NA        NA   ""               
## # ... with 22 more variables: ON.STREET.NAME <chr>, CROSS.STREET.NAME <chr>,
## #   OFF.STREET.NAME <chr>, NUMBER.OF.PERSONS.INJURED <int>,
## #   NUMBER.OF.PERSONS.KILLED <int>, NUMBER.OF.PEDESTRIANS.INJURED <int>,
## #   NUMBER.OF.PEDESTRIANS.KILLED <int>, NUMBER.OF.CYCLIST.INJURED <int>,
## #   NUMBER.OF.CYCLIST.KILLED <int>, NUMBER.OF.MOTORIST.INJURED <int>,
## #   NUMBER.OF.MOTORIST.KILLED <int>, CONTRIBUTING.FACTOR.VEHICLE.1 <chr>,
## #   CONTRIBUTING.FACTOR.VEHICLE.2 <chr>, ...

Data Pre-processing

Select required columns and rename the columns

req_columns <- motorCrash %>% select("CRASH.DATE", "CRASH.TIME", "BOROUGH", "ZIP.CODE", "LATITUDE", "LONGITUDE",
                                     "NUMBER.OF.PERSONS.INJURED", "NUMBER.OF.PERSONS.KILLED", "CONTRIBUTING.FACTOR.VEHICLE.1",
                                     "VEHICLE.TYPE.CODE.1")
headers <- c("crash_date", "crash_time", "borough", "zip_code", "latitude", "longitude", "number_injured", "number_killed",
            "contributing_factor","vehicle_type")
names(req_columns) <- headers
head(req_columns)

## # A tibble: 6 x 10
##   crash_date crash_time borough    zip_code latitude longitude number_injured
##   <chr>      <chr>      <chr>         <int>    <dbl>     <dbl>          <int>
## 1 09/11/2021 2:39       ""               NA     NA        NA                2
## 2 03/26/2022 11:45      ""               NA     NA        NA                1
## 3 06/29/2022 6:55       ""               NA     NA        NA                0
## 4 09/11/2021 9:35       "BROOKLYN"    11208     40.7     -73.9              0
## 5 12/14/2021 8:13       "BROOKLYN"    11233     40.7     -73.9              0
## 6 04/14/2021 12:47      ""               NA     NA        NA                0
## # ... with 3 more variables: number_killed <int>, contributing_factor <chr>,
## #   vehicle_type <chr>

Remove records that have missing zip_code, latitude, longitude, vehicle_type, and contributing_factor

missing <- c(NA, NULL, "")
remove_missing <- req_columns %>% filter(!zip_code %in% missing & !latitude %in% missing & !longitude %in% missing &
                                           !vehicle_type %in% missing & !contributing_factor %in% missing)
head(remove_missing, 20)

## # A tibble: 20 x 10
##    crash_date crash_time borough      zip_code latitude longitude number_injured
##    <chr>      <chr>      <chr>           <int>    <dbl>     <dbl>          <int>
##  1 09/11/2021 9:35       BROOKLYN        11208     40.7     -73.9              0
##  2 12/14/2021 8:17       BRONX           10475     40.9     -73.8              2
##  3 12/14/2021 21:10      BROOKLYN        11207     40.7     -73.9              0
##  4 12/14/2021 14:58      MANHATTAN       10017     40.8     -74.0              0
##  5 12/14/2021 16:50      QUEENS          11413     40.7     -73.8              0
##  6 12/14/2021 23:10      QUEENS          11434     40.7     -73.8              2
##  7 12/14/2021 17:58      BROOKLYN        11217     40.7     -74.0              0
##  8 12/14/2021 20:03      BROOKLYN        11226     40.7     -74.0              4
##  9 12/11/2021 19:43      BRONX           10463     40.9     -73.9              1
## 10 12/11/2021 4:45       MANHATTAN       10001     40.7     -74.0              0
## 11 12/13/2021 6:30       QUEENS          11372     40.8     -73.9              0
## 12 12/13/2021 17:40      STATEN ISLA~    10301     40.6     -74.1              1
## 13 12/14/2021 17:31      BROOKLYN        11230     40.6     -74.0              1
## 14 12/14/2021 20:13      BROOKLYN        11215     40.7     -74.0              0
## 15 12/14/2021 12:54      BROOKLYN        11217     40.7     -74.0              1
## 16 12/14/2021 17:15      BROOKLYN        11211     40.7     -74.0              1
## 17 12/14/2021 22:49      BRONX           10455     40.8     -73.9              0
## 18 12/12/2021 9:00       QUEENS          11385     40.7     -73.9              0
## 19 07/12/2022 17:50      BROOKLYN        11225     40.7     -74.0              0
## 20 04/24/2022 1:30       BROOKLYN        11220     40.6     -74.0              0
## # ... with 3 more variables: number_killed <int>, contributing_factor <chr>,
## #   vehicle_type <chr>

Convert the crash_date column to date format

remove_missing$crash_date <- as.Date(remove_missing$crash_date, format = "%m/%d/%Y")

Filter the data for only records with year greater than 2015

motor_crash = remove_missing %>% filter(year(crash_date) > 2015)
head(motor_crash)

## # A tibble: 6 x 10
##   crash_date crash_time borough   zip_code latitude longitude number_injured
##   <date>     <chr>      <chr>        <int>    <dbl>     <dbl>          <int>
## 1 2021-09-11 9:35       BROOKLYN     11208     40.7     -73.9              0
## 2 2021-12-14 8:17       BRONX        10475     40.9     -73.8              2
## 3 2021-12-14 21:10      BROOKLYN     11207     40.7     -73.9              0
## 4 2021-12-14 14:58      MANHATTAN    10017     40.8     -74.0              0
## 5 2021-12-14 16:50      QUEENS       11413     40.7     -73.8              0
## 6 2021-12-14 23:10      QUEENS       11434     40.7     -73.8              2
## # ... with 3 more variables: number_killed <int>, contributing_factor <chr>,
## #   vehicle_type <chr>

Drop the remaining NA

motor_crash_df <- motor_crash %>% drop_na()

Check if there is any NA in the dataset

any(is.na(motor_crash_df))

## [1] FALSE

Take a glimpse of the data

glimpse(motor_crash_df)

## Rows: 745,855
## Columns: 10
## $ crash_date          <date> 2021-09-11, 2021-12-14, 2021-12-14, 2021-12-14, 2~
## $ crash_time          <chr> "9:35", "8:17", "21:10", "14:58", "16:50", "23:10"~
## $ borough             <chr> "BROOKLYN", "BRONX", "BROOKLYN", "MANHATTAN", "QUE~
## $ zip_code            <int> 11208, 10475, 11207, 10017, 11413, 11434, 11217, 1~
## $ latitude            <dbl> 40.66720, 40.86816, 40.67172, 40.75144, 40.67588, ~
## $ longitude           <dbl> -73.86650, -73.83148, -73.89710, -73.97397, -73.75~
## $ number_injured      <int> 0, 2, 0, 0, 0, 2, 0, 4, 1, 0, 0, 1, 1, 0, 1, 1, 0,~
## $ number_killed       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ contributing_factor <chr> "Unspecified", "Unspecified", "Driver Inexperience~
## $ vehicle_type        <chr> "Sedan", "Sedan", "Sedan", "Sedan", "Sedan", "Seda~

Look at the summary statistics of the data

summary(motor_crash_df)

##    crash_date          crash_time          borough             zip_code    
##  Min.   :2016-01-01   Length:745855      Length:745855      Min.   :10000  
##  1st Qu.:2017-05-19   Class :character   Class :character   1st Qu.:10453  
##  Median :2018-09-15   Mode  :character   Mode  :character   Median :11208  
##  Mean   :2018-11-27                                         Mean   :10870  
##  3rd Qu.:2020-02-08                                         3rd Qu.:11249  
##  Max.   :2022-11-22                                         Max.   :11697  
##     latitude       longitude      number_injured    number_killed     
##  Min.   : 0.00   Min.   :-74.25   Min.   : 0.0000   Min.   :0.000000  
##  1st Qu.:40.67   1st Qu.:-73.97   1st Qu.: 0.0000   1st Qu.:0.000000  
##  Median :40.72   Median :-73.92   Median : 0.0000   Median :0.000000  
##  Mean   :40.57   Mean   :-73.64   Mean   : 0.2933   Mean   :0.001154  
##  3rd Qu.:40.77   3rd Qu.:-73.87   3rd Qu.: 0.0000   3rd Qu.:0.000000  
##  Max.   :41.13   Max.   :  0.00   Max.   :27.0000   Max.   :8.000000  
##  contributing_factor vehicle_type      
##  Length:745855       Length:745855     
##  Class :character    Class :character  
##  Mode  :character    Mode  :character  
##                                        
##                                        
##

file = "C:\\Users\\exper\\OneDrive\\Documents\\CUNY SPS\\Fall 2022\\DATA_608\\project\\data608-project\\data608-project\\data\\motor_crash_df.csv"
write_csv(motor_crash_df, file = file)

Data Analysis

Read the data from github

url_data <- "https://raw.githubusercontent.com/chinedu2301/data608-project/main/data/motor_crash_df.csv"
crash_df <- read.csv(url_data)
crash_df <- as_tibble(crash_df)

Change the crash_date data type to date format and View the head of the data

crash_df$crash_date <- as.Date(crash_df$crash_date, format = "%Y-%m-%d")
crash_df %>% head(20)

## # A tibble: 20 x 10
##    crash_date crash_time borough      zip_code latitude longitude number_injured
##    <date>     <chr>      <chr>           <int>    <dbl>     <dbl>          <int>
##  1 2021-09-11 9:35       BROOKLYN        11208     40.7     -73.9              0
##  2 2021-12-14 8:17       BRONX           10475     40.9     -73.8              2
##  3 2021-12-14 21:10      BROOKLYN        11207     40.7     -73.9              0
##  4 2021-12-14 14:58      MANHATTAN       10017     40.8     -74.0              0
##  5 2021-12-14 16:50      QUEENS          11413     40.7     -73.8              0
##  6 2021-12-14 23:10      QUEENS          11434     40.7     -73.8              2
##  7 2021-12-14 17:58      BROOKLYN        11217     40.7     -74.0              0
##  8 2021-12-14 20:03      BROOKLYN        11226     40.7     -74.0              4
##  9 2021-12-11 19:43      BRONX           10463     40.9     -73.9              1
## 10 2021-12-11 4:45       MANHATTAN       10001     40.7     -74.0              0
## 11 2021-12-13 6:30       QUEENS          11372     40.8     -73.9              0
## 12 2021-12-13 17:40      STATEN ISLA~    10301     40.6     -74.1              1
## 13 2021-12-14 17:31      BROOKLYN        11230     40.6     -74.0              1
## 14 2021-12-14 20:13      BROOKLYN        11215     40.7     -74.0              0
## 15 2021-12-14 12:54      BROOKLYN        11217     40.7     -74.0              1
## 16 2021-12-14 17:15      BROOKLYN        11211     40.7     -74.0              1
## 17 2021-12-14 22:49      BRONX           10455     40.8     -73.9              0
## 18 2021-12-12 9:00       QUEENS          11385     40.7     -73.9              0
## 19 2022-07-12 17:50      BROOKLYN        11225     40.7     -74.0              0
## 20 2022-04-24 1:30       BROOKLYN        11220     40.6     -74.0              0
## # ... with 3 more variables: number_killed <int>, contributing_factor <chr>,
## #   vehicle_type <chr>

data608-project data preparation

Chinedu Onyeka and Charles Ugiagbe

2022-11-28

Data Pre-processing

Data Analysis