DATA608_Project Analysis

Load Libraries

library(tidyverse)
library(readr)
library(gridExtra)
library(scales)
library(lubridate)
library(ggrepel)
library(rgdal)
library(plotly)
library(kableExtra)
library(shiny)

Read the data

url_data <- "https://raw.githubusercontent.com/chinedu2301/data608-project/main/data/motor_crash_df.csv"
crash_df <- read.csv(url_data)
crash_df <- as_tibble(crash_df)

Change the crash_date data type to date format and View the head of the data

crash_df$crash_date <- as.Date(crash_df$crash_date, format = "%Y-%m-%d")
crash_df %>% head(8)

## # A tibble: 8 x 10
##   crash_date crash_time borough   zip_code latitude longitude number_injured
##   <date>     <chr>      <chr>        <int>    <dbl>     <dbl>          <int>
## 1 2021-09-11 9:35       BROOKLYN     11208     40.7     -73.9              0
## 2 2021-12-14 8:17       BRONX        10475     40.9     -73.8              2
## 3 2021-12-14 21:10      BROOKLYN     11207     40.7     -73.9              0
## 4 2021-12-14 14:58      MANHATTAN    10017     40.8     -74.0              0
## 5 2021-12-14 16:50      QUEENS       11413     40.7     -73.8              0
## 6 2021-12-14 23:10      QUEENS       11434     40.7     -73.8              2
## 7 2021-12-14 17:58      BROOKLYN     11217     40.7     -74.0              0
## 8 2021-12-14 20:03      BROOKLYN     11226     40.7     -74.0              4
## # ... with 3 more variables: number_killed <int>, contributing_factor <chr>,
## #   vehicle_type <chr>

Take a glimpse of the data

glimpse(crash_df)

## Rows: 745,855
## Columns: 10
## $ crash_date          <date> 2021-09-11, 2021-12-14, 2021-12-14, 2021-12-14, 2~
## $ crash_time          <chr> "9:35", "8:17", "21:10", "14:58", "16:50", "23:10"~
## $ borough             <chr> "BROOKLYN", "BRONX", "BROOKLYN", "MANHATTAN", "QUE~
## $ zip_code            <int> 11208, 10475, 11207, 10017, 11413, 11434, 11217, 1~
## $ latitude            <dbl> 40.66720, 40.86816, 40.67172, 40.75144, 40.67588, ~
## $ longitude           <dbl> -73.86650, -73.83148, -73.89710, -73.97397, -73.75~
## $ number_injured      <int> 0, 2, 0, 0, 0, 2, 0, 4, 1, 0, 0, 1, 1, 0, 1, 1, 0,~
## $ number_killed       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ contributing_factor <chr> "Unspecified", "Unspecified", "Driver Inexperience~
## $ vehicle_type        <chr> "Sedan", "Sedan", "Sedan", "Sedan", "Sedan", "Seda~

Car crashes by year

Since 2018, there has been a decline in the car crash incident in NYC but there was a sharp decline from 2019 to 2020. The decline has also continue steadily after then. The 2020 sharp decline might be due to covid-related lockdowns. Also, increase in remote work opportunities may be a reason for decline in incidents, but more studies need to be conducted to ascertain that.

crash_df$year <- year(crash_df$crash_date) #extract year from date using the lubridate year() function

crash_df %>%
        ggplot(aes(x=as.factor(year))) + geom_bar(stat='count', fill='purple') +
        scale_y_continuous(labels=comma) + labs(x='Year', y='No of Incident', title='Incidents by year') + 
        geom_label(stat = "count", aes(label = ..count.., y = ..count..)) + 
        theme(axis.title = element_text(size = 13), plot.title = element_text(size = 15,
        hjust = 0.5), panel.background = element_rect(fill = "lemonchiffon")) +
        labs(title = "Incidents by Year", y = "No of Incidents")

Car crash by quarter

library(viridis)

## Loading required package: viridisLite

## 
## Attaching package: 'viridis'

## The following object is masked from 'package:scales':
## 
##     viridis_pal

crash_df$quarter <- quarter(crash_df$crash_date) #extract Quarters from date

q1 <- crash_df %>% filter(year!=2013) %>% select(year, quarter) %>% group_by(year) %>% count(quarter) %>%
        ggplot(aes(x=as.factor(quarter), y=n, fill = quarter)) + 
        geom_bar(stat='identity') + scale_y_continuous(labels=comma) + facet_grid(.~year) + 
        labs(x='Quarter', y='Number of incidents', title='Incidents by Quarter') 

q1 + scale_fill_viridis() + theme(axis.title = element_text(size = 13),
      plot.title = element_text(size = 15,
      hjust = 0.5), panel.background = element_rect(fill = "lemonchiffon"))

Car crashes by weekdays

As expected, Friday has the most crash incidents because it is usually the most busy day in terms of vehicular movement. People travelling for weekend.

crash_df$weekday <- wday(crash_df$crash_date, label=TRUE)

crash_df %>% count(weekday) %>%
        ggplot(aes(x=weekday, y=n)) + geom_bar(stat='identity', fill=rainbow(n=7)) +
        scale_y_continuous(labels=comma) +
        labs(x='Weekday', y='Number of incidents', title='Incidents by Weekday') + 
        theme(axis.title = element_text(size = 13),
        plot.title = element_text(size = 15,
        hjust = 0.5), panel.background = element_rect(fill = "lemonchiffon"))

Crash incidents by Time of Day

# Extract time of day from the data
time <- as.POSIXct(strptime(c(crash_df$crash_time), "%H:%M"), "UTC")
x=as.POSIXct(strptime(c("0000","0500","1100","1600","1900","2359"),
                      "%H%M"),"UTC")
labs=c("night","morning","afternoon","evening","night")
day_time <- labs[findInterval(time,x)]

crash_df$day_time <- c(day_time)
print(crash_df)

## # A tibble: 745,855 x 14
##    crash_date crash_time borough   zip_code latitude longitude number_injured
##    <date>     <chr>      <chr>        <int>    <dbl>     <dbl>          <int>
##  1 2021-09-11 9:35       BROOKLYN     11208     40.7     -73.9              0
##  2 2021-12-14 8:17       BRONX        10475     40.9     -73.8              2
##  3 2021-12-14 21:10      BROOKLYN     11207     40.7     -73.9              0
##  4 2021-12-14 14:58      MANHATTAN    10017     40.8     -74.0              0
##  5 2021-12-14 16:50      QUEENS       11413     40.7     -73.8              0
##  6 2021-12-14 23:10      QUEENS       11434     40.7     -73.8              2
##  7 2021-12-14 17:58      BROOKLYN     11217     40.7     -74.0              0
##  8 2021-12-14 20:03      BROOKLYN     11226     40.7     -74.0              4
##  9 2021-12-11 19:43      BRONX        10463     40.9     -73.9              1
## 10 2021-12-11 4:45       MANHATTAN    10001     40.7     -74.0              0
## # ... with 745,845 more rows, and 7 more variables: number_killed <int>,
## #   contributing_factor <chr>, vehicle_type <chr>, year <dbl>, quarter <int>,
## #   weekday <ord>, day_time <chr>

day_time_df <- crash_df %>% group_by(day_time) %>% summarise(count_incidents = n()) 
day_time_df

## # A tibble: 5 x 2
##   day_time  count_incidents
##   <chr>               <int>
## 1 afternoon          224152
## 2 evening            151433
## 3 morning            164171
## 4 night              205991
## 5 <NA>                  108

day_time_df %>% ggplot(aes(x=reorder(day_time, count_incidents), y=count_incidents)) + 
        geom_bar(stat='identity', fill='purple') +
        scale_y_continuous(labels=comma) +
        labs(x='new_time', y='Number of incidents', title='Incidents by time') + theme(axis.title = element_text(size = 13),
        plot.title = element_text(size = 15,
        hjust = 0.5), panel.background = element_rect(fill = "lemonchiffon")) +
        labs(title = "Incidents by time of day", x = "time_of_day")

Crash Incident by Borough.

Brooklyn has the highest number of crashes and Staten Island has the least number of crash incidents.

plotly::ggplotly(crash_df %>% count(borough) %>%
        ggplot(aes(x=reorder(borough, n), y=n, fill=n, text=borough)) +
        geom_bar(stat='identity', fill='violet') +
        labs(x='', y='Number of crashes'),
        tooltip=c("text", "y"))

Top Ten(10) incidents by number of victims

crash_df$victims <- crash_df$number_killed + crash_df$number_injured
Top10 <- crash_df %>% select(crash_date, borough, number_killed, number_injured, victims, zip_code) %>% 
         arrange(desc(victims)) %>% top_n(n=10, wt=victims)
Top10

## # A tibble: 12 x 6
##    crash_date borough       number_killed number_injured victims zip_code
##    <date>     <chr>                 <int>          <int>   <int>    <int>
##  1 2017-05-18 MANHATTAN                 1             27      28    10036
##  2 2018-04-06 BROOKLYN                  0             22      22    11225
##  3 2018-11-16 BRONX                     0             20      20    10457
##  4 2017-10-31 MANHATTAN                 8             12      20    10014
##  5 2017-09-18 QUEENS                    4             15      19    11354
##  6 2017-06-21 BRONX                     0             19      19    10473
##  7 2021-09-04 STATEN ISLAND             0             18      18    10304
##  8 2019-06-10 QUEENS                    0             18      18    11434
##  9 2017-06-14 BRONX                     0             18      18    10463
## 10 2019-06-07 STATEN ISLAND             0             17      17    10301
## 11 2018-08-06 MANHATTAN                 0             17      17    10027
## 12 2018-04-30 BRONX                     0             17      17    10461

Victims per crash incident

crash_df$victims <- crash_df$number_killed + crash_df$number_injured

VictimsByborough <- crash_df %>% group_by(borough)  %>%   
                    summarize(sumVic=sum(victims), sumInj=sum(number_injured), sumDeath=sum(number_killed),
                              PercDeath=round(sumDeath/sumVic,3), sumIncidents=n(), 
                              vicPerInc=round(sumVic/sumIncidents,1)) %>% arrange(desc(sumVic))
VictimsByborough

## # A tibble: 5 x 7
##   borough       sumVic sumInj sumDeath PercDeath sumIncidents vicPerInc
##   <chr>          <int>  <int>    <int>     <dbl>        <int>     <dbl>
## 1 BROOKLYN       78416  78136      280     0.004       243146       0.3
## 2 QUEENS         62553  62303      250     0.004       207476       0.3
## 3 BRONX          37561  37433      128     0.003       119087       0.3
## 4 MANHATTAN      32733  32576      157     0.005       147759       0.2
## 5 STATEN ISLAND   8389   8343       46     0.005        28387       0.3

ggplotly(crash_df %>% count(contributing_factor) %>%
        ggplot(aes(x=contributing_factor, y=n, fill=n, text=contributing_factor)) +
        geom_bar(stat='identity', fill='red') + theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
        labs(x='', y='Number of crashes'),
         tooltip=c("text", "y"))

df <- crash_df
choices <- c("Crash by Year", "Crash by Quarter", "Crash by Weekday", "Crash by Time of Day", "Crash by Borough",
             "Victims per Incident")

# Build the UI for question1
ui <- fluidPage(
  # Panel for Crash by Year
  tabsetPanel( 
    tabPanel(title = "NYC Vehicular Incidents",
      sidebarPanel(
        htmlOutput('message_q1'),
        selectInput('plotType', 'Plot to Display', 
                  unique(choices), selected= choices[0],width = 600)
      ),
      mainPanel(plotlyOutput('plot1_q1'))
    )
    ))

# build the server for question1
server <- function(input, output, session) {
  # Question1 Server Side
  data_year <- reactive({
    df2 <- df %>%
        ggplot(aes(x=as.factor(year))) + geom_bar(stat='count', fill='purple') +
        scale_y_continuous(labels=comma) + labs(x='Year', y='No of Incident', title='Incidents by year') + 
        geom_label(stat = "count", aes(label = ..count.., y = ..count..)) + 
        theme(axis.title = element_text(size = 13), plot.title = element_text(size = 15,
        hjust = 0.5), panel.background = element_rect(fill = "lemonchiffon")) +
        labs(title = "Incidents by Year", y = "No of Incidents")
    df2
  })
  
  data_quarter <- reactive({
    df2 <- df %>% filter(year!=2013) %>% select(year, quarter) %>% group_by(year) %>% count(quarter) %>%
        ggplot(aes(x=as.factor(quarter), y=n, fill = quarter)) + 
        geom_bar(stat='identity') + scale_y_continuous(labels=comma) + facet_grid(.~year) + 
        labs(x='Quarter', y='Number of incidents', title='Incidents by Quarter') + scale_fill_viridis() + 
        theme(axis.title = element_text(size = 13),
        plot.title = element_text(size = 15,
        hjust = 0.5), panel.background = element_rect(fill = "lemonchiffon"))
    
  })
  
  data_weekday <- reactive({
    df2 <- df %>% count(weekday) %>%
        ggplot(aes(x=weekday, y=n)) + geom_bar(stat='identity', fill=rainbow(n=7)) +
        scale_y_continuous(labels=comma) +
        labs(x='Weekday', y='Number of incidents', title='Incidents by Weekday') + 
        theme(axis.title = element_text(size = 13),
        plot.title = element_text(size = 15,
        hjust = 0.5), panel.background = element_rect(fill = "lemonchiffon"))
    
  })

  data_timeofday <- reactive({
    df2 <- day_time_df %>% ggplot(aes(x=reorder(day_time, count_incidents), y=count_incidents)) + 
        geom_bar(stat='identity', fill='purple') +
        scale_y_continuous(labels=comma) +
        labs(x='new_time', y='Number of incidents', title='Incidents by time') + theme(axis.title = element_text(size = 13),
        plot.title = element_text(size = 15,
        hjust = 0.5), panel.background = element_rect(fill = "lemonchiffon")) +
        labs(title = "Incidents by time of day", x = "time_of_day")
    
  })    
  
  data_borough <- reactive({
    df2 <- plotly::ggplotly(df %>% count(borough) %>%
        ggplot(aes(x=reorder(borough, n), y=n, fill=n, text=borough)) +
        geom_bar(stat='identity', fill='violet') +
        labs(x='', y='Number of crashes'),
        tooltip=c("text", "y"))
    
  })
  
  data_victims <- reactive({
    df2 <- ggplotly(df %>% count(contributing_factor) %>%
        ggplot(aes(x=contributing_factor, y=n, fill=n, text=contributing_factor)) +
        geom_bar(stat='identity', fill='red') + theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
        labs(x='', y='Number of crashes'),
         tooltip=c("text", "y"))
    
  })
  


  # output Plot
  
  output$plot1_q1 <- renderPlotly({
    if (input$plotType == "Crash by Year"){
      df2 <- data_year()
    } else if(input$plotType == "Crash by Quarter"){
      df2 <- data_quarter()
    } else if(input$plotType == "Crash by Weekday"){
      df2 <- data_weekday()
    } else if(input$plotType == "Crash by Time of Day"){
      df2 <- data_timeofday()
    } else if(input$plotType == "Crash by Borough"){
      df2 <- data_borough()
    } else{
      df2 <- data_victims()
    }
  }
  
  )
}

shinyApp(ui = ui, server = server)

## PhantomJS not found. You can install it with webshot::install_phantomjs(). If it is installed, please make sure the phantomjs executable can be found via the PATH variable.

Shiny applications not supported in static R Markdown documents