Load Libraries
library(tidyverse)
library(readr)
library(gridExtra)
library(scales)
library(lubridate)
library(ggrepel)
library(rgdal)
library(plotly)
library(kableExtra)
library(shiny)
Read the data
url_data <- "https://raw.githubusercontent.com/chinedu2301/data608-project/main/data/motor_crash_df.csv"
crash_df <- read.csv(url_data)
crash_df <- as_tibble(crash_df)
Change the crash_date data type to date format and View the head of the data
crash_df$crash_date <- as.Date(crash_df$crash_date, format = "%Y-%m-%d")
crash_df %>% head(8)
## # A tibble: 8 x 10
## crash_date crash_time borough zip_code latitude longitude number_injured
## <date> <chr> <chr> <int> <dbl> <dbl> <int>
## 1 2021-09-11 9:35 BROOKLYN 11208 40.7 -73.9 0
## 2 2021-12-14 8:17 BRONX 10475 40.9 -73.8 2
## 3 2021-12-14 21:10 BROOKLYN 11207 40.7 -73.9 0
## 4 2021-12-14 14:58 MANHATTAN 10017 40.8 -74.0 0
## 5 2021-12-14 16:50 QUEENS 11413 40.7 -73.8 0
## 6 2021-12-14 23:10 QUEENS 11434 40.7 -73.8 2
## 7 2021-12-14 17:58 BROOKLYN 11217 40.7 -74.0 0
## 8 2021-12-14 20:03 BROOKLYN 11226 40.7 -74.0 4
## # ... with 3 more variables: number_killed <int>, contributing_factor <chr>,
## # vehicle_type <chr>
Take a glimpse of the data
glimpse(crash_df)
## Rows: 745,855
## Columns: 10
## $ crash_date <date> 2021-09-11, 2021-12-14, 2021-12-14, 2021-12-14, 2~
## $ crash_time <chr> "9:35", "8:17", "21:10", "14:58", "16:50", "23:10"~
## $ borough <chr> "BROOKLYN", "BRONX", "BROOKLYN", "MANHATTAN", "QUE~
## $ zip_code <int> 11208, 10475, 11207, 10017, 11413, 11434, 11217, 1~
## $ latitude <dbl> 40.66720, 40.86816, 40.67172, 40.75144, 40.67588, ~
## $ longitude <dbl> -73.86650, -73.83148, -73.89710, -73.97397, -73.75~
## $ number_injured <int> 0, 2, 0, 0, 0, 2, 0, 4, 1, 0, 0, 1, 1, 0, 1, 1, 0,~
## $ number_killed <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ contributing_factor <chr> "Unspecified", "Unspecified", "Driver Inexperience~
## $ vehicle_type <chr> "Sedan", "Sedan", "Sedan", "Sedan", "Sedan", "Seda~
Since 2018, there has been a decline in the car crash incident in NYC but there was a sharp decline from 2019 to 2020. The decline has also continue steadily after then. The 2020 sharp decline might be due to covid-related lockdowns. Also, increase in remote work opportunities may be a reason for decline in incidents, but more studies need to be conducted to ascertain that.
crash_df$year <- year(crash_df$crash_date) #extract year from date using the lubridate year() function
crash_df %>%
ggplot(aes(x=as.factor(year))) + geom_bar(stat='count', fill='purple') +
scale_y_continuous(labels=comma) + labs(x='Year', y='No of Incident', title='Incidents by year') +
geom_label(stat = "count", aes(label = ..count.., y = ..count..)) +
theme(axis.title = element_text(size = 13), plot.title = element_text(size = 15,
hjust = 0.5), panel.background = element_rect(fill = "lemonchiffon")) +
labs(title = "Incidents by Year", y = "No of Incidents")
library(viridis)
## Loading required package: viridisLite
##
## Attaching package: 'viridis'
## The following object is masked from 'package:scales':
##
## viridis_pal
crash_df$quarter <- quarter(crash_df$crash_date) #extract Quarters from date
q1 <- crash_df %>% filter(year!=2013) %>% select(year, quarter) %>% group_by(year) %>% count(quarter) %>%
ggplot(aes(x=as.factor(quarter), y=n, fill = quarter)) +
geom_bar(stat='identity') + scale_y_continuous(labels=comma) + facet_grid(.~year) +
labs(x='Quarter', y='Number of incidents', title='Incidents by Quarter')
q1 + scale_fill_viridis() + theme(axis.title = element_text(size = 13),
plot.title = element_text(size = 15,
hjust = 0.5), panel.background = element_rect(fill = "lemonchiffon"))
As expected, Friday has the most crash incidents because it is usually the most busy day in terms of vehicular movement. People travelling for weekend.
crash_df$weekday <- wday(crash_df$crash_date, label=TRUE)
crash_df %>% count(weekday) %>%
ggplot(aes(x=weekday, y=n)) + geom_bar(stat='identity', fill=rainbow(n=7)) +
scale_y_continuous(labels=comma) +
labs(x='Weekday', y='Number of incidents', title='Incidents by Weekday') +
theme(axis.title = element_text(size = 13),
plot.title = element_text(size = 15,
hjust = 0.5), panel.background = element_rect(fill = "lemonchiffon"))
# Extract time of day from the data
time <- as.POSIXct(strptime(c(crash_df$crash_time), "%H:%M"), "UTC")
x=as.POSIXct(strptime(c("0000","0500","1100","1600","1900","2359"),
"%H%M"),"UTC")
labs=c("night","morning","afternoon","evening","night")
day_time <- labs[findInterval(time,x)]
crash_df$day_time <- c(day_time)
print(crash_df)
## # A tibble: 745,855 x 14
## crash_date crash_time borough zip_code latitude longitude number_injured
## <date> <chr> <chr> <int> <dbl> <dbl> <int>
## 1 2021-09-11 9:35 BROOKLYN 11208 40.7 -73.9 0
## 2 2021-12-14 8:17 BRONX 10475 40.9 -73.8 2
## 3 2021-12-14 21:10 BROOKLYN 11207 40.7 -73.9 0
## 4 2021-12-14 14:58 MANHATTAN 10017 40.8 -74.0 0
## 5 2021-12-14 16:50 QUEENS 11413 40.7 -73.8 0
## 6 2021-12-14 23:10 QUEENS 11434 40.7 -73.8 2
## 7 2021-12-14 17:58 BROOKLYN 11217 40.7 -74.0 0
## 8 2021-12-14 20:03 BROOKLYN 11226 40.7 -74.0 4
## 9 2021-12-11 19:43 BRONX 10463 40.9 -73.9 1
## 10 2021-12-11 4:45 MANHATTAN 10001 40.7 -74.0 0
## # ... with 745,845 more rows, and 7 more variables: number_killed <int>,
## # contributing_factor <chr>, vehicle_type <chr>, year <dbl>, quarter <int>,
## # weekday <ord>, day_time <chr>
day_time_df <- crash_df %>% group_by(day_time) %>% summarise(count_incidents = n())
day_time_df
## # A tibble: 5 x 2
## day_time count_incidents
## <chr> <int>
## 1 afternoon 224152
## 2 evening 151433
## 3 morning 164171
## 4 night 205991
## 5 <NA> 108
day_time_df %>% ggplot(aes(x=reorder(day_time, count_incidents), y=count_incidents)) +
geom_bar(stat='identity', fill='purple') +
scale_y_continuous(labels=comma) +
labs(x='new_time', y='Number of incidents', title='Incidents by time') + theme(axis.title = element_text(size = 13),
plot.title = element_text(size = 15,
hjust = 0.5), panel.background = element_rect(fill = "lemonchiffon")) +
labs(title = "Incidents by time of day", x = "time_of_day")
Brooklyn has the highest number of crashes and Staten Island has the least number of crash incidents.
plotly::ggplotly(crash_df %>% count(borough) %>%
ggplot(aes(x=reorder(borough, n), y=n, fill=n, text=borough)) +
geom_bar(stat='identity', fill='violet') +
labs(x='', y='Number of crashes'),
tooltip=c("text", "y"))
crash_df$victims <- crash_df$number_killed + crash_df$number_injured
Top10 <- crash_df %>% select(crash_date, borough, number_killed, number_injured, victims, zip_code) %>%
arrange(desc(victims)) %>% top_n(n=10, wt=victims)
Top10
## # A tibble: 12 x 6
## crash_date borough number_killed number_injured victims zip_code
## <date> <chr> <int> <int> <int> <int>
## 1 2017-05-18 MANHATTAN 1 27 28 10036
## 2 2018-04-06 BROOKLYN 0 22 22 11225
## 3 2018-11-16 BRONX 0 20 20 10457
## 4 2017-10-31 MANHATTAN 8 12 20 10014
## 5 2017-09-18 QUEENS 4 15 19 11354
## 6 2017-06-21 BRONX 0 19 19 10473
## 7 2021-09-04 STATEN ISLAND 0 18 18 10304
## 8 2019-06-10 QUEENS 0 18 18 11434
## 9 2017-06-14 BRONX 0 18 18 10463
## 10 2019-06-07 STATEN ISLAND 0 17 17 10301
## 11 2018-08-06 MANHATTAN 0 17 17 10027
## 12 2018-04-30 BRONX 0 17 17 10461
crash_df$victims <- crash_df$number_killed + crash_df$number_injured
VictimsByborough <- crash_df %>% group_by(borough) %>%
summarize(sumVic=sum(victims), sumInj=sum(number_injured), sumDeath=sum(number_killed),
PercDeath=round(sumDeath/sumVic,3), sumIncidents=n(),
vicPerInc=round(sumVic/sumIncidents,1)) %>% arrange(desc(sumVic))
VictimsByborough
## # A tibble: 5 x 7
## borough sumVic sumInj sumDeath PercDeath sumIncidents vicPerInc
## <chr> <int> <int> <int> <dbl> <int> <dbl>
## 1 BROOKLYN 78416 78136 280 0.004 243146 0.3
## 2 QUEENS 62553 62303 250 0.004 207476 0.3
## 3 BRONX 37561 37433 128 0.003 119087 0.3
## 4 MANHATTAN 32733 32576 157 0.005 147759 0.2
## 5 STATEN ISLAND 8389 8343 46 0.005 28387 0.3
ggplotly(crash_df %>% count(contributing_factor) %>%
ggplot(aes(x=contributing_factor, y=n, fill=n, text=contributing_factor)) +
geom_bar(stat='identity', fill='red') + theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(x='', y='Number of crashes'),
tooltip=c("text", "y"))
df <- crash_df
choices <- c("Crash by Year", "Crash by Quarter", "Crash by Weekday", "Crash by Time of Day", "Crash by Borough",
"Victims per Incident")
# Build the UI for question1
ui <- fluidPage(
# Panel for Crash by Year
tabsetPanel(
tabPanel(title = "NYC Vehicular Incidents",
sidebarPanel(
htmlOutput('message_q1'),
selectInput('plotType', 'Plot to Display',
unique(choices), selected= choices[0],width = 600)
),
mainPanel(plotlyOutput('plot1_q1'))
)
))
# build the server for question1
server <- function(input, output, session) {
# Question1 Server Side
data_year <- reactive({
df2 <- df %>%
ggplot(aes(x=as.factor(year))) + geom_bar(stat='count', fill='purple') +
scale_y_continuous(labels=comma) + labs(x='Year', y='No of Incident', title='Incidents by year') +
geom_label(stat = "count", aes(label = ..count.., y = ..count..)) +
theme(axis.title = element_text(size = 13), plot.title = element_text(size = 15,
hjust = 0.5), panel.background = element_rect(fill = "lemonchiffon")) +
labs(title = "Incidents by Year", y = "No of Incidents")
df2
})
data_quarter <- reactive({
df2 <- df %>% filter(year!=2013) %>% select(year, quarter) %>% group_by(year) %>% count(quarter) %>%
ggplot(aes(x=as.factor(quarter), y=n, fill = quarter)) +
geom_bar(stat='identity') + scale_y_continuous(labels=comma) + facet_grid(.~year) +
labs(x='Quarter', y='Number of incidents', title='Incidents by Quarter') + scale_fill_viridis() +
theme(axis.title = element_text(size = 13),
plot.title = element_text(size = 15,
hjust = 0.5), panel.background = element_rect(fill = "lemonchiffon"))
})
data_weekday <- reactive({
df2 <- df %>% count(weekday) %>%
ggplot(aes(x=weekday, y=n)) + geom_bar(stat='identity', fill=rainbow(n=7)) +
scale_y_continuous(labels=comma) +
labs(x='Weekday', y='Number of incidents', title='Incidents by Weekday') +
theme(axis.title = element_text(size = 13),
plot.title = element_text(size = 15,
hjust = 0.5), panel.background = element_rect(fill = "lemonchiffon"))
})
data_timeofday <- reactive({
df2 <- day_time_df %>% ggplot(aes(x=reorder(day_time, count_incidents), y=count_incidents)) +
geom_bar(stat='identity', fill='purple') +
scale_y_continuous(labels=comma) +
labs(x='new_time', y='Number of incidents', title='Incidents by time') + theme(axis.title = element_text(size = 13),
plot.title = element_text(size = 15,
hjust = 0.5), panel.background = element_rect(fill = "lemonchiffon")) +
labs(title = "Incidents by time of day", x = "time_of_day")
})
data_borough <- reactive({
df2 <- plotly::ggplotly(df %>% count(borough) %>%
ggplot(aes(x=reorder(borough, n), y=n, fill=n, text=borough)) +
geom_bar(stat='identity', fill='violet') +
labs(x='', y='Number of crashes'),
tooltip=c("text", "y"))
})
data_victims <- reactive({
df2 <- ggplotly(df %>% count(contributing_factor) %>%
ggplot(aes(x=contributing_factor, y=n, fill=n, text=contributing_factor)) +
geom_bar(stat='identity', fill='red') + theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(x='', y='Number of crashes'),
tooltip=c("text", "y"))
})
# output Plot
output$plot1_q1 <- renderPlotly({
if (input$plotType == "Crash by Year"){
df2 <- data_year()
} else if(input$plotType == "Crash by Quarter"){
df2 <- data_quarter()
} else if(input$plotType == "Crash by Weekday"){
df2 <- data_weekday()
} else if(input$plotType == "Crash by Time of Day"){
df2 <- data_timeofday()
} else if(input$plotType == "Crash by Borough"){
df2 <- data_borough()
} else{
df2 <- data_victims()
}
}
)
}
shinyApp(ui = ui, server = server)
## PhantomJS not found. You can install it with webshot::install_phantomjs(). If it is installed, please make sure the phantomjs executable can be found via the PATH variable.