This project examines inequities in traffic crashes in terms of
motorists versus non motorists, i.e.,pedestrians, bicyclists and
motorcyclists against motorists. Using and combining available data, the
analysis will explore the level of road casualties in the above
mentioned categories and will identify its leading causes. Although the
focus is New York City, the data used in this project come from various
sources both local, state, federal and international, including the The
New York Times, NYPD
Traffic Data, Vision Zero ,
NHTSA,
Bureau
Of Transportation Statistics, WHO -Global Status
on Road Safety Report 2018
Getting Started: Loading libraries
Data Wrangling Model
Importing the Data
df1 <- read_csv("https://raw.githubusercontent.com/Heleinef/Data-Science-Master_Heleine/main/Vehicle%20Collision%20Report%20Statistics%20Citywide.csv")
## Rows: 36 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): GeoCode, GeoCodeLabel
## dbl (13): Year, Number_of_Motor_Vehicle_Collisions, Vehicles_or_Motorists_In...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
spec(df1)
## cols(
## Year = col_double(),
## GeoCode = col_character(),
## GeoCodeLabel = col_character(),
## Number_of_Motor_Vehicle_Collisions = col_double(),
## Vehicles_or_Motorists_Involved = col_double(),
## Injury_or_Fatal_Collisions = col_double(),
## MotoristsInjured = col_double(),
## MotoristsKilled = col_double(),
## PassengInjured = col_double(),
## PassengKilled = col_double(),
## CyclistsInjured = col_double(),
## CyclistsKilled = col_double(),
## PedestrInjured = col_double(),
## PedestrKilled = col_double(),
## Bicycle = col_double()
## )
VehecileReportStatisticsCitywide <- df1
df2 <- read_csv("https://raw.githubusercontent.com/Heleinef/Data-Science-Master_Heleine/main/Collisions%20Contributing%20Factors.csv")
## Rows: 120 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): GeoCode, GeoCodeLabel, ContributingFactorCode, ContributingFactorDe...
## dbl (2): Year, Number_of_Vehicles
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
spec(df2)
## cols(
## Year = col_double(),
## GeoCode = col_character(),
## GeoCodeLabel = col_character(),
## ContributingFactorCode = col_character(),
## ContributingFactorDescription = col_character(),
## Number_of_Vehicles = col_double()
## )
CollisionsContributingFactors <- df2
Let’s take a quick peek at df1
glimpse(df1)
## Rows: 36
## Columns: 15
## $ Year <dbl> 2014, 2014, 2014, 2014, 2014, 2014,…
## $ GeoCode <chr> "C", "M", "B", "K", "Q", "S", "C", …
## $ GeoCodeLabel <chr> "CITYWIDE", "MANHATTAN", "BRONX", "…
## $ Number_of_Motor_Vehicle_Collisions <dbl> 17720, 4026, 2455, 4960, 5195, 1084…
## $ Vehicles_or_Motorists_Involved <dbl> 34721, 7672, 4816, 9725, 10367, 214…
## $ Injury_or_Fatal_Collisions <dbl> 3249, 522, 556, 1077, 895, 199, 391…
## $ MotoristsInjured <dbl> 1522, 155, 283, 479, 471, 134, 2453…
## $ MotoristsKilled <dbl> 8, 1, 3, 1, 3, 0, 6, 0, 3, 1, 2, 0,…
## $ PassengInjured <dbl> 1677, 174, 331, 586, 485, 101, 1525…
## $ PassengKilled <dbl> 4, 0, 3, 0, 1, 0, 2, 0, 1, 0, 1, 0,…
## $ CyclistsInjured <dbl> 483, 119, 68, 182, 103, 11, 452, 11…
## $ CyclistsKilled <dbl> 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0,…
## $ PedestrInjured <dbl> 751, 174, 117, 263, 171, 26, 778, 1…
## $ PedestrKilled <dbl> 13, 5, 3, 2, 3, 0, 8, 0, 1, 4, 3, 0…
## $ Bicycle <dbl> 645, 194, 76, 241, 121, 13, 644, 19…
dim(df1)
## [1] 36 15
str(df1)
## spc_tbl_ [36 × 15] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Year : num [1:36] 2014 2014 2014 2014 2014 ...
## $ GeoCode : chr [1:36] "C" "M" "B" "K" ...
## $ GeoCodeLabel : chr [1:36] "CITYWIDE" "MANHATTAN" "BRONX" "BROOKLYN" ...
## $ Number_of_Motor_Vehicle_Collisions: num [1:36] 17720 4026 2455 4960 5195 ...
## $ Vehicles_or_Motorists_Involved : num [1:36] 34721 7672 4816 9725 10367 ...
## $ Injury_or_Fatal_Collisions : num [1:36] 3249 522 556 1077 895 ...
## $ MotoristsInjured : num [1:36] 1522 155 283 479 471 ...
## $ MotoristsKilled : num [1:36] 8 1 3 1 3 0 6 0 3 1 ...
## $ PassengInjured : num [1:36] 1677 174 331 586 485 ...
## $ PassengKilled : num [1:36] 4 0 3 0 1 0 2 0 1 0 ...
## $ CyclistsInjured : num [1:36] 483 119 68 182 103 11 452 117 48 199 ...
## $ CyclistsKilled : num [1:36] 1 0 0 1 0 0 1 0 0 0 ...
## $ PedestrInjured : num [1:36] 751 174 117 263 171 26 778 156 146 255 ...
## $ PedestrKilled : num [1:36] 13 5 3 2 3 0 8 0 1 4 ...
## $ Bicycle : num [1:36] 645 194 76 241 121 13 644 199 71 259 ...
## - attr(*, "spec")=
## .. cols(
## .. Year = col_double(),
## .. GeoCode = col_character(),
## .. GeoCodeLabel = col_character(),
## .. Number_of_Motor_Vehicle_Collisions = col_double(),
## .. Vehicles_or_Motorists_Involved = col_double(),
## .. Injury_or_Fatal_Collisions = col_double(),
## .. MotoristsInjured = col_double(),
## .. MotoristsKilled = col_double(),
## .. PassengInjured = col_double(),
## .. PassengKilled = col_double(),
## .. CyclistsInjured = col_double(),
## .. CyclistsKilled = col_double(),
## .. PedestrInjured = col_double(),
## .. PedestrKilled = col_double(),
## .. Bicycle = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
Let’s take a quick peek at df2
glimpse(df2)
## Rows: 120
## Columns: 6
## $ Year <dbl> 2023, 2023, 2023, 2023, 2023, 2023, 2023…
## $ GeoCode <chr> "C", "C", "C", "C", "C", "C", "C", "C", …
## $ GeoCodeLabel <chr> "CITYWIDE", "CITYWIDE", "CITYWIDE", "CIT…
## $ ContributingFactorCode <chr> "28", "02", "03", "22", "04", "05", "06"…
## $ ContributingFactorDescription <chr> "AGGRESSIVE DRIVING/ROAD RAGE", "ALCOHOL…
## $ Number_of_Vehicles <dbl> 89, 161, 226, 4, 2410, 204, 8, 113, 13, …
dim(df2)
## [1] 120 6
str(df2)
## spc_tbl_ [120 × 6] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Year : num [1:120] 2023 2023 2023 2023 2023 ...
## $ GeoCode : chr [1:120] "C" "C" "C" "C" ...
## $ GeoCodeLabel : chr [1:120] "CITYWIDE" "CITYWIDE" "CITYWIDE" "CITYWIDE" ...
## $ ContributingFactorCode : chr [1:120] "28" "02" "03" "22" ...
## $ ContributingFactorDescription: chr [1:120] "AGGRESSIVE DRIVING/ROAD RAGE" "ALCOHOL INVOLVEMENT" "BACKING UNSAFELY" "CELL PHONE (HAND-HELD)" ...
## $ Number_of_Vehicles : num [1:120] 89 161 226 4 2410 204 8 113 13 612 ...
## - attr(*, "spec")=
## .. cols(
## .. Year = col_double(),
## .. GeoCode = col_character(),
## .. GeoCodeLabel = col_character(),
## .. ContributingFactorCode = col_character(),
## .. ContributingFactorDescription = col_character(),
## .. Number_of_Vehicles = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
Data Tidying and Data Transformation
# Merging df1 and df2 into one single data frame
data <- df1 %>% inner_join(df2, by = "Year")
## Warning in inner_join(., df2, by = "Year"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 90 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
data
## # A tibble: 720 × 20
## Year GeoCode.x GeoCodeLabel.x Number_of_Motor_Vehic…¹ Vehicles_or_Motorist…²
## <dbl> <chr> <chr> <dbl> <dbl>
## 1 2014 C CITYWIDE 17720 34721
## 2 2014 C CITYWIDE 17720 34721
## 3 2014 C CITYWIDE 17720 34721
## 4 2014 C CITYWIDE 17720 34721
## 5 2014 C CITYWIDE 17720 34721
## 6 2014 C CITYWIDE 17720 34721
## 7 2014 C CITYWIDE 17720 34721
## 8 2014 C CITYWIDE 17720 34721
## 9 2014 C CITYWIDE 17720 34721
## 10 2014 C CITYWIDE 17720 34721
## # ℹ 710 more rows
## # ℹ abbreviated names: ¹Number_of_Motor_Vehicle_Collisions,
## # ²Vehicles_or_Motorists_Involved
## # ℹ 15 more variables: Injury_or_Fatal_Collisions <dbl>,
## # MotoristsInjured <dbl>, MotoristsKilled <dbl>, PassengInjured <dbl>,
## # PassengKilled <dbl>, CyclistsInjured <dbl>, CyclistsKilled <dbl>,
## # PedestrInjured <dbl>, PedestrKilled <dbl>, Bicycle <dbl>, …
Let’s take a peek at the new data frame
glimpse(data)
## Rows: 720
## Columns: 20
## $ Year <dbl> 2014, 2014, 2014, 2014, 2014, 2014,…
## $ GeoCode.x <chr> "C", "C", "C", "C", "C", "C", "C", …
## $ GeoCodeLabel.x <chr> "CITYWIDE", "CITYWIDE", "CITYWIDE",…
## $ Number_of_Motor_Vehicle_Collisions <dbl> 17720, 17720, 17720, 17720, 17720, …
## $ Vehicles_or_Motorists_Involved <dbl> 34721, 34721, 34721, 34721, 34721, …
## $ Injury_or_Fatal_Collisions <dbl> 3249, 3249, 3249, 3249, 3249, 3249,…
## $ MotoristsInjured <dbl> 1522, 1522, 1522, 1522, 1522, 1522,…
## $ MotoristsKilled <dbl> 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,…
## $ PassengInjured <dbl> 1677, 1677, 1677, 1677, 1677, 1677,…
## $ PassengKilled <dbl> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,…
## $ CyclistsInjured <dbl> 483, 483, 483, 483, 483, 483, 483, …
## $ CyclistsKilled <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ PedestrInjured <dbl> 751, 751, 751, 751, 751, 751, 751, …
## $ PedestrKilled <dbl> 13, 13, 13, 13, 13, 13, 13, 13, 13,…
## $ Bicycle <dbl> 645, 645, 645, 645, 645, 645, 645, …
## $ GeoCode.y <chr> "C", "C", "C", "C", "C", "C", "C", …
## $ GeoCodeLabel.y <chr> "CITYWIDE", "CITYWIDE", "CITYWIDE",…
## $ ContributingFactorCode <chr> "28", "02", "03", "22", "23", "04",…
## $ ContributingFactorDescription <chr> "AGGRESSIVE DRIVING/ROAD RAGE", "AL…
## $ Number_of_Vehicles <dbl> 92, 233, 731, 9, 2, 3269, 322, 21, …
names(data)
## [1] "Year" "GeoCode.x"
## [3] "GeoCodeLabel.x" "Number_of_Motor_Vehicle_Collisions"
## [5] "Vehicles_or_Motorists_Involved" "Injury_or_Fatal_Collisions"
## [7] "MotoristsInjured" "MotoristsKilled"
## [9] "PassengInjured" "PassengKilled"
## [11] "CyclistsInjured" "CyclistsKilled"
## [13] "PedestrInjured" "PedestrKilled"
## [15] "Bicycle" "GeoCode.y"
## [17] "GeoCodeLabel.y" "ContributingFactorCode"
## [19] "ContributingFactorDescription" "Number_of_Vehicles"
Let’s add and mutate some of the data frame variables for analysis
convenience
# Adding and renaming a few new variables and changing some
data_new <-data %>%
mutate(Contributing_Factor = ContributingFactorDescription, GeoCodeLabel = GeoCodeLabel.x, non_motorists_casualties = CyclistsInjured + CyclistsKilled + PedestrInjured + PedestrKilled, motorists_casualties = MotoristsInjured +MotoristsKilled +PassengInjured +PassengKilled) %>%
rename(Motorists_Involved =Vehicles_or_Motorists_Involved)
data_new
## # A tibble: 720 × 24
## Year GeoCode.x GeoCodeLabel.x Number_of_Motor_Vehicle_C…¹ Motorists_Involved
## <dbl> <chr> <chr> <dbl> <dbl>
## 1 2014 C CITYWIDE 17720 34721
## 2 2014 C CITYWIDE 17720 34721
## 3 2014 C CITYWIDE 17720 34721
## 4 2014 C CITYWIDE 17720 34721
## 5 2014 C CITYWIDE 17720 34721
## 6 2014 C CITYWIDE 17720 34721
## 7 2014 C CITYWIDE 17720 34721
## 8 2014 C CITYWIDE 17720 34721
## 9 2014 C CITYWIDE 17720 34721
## 10 2014 C CITYWIDE 17720 34721
## # ℹ 710 more rows
## # ℹ abbreviated name: ¹Number_of_Motor_Vehicle_Collisions
## # ℹ 19 more variables: Injury_or_Fatal_Collisions <dbl>,
## # MotoristsInjured <dbl>, MotoristsKilled <dbl>, PassengInjured <dbl>,
## # PassengKilled <dbl>, CyclistsInjured <dbl>, CyclistsKilled <dbl>,
## # PedestrInjured <dbl>, PedestrKilled <dbl>, Bicycle <dbl>, GeoCode.y <chr>,
## # GeoCodeLabel.y <chr>, ContributingFactorCode <chr>, …
Data Analysis:
Descriptive statistics
# Summary statistics
summary(data_new)
## Year GeoCode.x GeoCodeLabel.x
## Min. :2014 Length:720 Length:720
## 1st Qu.:2014 Class :character Class :character
## Median :2019 Mode :character Mode :character
## Mean :2019
## 3rd Qu.:2020
## Max. :2023
## Number_of_Motor_Vehicle_Collisions Motorists_Involved
## Min. : 440 Min. : 881
## 1st Qu.: 1395 1st Qu.: 2485
## Median : 2886 Median : 5737
## Mean : 4463 Mean : 8741
## 3rd Qu.: 5195 3rd Qu.:10367
## Max. :17720 Max. :34721
## Injury_or_Fatal_Collisions MotoristsInjured MotoristsKilled PassengInjured
## Min. : 139 Min. : 103 Min. : 0.00 Min. : 65
## 1st Qu.: 548 1st Qu.: 228 1st Qu.: 0.75 1st Qu.: 179
## Median : 788 Median : 457 Median : 2.00 Median : 359
## Mean :1175 Mean : 671 Mean : 2.82 Mean : 493
## 3rd Qu.:1172 3rd Qu.: 726 3rd Qu.: 3.00 3rd Qu.: 487
## Max. :3919 Max. :2453 Max. :14.00 Max. :1677
## PassengKilled CyclistsInjured CyclistsKilled PedestrInjured PedestrKilled
## Min. :0.000 Min. : 8 Min. :0.000 Min. : 20 Min. : 0.00
## 1st Qu.:0.000 1st Qu.: 66 1st Qu.:0.000 1st Qu.:117 1st Qu.: 1.00
## Median :1.000 Median :117 Median :0.000 Median :171 Median : 3.00
## Mean :0.994 Mean :178 Mean :0.892 Mean :235 Mean : 3.34
## 3rd Qu.:1.000 3rd Qu.:199 3rd Qu.:1.000 3rd Qu.:255 3rd Qu.: 4.25
## Max. :4.000 Max. :693 Max. :6.000 Max. :778 Max. :13.00
## Bicycle GeoCode.y GeoCodeLabel.y ContributingFactorCode
## Min. : 10.0 Length:720 Length:720 Length:720
## 1st Qu.: 74.8 Class :character Class :character Class :character
## Median :168.5 Mode :character Mode :character Mode :character
## Mean :219.6
## 3rd Qu.:259.0
## Max. :719.0
## ContributingFactorDescription Number_of_Vehicles Contributing_Factor
## Length:720 Min. : 1 Length:720
## Class :character 1st Qu.: 11 Class :character
## Mode :character Median : 84 Mode :character
## Mean : 326
## 3rd Qu.: 316
## Max. :5721
## GeoCodeLabel non_motorists_casualties motorists_casualties
## Length:720 Min. : 37 Min. : 174
## Class :character 1st Qu.: 195 1st Qu.: 398
## Mode :character Median : 277 Median : 864
## Mean : 417 Mean :1168
## 3rd Qu.: 456 3rd Qu.:1166
## Max. :1270 Max. :3986
# Histogram of all non motorists killed or injured
ggplot(data_new, aes(x = non_motorists_casualties)) +
geom_histogram(bindwidth = 0.3) + scale_x_log10()
## Warning in geom_histogram(bindwidth = 0.3): Ignoring unknown parameters:
## `bindwidth`
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

xlab("Non_motorists_casualties")
## $x
## [1] "Non_motorists_casualties"
##
## attr(,"class")
## [1] "labels"
# Histogram of all motorists killed or injured
ggplot(data_new, aes(x = motorists_casualties)) +
geom_histogram(bindwidth = 0.3) + scale_x_log10()
## Warning in geom_histogram(bindwidth = 0.3): Ignoring unknown parameters:
## `bindwidth`
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

xlab("Motorists_casualties")
## $x
## [1] "Motorists_casualties"
##
## attr(,"class")
## [1] "labels"
# scatter plot of non - motorist casualties
ggplot(data = data_new, aes( x = non_motorists_casualties, y = GeoCodeLabel)) +
geom_point(color = 4, alpha = 0.3) +
stat_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula = 'y ~ x'

labs(
title = ("Scatter Plot of Non - Motorists Casuaties per Borough"))
## $title
## [1] "Scatter Plot of Non - Motorists Casuaties per Borough"
##
## attr(,"class")
## [1] "labels"
# scatter plot of non - motorist casualties
ggplot(data = data_new, aes( x = non_motorists_casualties, y = GeoCodeLabel)) +
geom_boxplot(color = 4, alpha = 0.3) +
stat_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula = 'y ~ x'

labs(
title = ("BoxPlot of Non - Motorists Casuaties per Borough"))
## $title
## [1] "BoxPlot of Non - Motorists Casuaties per Borough"
##
## attr(,"class")
## [1] "labels"
# scatter plot of motorist casualties
ggplot(data = data_new, aes( x = motorists_casualties, y = GeoCodeLabel)) +
geom_point(color = 4, alpha = 0.3) +
stat_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula = 'y ~ x'

labs(
title = ("Scatter Plot of Motorists Casuaties per Borough "))
## $title
## [1] "Scatter Plot of Motorists Casuaties per Borough "
##
## attr(,"class")
## [1] "labels"
# Box plot of motorists casualties
ggplot(data = data_new, aes( x = motorists_casualties, y = GeoCodeLabel)) +
geom_boxplot(color = 4, alpha = 0.3) +
stat_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula = 'y ~ x'

labs(
title = ("Box Plot of Motorists Casuaties per Borough "))
## $title
## [1] "Box Plot of Motorists Casuaties per Borough "
##
## attr(,"class")
## [1] "labels"
ggplot(data = data_new, aes( x = PedestrKilled, y = GeoCodeLabel)) +
geom_boxplot(color = 4, alpha = 0.3) +
stat_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula = 'y ~ x'

labs(
title = ("Box Plot of Pedestrian Kiled per Borough "))
## $title
## [1] "Box Plot of Pedestrian Kiled per Borough "
##
## attr(,"class")
## [1] "labels"
ggplot(data = data_new, aes( x = PedestrInjured, y = GeoCodeLabel)) +
geom_boxplot(color = 4, alpha = 0.3) +
stat_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula = 'y ~ x'

labs(
title = ("Box Plot of Pedestrian Injured per Borough "))
## $title
## [1] "Box Plot of Pedestrian Injured per Borough "
##
## attr(,"class")
## [1] "labels"
ggplot(data = data_new, aes( x = CyclistsKilled, y = GeoCodeLabel)) +
geom_boxplot(color = 4, alpha = 0.3) +
stat_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula = 'y ~ x'

labs(
title = ("Box Plot of Cyclists Killed per Borough "))
## $title
## [1] "Box Plot of Cyclists Killed per Borough "
##
## attr(,"class")
## [1] "labels"
ggplot(data = data_new, aes( x = CyclistsInjured, y = GeoCodeLabel)) +
geom_boxplot(color = 4, alpha = 0.3) +
stat_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula = 'y ~ x'

labs(
title = ("Box Plot of Cyclists Injured per Borough "))
## $title
## [1] "Box Plot of Cyclists Injured per Borough "
##
## attr(,"class")
## [1] "labels"
Data Visualization:
Visualizing Changes in casualties Over Time since 2014
## Trends per boroughs in cyclists killed in NYC
ggplot(data_new, aes(x = Year, y = CyclistsKilled)) +
geom_line(aes(group = GeoCodeLabel), colour = "grey50") +
geom_point(aes(colour = GeoCodeLabel )) +
labs(
title = ("Trends per Borough in cyclists killed"))

## Trends per boroughs in Cyclists Injured
ggplot(data_new, aes(x = Year, y = CyclistsInjured)) +
geom_line(aes(group = GeoCodeLabel), colour = "grey50") +
geom_point(aes(colour = GeoCodeLabel )) +
labs(
title = ("Trends per Borough in Cyclists Injured"))

## Trends per borough in pedestrians killed
ggplot(data_new, aes(x = Year, y = PedestrKilled)) +
geom_line(aes(group = GeoCodeLabel), colour = "grey50") +
geom_point(aes(colour = GeoCodeLabel )) +
labs(
title = ("Trends per Borough in Pedestrians killed"))

## Trends per borough in Pedestrians Injured
ggplot(data_new, aes(x = Year, y = PedestrInjured)) +
geom_line(aes(group = GeoCodeLabel), colour = "grey50") +
geom_point(aes(colour = GeoCodeLabel )) +
labs(
title = ("Trends per Borough in Pedestrians Injured"))

Visualizing the contributing factors to road collisions in NYC
# Visualizing collisions factors in a bar chart
data2 <- as.data.frame(contributing_factors)
ggplot(data2, aes(x = Contributing_Factor, y = Number_of_Vehicles
)) +
geom_bar(stat = "identity", freq = 500, fill = "steelblue")+ coord_flip()
## Warning in geom_bar(stat = "identity", freq = 500, fill = "steelblue"):
## Ignoring unknown parameters: `freq`

library(tm)
# Visualizing collisions factors as a wordcloud
set.seed(337)
wordcloud(data2, max.words = 2000, random.order = FALSE, min.freq = 20, colors = brewer.pal(8,"Dark2"))

data2 <- as.data.frame(contributing_factors)
ggplot(data2, aes(x = Contributing_Factor, y = Number_of_Vehicles
)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_polar(theta = "x")

Narrowing things down
# Getting distinct values from data2
distinct_data2 <- distinct(data2,Contributing_Factor,Number_of_Vehicles)
# Identifying the main contributing factors in collisions in NYC
top_Contributing_Factor <- distinct_data2 %>%
arrange(desc(Number_of_Vehicles)) %>%
slice_head(n = 10)
print(top_Contributing_Factor)
## Contributing_Factor Number_of_Vehicles
## 1 DRIVER INATTENTION/DISTRACTION 5721
## 2 DRIVER INATTENTION/DISTRACTION 3269
## 3 DRIVER INATTENTION/DISTRACTION 2800
## 4 DRIVER INATTENTION/DISTRACTION 2410
## 5 FOLLOWING TOO CLOSELY 1959
## 6 FOLLOWING TOO CLOSELY 1447
## 7 FAILURE TO YIELD RIGHT-OF-WAY 1291
## 8 FAILURE TO YIELD RIGHT-OF-WAY 1046
## 9 PASSING OR LANE USAGE IMPROPER 977
## 10 PASSING TOO CLOSELY 928
Visualizing the top contributing factors to collisions in NYC
# Visualizing collisions main contributing factors in a bar chart
ggplot(top_Contributing_Factor, aes(x = Contributing_Factor, y = Number_of_Vehicles
)) +
geom_bar(stat = "identity", freq = 500, fill = "steelblue")+ coord_flip()
## Warning in geom_bar(stat = "identity", freq = 500, fill = "steelblue"):
## Ignoring unknown parameters: `freq`

library(tm)
# Visualizing collisions main contributing factors as a wordcloud
set.seed(337)
wordcloud(top_Contributing_Factor, max.words = 1000, random.order = FALSE, min.freq = 20, colors = brewer.pal(8,"Dark2"))

# Main contributing factors in road collisions in NYC
ggplot(top_Contributing_Factor, aes(x = Contributing_Factor,y = Number_of_Vehicles
)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_polar(theta = "x")

collisions_2020 <- data_new %>%
group_by(Contributing_Factor) %>%
filter(Year == 2020)
## distinct collisions factors for 2020
collisions_2020 <- distinct(collisions_2020)
# Main contributing factors of collisions in 2020
Main_Factor_2020 <- collisions_2020 %>%
arrange(desc(Number_of_Vehicles)) %>%
slice_head(n = 10)
print(Main_Factor_2020)
## # A tibble: 174 × 24
## # Groups: Contributing_Factor [29]
## Year GeoCode.x GeoCodeLabel.x Number_of_Motor_Vehicle_C…¹ Motorists_Involved
## <dbl> <chr> <chr> <dbl> <dbl>
## 1 2020 C CITYWIDE 9429 18541
## 2 2020 M MANHATTAN 1383 2485
## 3 2020 B BRONX 1868 3684
## 4 2020 K BROOKLYN 3126 6270
## 5 2020 Q QUEENS 2612 5211
## 6 2020 S STATEN ISLAND 440 891
## 7 2020 C CITYWIDE 9429 18541
## 8 2020 M MANHATTAN 1383 2485
## 9 2020 B BRONX 1868 3684
## 10 2020 K BROOKLYN 3126 6270
## # ℹ 164 more rows
## # ℹ abbreviated name: ¹Number_of_Motor_Vehicle_Collisions
## # ℹ 19 more variables: Injury_or_Fatal_Collisions <dbl>,
## # MotoristsInjured <dbl>, MotoristsKilled <dbl>, PassengInjured <dbl>,
## # PassengKilled <dbl>, CyclistsInjured <dbl>, CyclistsKilled <dbl>,
## # PedestrInjured <dbl>, PedestrKilled <dbl>, Bicycle <dbl>, GeoCode.y <chr>,
## # GeoCodeLabel.y <chr>, ContributingFactorCode <chr>, …
ggplot(collisions_2020, aes(x = Contributing_Factor, y = Number_of_Vehicles
)) +
geom_bar(stat = "identity", freq = 500, fill = "steelblue")+ coord_flip()
## Warning in geom_bar(stat = "identity", freq = 500, fill = "steelblue"):
## Ignoring unknown parameters: `freq`

ggplot(collisions_2020 , aes(x = Contributing_Factor, y = Number_of_Vehicles
)) +
geom_bar(stat = "identity", freq = 500, fill = "steelblue")+ facet_wrap(~GeoCodeLabel)
## Warning in geom_bar(stat = "identity", freq = 500, fill = "steelblue"):
## Ignoring unknown parameters: `freq`

Correlations: Evaluating the correlation between non - motorists
casualties and the main contributing factors
Strong positive relationship between car collisions and passengers
killed
(correlation <- cor( data_new$Number_of_Motor_Vehicle_Collisions,
data_new$PassengKilled))
## [1] 0.6401
Strong positive correlation between car collisions and death of
passengers
# Correlation between car collisions and passenger death
(correlation <- cor( data_new$Number_of_Motor_Vehicle_Collisions,
data_new$PassengInjured))
## [1] 0.9358
A very strong and positive correlation between vehicle collisions and
death of pedestrians
# Correlation between car collisions and pedestrian death
(correlation <- cor( data_new$Number_of_Motor_Vehicle_Collisions,
data_new$PedestrKilled))
## [1] 0.824
An even higher positive correlation between car collisions and
pedestrians injuries
# Correlation between car collisions and pedestrian injuries
(correlation <- cor( data_new$Number_of_Motor_Vehicle_Collisions,
data_new$PedestrInjured))
## [1] 0.9332
Positive and strong correlation between car collisions and cyclists
injured
# Correlation between car collisions and cyclist injuries
(correlation <- cor( data_new$Number_of_Motor_Vehicle_Collisions,
data_new$CyclistsInjured))
## [1] 0.7874
Positive and moderate correlation between car collisions and cyclists
killed
(correlation <- cor( data_new$Number_of_Motor_Vehicle_Collisions,
data_new$CyclistsKilled))
## [1] 0.3282
Running Correlation tests
# Correlation test of injured pedestrians
(correlation <- cor.test( data_new$Number_of_Motor_Vehicle_Collisions,
data_new$PedestrInjured))
##
## Pearson's product-moment correlation
##
## data: data_new$Number_of_Motor_Vehicle_Collisions and data_new$PedestrInjured
## t = 70, df = 718, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9231 0.9421
## sample estimates:
## cor
## 0.9332
# correlation test for pedestrians killed
(correlation <- cor.test( data_new$Number_of_Motor_Vehicle_Collisions,
data_new$PedestrKilled))
##
## Pearson's product-moment correlation
##
## data: data_new$Number_of_Motor_Vehicle_Collisions and data_new$PedestrKilled
## t = 39, df = 718, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.7990 0.8461
## sample estimates:
## cor
## 0.824
# correlation test for cyclists killed
(correlation <- cor.test( data_new$Number_of_Motor_Vehicle_Collisions,
data_new$CyclistsKilled))
##
## Pearson's product-moment correlation
##
## data: data_new$Number_of_Motor_Vehicle_Collisions and data_new$CyclistsKilled
## t = 9.3, df = 718, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2614 0.3918
## sample estimates:
## cor
## 0.3282
# Correlation test of passengers killed
(correlation <- cor.test( data_new$Number_of_Motor_Vehicle_Collisions,
data_new$PassengKilled))
##
## Pearson's product-moment correlation
##
## data: data_new$Number_of_Motor_Vehicle_Collisions and data_new$PassengKilled
## t = 22, df = 718, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.5949 0.6813
## sample estimates:
## cor
## 0.6401
# Correlation test on passengers injured
(correlation <- cor.test( data_new$Number_of_Motor_Vehicle_Collisions,
data_new$PassengInjured))
##
## Pearson's product-moment correlation
##
## data: data_new$Number_of_Motor_Vehicle_Collisions and data_new$PassengInjured
## t = 71, df = 718, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9261 0.9443
## sample estimates:
## cor
## 0.9358
Main Findings
- There were more non-motorists casualties in 2020 than in any years
since 2014 , the year the city launched its vision zero initiative
- The main causes of vehicles collisions are related to drivers
inattention and following too closely
- Brooklyn, Queens and Manhattan are the boroughs where it is
dangerous to be a pedestrian or a cyclist. Staten Island is the safest
borough for both pedestrians and cyclists
- There is a Strong positive relationship between car collisions and
passengers killed :0.6401476 5.There is a Strong positive correlation
between car collisions and death of passengers : 0.9358199
- There is a very strong and positive correlation between vehicle
collisions and death of pedestrians:0.8239562
- There is a very high positive correlation between car collisions and
pedestrians injuries:0.9332216
- Thers is positive and strong correlation between car collisions and
cyclists injured:0.7873529
- There is also a positive but moderate correlation between car
collisions and cyclists killed:0.3281672
names(data_new)
## [1] "Year" "GeoCode.x"
## [3] "GeoCodeLabel.x" "Number_of_Motor_Vehicle_Collisions"
## [5] "Motorists_Involved" "Injury_or_Fatal_Collisions"
## [7] "MotoristsInjured" "MotoristsKilled"
## [9] "PassengInjured" "PassengKilled"
## [11] "CyclistsInjured" "CyclistsKilled"
## [13] "PedestrInjured" "PedestrKilled"
## [15] "Bicycle" "GeoCode.y"
## [17] "GeoCodeLabel.y" "ContributingFactorCode"
## [19] "ContributingFactorDescription" "Number_of_Vehicles"
## [21] "Contributing_Factor" "GeoCodeLabel"
## [23] "non_motorists_casualties" "motorists_casualties"
# Non motorist casualties - rate per 10000
data_new1<- data_new %>%
mutate(rate = non_motorists_casualties / Number_of_Motor_Vehicle_Collisions * 1000)
data_new1
## # A tibble: 720 × 25
## Year GeoCode.x GeoCodeLabel.x Number_of_Motor_Vehicle_C…¹ Motorists_Involved
## <dbl> <chr> <chr> <dbl> <dbl>
## 1 2014 C CITYWIDE 17720 34721
## 2 2014 C CITYWIDE 17720 34721
## 3 2014 C CITYWIDE 17720 34721
## 4 2014 C CITYWIDE 17720 34721
## 5 2014 C CITYWIDE 17720 34721
## 6 2014 C CITYWIDE 17720 34721
## 7 2014 C CITYWIDE 17720 34721
## 8 2014 C CITYWIDE 17720 34721
## 9 2014 C CITYWIDE 17720 34721
## 10 2014 C CITYWIDE 17720 34721
## # ℹ 710 more rows
## # ℹ abbreviated name: ¹Number_of_Motor_Vehicle_Collisions
## # ℹ 20 more variables: Injury_or_Fatal_Collisions <dbl>,
## # MotoristsInjured <dbl>, MotoristsKilled <dbl>, PassengInjured <dbl>,
## # PassengKilled <dbl>, CyclistsInjured <dbl>, CyclistsKilled <dbl>,
## # PedestrInjured <dbl>, PedestrKilled <dbl>, Bicycle <dbl>, GeoCode.y <chr>,
## # GeoCodeLabel.y <chr>, ContributingFactorCode <chr>, …
### 2020 rate of non-motorist casualties
rate1 <- data_new1 %>%
filter(Year == 2020) %>%
select(GeoCodeLabel.x, Number_of_Motor_Vehicle_Collisions, non_motorists_casualties, rate) %>%
rename(GeoCodeLabel = GeoCodeLabel.x,Total_Collisions = Number_of_Motor_Vehicle_Collisions, rate_per_1000 = rate )
distinct(rate1)
## # A tibble: 6 × 4
## GeoCodeLabel Total_Collisions non_motorists_casualties rate_per_1000
## <chr> <dbl> <dbl> <dbl>
## 1 CITYWIDE 9429 1270 135.
## 2 MANHATTAN 1383 267 193.
## 3 BRONX 1868 221 118.
## 4 BROOKLYN 3126 456 146.
## 5 QUEENS 2612 289 111.
## 6 STATEN ISLAND 440 37 84.1
rate1
## # A tibble: 174 × 4
## GeoCodeLabel Total_Collisions non_motorists_casualties rate_per_1000
## <chr> <dbl> <dbl> <dbl>
## 1 CITYWIDE 9429 1270 135.
## 2 CITYWIDE 9429 1270 135.
## 3 CITYWIDE 9429 1270 135.
## 4 CITYWIDE 9429 1270 135.
## 5 CITYWIDE 9429 1270 135.
## 6 CITYWIDE 9429 1270 135.
## 7 CITYWIDE 9429 1270 135.
## 8 CITYWIDE 9429 1270 135.
## 9 CITYWIDE 9429 1270 135.
## 10 CITYWIDE 9429 1270 135.
## # ℹ 164 more rows
### 2019 rate of non-motorist casualties
rate2<- data_new1 %>%
filter(Year == 2019) %>%
select(GeoCodeLabel.x, Number_of_Motor_Vehicle_Collisions, non_motorists_casualties, rate) %>%
rename(GeoCodeLabel = GeoCodeLabel.x,Total_Collisions = Number_of_Motor_Vehicle_Collisions, rate_per_1000 = rate )
distinct(rate2)
## # A tibble: 6 × 4
## GeoCodeLabel Total_Collisions non_motorists_casualties rate_per_1000
## <chr> <dbl> <dbl> <dbl>
## 1 CITYWIDE 17380 1239 71.3
## 2 MANHATTAN 3470 273 78.7
## 3 BRONX 2886 195 67.6
## 4 BROOKLYN 5155 458 88.8
## 5 QUEENS 5323 276 51.9
## 6 STATEN ISLAND 546 37 67.8
rate2
## # A tibble: 192 × 4
## GeoCodeLabel Total_Collisions non_motorists_casualties rate_per_1000
## <chr> <dbl> <dbl> <dbl>
## 1 CITYWIDE 17380 1239 71.3
## 2 CITYWIDE 17380 1239 71.3
## 3 CITYWIDE 17380 1239 71.3
## 4 CITYWIDE 17380 1239 71.3
## 5 CITYWIDE 17380 1239 71.3
## 6 CITYWIDE 17380 1239 71.3
## 7 CITYWIDE 17380 1239 71.3
## 8 CITYWIDE 17380 1239 71.3
## 9 CITYWIDE 17380 1239 71.3
## 10 CITYWIDE 17380 1239 71.3
## # ℹ 182 more rows