library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr)
library(tidyr)
library(dplyr)
Motor_accidents <- read.csv("C:/Users/dbrusche/Desktop/Motor_Vehicle_Collisions_-_Crashes_20241005.csv")
# Display the first few rows and columns
head(Motor_accidents)
## CRASH.DATE CRASH.TIME BOROUGH ZIP.CODE LATITUDE LONGITUDE
## 1 9/11/2021 2:39 NA NA NA
## 2 3/26/2022 11:45 NA NA NA
## 3 6/29/2022 6:55 NA NA NA
## 4 9/11/2021 9:35 BROOKLYN 11208 40.6672 -73.86650
## 5 12/14/2021 8:13 BROOKLYN 11233 40.6833 -73.91727
## 6 4/14/2021 12:47 NA NA NA
## LOCATION ON.STREET.NAME CROSS.STREET.NAME
## 1 WHITESTONE EXPRESSWAY 20 AVENUE
## 2 QUEENSBORO BRIDGE UPPER
## 3 THROGS NECK BRIDGE
## 4 (40.667202, -73.8665)
## 5 (40.683304, -73.917274) SARATOGA AVENUE DECATUR STREET
## 6 MAJOR DEEGAN EXPRESSWAY RAMP
## OFF.STREET.NAME NUMBER.OF.PERSONS.INJURED NUMBER.OF.PERSONS.KILLED
## 1 2 0
## 2 1 0
## 3 0 0
## 4 1211 LORING AVENUE 0 0
## 5 0 0
## 6 0 0
## NUMBER.OF.PEDESTRIANS.INJURED NUMBER.OF.PEDESTRIANS.KILLED
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## NUMBER.OF.CYCLIST.INJURED NUMBER.OF.CYCLIST.KILLED NUMBER.OF.MOTORIST.INJURED
## 1 0 0 2
## 2 0 0 1
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## NUMBER.OF.MOTORIST.KILLED CONTRIBUTING.FACTOR.VEHICLE.1
## 1 0 Aggressive Driving/Road Rage
## 2 0 Pavement Slippery
## 3 0 Following Too Closely
## 4 0 Unspecified
## 5 0
## 6 0 Unspecified
## CONTRIBUTING.FACTOR.VEHICLE.2 CONTRIBUTING.FACTOR.VEHICLE.3
## 1 Unspecified
## 2
## 3 Unspecified
## 4
## 5
## 6 Unspecified
## CONTRIBUTING.FACTOR.VEHICLE.4 CONTRIBUTING.FACTOR.VEHICLE.5 COLLISION_ID
## 1 4455765
## 2 4513547
## 3 4541903
## 4 4456314
## 5 4486609
## 6 4407458
## VEHICLE.TYPE.CODE.1 VEHICLE.TYPE.CODE.2 VEHICLE.TYPE.CODE.3
## 1 Sedan Sedan
## 2 Sedan
## 3 Sedan Pick-up Truck
## 4 Sedan
## 5
## 6 Dump Sedan
## VEHICLE.TYPE.CODE.4 VEHICLE.TYPE.CODE.5
## 1
## 2
## 3
## 4
## 5
## 6
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
# Convert date column to Date type
Motor_accidents$CRASH.DATE <- as.Date(Motor_accidents$CRASH.DATE)
# Extract day of the week
Motor_accidents$Daysoftheweek <- weekdays(Motor_accidents$CRASH.DATE)
# Select relevant columns
Motor_accidents2<- Motor_accidents %>%
select(BOROUGH, CRASH.DATE, LATITUDE, LONGITUDE, CONTRIBUTING.FACTOR.VEHICLE.1, VEHICLE.TYPE.CODE.1,Daysoftheweek)
# Filter out rows with missing values in key columns
Motor_accidents2 <- Motor_accidents2 %>%
filter(!is.na(BOROUGH),
!is.na(LATITUDE),
!is.na(LONGITUDE),
!is.na(CRASH.DATE),
!is.na(VEHICLE.TYPE.CODE.1),
!is.na(CONTRIBUTING.FACTOR.VEHICLE.1))
#removing Rows that does not have boroughs
Motor_accidents2 <- Motor_accidents2 %>%
filter(!is.na(BOROUGH) & BOROUGH != "")
#First, for data preparation, I used the as.Date function to convert the crash.date column from character to date format. I then selected the columns I wanted and subsetted the data. Next, I removed the NAs from those columns and finally removed rows that did not list a borough.
You can also embed plots, for example:
#Analyze accidents by Borough
# Count accidents by borough and percentage
borough_counts <- Motor_accidents2 %>%
group_by(BOROUGH) %>%
summarise(
Accident.count = n(),
Percentage = (n() / nrow(Motor_accidents2)) * 100
)
# Visualization
library(ggplot2)
ggplot(borough_counts, aes(x = BOROUGH, y = Percentage)) +
geom_bar(stat = "identity", fill = "blue") +
theme_minimal() +
labs(title = "Accidents Percentage by Borough", x = "Borough", y = "Percentage of Accidents")
# I created a borough count to total the number of accidents per borough and provide the percentage breakdown. The results were as follows: Bronx had 41,788 (16.34%), Brooklyn had 83,901 (32.82%), Manhattan had 48,857 (19.11%), Queens had 71,345 (27.90%), and Staten Island had 9,710 (3.79%). From the bar plot, we see that Brooklyn has the highest number of accidents, followed by Queens, Manhattan, Bronx, and lastly, Staten Island.
#day_counts and percentages
days_counts <- Motor_accidents2 %>%
group_by(Daysoftheweek) %>%
summarise(
Accident.count = n(),
Percentage = (n() / nrow(Motor_accidents2)) * 100
)
# Visualization
ggplot(days_counts, aes(x = reorder(Daysoftheweek, -Percentage), y = Percentage)) +
geom_bar(stat = "identity", fill = "green") +
theme_minimal() +
labs(title = "Accidents by Day of the Week", x = "Day of the Week", y = "Percentage of Accidents") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# I then wanted to analyze whether certain days had more accidents. From the visualization, Tuesday had the highest number of accidents with 40,628 (15.89%), followed by Thursday with 38,091 (14.90%), Sunday with 38,044 (14.88%), Friday with 36,307 (14.20%), Saturday with 35,642 (13.94%), Wednesday with 33,965 (13.28%), and lastly Monday with 32,924 (12.88%).
#day_counts and percentages
Car_accident_factor <- Motor_accidents2 %>%
group_by(CONTRIBUTING.FACTOR.VEHICLE.1) %>%
summarise(
Accident.count = n(),
Percentage = (n() / nrow(Motor_accidents2)) * 100
) %>%
filter(Accident.count > 10000) # Filter for counts greater than 10,000
ggplot(Car_accident_factor, aes(x = reorder(CONTRIBUTING.FACTOR.VEHICLE.1, -Percentage), y = Percentage)) +
geom_bar(stat = "identity", fill = "green") +
theme_minimal() +
labs(title = "Accidents by Contributing Factor", x = "Contributing Factor", y = "Percentage of Accidents") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# I then wanted to explore the factors that caused the accidents. Since this is a large dataset, I focused on those with a count of 10,000 or more. The visualization showed that 'unspecified' had the highest number of accidents, with 66,461 (26.00%), followed by 'driver inattention/distraction' with 61,550 (24.08%), 'failure to yield right of way' with 19,396 (7.58%), 'following too closely' with 14,604 (5.71%), 'backing unsafely' with 13,724 (5.36%), 'passing too closely' with 11,540 (4.51%), and 'passing or lane usage improper' with 10,793 (4.22%).