library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr) 
library(tidyr) 
library(dplyr)
Motor_accidents <-  read.csv("C:/Users/dbrusche/Desktop/Motor_Vehicle_Collisions_-_Crashes_20241005.csv")

 # Display the first few rows and columns
head(Motor_accidents)
##   CRASH.DATE CRASH.TIME  BOROUGH ZIP.CODE LATITUDE LONGITUDE
## 1  9/11/2021       2:39                NA       NA        NA
## 2  3/26/2022      11:45                NA       NA        NA
## 3  6/29/2022       6:55                NA       NA        NA
## 4  9/11/2021       9:35 BROOKLYN    11208  40.6672 -73.86650
## 5 12/14/2021       8:13 BROOKLYN    11233  40.6833 -73.91727
## 6  4/14/2021      12:47                NA       NA        NA
##                  LOCATION               ON.STREET.NAME CROSS.STREET.NAME
## 1                                WHITESTONE EXPRESSWAY         20 AVENUE
## 2                              QUEENSBORO BRIDGE UPPER                  
## 3                                   THROGS NECK BRIDGE                  
## 4   (40.667202, -73.8665)                                               
## 5 (40.683304, -73.917274)              SARATOGA AVENUE    DECATUR STREET
## 6                         MAJOR DEEGAN EXPRESSWAY RAMP                  
##           OFF.STREET.NAME NUMBER.OF.PERSONS.INJURED NUMBER.OF.PERSONS.KILLED
## 1                                                 2                        0
## 2                                                 1                        0
## 3                                                 0                        0
## 4 1211      LORING AVENUE                         0                        0
## 5                                                 0                        0
## 6                                                 0                        0
##   NUMBER.OF.PEDESTRIANS.INJURED NUMBER.OF.PEDESTRIANS.KILLED
## 1                             0                            0
## 2                             0                            0
## 3                             0                            0
## 4                             0                            0
## 5                             0                            0
## 6                             0                            0
##   NUMBER.OF.CYCLIST.INJURED NUMBER.OF.CYCLIST.KILLED NUMBER.OF.MOTORIST.INJURED
## 1                         0                        0                          2
## 2                         0                        0                          1
## 3                         0                        0                          0
## 4                         0                        0                          0
## 5                         0                        0                          0
## 6                         0                        0                          0
##   NUMBER.OF.MOTORIST.KILLED CONTRIBUTING.FACTOR.VEHICLE.1
## 1                         0  Aggressive Driving/Road Rage
## 2                         0             Pavement Slippery
## 3                         0         Following Too Closely
## 4                         0                   Unspecified
## 5                         0                              
## 6                         0                   Unspecified
##   CONTRIBUTING.FACTOR.VEHICLE.2 CONTRIBUTING.FACTOR.VEHICLE.3
## 1                   Unspecified                              
## 2                                                            
## 3                   Unspecified                              
## 4                                                            
## 5                                                            
## 6                   Unspecified                              
##   CONTRIBUTING.FACTOR.VEHICLE.4 CONTRIBUTING.FACTOR.VEHICLE.5 COLLISION_ID
## 1                                                                  4455765
## 2                                                                  4513547
## 3                                                                  4541903
## 4                                                                  4456314
## 5                                                                  4486609
## 6                                                                  4407458
##   VEHICLE.TYPE.CODE.1 VEHICLE.TYPE.CODE.2 VEHICLE.TYPE.CODE.3
## 1               Sedan               Sedan                    
## 2               Sedan                                        
## 3               Sedan       Pick-up Truck                    
## 4               Sedan                                        
## 5                                                            
## 6                Dump               Sedan                    
##   VEHICLE.TYPE.CODE.4 VEHICLE.TYPE.CODE.5
## 1                                        
## 2                                        
## 3                                        
## 4                                        
## 5                                        
## 6

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

# Convert date column to Date type
Motor_accidents$CRASH.DATE <- as.Date(Motor_accidents$CRASH.DATE)

# Extract day of the week
Motor_accidents$Daysoftheweek <- weekdays(Motor_accidents$CRASH.DATE)

# Select relevant columns
Motor_accidents2<- Motor_accidents %>%
  select(BOROUGH, CRASH.DATE, LATITUDE, LONGITUDE, CONTRIBUTING.FACTOR.VEHICLE.1, VEHICLE.TYPE.CODE.1,Daysoftheweek)


# Filter out rows with missing values in key columns
Motor_accidents2 <- Motor_accidents2 %>%
  filter(!is.na(BOROUGH), 
         !is.na(LATITUDE), 
         !is.na(LONGITUDE), 
         !is.na(CRASH.DATE), 
         !is.na(VEHICLE.TYPE.CODE.1), 
         !is.na(CONTRIBUTING.FACTOR.VEHICLE.1))

#removing Rows that does not have boroughs 
Motor_accidents2 <- Motor_accidents2 %>%
  filter(!is.na(BOROUGH) & BOROUGH != "")

#First, for data preparation, I used the as.Date function to convert the crash.date column from character to date format. I then selected the columns I wanted and subsetted the data. Next, I removed the NAs from those columns and finally removed rows that did not list a borough.

Including Plots

You can also embed plots, for example:

#Analyze accidents by Borough 

# Count accidents by borough and percentage
borough_counts <- Motor_accidents2 %>%
  group_by(BOROUGH) %>%
  summarise(
    Accident.count = n(),
    Percentage = (n() / nrow(Motor_accidents2)) * 100
  )

# Visualization
library(ggplot2)

ggplot(borough_counts, aes(x = BOROUGH, y = Percentage)) +
  geom_bar(stat = "identity", fill = "blue") +
  theme_minimal() +
  labs(title = "Accidents Percentage by Borough", x = "Borough", y = "Percentage of Accidents")

# I created a borough count to total the number of accidents per borough and provide the percentage breakdown. The results were as follows: Bronx had 41,788 (16.34%), Brooklyn had 83,901 (32.82%), Manhattan had 48,857 (19.11%), Queens had 71,345 (27.90%), and Staten Island had 9,710 (3.79%). From the bar plot, we see that Brooklyn has the highest number of accidents, followed by Queens, Manhattan, Bronx, and lastly, Staten Island.
#day_counts and percentages 
days_counts <- Motor_accidents2 %>%
  group_by(Daysoftheweek) %>%
  summarise(
    Accident.count = n(),
    Percentage = (n() / nrow(Motor_accidents2)) * 100
  )


# Visualization
ggplot(days_counts, aes(x = reorder(Daysoftheweek, -Percentage), y = Percentage)) +
  geom_bar(stat = "identity", fill = "green") +
  theme_minimal() +
  labs(title = "Accidents by Day of the Week", x = "Day of the Week", y = "Percentage of Accidents") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# I then wanted to analyze whether certain days had more accidents. From the visualization, Tuesday had the highest number of accidents with 40,628 (15.89%), followed by Thursday with 38,091 (14.90%), Sunday with 38,044 (14.88%), Friday with 36,307 (14.20%), Saturday with 35,642 (13.94%), Wednesday with 33,965 (13.28%), and lastly Monday with 32,924 (12.88%).
#day_counts and percentages 
Car_accident_factor <- Motor_accidents2 %>%
  group_by(CONTRIBUTING.FACTOR.VEHICLE.1) %>%
  summarise(
    Accident.count = n(),
    Percentage = (n() / nrow(Motor_accidents2)) * 100
  ) %>%
  filter(Accident.count > 10000)  # Filter for counts greater than 10,000

ggplot(Car_accident_factor, aes(x = reorder(CONTRIBUTING.FACTOR.VEHICLE.1, -Percentage), y = Percentage)) +
  geom_bar(stat = "identity", fill = "green") +
  theme_minimal() +
  labs(title = "Accidents by Contributing Factor", x = "Contributing Factor", y = "Percentage of Accidents") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# I then wanted to explore the factors that caused the accidents. Since this is a large dataset, I focused on those with a count of 10,000 or more. The visualization showed that 'unspecified' had the highest number of accidents, with 66,461 (26.00%), followed by 'driver inattention/distraction' with 61,550 (24.08%), 'failure to yield right of way' with 19,396 (7.58%), 'following too closely' with 14,604 (5.71%), 'backing unsafely' with 13,724 (5.36%), 'passing too closely' with 11,540 (4.51%), and 'passing or lane usage improper' with 10,793 (4.22%).