# REPORT some details about the data.
library(dplyr)
library(plotly)
library(ggplot2)
library(scales)
library(stringr)
library(ggrepel)
library(lubridate)
library(ggthemes)
library(RColorBrewer)
library(data.table)
df <- fread("C:/Users/riley/OneDrive/Documents/DS 736/traffic_accidents.csv")
summary(df)
## crash_date traffic_control_device weather_condition
## Length:209306 Length:209306 Length:209306
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## lighting_condition first_crash_type trafficway_type alignment
## Length:209306 Length:209306 Length:209306 Length:209306
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## roadway_surface_cond road_defect crash_type
## Length:209306 Length:209306 Length:209306
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## intersection_related_i damage prim_contributory_cause
## Length:209306 Length:209306 Length:209306
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## num_units most_severe_injury injuries_total injuries_fatal
## Min. : 1.000 Length:209306 Min. : 0.0000 Min. :0.000000
## 1st Qu.: 2.000 Class :character 1st Qu.: 0.0000 1st Qu.:0.000000
## Median : 2.000 Mode :character Median : 0.0000 Median :0.000000
## Mean : 2.063 Mean : 0.3827 Mean :0.001858
## 3rd Qu.: 2.000 3rd Qu.: 1.0000 3rd Qu.:0.000000
## Max. :11.000 Max. :21.0000 Max. :3.000000
## injuries_incapacitating injuries_non_incapacitating
## Min. :0.0000 Min. : 0.0000
## 1st Qu.:0.0000 1st Qu.: 0.0000
## Median :0.0000 Median : 0.0000
## Mean :0.0381 Mean : 0.2212
## 3rd Qu.:0.0000 3rd Qu.: 0.0000
## Max. :7.0000 Max. :21.0000
## injuries_reported_not_evident injuries_no_indication crash_hour
## Min. : 0.0000 Min. : 0.000 Min. : 0.00
## 1st Qu.: 0.0000 1st Qu.: 2.000 1st Qu.: 9.00
## Median : 0.0000 Median : 2.000 Median :14.00
## Mean : 0.1215 Mean : 2.244 Mean :13.37
## 3rd Qu.: 0.0000 3rd Qu.: 3.000 3rd Qu.:17.00
## Max. :15.0000 Max. :49.000 Max. :23.00
## crash_day_of_week crash_month
## Min. :1.000 Min. : 1.000
## 1st Qu.:2.000 1st Qu.: 4.000
## Median :4.000 Median : 7.000
## Mean :4.144 Mean : 6.772
## 3rd Qu.:6.000 3rd Qu.:10.000
## Max. :7.000 Max. :12.000
# I paste some code in here if needed. This might be manipulation of the data after reading it in, to remove bad data, for example.
crashcount <- data.frame(count(df, first_crash_type))
crashcount <- crashcount[order(crashcount$n, decreasing = TRUE),]
most_common <- crashcount[1:4,]
other <- crashcount[5:18,]
other_sum <- sum(other$n)
otherdf <- data.frame(first_crash_type = 'OTHER', n = other_sum)
top5 <- rbind(most_common, otherdf)
top5 <- top5[order(top5$n, decreasing = TRUE),]
top5$first_crash_type <- str_to_title(top5$first_crash_type)
ggplot(top5, aes(x= reorder(first_crash_type, -n), y = n)) +
geom_bar(colour = 'hotpink', fill = 'lightpink', stat = 'identity') +
labs(title = "Amount of Accidents by Crash Type", x = "Type of Crash", y = "Accident Count") +
geom_text(aes(label= comma(n)), vjust = -.5, size = 3) +
theme(plot.title = element_text(hjust=0.5)) +
theme_gray()+
scale_y_continuous(label = comma)
month_injuries <- df %>%
group_by(crash_month) %>%
summarise(total_injuries = sum(injuries_total, na.rm = TRUE)) %>%
data.frame()
month_injuries$crash_month <- as.factor(month_injuries$crash_month)
high_low <- month_injuries %>%
filter(total_injuries == min(total_injuries) | total_injuries == max(total_injuries)) %>%
data.frame()
ggplot(month_injuries, aes(x = crash_month, y = total_injuries, group=1)) +
geom_line(color = 'pink', linewidth =1) +
geom_point(shape = 21, size =3, color = 'hotpink', fill = 'hotpink') +
labs(x = 'Month', y = 'Total Injuries', title = 'Injuries from Crashes by Month')+
scale_y_continuous(labels = comma) +
theme_bw()+
theme(plot.title = element_text(hjust = 0.5)) +
geom_point(data = high_low, aes(x=crash_month, y=total_injuries), inherit.aes = FALSE,
shape = 21, size = 3, fill = 'black', color = 'black') +
geom_label_repel(aes(label = ifelse(total_injuries == max(total_injuries)
| total_injuries == min(total_injuries),
scales::comma(total_injuries), '')),
box.padding = 1, point.padding =0, size = 3, nudge_x = .5,
color = 'black', segment.color = 'gray') +
scale_x_discrete(breaks = 1:12, labels = month.abb)
df2 <- df %>%
group_by(weather_condition, damage) %>%
summarise(total_injuries = mean(injuries_total, na.rm = TRUE), .groups = "drop")
df2 <- df2 %>%
mutate(weather_condition = reorder(weather_condition, total_injuries, .desc = TRUE))
df2$damage <- str_to_title(df2$damage)
df2$weather_condition <- str_to_title(df2$weather_condition)
ggplot(df2, aes(x = weather_condition, y = damage, fill = total_injuries)) +
geom_tile(color = "black") +
geom_text(aes(label = round(total_injuries, 1)), color = "black", size = 3)+
coord_equal(ratio=2) +
labs(title = "Heatmap of Average Amount of Injuries by Damage & Weather Condition",
x = "Weather Condition",
y = "Cost of Damage",
fill = "Average Amount of Injuries")+
theme_minimal()+
theme(
plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1, size = 8)) +
scale_fill_distiller(palette = "PuRd", direction = 1)
df$trafficway_type <- str_to_title(df$trafficway_type)
toptt <- count(df, trafficway_type)
toptt <- toptt[order(-toptt$n),]
#toptt[toptt$trafficway_type %in% c("Not Divided", "Four Way", "Divided - W/Median (Not Raised)", "One-Way", "Divided - W/Median Barrier", "T-Intersection"), "n"] / sum(toptt$n)
df3 <- df %>%
select(trafficway_type, crash_date) %>%
mutate(year = year(mdy_hms(crash_date)),
toptrafficway = ifelse(trafficway_type == "Not Divided", "Not Divided", ifelse(trafficway_type=="Four Way", "Four Way", ifelse(trafficway_type=="Divided - W/Median (Not Raised)", "Divided - W/Median (Not Raised)", ifelse(trafficway_type=="One-Way", "One-Way", ifelse(trafficway_type=="Divided - W/Median Barrier", "Divided - W/Median Barrier", ifelse(trafficway_type=="One-Way", "One-Way", "Other"))))))) %>%
group_by(year, toptrafficway) %>%
summarise(n=length(toptrafficway), .groups = 'keep') %>%
group_by(year) %>%
mutate(percent_of_total = round(100*n/sum(n), 1)) %>%
ungroup() %>%
data.frame()
df3 <- subset(df3, year >= max(df3$year-2))
ggplot(data = df3, aes(x="", y=n, fill=toptrafficway)) +
geom_bar(stat="identity", position="fill") +
coord_polar(theta="y", start=0) +
labs(fill="Traffic Way Type", x=NULL, y=NULL, title="Traffic Way Type Count by Year", caption = "Slices under 4% are not labeled") +
theme_minimal()+
theme(plot.title=element_text(hjust=0.5),
axis.text=element_blank(),
axis.ticks=element_blank(),
panel.grid=element_blank()) +
facet_wrap(~year, ncol = 3, nrow = 1) +
scale_fill_brewer(palette = "PuRd")+
geom_text(aes(x=1.9, label=ifelse(percent_of_total>4, paste0(percent_of_total, "%"), " ")), size = 3, position=position_fill(vjust=0.5))
df$prim_contributory_cause <- str_to_title(df$prim_contributory_cause)
topcontrib <- df %>%
count(prim_contributory_cause, sort = TRUE) %>%
slice_head(n = 5)
df5 <- df %>%
filter(prim_contributory_cause %in% topcontrib$prim_contributory_cause) %>%
group_by(damage, prim_contributory_cause) %>%
summarise(totalcars = sum(num_units), .groups = "drop") %>%
data.frame()
df5$damage <- as.factor(df5$damage)
ggplot(df5, aes(x = reorder(prim_contributory_cause, totalcars), y = totalcars, fill = damage)) +
geom_bar(stat = "identity", position = position_stack(reverse = TRUE)) +
labs(title = "Cars Involved in Top 5 Primary Contributory Causes by Damage",
x = " ",
y = "Amount of Cars Involved in Accidents",
fill = "Cost of Damage") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_y_continuous(labels = comma,
breaks = seq(0, 120000, by = 20000)) +
geom_text(aes(label = comma(totalcars)),
stat = "identity",
position = position_stack(vjust = 0.5, reverse = TRUE),
size = 3, color = "black", angle = 0)+
scale_fill_brewer(palette = "PuRd")
df$first_crash_type <- str_to_title(df$first_crash_type)
injuryacc <- df %>%
filter(injuries_total != 0) %>%
mutate(type = ifelse(first_crash_type == "Angle", "Angle", ifelse(first_crash_type == "Pedestrian", "Pedestrian", ifelse(first_crash_type== "Rear End", "Rear End", ifelse(first_crash_type=="Turning", "Turning", "Other"))))) %>%
group_by(type, crash_day_of_week) %>%
summarise(total_injuries = sum(injuries_total), .groups = "drop") %>%
mutate(percent_total = round(100*total_injuries/sum(total_injuries), 1)) %>%
data.frame()
injuryacc$type <- as.factor(injuryacc$type)
plot <- plot_ly(hole=0.75) %>%
layout(title="Injuries by Crash Type and Weekday") %>%
add_trace(data = injuryacc[injuryacc$crash_day_of_week == 2,],
labels= ~type,
values= ~injuryacc[injuryacc$crash_day_of_week == 2, "total_injuries"],
type="pie",
textposition="inside",
hovertemplate = "Day:Monday<br>Accident Type:%{label}<br>Total Injuries:%{value}<br>Percent:%{percent}<br><extra></extra>") %>%
add_trace(data = injuryacc[injuryacc$crash_day_of_week == 3,],
labels= ~type,
values= ~injuryacc[injuryacc$crash_day_of_week == 3, "total_injuries"],
type="pie",
textposition="inside",
hovertemplate = "Day:Tuesday<br>Accident Type:%{label}<br>Total Injuries:%{value}<br>Percent:%{percent}<br><extra></extra>",
domain=list(
x=c(0.13, 0.87),
y= c(0.13, 0.87)))%>%
add_trace(data = injuryacc[injuryacc$crash_day_of_week == 4,],
labels= ~type,
values= ~injuryacc[injuryacc$crash_day_of_week == 4, "total_injuries"],
type="pie",
textposition="inside",
hovertemplate = "Day:Wednesday<br>Accident Type:%{label}<br>Total Injuries:%{value}<br>Percent:%{percent}<br><extra></extra>",
domain=list(
x=c(0.23, 0.77),
y= c(0.23, 0.77)))%>%
add_trace(data = injuryacc[injuryacc$crash_day_of_week == 5,],
labels= ~type,
values= ~injuryacc[injuryacc$crash_day_of_week == 5, "total_injuries"],
type="pie",
textposition="inside",
hovertemplate = "Day:Thursday<br>Accident Type:%{label}<br>Total Injuries:%{value}<br>Percent:%{percent}<br><extra></extra>",
domain=list(
x=c(0.3, 0.7),
y= c(0.3, 0.7)))%>%
add_trace(data = injuryacc[injuryacc$crash_day_of_week == 6,],
labels= ~type,
values= ~injuryacc[injuryacc$crash_day_of_week == 6, "total_injuries"],
type="pie",
textposition="inside",
hovertemplate = "Day:Friday<br>Accident Type:%{label}<br>Total Injuries:%{value}<br>Percent:%{percent}<br><extra></extra>",
marker = list(colors = RColorBrewer::brewer.pal(length(unique(injuryacc$type)), "PuRd")),
domain=list(
x=c(0.355, 0.645),
y= c(0.355, 0.645)))
plot