library(tidyverse)
library(vroom)
# Download from here: https://data.transportation.gov/Railroads/Rail-Equipment-Accident-Incident-Data-Form-54-/85tf-25kj
data <- vroom("Rail_Equipment_Accident_Incident_Data__Form_54_.csv")
data %>% filter(Year == 2023) %>% count(`Accident Number`) %>% arrange(desc(n))
duplicates <- data %>% filter(Year == 2023) %>% count(`Accident Number`) %>% arrange(desc(n)) %>% pull(`Accident Number`)
OK, so found some examples with duplicate records. One is Accident
Number 151894.
data %>% filter(`Accident Number` == "151894")
OK for this accident number, it seems like almost all data is the same despite it being 4 rows for the same Accident Number, same day, same time, same type.
# Figure out which columns are different within this duplicate
data %>%
filter(`Accident Number` == "151894", Year == 2023) %>%
summarise_all(~ length(unique(.)) > 1) %>%
select_if(all)
The only differences are in “Remote Control Locomotive,” “First Car Initials” and other “Car” level information.
Generally, it doesn’t seem like this 4 rows should count as separate
derailments. A distinct() on Accident Number
should handle this
data %>%
mutate(month = as.double(`Accident Month`)) %>%
filter(Year == 2023, `Accident Type` == "Derailment") %>%
distinct(`Accident Number`) %>%
as_vector() %>%
length()
## [1] 742
summarised_by_type <-
data %>%
mutate(month = as.double(`Accident Month`)) %>%
filter(Year == 2023) %>%
distinct(`Accident Number`, .keep_all = TRUE) %>%
count(`Accident Type`) %>%
arrange(desc(n))
summarised_by_type
summarised_by_type %>%
mutate(collision = if_else(str_detect(`Accident Type`, "collision"), "Collision", "No")) %>%
group_by(collision) %>%
summarise(n = sum(n))
summarised_yearly_data <-
data %>%
distinct(`Accident Number`, .keep_all = TRUE) %>%
filter(`Accident Type` == "Derailment") %>%
count(Year)
summarised_yearly_data
write_csv(summarised_yearly_data, "train_derailments_by_year.csv")
summarised_yearly_data %>%
summarise(avg = mean(n))
summarised_yearly_data %>%
arrange(desc(n)) %>%
head(1)
peak_derailments <-
summarised_yearly_data %>%
arrange(desc(n)) %>%
head(1) %>%
pull(n)
summarised_yearly_data %>%
filter(Year == 1990)
(summarised_yearly_data %>%
filter(Year == 1990) %>%
pull(n) - peak_derailments) / peak_derailments
## [1] -0.7244396
summarised_yearly_data %>%
filter(Year > 2006)
accidents_by_type_by_state <-
data %>%
distinct(`Accident Number`, .keep_all = TRUE) %>%
group_by(`State Abbreviation`) %>%
count(`Accident Type`)
accidents_by_type_by_state %>%
filter(`Accident Type` == "Derailment") %>%
arrange(desc(n))
data %>%
distinct(`Accident Number`, .keep_all = TRUE) %>%
count(`Accident Type`) %>%
arrange(desc(n))
data %>%
mutate(
cause_type = case_when(
startsWith(`Accident Cause Code`, "E") ~ "Mechanical and Electrical Failures",
startsWith(`Accident Cause Code`, "M") ~ "Miscellaneous Causes Not Otherwise Listed",
startsWith(`Accident Cause Code`, "T") ~ "Rack, Roadbed and Structures",
startsWith(`Accident Cause Code`, "S") ~ "Signal and Communication",
startsWith(`Accident Cause Code`, "H") ~ "Train operation - Human Factors",
TRUE ~ "NA"
)
) %>%
distinct(`Accident Number`, .keep_all = TRUE) %>%
filter(cause_type != "NA") %>%
count(cause_type)
data %>%
distinct(`Accident Number`, .keep_all = TRUE) %>%
group_by(Year) %>%
summarise(total_deaths = sum(`Total Persons Killed`))
data %>%
distinct(`Accident Number`, .keep_all = TRUE) %>%
summarise(total_deaths = sum(`Total Persons Killed`))
data %>%
distinct(`Accident Number`, .keep_all = TRUE) %>%
group_by(Year) %>%
summarise(total_deaths = sum(`Total Persons Injured`))
data %>%
filter(Year == 2002) %>%
distinct(`Accident Number`, .keep_all = TRUE) %>%
select(`Accident Number`, Date, `Accident Type`, Division, `State Abbreviation`, `Total Persons Killed`, `Total Persons Injured`) %>%
arrange(desc(`Total Persons Injured`))
library(readxl)
# Download from here: https://data.transportation.gov/Railroads/Rail-Equipment-Accident-Incident-Data-Form-54-/85tf-25kj
data <- read_xlsx("Rail_Equipment_Accident_Incident_Data_common-causes.xlsx")
data %>%
mutate(month = as.double(`Accident Month`)) %>%
filter(Year == 2023, `Accident Type` == "Derailment", month < 7) %>%
distinct(`Accident Number`) %>%
as_vector() %>%
length()
## [1] 610
summarised_by_type <-
data %>%
mutate(month = as.double(`Accident Month`)) %>%
filter(Year == 2023, month < 7) %>%
distinct(`Accident Number`, .keep_all = TRUE) %>%
count(`Accident Type`) %>%
arrange(desc(n))
summarised_by_type
summarised_by_type %>%
mutate(collision = if_else(str_detect(`Accident Type`, "collision"), "Collision", "No")) %>%
group_by(collision) %>%
summarise(n = sum(n))
summarised_yearly_data <-
data %>%
distinct(`Accident Number`, .keep_all = TRUE) %>%
filter(`Accident Type` == "Derailment") %>%
count(Year)
write_csv(summarised_yearly_data, "train_derailments_by_year.csv")
summarised_yearly_data %>%
summarise(avg = mean(n))
summarised_yearly_data %>%
arrange(desc(n)) %>%
head(1)
peak_derailments <-
summarised_yearly_data %>%
arrange(desc(n)) %>%
head(1) %>%
pull(n)
summarised_yearly_data %>%
filter(Year == 1990)
(summarised_yearly_data %>%
filter(Year == 1990) %>%
pull(n) - peak_derailments) / peak_derailments
## [1] -0.7251729
summarised_yearly_data %>%
filter(Year > 2006)
accidents_by_type_by_state <-
data %>%
distinct(`Accident Number`, .keep_all = TRUE) %>%
group_by(`State Abbreviation`) %>%
count(`Accident Type`)
accidents_by_type_by_state %>%
filter(`Accident Type` == "Derailment") %>%
arrange(desc(n))
data %>%
distinct(`Accident Number`, .keep_all = TRUE) %>%
count(`Accident Type`) %>%
arrange(desc(n))
data %>%
mutate(
cause_type = case_when(
startsWith(`Accident Cause Code`, "E") ~ "Mechanical and Electrical Failures",
startsWith(`Accident Cause Code`, "M") ~ "Miscellaneous Causes Not Otherwise Listed",
startsWith(`Accident Cause Code`, "T") ~ "Rack, Roadbed and Structures",
startsWith(`Accident Cause Code`, "S") ~ "Signal and Communication",
startsWith(`Accident Cause Code`, "H") ~ "Train operation - Human Factors",
TRUE ~ "NA"
)
) %>%
distinct(`Accident Number`, .keep_all = TRUE) %>%
filter(cause_type != "NA") %>%
count(cause_type)
library(readxl)
# Download from here: https://data.transportation.gov/Railroads/Rail-Equipment-Accident-Incident-Data-Form-54-/85tf-25kj
data <- read_xlsx("Rail_Equipment_Accident_Incident_Data_common-causes.xlsx")
data %>%
mutate(month = as.double(`Accident Month`)) %>%
filter(Year == 2023, `Accident Type` == "Derailment", month < 7) %>%
nrow()
## [1] 647
summarised_by_type <-
data %>%
mutate(month = as.double(`Accident Month`)) %>%
filter(Year == 2023, month < 7) %>%
count(`Accident Type`) %>%
arrange(desc(n))
summarised_by_type
summarised_by_type %>%
mutate(collision = if_else(str_detect(`Accident Type`, "collision"), "Collision", "No")) %>%
group_by(collision) %>%
summarise(n = sum(n))
summarised_yearly_data <-
data %>%
filter(`Accident Type` == "Derailment") %>%
count(Year)
write_csv(summarised_yearly_data, "train_derailments_by_year.csv")
summarised_yearly_data %>%
summarise(avg = mean(n))
summarised_yearly_data %>%
arrange(desc(n)) %>%
head(1)
peak_derailments <-
summarised_yearly_data %>%
arrange(desc(n)) %>%
head(1) %>%
pull(n)
summarised_yearly_data %>%
filter(Year == 1990)
(summarised_yearly_data %>%
filter(Year == 1990) %>%
pull(n) - peak_derailments) / peak_derailments
## [1] -0.7533609
summarised_yearly_data %>%
filter(Year > 2006)
accidents_by_type_by_state <-
data %>%
group_by(`State Abbreviation`) %>%
count(`Accident Type`)
accidents_by_type_by_state %>%
filter(`Accident Type` == "Derailment") %>%
arrange(desc(n))
data %>%
count(`Accident Type`) %>%
arrange(desc(n))
data %>%
mutate(
cause_type = case_when(
startsWith(`Accident Cause Code`, "E") ~ "Mechanical and Electrical Failures",
startsWith(`Accident Cause Code`, "M") ~ "Miscellaneous Causes Not Otherwise Listed",
startsWith(`Accident Cause Code`, "T") ~ "Rack, Roadbed and Structures",
startsWith(`Accident Cause Code`, "S") ~ "Signal and Communication",
startsWith(`Accident Cause Code`, "H") ~ "Train operation - Human Factors",
TRUE ~ "NA"
)
) %>%
filter(cause_type != "NA") %>%
count(cause_type)