library(tidyverse)
library(vroom)

Attempt 1: Read Data Direct from Source

# Download from here: https://data.transportation.gov/Railroads/Rail-Equipment-Accident-Incident-Data-Form-54-/85tf-25kj
data <- vroom("Rail_Equipment_Accident_Incident_Data__Form_54_.csv")

Examine what’s going on with duplicates

data %>% filter(Year == 2023) %>% count(`Accident Number`) %>%  arrange(desc(n))
duplicates <- data %>% filter(Year == 2023) %>% count(`Accident Number`) %>%  arrange(desc(n)) %>% pull(`Accident Number`)

OK, so found some examples with duplicate records. One is Accident Number 151894.

data %>% filter(`Accident Number` == "151894")

OK for this accident number, it seems like almost all data is the same despite it being 4 rows for the same Accident Number, same day, same time, same type.

# Figure out which columns are different within this duplicate
data %>% 
  filter(`Accident Number` == "151894", Year == 2023) %>% 
  summarise_all(~ length(unique(.)) > 1) %>% 
  select_if(all)

The only differences are in “Remote Control Locomotive,” “First Car Initials” and other “Car” level information.

Generally, it doesn’t seem like this 4 rows should count as separate derailments. A distinct() on Accident Number should handle this

How many train derailments in 2023?

Num Train Derailments

data %>% 
  mutate(month = as.double(`Accident Month`)) %>% 
  filter(Year == 2023, `Accident Type` == "Derailment") %>% 
  distinct(`Accident Number`) %>% 
  as_vector() %>% 
  length()
## [1] 742

Num collisions, fires, and highway-rail crossing incidents

summarised_by_type <- 
  data %>% 
  mutate(month = as.double(`Accident Month`)) %>% 
  filter(Year == 2023) %>% 
  distinct(`Accident Number`, .keep_all = TRUE) %>% 
  count(`Accident Type`) %>% 
  arrange(desc(n))

summarised_by_type
summarised_by_type %>% 
  mutate(collision = if_else(str_detect(`Accident Type`, "collision"), "Collision", "No")) %>% 
  group_by(collision) %>% 
  summarise(n = sum(n))

How many trains derail each year?

Time Series and Avg Per Year

summarised_yearly_data <-
  data %>% 
  distinct(`Accident Number`, .keep_all = TRUE) %>% 
  filter(`Accident Type` == "Derailment") %>% 
  count(Year)

summarised_yearly_data
write_csv(summarised_yearly_data, "train_derailments_by_year.csv")

Avg Derailments Per Year

summarised_yearly_data %>% 
  summarise(avg = mean(n))

Peak Derailments

summarised_yearly_data %>% 
  arrange(desc(n)) %>% 
  head(1)
peak_derailments <-
  summarised_yearly_data %>% 
  arrange(desc(n)) %>% 
  head(1) %>% 
  pull(n)

Derailments in 1990 and % Change

summarised_yearly_data %>% 
  filter(Year == 1990)
(summarised_yearly_data %>% 
  filter(Year == 1990) %>% 
  pull(n) - peak_derailments) / peak_derailments
## [1] -0.7244396

Derailments in 2022 and 2008

summarised_yearly_data %>% 
  filter(Year > 2006)

Geography

Most derailments by state

accidents_by_type_by_state <-
  data %>% 
  distinct(`Accident Number`, .keep_all = TRUE) %>% 
  group_by(`State Abbreviation`) %>% 
  count(`Accident Type`)

accidents_by_type_by_state %>% 
  filter(`Accident Type` == "Derailment") %>% 
  arrange(desc(n))

What is the most common type of train accident?

data %>% 
  distinct(`Accident Number`, .keep_all = TRUE) %>% 
  count(`Accident Type`) %>% 
  arrange(desc(n))

What are the most common causes of train accidents?

data %>% 
  mutate(
    cause_type = case_when(
      startsWith(`Accident Cause Code`, "E") ~ "Mechanical and Electrical Failures",
      startsWith(`Accident Cause Code`, "M") ~ "Miscellaneous Causes Not Otherwise Listed",
      startsWith(`Accident Cause Code`, "T") ~ "Rack, Roadbed and Structures",
      startsWith(`Accident Cause Code`, "S") ~ "Signal and Communication",
      startsWith(`Accident Cause Code`, "H") ~ "Train operation - Human Factors",
      TRUE ~ "NA"
    )
  ) %>% 
  distinct(`Accident Number`, .keep_all = TRUE) %>% 
  filter(cause_type != "NA") %>% 
  count(cause_type)

Deaths

data %>% 
  distinct(`Accident Number`, .keep_all = TRUE) %>% 
  group_by(Year) %>% 
  summarise(total_deaths = sum(`Total Persons Killed`))
data %>% 
  distinct(`Accident Number`, .keep_all = TRUE) %>% 
  summarise(total_deaths = sum(`Total Persons Killed`)) 
data %>% 
  distinct(`Accident Number`, .keep_all = TRUE) %>% 
  group_by(Year) %>% 
  summarise(total_deaths = sum(`Total Persons Injured`))

What happened in 2002?

data %>% 
  filter(Year == 2002) %>% 
  distinct(`Accident Number`, .keep_all = TRUE) %>% 
  select(`Accident Number`, Date, `Accident Type`, Division, `State Abbreviation`, `Total Persons Killed`, `Total Persons Injured`) %>% 
  arrange(desc(`Total Persons Injured`))

Attempt 2: Read Data from One of Analysis Spreadsheets sent via Slack

library(readxl)
# Download from here: https://data.transportation.gov/Railroads/Rail-Equipment-Accident-Incident-Data-Form-54-/85tf-25kj
data <- read_xlsx("Rail_Equipment_Accident_Incident_Data_common-causes.xlsx")

How many train derailments in 2023?

Num Train Derailments as of June 30 2023

data %>% 
  mutate(month = as.double(`Accident Month`)) %>% 
  filter(Year == 2023, `Accident Type` == "Derailment", month < 7) %>% 
  distinct(`Accident Number`) %>% 
  as_vector() %>% 
  length()
## [1] 610

Num collisions, fires, and highway-rail crossing incidents as of June 30 2023

summarised_by_type <- 
  data %>% 
  mutate(month = as.double(`Accident Month`)) %>% 
  filter(Year == 2023, month < 7) %>% 
  distinct(`Accident Number`, .keep_all = TRUE) %>% 
  count(`Accident Type`) %>% 
  arrange(desc(n))
 
summarised_by_type
summarised_by_type %>% 
  mutate(collision = if_else(str_detect(`Accident Type`, "collision"), "Collision", "No")) %>% 
  group_by(collision) %>% 
  summarise(n = sum(n))

How many trains derail each year?

Time Series and Avg Per Year

summarised_yearly_data <-
  data %>% 
  distinct(`Accident Number`, .keep_all = TRUE) %>% 
  filter(`Accident Type` == "Derailment") %>% 
  count(Year)

write_csv(summarised_yearly_data, "train_derailments_by_year.csv")

Avg Derailments Per Year

summarised_yearly_data %>% 
  summarise(avg = mean(n))

Peak Derailments

summarised_yearly_data %>% 
  arrange(desc(n)) %>% 
  head(1)
peak_derailments <-
  summarised_yearly_data %>% 
  arrange(desc(n)) %>% 
  head(1) %>% 
  pull(n)

Derailments in 1990 and % Change

summarised_yearly_data %>% 
  filter(Year == 1990)
(summarised_yearly_data %>% 
  filter(Year == 1990) %>% 
  pull(n) - peak_derailments) / peak_derailments
## [1] -0.7251729

Derailments in 2022 and 2008

summarised_yearly_data %>% 
  filter(Year > 2006)

Geography

Most derailments by state

accidents_by_type_by_state <-
  data %>% 
  distinct(`Accident Number`, .keep_all = TRUE) %>% 
  group_by(`State Abbreviation`) %>% 
  count(`Accident Type`)

accidents_by_type_by_state %>% 
  filter(`Accident Type` == "Derailment") %>% 
  arrange(desc(n))

What is the most common type of train accident?

data %>% 
  distinct(`Accident Number`, .keep_all = TRUE) %>% 
  count(`Accident Type`) %>% 
  arrange(desc(n))

What are the most common causes of train accidents?

data %>% 
  mutate(
    cause_type = case_when(
      startsWith(`Accident Cause Code`, "E") ~ "Mechanical and Electrical Failures",
      startsWith(`Accident Cause Code`, "M") ~ "Miscellaneous Causes Not Otherwise Listed",
      startsWith(`Accident Cause Code`, "T") ~ "Rack, Roadbed and Structures",
      startsWith(`Accident Cause Code`, "S") ~ "Signal and Communication",
      startsWith(`Accident Cause Code`, "H") ~ "Train operation - Human Factors",
      TRUE ~ "NA"
    )
  ) %>% 
  distinct(`Accident Number`, .keep_all = TRUE) %>% 
  filter(cause_type != "NA") %>% 
  count(cause_type)

Attempt 3: Attempt 2 but ignore duplicates

library(readxl)
# Download from here: https://data.transportation.gov/Railroads/Rail-Equipment-Accident-Incident-Data-Form-54-/85tf-25kj
data <- read_xlsx("Rail_Equipment_Accident_Incident_Data_common-causes.xlsx")

How many train derailments in 2023?

Num Train Derailments as of June 30 2023

data %>% 
  mutate(month = as.double(`Accident Month`)) %>% 
  filter(Year == 2023, `Accident Type` == "Derailment", month < 7) %>% 
  nrow()
## [1] 647

Num collisions, fires, and highway-rail crossing incidents as of June 30 2023

summarised_by_type <- 
  data %>% 
  mutate(month = as.double(`Accident Month`)) %>% 
  filter(Year == 2023, month < 7) %>% 
  count(`Accident Type`) %>% 
  arrange(desc(n))
 
summarised_by_type 
summarised_by_type %>% 
  mutate(collision = if_else(str_detect(`Accident Type`, "collision"), "Collision", "No")) %>% 
  group_by(collision) %>% 
  summarise(n = sum(n))

How many trains derail each year?

Time Series and Avg Per Year

summarised_yearly_data <-
  data %>% 
  filter(`Accident Type` == "Derailment") %>% 
  count(Year)

write_csv(summarised_yearly_data, "train_derailments_by_year.csv")

Avg Derailments Per Year

summarised_yearly_data %>% 
  summarise(avg = mean(n))

Peak Derailments

summarised_yearly_data %>% 
  arrange(desc(n)) %>% 
  head(1)
peak_derailments <-
  summarised_yearly_data %>% 
  arrange(desc(n)) %>% 
  head(1) %>% 
  pull(n)

Derailments in 1990 and % Change

summarised_yearly_data %>% 
  filter(Year == 1990)
(summarised_yearly_data %>% 
  filter(Year == 1990) %>% 
  pull(n) - peak_derailments) / peak_derailments
## [1] -0.7533609

Derailments in 2022 and 2008

summarised_yearly_data %>% 
  filter(Year > 2006)

Geography

Most derailments by state

accidents_by_type_by_state <-
  data %>% 
  group_by(`State Abbreviation`) %>% 
  count(`Accident Type`)

accidents_by_type_by_state %>% 
  filter(`Accident Type` == "Derailment") %>% 
  arrange(desc(n))

What is the most common type of train accident?

data %>% 
  count(`Accident Type`) %>% 
  arrange(desc(n))

What are the most common causes of train accidents?

data %>% 
  mutate(
    cause_type = case_when(
      startsWith(`Accident Cause Code`, "E") ~ "Mechanical and Electrical Failures",
      startsWith(`Accident Cause Code`, "M") ~ "Miscellaneous Causes Not Otherwise Listed",
      startsWith(`Accident Cause Code`, "T") ~ "Rack, Roadbed and Structures",
      startsWith(`Accident Cause Code`, "S") ~ "Signal and Communication",
      startsWith(`Accident Cause Code`, "H") ~ "Train operation - Human Factors",
      TRUE ~ "NA"
    )
  ) %>% 
  filter(cause_type != "NA") %>% 
  count(cause_type)