library(tidyverse)
library(vroom)

Attempt 1: Read Data Direct from Source

# Download from here: https://data.transportation.gov/Railroads/Rail-Equipment-Accident-Incident-Data-Form-54-/85tf-25kj
data <- vroom("Rail_Equipment_Accident_Incident_Data__Form_54_.csv")

Examine what’s going on with duplicates

data %>% filter(Year == 2023) %>% count(`Accident Number`) %>%  arrange(desc(n))

duplicates <- data %>% filter(Year == 2023) %>% count(`Accident Number`) %>%  arrange(desc(n)) %>% pull(`Accident Number`)

OK, so found some examples with duplicate records. One is Accident Number 151894.

data %>% filter(`Accident Number` == "151894")

OK for this accident number, it seems like almost all data is the same despite it being 4 rows for the same Accident Number, same day, same time, same type.

# Figure out which columns are different within this duplicate
data %>% 
  filter(`Accident Number` == "151894", Year == 2023) %>% 
  summarise_all(~ length(unique(.)) > 1) %>% 
  select_if(all)

The only differences are in “Remote Control Locomotive,” “First Car Initials” and other “Car” level information.

Generally, it doesn’t seem like this 4 rows should count as separate derailments. A distinct() on Accident Number should handle this

How many train derailments in 2023?

Num Train Derailments

data %>% 
  mutate(month = as.double(`Accident Month`)) %>% 
  filter(Year == 2023, `Accident Type` == "Derailment") %>% 
  distinct(`Accident Number`) %>% 
  as_vector() %>% 
  length()

## [1] 742

Num collisions, fires, and highway-rail crossing incidents

summarised_by_type <- 
  data %>% 
  mutate(month = as.double(`Accident Month`)) %>% 
  filter(Year == 2023) %>% 
  distinct(`Accident Number`, .keep_all = TRUE) %>% 
  count(`Accident Type`) %>% 
  arrange(desc(n))

summarised_by_type

summarised_by_type %>% 
  mutate(collision = if_else(str_detect(`Accident Type`, "collision"), "Collision", "No")) %>% 
  group_by(collision) %>% 
  summarise(n = sum(n))

How many trains derail each year?

Time Series and Avg Per Year

summarised_yearly_data <-
  data %>% 
  distinct(`Accident Number`, .keep_all = TRUE) %>% 
  filter(`Accident Type` == "Derailment") %>% 
  count(Year)

summarised_yearly_data

write_csv(summarised_yearly_data, "train_derailments_by_year.csv")

Avg Derailments Per Year

summarised_yearly_data %>% 
  summarise(avg = mean(n))

Peak Derailments

summarised_yearly_data %>% 
  arrange(desc(n)) %>% 
  head(1)

peak_derailments <-
  summarised_yearly_data %>% 
  arrange(desc(n)) %>% 
  head(1) %>% 
  pull(n)

Derailments in 1990 and % Change

summarised_yearly_data %>% 
  filter(Year == 1990)

(summarised_yearly_data %>% 
  filter(Year == 1990) %>% 
  pull(n) - peak_derailments) / peak_derailments

## [1] -0.7244396

Derailments in 2022 and 2008

summarised_yearly_data %>% 
  filter(Year > 2006)

Geography

Most derailments by state

accidents_by_type_by_state <-
  data %>% 
  distinct(`Accident Number`, .keep_all = TRUE) %>% 
  group_by(`State Abbreviation`) %>% 
  count(`Accident Type`)

accidents_by_type_by_state %>% 
  filter(`Accident Type` == "Derailment") %>% 
  arrange(desc(n))

What is the most common type of train accident?

data %>% 
  distinct(`Accident Number`, .keep_all = TRUE) %>% 
  count(`Accident Type`) %>% 
  arrange(desc(n))

What are the most common causes of train accidents?

data %>% 
  mutate(
    cause_type = case_when(
      startsWith(`Accident Cause Code`, "E") ~ "Mechanical and Electrical Failures",
      startsWith(`Accident Cause Code`, "M") ~ "Miscellaneous Causes Not Otherwise Listed",
      startsWith(`Accident Cause Code`, "T") ~ "Rack, Roadbed and Structures",
      startsWith(`Accident Cause Code`, "S") ~ "Signal and Communication",
      startsWith(`Accident Cause Code`, "H") ~ "Train operation - Human Factors",
      TRUE ~ "NA"
    )
  ) %>% 
  distinct(`Accident Number`, .keep_all = TRUE) %>% 
  filter(cause_type != "NA") %>% 
  count(cause_type)

Deaths

data %>% 
  distinct(`Accident Number`, .keep_all = TRUE) %>% 
  group_by(Year) %>% 
  summarise(total_deaths = sum(`Total Persons Killed`))

data %>% 
  distinct(`Accident Number`, .keep_all = TRUE) %>% 
  summarise(total_deaths = sum(`Total Persons Killed`))

data %>% 
  distinct(`Accident Number`, .keep_all = TRUE) %>% 
  group_by(Year) %>% 
  summarise(total_deaths = sum(`Total Persons Injured`))

What happened in 2002?

data %>% 
  filter(Year == 2002) %>% 
  distinct(`Accident Number`, .keep_all = TRUE) %>% 
  select(`Accident Number`, Date, `Accident Type`, Division, `State Abbreviation`, `Total Persons Killed`, `Total Persons Injured`) %>% 
  arrange(desc(`Total Persons Injured`))

Attempt 2: Read Data from One of Analysis Spreadsheets sent via Slack

library(readxl)
# Download from here: https://data.transportation.gov/Railroads/Rail-Equipment-Accident-Incident-Data-Form-54-/85tf-25kj
data <- read_xlsx("Rail_Equipment_Accident_Incident_Data_common-causes.xlsx")

How many train derailments in 2023?

Num Train Derailments as of June 30 2023

data %>% 
  mutate(month = as.double(`Accident Month`)) %>% 
  filter(Year == 2023, `Accident Type` == "Derailment", month < 7) %>% 
  distinct(`Accident Number`) %>% 
  as_vector() %>% 
  length()

## [1] 610

Num collisions, fires, and highway-rail crossing incidents as of June 30 2023

summarised_by_type <- 
  data %>% 
  mutate(month = as.double(`Accident Month`)) %>% 
  filter(Year == 2023, month < 7) %>% 
  distinct(`Accident Number`, .keep_all = TRUE) %>% 
  count(`Accident Type`) %>% 
  arrange(desc(n))
 
summarised_by_type

summarised_by_type %>% 
  mutate(collision = if_else(str_detect(`Accident Type`, "collision"), "Collision", "No")) %>% 
  group_by(collision) %>% 
  summarise(n = sum(n))

How many trains derail each year?

Time Series and Avg Per Year

summarised_yearly_data <-
  data %>% 
  distinct(`Accident Number`, .keep_all = TRUE) %>% 
  filter(`Accident Type` == "Derailment") %>% 
  count(Year)

write_csv(summarised_yearly_data, "train_derailments_by_year.csv")

Avg Derailments Per Year

summarised_yearly_data %>% 
  summarise(avg = mean(n))

Peak Derailments

summarised_yearly_data %>% 
  arrange(desc(n)) %>% 
  head(1)

peak_derailments <-
  summarised_yearly_data %>% 
  arrange(desc(n)) %>% 
  head(1) %>% 
  pull(n)

Derailments in 1990 and % Change

summarised_yearly_data %>% 
  filter(Year == 1990)

(summarised_yearly_data %>% 
  filter(Year == 1990) %>% 
  pull(n) - peak_derailments) / peak_derailments

## [1] -0.7251729

Derailments in 2022 and 2008

summarised_yearly_data %>% 
  filter(Year > 2006)

Geography

Most derailments by state

accidents_by_type_by_state <-
  data %>% 
  distinct(`Accident Number`, .keep_all = TRUE) %>% 
  group_by(`State Abbreviation`) %>% 
  count(`Accident Type`)

accidents_by_type_by_state %>% 
  filter(`Accident Type` == "Derailment") %>% 
  arrange(desc(n))

What is the most common type of train accident?

data %>% 
  distinct(`Accident Number`, .keep_all = TRUE) %>% 
  count(`Accident Type`) %>% 
  arrange(desc(n))

What are the most common causes of train accidents?

data %>% 
  mutate(
    cause_type = case_when(
      startsWith(`Accident Cause Code`, "E") ~ "Mechanical and Electrical Failures",
      startsWith(`Accident Cause Code`, "M") ~ "Miscellaneous Causes Not Otherwise Listed",
      startsWith(`Accident Cause Code`, "T") ~ "Rack, Roadbed and Structures",
      startsWith(`Accident Cause Code`, "S") ~ "Signal and Communication",
      startsWith(`Accident Cause Code`, "H") ~ "Train operation - Human Factors",
      TRUE ~ "NA"
    )
  ) %>% 
  distinct(`Accident Number`, .keep_all = TRUE) %>% 
  filter(cause_type != "NA") %>% 
  count(cause_type)

Attempt 3: Attempt 2 but ignore duplicates

library(readxl)
# Download from here: https://data.transportation.gov/Railroads/Rail-Equipment-Accident-Incident-Data-Form-54-/85tf-25kj
data <- read_xlsx("Rail_Equipment_Accident_Incident_Data_common-causes.xlsx")

How many train derailments in 2023?

Num Train Derailments as of June 30 2023

data %>% 
  mutate(month = as.double(`Accident Month`)) %>% 
  filter(Year == 2023, `Accident Type` == "Derailment", month < 7) %>% 
  nrow()

## [1] 647

Num collisions, fires, and highway-rail crossing incidents as of June 30 2023

summarised_by_type <- 
  data %>% 
  mutate(month = as.double(`Accident Month`)) %>% 
  filter(Year == 2023, month < 7) %>% 
  count(`Accident Type`) %>% 
  arrange(desc(n))
 
summarised_by_type

summarised_by_type %>% 
  mutate(collision = if_else(str_detect(`Accident Type`, "collision"), "Collision", "No")) %>% 
  group_by(collision) %>% 
  summarise(n = sum(n))

How many trains derail each year?

Time Series and Avg Per Year

summarised_yearly_data <-
  data %>% 
  filter(`Accident Type` == "Derailment") %>% 
  count(Year)

write_csv(summarised_yearly_data, "train_derailments_by_year.csv")

Avg Derailments Per Year

summarised_yearly_data %>% 
  summarise(avg = mean(n))

Peak Derailments

summarised_yearly_data %>% 
  arrange(desc(n)) %>% 
  head(1)

peak_derailments <-
  summarised_yearly_data %>% 
  arrange(desc(n)) %>% 
  head(1) %>% 
  pull(n)

Derailments in 1990 and % Change

summarised_yearly_data %>% 
  filter(Year == 1990)

(summarised_yearly_data %>% 
  filter(Year == 1990) %>% 
  pull(n) - peak_derailments) / peak_derailments

## [1] -0.7533609

Derailments in 2022 and 2008

summarised_yearly_data %>% 
  filter(Year > 2006)

Geography

Most derailments by state

accidents_by_type_by_state <-
  data %>% 
  group_by(`State Abbreviation`) %>% 
  count(`Accident Type`)

accidents_by_type_by_state %>% 
  filter(`Accident Type` == "Derailment") %>% 
  arrange(desc(n))

What is the most common type of train accident?

data %>% 
  count(`Accident Type`) %>% 
  arrange(desc(n))

What are the most common causes of train accidents?

data %>% 
  mutate(
    cause_type = case_when(
      startsWith(`Accident Cause Code`, "E") ~ "Mechanical and Electrical Failures",
      startsWith(`Accident Cause Code`, "M") ~ "Miscellaneous Causes Not Otherwise Listed",
      startsWith(`Accident Cause Code`, "T") ~ "Rack, Roadbed and Structures",
      startsWith(`Accident Cause Code`, "S") ~ "Signal and Communication",
      startsWith(`Accident Cause Code`, "H") ~ "Train operation - Human Factors",
      TRUE ~ "NA"
    )
  ) %>% 
  filter(cause_type != "NA") %>% 
  count(cause_type)

Train Derailments

Olivia Martin

2023-10-04

Attempt 1: Read Data Direct from Source

Examine what’s going on with duplicates

How many train derailments in 2023?

Num Train Derailments

Num collisions, fires, and highway-rail crossing incidents

How many trains derail each year?

Time Series and Avg Per Year

Avg Derailments Per Year

Peak Derailments

Derailments in 1990 and % Change

Derailments in 2022 and 2008

Geography

Most derailments by state

What is the most common type of train accident?

What are the most common causes of train accidents?

Deaths

What happened in 2002?

Attempt 2: Read Data from One of Analysis Spreadsheets sent via Slack

How many train derailments in 2023?

Num Train Derailments as of June 30 2023

Num collisions, fires, and highway-rail crossing incidents as of June 30 2023

How many trains derail each year?

Time Series and Avg Per Year

Avg Derailments Per Year

Peak Derailments

Derailments in 1990 and % Change

Derailments in 2022 and 2008

Geography

Most derailments by state

What is the most common type of train accident?

What are the most common causes of train accidents?

Attempt 3: Attempt 2 but ignore duplicates

How many train derailments in 2023?

Num Train Derailments as of June 30 2023

Num collisions, fires, and highway-rail crossing incidents as of June 30 2023

How many trains derail each year?

Time Series and Avg Per Year

Avg Derailments Per Year

Peak Derailments

Derailments in 1990 and % Change

Derailments in 2022 and 2008

Geography

Most derailments by state

What is the most common type of train accident?

What are the most common causes of train accidents?