Project Final

Introduction

This project asks: which factors most consistently predict subway delays, and has overall performance improved since 2022?. This project analyzes New York City subway delay patterns from January 2022 through March 2026, exploring which subway lines experience the most disruptions, how delays have trended over time, and which reporting categories such as infrastructure issues, operating conditions, or external factors are the most common causes of delays. Data is sourced from the MTA Open Data portal (CSV). The workflow follows the OSEMN framework: Obtain, Scrub, Explore, Model, and Interpret. The reason I choice this project is because the mta has gotten more expensive and I wanted to see if there’s been any improvement.

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.6
✔ forcats   1.0.1     ✔ stringr   1.6.0
✔ ggplot2   4.0.1     ✔ tibble    3.3.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.2
✔ purrr     1.2.1     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(lubridate)
library(ggplot2)
library(knitr)
library(corrplot)

corrplot 0.95 loaded

You can add options to executable code like this

Loading the MTA and Weather Data

mta_raw <- read.csv("https://raw.githubusercontent.com/Jeovany97/Data-607/refs/heads/main/Final%20Project/mta_delays.csv", 
                    stringsAsFactors = FALSE)

glimpse(mta_raw)

Rows: 17,895
Columns: 6
$ month              <chr> "2020-01-01", "2020-01-01", "2020-01-01", "2020-01-…
$ division           <chr> "A DIVISION", "A DIVISION", "A DIVISION", "A DIVISI…
$ line               <chr> "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "…
$ day_type           <int> 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, …
$ reporting_category <chr> "Crew Availability", "Infrastructure & Equipment", …
$ delays             <chr> "17", "399", "332", "314", "345", "111", "10", "18"…

Cleaning data and converting it to tidy format

#renaming header and convert delay to int
mta_clean <- mta_raw %>%
  rename_with(tolower) %>%
  rename_with(~ gsub(" ", "_", .)) %>%
  mutate(
    date    = as.Date(month, format = "%Y-%m-%d"),
    delays  = as.numeric(gsub(",", "", delays)),
    year    = as.integer(format(date, "%Y")),
    year_month = format(date, "%Y-%m")
  ) %>%
  select(-month) %>%
  filter(date >= as.Date("2022-01-01") & date <= as.Date("2026-03-31")) %>%
  drop_na()

glimpse(mta_clean)

Rows: 12,262
Columns: 8
$ division           <chr> "A DIVISION", "A DIVISION", "A DIVISION", "A DIVISI…
$ line               <chr> "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "…
$ day_type           <int> 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, …
$ reporting_category <chr> "External Factors", "Police & Medical", "Infrastruc…
$ delays             <dbl> 14, 401, 273, 360, 75, 210, 147, 11, 6, 47, 151, 26…
$ date               <date> 2022-01-01, 2022-01-01, 2022-01-01, 2022-01-01, 20…
$ year               <int> 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022, 202…
$ year_month         <chr> "2022-01", "2022-01", "2022-01", "2022-01", "2022-0…

Preparing for data analysis

# Yearly totals for trend analysis
delays_by_year <- mta_clean %>%
  group_by(year) %>%
  summarise(total_delays = sum(delays, na.rm = TRUE))

# Delays by reporting category
delays_by_category <- mta_clean %>%
  group_by(reporting_category) %>%
  summarise(total_delays = sum(delays, na.rm = TRUE)) %>%
  arrange(desc(total_delays))

# Delays by line
delays_by_line <- mta_clean %>%
  group_by(line) %>%
  summarise(total_delays = sum(delays, na.rm = TRUE)) %>%
  arrange(desc(total_delays))

# Delays by category and year (for heatmap)
delays_category_year <- mta_clean %>%
  group_by(year, reporting_category) %>%
  summarise(total_delays = sum(delays, na.rm = TRUE), .groups = "drop")

cat("Date range in cleaned data:\n")

Date range in cleaned data:

range(mta_clean$date)

[1] "2022-01-01" "2026-03-01"

Data exploration

# Delays by line for March 2026
kable(delays_by_year, 
      caption = "Table 1: Total Subway Delays by Year – 2022 to 2026")

Table 1: Total Subway Delays by Year – 2022 to 2026
year	total_delays
2022	498487
2023	449652
2024	488407
2025	430338
2026	112317

kable(delays_by_category, 
      caption = "Table 2: Total Subway Delays by Reporting Category – 2022 to 2026")

Table 2: Total Subway Delays by Reporting Category – 2022 to 2026
reporting_category	total_delays
Infrastructure & Equipment	586224
Police & Medical	433112
Planned ROW Work	413873
Operating Conditions	271913
Crew Availability	222752
External Factors	51327

kable(delays_by_line, 
      caption = "Table 3: Total Subway Delays by Line – 2022 to 2026")

Table 3: Total Subway Delays by Line – 2022 to 2026
line	total_delays
N	156761
F	149142
6	146033
A	140230
2	131371
E	130229
1	107459
Q	104843
4	101742
D	101106
R	85248
C	84580
7	79166
5	77749
L	68475
3	68466
B	65085
G	56299
JZ	55102
M	54737
S Rock	9925
S 42nd	2863
S Fkln	2590

ggplot(delays_by_year, aes(x = year, y = total_delays)) +
  geom_col(fill = "#2980b9", alpha = 0.85) +
  geom_text(aes(label = scales::comma(total_delays)), 
            vjust = -0.5, size = 3.5) +
  labs(
    title    = "Total NYC Subway Delays by Year – 2022 to 2026",
    subtitle = "Source: MTA Open Data Portal",
    x        = "Year",
    y        = "Total Delays"
  ) +
  scale_y_continuous(labels = scales::comma) +
  theme_minimal()

ggplot(delays_by_category, 
       aes(x = reorder(reporting_category, -total_delays), 
           y = total_delays, fill = total_delays)) +
  geom_col() +
  scale_fill_gradient(low = "#f7c59f", high = "#c0392b") +
  scale_y_continuous(labels = scales::comma) +
  labs(
    title    = "Total Delays by Reporting Category – 2022 to 2026",
    subtitle = "Source: MTA Open Data Portal",
    x        = "Reporting Category",
    y        = "Total Delays",
    fill     = "Delays"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot(delays_by_line, aes(x = reorder(line, -total_delays),
                            y = total_delays, fill = total_delays)) +
  geom_col() +
  scale_fill_gradient(low = "#f7c59f", high = "#c0392b") +
  scale_y_continuous(labels = scales::comma) +
  labs(
    title    = "Total Delays by Subway Line – 2022 to 2026",
    subtitle = "Source: MTA Open Data Portal",
    x        = "Subway Line",
    y        = "Total Delays",
    fill     = "Delays"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Summary table: delays by line and year
delays_line_year <- mta_clean %>%
  group_by(year, line) %>%
  summarise(total_delays = sum(delays, na.rm = TRUE), .groups = "drop") %>%
  arrange(year, desc(total_delays))

kable(delays_line_year, 
      caption = "Table 4: Subway Delays by Line and Year – 2022 to 2026")

Table 4: Subway Delays by Line and Year – 2022 to 2026
year	line	total_delays
2022	A	41776
2022	6	35775
2022	N	35679
2022	F	35356
2022	E	35300
2022	2	32003
2022	Q	30052
2022	4	26112
2022	D	25593
2022	1	24951
2022	C	22584
2022	R	21398
2022	7	19631
2022	5	18724
2022	L	16638
2022	3	16445
2022	M	15378
2022	JZ	15218
2022	B	14857
2022	G	10535
2022	S Rock	2587
2022	S 42nd	1026
2022	S Fkln	869
2023	N	38820
2023	F	36772
2023	A	35842
2023	6	34334
2023	E	31712
2023	2	28220
2023	D	24103
2023	1	24039
2023	Q	24012
2023	4	22027
2023	R	19357
2023	7	19217
2023	C	19089
2023	5	17636
2023	3	13097
2023	L	12853
2023	G	11962
2023	B	11866
2023	JZ	11283
2023	M	10246
2023	S Rock	2018
2023	S Fkln	657
2023	S 42nd	490
2024	6	39069
2024	F	37990
2024	N	37377
2024	2	33362
2024	A	32611
2024	E	28796
2024	1	28019
2024	D	26766
2024	4	24460
2024	Q	23145
2024	C	22812
2024	R	21794
2024	5	18916
2024	B	18341
2024	7	17331
2024	3	16587
2024	JZ	16287
2024	L	14813
2024	G	13430
2024	M	12855
2024	S Rock	2231
2024	S 42nd	916
2024	S Fkln	499
2025	N	34652
2025	F	32078
2025	2	31008
2025	E	27557
2025	6	27256
2025	1	25223
2025	A	23475
2025	4	22287
2025	Q	20964
2025	D	19567
2025	7	18996
2025	5	18409
2025	R	18339
2025	3	18295
2025	L	17777
2025	G	16772
2025	C	16699
2025	B	15696
2025	M	12505
2025	JZ	9313
2025	S Rock	2620
2025	S Fkln	459
2025	S 42nd	391
2026	N	10233
2026	6	9599
2026	F	6946
2026	E	6864
2026	4	6856
2026	2	6778
2026	Q	6670
2026	A	6526
2026	L	6394
2026	1	5227
2026	D	5077
2026	R	4360
2026	B	4325
2026	5	4064
2026	3	4042
2026	7	3991
2026	M	3753
2026	G	3600
2026	C	3396
2026	JZ	3001
2026	S Rock	469
2026	S Fkln	106
2026	S 42nd	40

ggplot(delays_line_year, aes(x = reorder(line, -total_delays), 
                              y = total_delays, 
                              fill = factor(year))) +
  geom_col(position = "dodge") +
  scale_y_continuous(labels = scales::comma) +
  scale_fill_brewer(palette = "Set2") +
  labs(
    title    = "NYC Subway Delays by Line and Year – 2022 to 2026",
    subtitle = "Source: MTA Open Data Portal",
    x        = "Subway Line",
    y        = "Total Delays",
    fill     = "Year"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "bottom")

Statistical Analysis

Kruskal-Wallis Test Across Lines. This is to see if delay are significantly different across reporting categories

kruskal_category <- kruskal.test(delays ~ reporting_category, data = mta_clean)
print(kruskal_category)


    Kruskal-Wallis rank sum test

data:  delays by reporting_category
Kruskal-Wallis chi-squared = 2254.3, df = 5, p-value < 2.2e-16

cat("\nIf p < 0.05, delay counts differ significantly across reporting categories.\n")


If p < 0.05, delay counts differ significantly across reporting categories.

This tests whether total delays have increased or decreased year over year from 2022 to 2026.

model <- lm(total_delays ~ year, data = delays_by_year)
summary(model)


Call:
lm(formula = total_delays ~ year, data = delays_by_year)

Residuals:
      1       2       3       4       5 
 -55684  -25354   92567  113663 -125192 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)
(Intercept) 160626610   74737056   2.149    0.121
year           -79165      36925  -2.144    0.121

Residual standard error: 116800 on 3 degrees of freedom
Multiple R-squared:  0.6051,    Adjusted R-squared:  0.4734 
F-statistic: 4.596 on 1 and 3 DF,  p-value: 0.1214

ggplot(delays_category_year, 
       aes(x = year, y = total_delays, fill = reporting_category)) +
  geom_col(position = "stack") +
  scale_y_continuous(labels = scales::comma) +
  labs(
    title    = "NYC Subway Delays by Reporting Category and Year",
    subtitle = "2022 to 2026 | Source: MTA Open Data Portal",
    x        = "Year",
    y        = "Total Delays",
    fill     = "Reporting Category"
  ) +
  theme_minimal() +
  theme(legend.position = "bottom",
        legend.text = element_text(size = 7))

delays_category_year_detail <- mta_clean %>%
  group_by(year, reporting_category) %>%
  summarise(total_delays = sum(delays, na.rm = TRUE), .groups = "drop") %>%
  arrange(year, desc(total_delays))

ggplot(delays_category_year_detail, 
       aes(x = reorder(reporting_category, -total_delays), 
           y = total_delays, 
           fill = factor(year))) +
  geom_col(position = "dodge") +
  scale_y_continuous(labels = scales::comma) +
  scale_fill_brewer(palette = "Set2") +
  labs(
    title    = "NYC Subway Delays by Reporting Category and Year – 2022 to 2026",
    subtitle = "Source: MTA Open Data Portal",
    x        = "Reporting Category",
    y        = "Total Delays",
    fill     = "Year"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "bottom")

ggplot(delays_by_year, aes(x = year, y = total_delays)) +
  geom_col(fill = "#8e44ad", alpha = 0.7) +
  geom_smooth(method = "lm", se = TRUE, color = "#2c3e50") +
  scale_y_continuous(labels = scales::comma) +
  labs(
    title    = "NYC Subway Delay Trend – 2022 to 2026",
    subtitle = "Linear trend with 95% confidence interval",
    x        = "Year",
    y        = "Total Delays"
  ) +
  theme_minimal()

`geom_smooth()` using formula = 'y ~ x'

numeric_vars <- mta_clean %>%
  mutate(
    line_num     = as.numeric(factor(line)),
    category_num = as.numeric(factor(reporting_category))
  ) %>%
  select(delays, line_num, category_num, year) %>%
  drop_na()

cor_matrix <- cor(numeric_vars)

corrplot(cor_matrix,
         method      = "color",
         type        = "upper",
         addCoef.col = "black",
         tl.col      = "black",
         title       = "Correlation Matrix: Delays, Line, Category, Year",
         mar         = c(0, 0, 1, 0))

Conclusion

Total subway delays have shown a general decline over the study period, dropping from a peak of 498,487 delays in 2022 to 449,652 in 2023, rising slightly to 488,407 in 2024, then falling again to 430,338 in 2025. The linear regression supports a modest downward trend overall, though the p-value of 0.121 indicates this trend is not statistically significant, meaning we cannot confidently conclude delays are improving system-wide.

Infrastructure & Equipment was by far the largest driver of delays across all years, accounting for 586,224 total delays — nearly 135,000 more than the next highest category, Police & Medical at 433,112. Planned ROW Work (413,873), Operating Conditions (271,913), and Crew Availability (222,752) followed, while External Factors were the least common at 51,327. The Kruskal-Wallis test confirmed these differences are statistically significant (chi-squared = 2254.3, p < 2.2e-16), meaning delay counts differ meaningfully across reporting categories and are not due to chance. The grouped bar chart shows Infrastructure & Equipment consistently dominated delays in every year from 2022 to 2026.

Correlation Matrix: The correlation heatmap showed that reporting category had a modest positive correlation with delays (0.19), while subway line had a slight negative correlation (-0.14). Year and delays were nearly uncorrelated (-0.03), consistent with the regression finding that the year over year trend is weak. Overall the correlations are low, suggesting that no single variable fully explains delay volume and that delays are driven by a complex combination of factors.