Introduction: This data set includes NYC transportation usership
data broken out by year, beginning in March 2020, just before the
pandemic. I aimed to analyze a few different aspects of this data:
- Which transportation methods had the greatest change in ridership
from pre-pandemic to now?
- May a potential change in population over time effect our perception
of these numbers?
- How do we compare now to pre-pandemic? Has our recovery to
pre-pandemic rates plateaued? With these questions in mind, I began
working with and manipulating the data to extract these insights.
Load Packages
library(tidyr)
library(dplyr)
library(ggplot2)
library(stringr)
library(ggrepel)
Ridership: Read Data Set number 3
# Read MTA daily ridership data from CSV file
df3 <- read.csv('/Users/williamberritt/Downloads/MTA_Daily_Ridership_Data__Beginning_2020_20240228.csv')
Create separate pivoted data frames then join data back
together
# Select relevant columns containing 'Estimated' and 'Total'
rider_ship <- df3 %>%
select(Date, matches('Estimated|Total'))
# Reshape data from wide to long format using pivot_longer
rider_ship <- rider_ship |> pivot_longer(cols = (Subways..Total.Estimated.Ridership:Staten.Island.Railway..Total.Estimated.Ridership),
names_to = 'transport_method',
values_to ='value')
# Display the first few rows of the reshaped data frame
head(rider_ship)
## # A tibble: 6 × 3
## Date transport_method value
## <chr> <chr> <int>
## 1 03/01/2020 Subways..Total.Estimated.Ridership 2212965
## 2 03/01/2020 Buses..Total.Estimated.Ridership 984908
## 3 03/01/2020 LIRR..Total.Estimated.Ridership NA
## 4 03/01/2020 Metro.North..Total.Estimated.Ridership 55826
## 5 03/01/2020 Access.A.Ride..Total.Scheduled.Trips 19922
## 6 03/01/2020 Bridges.and.Tunnels..Total.Traffic 786961
# Assign transport methods based on pattern matching
rider_ship$transport_method <- ifelse(str_detect(rider_ship$transport_method, 'Subways'), 'subway',
ifelse(str_detect(rider_ship$transport_method, 'Buses'), 'buses',
ifelse(str_detect(rider_ship$transport_method, 'LIRR'), 'lirr',
ifelse(str_detect(rider_ship$transport_method, 'Metro'), 'mnr',
ifelse(str_detect(rider_ship$transport_method, 'Island'), 'staten',
ifelse(str_detect(rider_ship$transport_method, 'Access'), 'access_a_ride',
ifelse(str_detect(rider_ship$transport_method, 'Bridges'), 'bridges_tunnels', NA)))))))
# Select relevant columns for pre-pandemic data
pre_pandemic <- df3 %>%
select(Date, contains("Pre"))
# Reshape pre-pandemic data from wide to long format using pivot_longer
pre_pandemic <- pre_pandemic |> pivot_longer(cols = (Subways....of.Comparable.Pre.Pandemic.Day:Staten.Island.Railway....of.Comparable.Pre.Pandemic.Day),
names_to = 'transport_method',
values_to ='value')
# Assign transport methods based on pattern matching
pre_pandemic$transport_method <- ifelse(str_detect(pre_pandemic$transport_method, 'Subways'), 'subway',
ifelse(str_detect(pre_pandemic$transport_method, 'Buses'), 'buses',
ifelse(str_detect(pre_pandemic$transport_method, 'LIRR'), 'lirr',
ifelse(str_detect(pre_pandemic$transport_method, 'Metro'), 'mnr',
ifelse(str_detect(pre_pandemic$transport_method, 'Island'), 'staten',
ifelse(str_detect(pre_pandemic$transport_method, 'Access'), 'access_a_ride',
ifelse(str_detect(pre_pandemic$transport_method, 'Bridges'), 'bridges_tunnels', NA)))))))
# Merge the two data frames on transport method and date
base_df <- left_join(rider_ship, pre_pandemic, by=c('transport_method', 'Date'))
base_df <- na.omit(base_df)
# Display the first few rows of the merged data frame
head(base_df)
## # A tibble: 6 × 4
## Date transport_method value.x value.y
## <chr> <chr> <int> <dbl>
## 1 03/01/2020 subway 2212965 0.97
## 2 03/01/2020 buses 984908 0.99
## 3 03/01/2020 mnr 55826 0.59
## 4 03/01/2020 access_a_ride 19922 1.13
## 5 03/01/2020 bridges_tunnels 786961 0.98
## 6 03/01/2020 staten 1636 0.52
Create new date variables and rename columns
# Rename columns
colnames(base_df) <- c('date', 'transport_method', 'ridership', 'percent_change')
# Convert 'date' to Date format
base_df$date <- as.Date(base_df$date, format = "%m/%d/%Y")
# Extract month, year, and day from 'date'
base_df$month <- format(base_df$date, "%m")
base_df$year <- format(base_df$date, "%Y")
base_df$day <- weekdays(base_df$date)
What do trends in ridership look like for each method?
# Plot ridership by data for each method
subway <- subset(base_df, transport_method=='subway')
plot(subway$date, subway$ridership)

buses <- subset(base_df, transport_method=='buses')
plot(buses$date, buses$ridership)

lirr <- subset(base_df, transport_method=='lirr')
plot(lirr$date, lirr$ridership)

mnr <- subset(base_df, transport_method=='mnr')
plot(mnr$date, mnr$ridership)

staten <- subset(base_df, transport_method=='staten')
plot(staten$date, staten$ridership)

access_a_ride <- subset(base_df, transport_method=='access_a_ride')
plot(access_a_ride$date, access_a_ride$ridership)

bridges_tunnels <- subset(base_df, transport_method=='bridges_tunnels')
plot(bridges_tunnels$date, bridges_tunnels$ridership)

Create data frame with for total ridership grouped by month
# Group data by year and month, calculating total ridership
total_ridership_by_month <- base_df |> group_by(year, month) |>
summarize(total_ridership = sum(ridership))
Compare first Wednesday to average of last 3 Wednesdays (only using
first Wednesday because rates started to decline quickly in March. The
first Wednesday seems like the best proxy of pre-pandemic behavior)
# Select data for the first Wednesday of March 2020
first_wednesday <- subset(base_df, day == 'Wednesday' & date == '2020-03-04')
# Select data for all Wednesdays
last_wednesday <- subset(base_df, day == 'Wednesday')
last_wednesday <- tail(last_wednesday, 21)
# Calculate average ridership for the last 3 Wednesdays by transport method
avg_last_3_wed <- last_wednesday |> group_by(transport_method) |>
summarize(avg_ridership_last_3 = mean(ridership))
# Merge data for the first Wednesday and average ridership for the last 3 Wednesdays
combined_wednesday <- left_join(first_wednesday, avg_last_3_wed, by = 'transport_method')
# Calculate total difference and percentage difference in ridership
combined_wednesday$total_difference <- combined_wednesday$ridership - combined_wednesday$avg_ridership_last_3
combined_wednesday$percent_diff_wed <- (combined_wednesday$avg_ridership_last_3 - combined_wednesday$ridership) / combined_wednesday$ridership
# Subset data for relevant columns
change_in_ridership <- subset(combined_wednesday, select = c('transport_method', 'total_difference', 'percent_diff_wed'))
# Display the results, ordered by total difference
change_in_ridership[order(-change_in_ridership$total_difference),]
## # A tibble: 7 × 3
## transport_method total_difference percent_diff_wed
## <chr> <dbl> <dbl>
## 1 subway 1679637 -0.305
## 2 buses 895478 -0.411
## 3 lirr 81666. -0.262
## 4 staten 9665. -0.564
## 5 bridges_tunnels 6284 -0.00694
## 6 access_a_ride 569. -0.0166
## 7 mnr -9188. 0.0477
# Display the results, ordered by percentage difference
change_in_ridership[order(change_in_ridership$percent_diff_wed),]
## # A tibble: 7 × 3
## transport_method total_difference percent_diff_wed
## <chr> <dbl> <dbl>
## 1 staten 9665. -0.564
## 2 buses 895478 -0.411
## 3 subway 1679637 -0.305
## 4 lirr 81666. -0.262
## 5 access_a_ride 569. -0.0166
## 6 bridges_tunnels 6284 -0.00694
## 7 mnr -9188. 0.0477
Adjust by considering recent NYC population decline
# Create a data frame for NYC population
nyc_pop <- data.frame(
year = c(2020, 2021, 2022, 2023, 2024),
population = c(8700000, 8500000, 8300000, 8200000, 8100000))
# Convert 'year' to numeric type
nyc_pop$year <- as.numeric(nyc_pop$year)
# Convert 'year' to numeric type in the base data frame
base_df$year <- as.numeric(base_df$year)
# Merge base data frame with NYC population data
ridership_population <- left_join(base_df, nyc_pop, by='year')
# Group data by year, calculating yearly ridership and daily population sum
ridership_population <- ridership_population |> group_by(year) |>
summarize(yearly_ridership = sum(ridership), daily_pop_sum = sum(population))
# Calculate the rate of use per 1000 people
ridership_population$rate_of_use <- (ridership_population$yearly_ridership / ridership_population$daily_pop_sum) * 1000
# Group data by year, calculating the mean ridership rate per 1000 people
ridership_by_year_by_transport <- ridership_population |> group_by(year) |>
summarize(ridership_rate_per_1000 = mean(rate_of_use))
# Convert 'year' to numeric type
ridership_by_year_by_transport$year <- as.numeric(ridership_by_year_by_transport$year)
# Plot the ridership rate per 1000 people over the years
plot(ridership_by_year_by_transport$year, ridership_by_year_by_transport$ridership_rate_per_1000)

Get ridership data grouped by year adjusted for population to see
how different it is now vs pre pandemic
# Select data for the first week of March 2020
first_week_march_rate <- subset(base_df, date %in% c('2020-03-01', '2020-03-02', '2020-03-03', '2020-03-04', '2020-03-05', '2020-03-06', '2020-03-07'))
# Merge with NYC population data
first_week_march_rate <- left_join(first_week_march_rate, nyc_pop, by='year')
# Convert 'date' to numeric type
first_week_march_rate$date <- as.numeric(first_week_march_rate$year)
# Group data by year, calculating yearly ridership and daily population sum for the first week of March
march_grouping <- first_week_march_rate |> group_by(year) |>
summarize(ridership = sum(ridership), daily_pop_sum = sum(population))
# Calculate the ridership rate per 1000 people for the first week of March
march_grouping$ridership_rate_per_1000 <- (march_grouping$ridership / march_grouping$daily_pop_sum) * 1000
# Subset relevant columns
march_grouping <- subset(march_grouping, select=c('year', 'ridership_rate_per_1000'))
# Combine data for the first week of March with the yearly ridership rate per 1000 people
march_and_covid_ridership_pop_adjusted <- union_all(march_grouping, ridership_by_year_by_transport)
# Adjust the ridership rate for the entire year by multiplying by 7 (number of days in a week)
march_and_covid_ridership_pop_adjusted$ridership_rate_per_1000 <- march_and_covid_ridership_pop_adjusted$ridership_rate_per_1000 * 7
# Display the adjusted data
march_and_covid_ridership_pop_adjusted
## # A tibble: 6 × 2
## year ridership_rate_per_1000
## <dbl> <dbl>
## 1 2020 911.
## 2 2020 285.
## 3 2021 495.
## 4 2022 618.
## 5 2023 684.
## 6 2024 648.