## New names:
## • `Score` -> `Score...2`
## • `Decile` -> `Decile...3`
## • `Score` -> `Score...4`
## • `Decile` -> `Decile...5`
## • `Score` -> `Score...6`
## • `Decile` -> `Decile...7`
## • `Score` -> `Score...8`
## • `Decile` -> `Decile...9`
bus_trip_counts <- read.csv("/Users/ryan/Desktop/Winterdatachallenge/WDAC_2025_data/busTripcounts.csv", stringsAsFactors = FALSE)
transport_routes <- read.csv("/Users/ryan/Desktop/Winterdatachallenge/WDAC_2025_data/routes_1.csv")
location_covid_cases <- read.csv("/Users/ryan/Desktop/Winterdatachallenge/WDAC_2025_data/confirmed_cases_table1_location_agg.csv")
AusPostCodes <- read.csv("/Users/ryan/Desktop/Winterdatachallenge/WDAC_2025_data/australian_postcodes.csv")
opal_patronage_path <- "/Users/ryan/Desktop/Winterdatachallenge/WDAC_2025_data/OpalPatronage"
nsw_lga_shapes <- nswgeo::lga_nsw
lga_to_ti_region_lookup <- tribble(
~lga_name, ~ti_region,
# --- Key Commercial Hubs ---
"Sydney", "Sydney CBD",
"North Sydney", "North Sydney",
"Parramatta", "Parramatta",
"Strathfield", "Strathfield",
"Burwood", "Strathfield",
"Ryde", "Macquarie Park",
"Willoughby", "Chatswood",
"Newcastle", "Newcastle and surrounds",
"Lake Macquarie", "Newcastle and surrounds",
"Cessnock", "Newcastle and surrounds",
"Maitland", "Newcastle and surrounds",
"Wollongong", "Wollongong and surrounds",
"Shellharbour", "Wollongong and surrounds"
)
bus_trip_counts <- bus_trip_counts %>%
mutate(
Time.period = str_trim(Time.period),
Time.period = dmy(paste0("01-", Time.period)),
Trip = str_trim(Trip),
Trip = na_if(Trip, ""),
Trip = na_if(Trip, "N/A"),
Trip = na_if(Trip, "unknown"),
Trip = as.numeric(Trip)
) %>%
filter(Time.period > as.Date("2019-12-01")) %>%
filter(!is.na(Trip))## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Trip = as.numeric(Trip)`.
## Caused by warning:
## ! NAs introduced by coercion
AusPostCodes <- AusPostCodes %>%
mutate(postcode = as.character(postcode))
full_opal_patronage <-list.files(path = opal_patronage_path, pattern = "\\.txt$", full.names = TRUE, recursive = TRUE)
opal_patronage_data <- map_dfr(full_opal_patronage, ~ read_delim(., delim = "|", col_types = cols(.default = "c")))
location_covid_cases <- left_join(location_covid_cases, AusPostCodes, by = "postcode", relationship = "many-to-many") %>%
mutate(notification_date = as.Date(notification_date))%>%
group_by(postcode, lat, long, locality) %>%
summarise(total_confirmed_cases = sum(confirmed_cases_count, na.rm = TRUE)) %>%
ungroup()## `summarise()` has grouped output by 'postcode', 'lat', 'long'. You can override
## using the `.groups` argument.
SIFEAL <- anti_join(SIFEA, AusPostCodes, by = "postcode")
summary_bus_trip_counts <- bus_trip_counts %>%
group_by(Time.period, Card.Type) %>%
summarise(TotalTrips = sum(Trip), .groups = "drop")
routes_clean <- transport_routes %>%
select(operator_name,operator_id, my_timetable_route_name,route_variant_type,transport_name,mot_for_interchange,
route_search_name,
service_direction_name,
start_date,
gtfs_route_id_in
)
postcode_to_lga_lookup <- AusPostCodes %>%
filter(state == "NSW") %>%
select(postcode, lga_name = lgaregion) %>%
mutate(lga_name = str_remove(lga_name, " \\(C\\)| \\(A\\)| \\(NSW\\)")) %>%
distinct(postcode, .keep_all = TRUE)
regional_covid_data <- location_covid_cases %>%
group_by(postcode) %>%
summarise(total_cases = sum(total_confirmed_cases, na.rm = TRUE), .groups = "drop") %>%
left_join(postcode_to_lga_lookup, by = "postcode") %>%
left_join(lga_to_ti_region_lookup, by = "lga_name") %>%
filter(!is.na(ti_region)) %>%
group_by(ti_region) %>%
summarise(total_cases = sum(total_cases, na.rm = TRUE), .groups = "drop")Key questions explored:
ggplot(summary_bus_trip_counts, aes(x = Time.period, y = TotalTrips, fill = Card.Type)) +
geom_bar(stat = "identity") +
labs(title = "Annual Bus Trips by Card Type (2020–2025)",
x = "Year", y = "Total Trips") +
theme_minimal() +
theme(legend.position = "right")pre_covid_start <- ymd("2020-01-01")
pre_covid_end <- ymd("2020-03-12")
post_covid_start <- ymd("2025-01-01")
post_covid_end <- ymd("2025-07-01")
hourly_summary <- opal_patronage_data %>%
mutate(
trip_origin_date = ymd(trip_origin_date),
tap_hour = as.numeric(tap_hour),
Tap_Ons = as.numeric(Tap_Ons)
) %>%
mutate(Period = case_when(
trip_origin_date >= pre_covid_start & trip_origin_date <= pre_covid_end ~ "Pre-COVID (Jan-Mar 2020)",
trip_origin_date >= post_covid_start & trip_origin_date <= post_covid_end ~ "Post-COVID (Jan-Jul 2025)",
TRUE ~ NA_character_
)) %>%
filter(!is.na(Period)) %>%
mutate(weekday = wday(trip_origin_date, label = TRUE, week_start = 1)) %>%
filter(!weekday %in% c("Sat", "Sun")) %>%
group_by(Period, ti_region, tap_hour) %>%
summarise(
average_tap_ons = mean(Tap_Ons, na.rm = TRUE),
.groups = 'drop'
)## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Tap_Ons = as.numeric(Tap_Ons)`.
## Caused by warning:
## ! NAs introduced by coercion
## # A tibble: 6 × 4
## Period ti_region tap_hour average_tap_ons
## <chr> <chr> <dbl> <dbl>
## 1 Post-COVID (Jan-Jul 2025) All - NSW 0 1257.
## 2 Post-COVID (Jan-Jul 2025) All - NSW 1 832.
## 3 Post-COVID (Jan-Jul 2025) All - NSW 2 569.
## 4 Post-COVID (Jan-Jul 2025) All - NSW 3 712.
## 5 Post-COVID (Jan-Jul 2025) All - NSW 4 4461.
## 6 Post-COVID (Jan-Jul 2025) All - NSW 5 10120.
base_data <- opal_patronage_data %>%
mutate(trip_origin_date = ymd(trip_origin_date),tap_hour = as.numeric(tap_hour),Tap_Ons = as.numeric(Tap_Ons)) %>%
mutate(Period = case_when(
trip_origin_date >= pre_covid_start & trip_origin_date <= pre_covid_end ~ "Pre-COVID",
trip_origin_date >= post_covid_start & trip_origin_date <= post_covid_end ~ "Post-COVID",
TRUE ~ NA_character_)) %>%
filter(!is.na(Period)) %>%
mutate(weekday = wday(trip_origin_date, label = TRUE, week_start = 1)) %>%
filter(!weekday %in% c("Sat", "Sun"))## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Tap_Ons = as.numeric(Tap_Ons)`.
## Caused by warning:
## ! NAs introduced by coercion
peak_summary <- base_data %>%
mutate(Peak_Type = case_when(tap_hour %in% 7:9 ~ "Morning Peak (7-9am)",tap_hour %in% 16:18 ~ "Afternoon Peak (4-6pm)",TRUE ~ "Off-Peak")) %>%
group_by(Period, ti_region, Peak_Type) %>%
summarise(Total_Taps = sum(Tap_Ons, na.rm = TRUE),Num_Days = n_distinct(trip_origin_date),.groups = 'drop') %>%
mutate(Average_Daily_Taps = Total_Taps / Num_Days)
overall_summary <- base_data %>%
group_by(Period, ti_region) %>%
summarise(Total_Taps = sum(Tap_Ons, na.rm = TRUE),Num_Days = n_distinct(trip_origin_date),.groups = 'drop') %>%
mutate(Average_Daily_Taps = Total_Taps / Num_Days) %>%
mutate(Peak_Type = "Overall Day")
final_summary <- bind_rows(peak_summary, overall_summary) %>%
select(ti_region, Period, Peak_Type, Average_Daily_Taps)
percentage_change_table <- final_summary %>%
pivot_wider(names_from = Period,values_from = Average_Daily_Taps) %>%
filter(!is.na(`Pre-COVID`) & !is.na(`Post-COVID`)) %>%
mutate(pct_change = (`Post-COVID` - `Pre-COVID`) / `Pre-COVID` * 100) %>%
select(ti_region, Peak_Type, `Pre-COVID`, `Post-COVID`, pct_change) %>%
arrange(ti_region, Peak_Type)
print(percentage_change_table, n = 40)## # A tibble: 40 × 5
## ti_region Peak_Type `Pre-COVID` `Post-COVID` pct_change
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 All - NSW Afternoon Peak … 584456. 498652. -14.7
## 2 All - NSW Morning Peak (7… 598006. 508615. -14.9
## 3 All - NSW Off-Peak 1046952. 1012403. -3.30
## 4 All - NSW Overall Day 2229413. 2019669. -9.41
## 5 Chatswood Afternoon Peak … 12123. 10701. -11.7
## 6 Chatswood Morning Peak (7… 7267. 7088. -2.47
## 7 Chatswood Off-Peak 17206. 19174. 11.4
## 8 Chatswood Overall Day 36596. 36962. 1.00
## 9 Macquarie Park Afternoon Peak … 11942. 10970 -8.14
## 10 Macquarie Park Morning Peak (7… 5267. 6608. 25.5
## 11 Macquarie Park Off-Peak 12196. 15142. 24.2
## 12 Macquarie Park Overall Day 29406. 32721. 11.3
## 13 Newcastle and surrounds Afternoon Peak … 2240. 1425. -36.4
## 14 Newcastle and surrounds Morning Peak (7… 1496. 1005. -32.8
## 15 Newcastle and surrounds Off-Peak 3723. 2726. -26.8
## 16 Newcastle and surrounds Overall Day 7460. 5156. -30.9
## 17 North Sydney Afternoon Peak … 25100 15669. -37.6
## 18 North Sydney Morning Peak (7… 8744. 6773. -22.5
## 19 North Sydney Off-Peak 20204. 16967. -16.0
## 20 North Sydney Overall Day 54048. 39409. -27.1
## 21 Other Afternoon Peak … 270569. 251583. -7.02
## 22 Other Morning Peak (7… 495410. 416847. -15.9
## 23 Other Off-Peak 697423. 667628. -4.27
## 24 Other Overall Day 1463402. 1336058. -8.70
## 25 Parramatta Afternoon Peak … 25215. 20563. -18.5
## 26 Parramatta Morning Peak (7… 17027. 13328. -21.7
## 27 Parramatta Off-Peak 32738. 32072. -2.04
## 28 Parramatta Overall Day 74981. 65962. -12.0
## 29 Strathfield Afternoon Peak … 6173. 5886. -4.65
## 30 Strathfield Morning Peak (7… 9465. 7676. -18.9
## 31 Strathfield Off-Peak 14519. 14195. -2.23
## 32 Strathfield Overall Day 30158. 27758. -7.96
## 33 Sydney CBD Afternoon Peak … 229677. 178930 -22.1
## 34 Sydney CBD Morning Peak (7… 51590. 46115. -10.6
## 35 Sydney CBD Off-Peak 243585. 234858. -3.58
## 36 Sydney CBD Overall Day 524852. 459903. -12.4
## 37 Wollongong and surrounds Afternoon Peak … 1415. 2925. 107.
## 38 Wollongong and surrounds Morning Peak (7… 1738. 3175. 82.6
## 39 Wollongong and surrounds Off-Peak 5358. 9641. 79.9
## 40 Wollongong and surrounds Overall Day 8512. 15740 84.9
plot_data <- percentage_change_table %>%
mutate(Peak_Type = factor(Peak_Type, levels = c("Morning Peak (7-9am)", "Afternoon Peak (4-6pm)", "Off-Peak", "Overall Day")))
ggplot(data = plot_data, mapping = aes(x = pct_change, y = fct_reorder(ti_region, pct_change), fill = Peak_Type)) +
geom_col(position = "dodge") +
geom_vline(xintercept = 0, linetype = "dashed", color = "black") +
labs(
title = "The Uneven Recovery of Sydney's Commute",
subtitle = "Percentage Change in Average Weekday Patronage (Pre vs. Post-COVID)",
x = "Percentage Change (%)",
y = "Transport Region",
fill = "Time Period"
) +
theme_minimal()ggplot(data = hourly_summary, aes(x = tap_hour, y = average_tap_ons, color = Period, group = Period)) +
geom_line(linewidth = 1) +
geom_point() +
facet_wrap(~ ti_region, scales = "free_y", ncol = 3) +
geom_vline(xintercept = 7, linetype = "dashed", color = "grey60") +
geom_vline(xintercept = 9, linetype = "dashed", color = "grey60") +
geom_vline(xintercept = 16, linetype = "dashed", color = "grey30") +
geom_vline(xintercept = 18, linetype = "dashed", color = "grey30") +
scale_color_manual(values = c(
"Pre-COVID (Jan-Mar 2020)" = "#0072B2",
"Post-COVID (Jan-Jul 2025)" = "#D55E00" )) +
scale_x_continuous(breaks = seq(0, 23, by = 4)) +
theme_minimal(base_size = 12) +
theme(
legend.position = "bottom",
strip.text = element_text(face = "bold", size = 10)
) +
labs(
title = "The Fracturing of the 9-to-5 Commute Across Sydney",
subtitle = "Comparing Average Weekday Hourly Tap-Ons by Transport Region",
x = "Hour of the Day",
y = "Average Tap-Ons",
color = "Period"
)## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 13 rows containing missing values or values outside the scale range
## (`geom_point()`).
seifa_filepath <- "/Users/ryan/Desktop/Winterdatachallenge/SIFEA.xlsx"
regional_socio_economic_data <- read_excel(seifa_filepath, sheet = 1) %>%
select(
postcode,
seifa_score = `Score...2`,
population = `Usual Resident Population`
) %>%
mutate(postcode = as.character(postcode), across(c(seifa_score, population), as.numeric)) %>%
filter(!is.na(seifa_score) & !is.na(population)) %>%
left_join(postcode_to_lga_lookup, by = "postcode") %>%
left_join(lga_to_ti_region_lookup, by = "lga_name") %>%
filter(!is.na(ti_region)) %>%
group_by(ti_region) %>%
summarise(
avg_seifa_score = weighted.mean(seifa_score, w = population, na.rm = TRUE),
total_population = sum(population, na.rm = TRUE),
.groups = "drop" )## New names:
## • `Score` -> `Score...2`
## • `Decile` -> `Decile...3`
## • `Score` -> `Score...4`
## • `Decile` -> `Decile...5`
## • `Score` -> `Score...6`
## • `Decile` -> `Decile...7`
## • `Score` -> `Score...8`
## • `Decile` -> `Decile...9`
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(c(seifa_score, population), as.numeric)`.
## Caused by warning:
## ! NAs introduced by coercion
final_analysis_df <- percentage_change_table %>%
filter(Peak_Type == "Morning Peak (7-9am)") %>%
filter(ti_region != "Other") %>%
left_join(regional_socio_economic_data, by = "ti_region") %>%
left_join(regional_covid_data, by = "ti_region") %>%
mutate(
cases_per_10k_people = (total_cases / total_population) * 10000
) %>%
select(
ti_region,
pct_change,
avg_seifa_score,
cases_per_10k_people,
total_population
)
vars_to_correlate <- c("pct_change", "avg_seifa_score", "cases_per_10k_people")
correlation_data <- final_analysis_df %>%
filter(if_all(all_of(vars_to_correlate), is.finite)) %>%
select(all_of(vars_to_correlate))
if (nrow(correlation_data) > 3) {
corr_matrix <- round(cor(correlation_data), 2)
ggcorrplot(
corr_matrix,
hc.order = TRUE,
type = "lower",
lab = TRUE,
lab_size = 4,
colors = c("#D55E00", "white", "#0072B2"),
title = "Correlation of Commuting Shift Drivers",
ggtheme = theme_minimal(base_size = 14)
)
} else {
message("Correlation analysis cancelled: Not enough complete data rows.")
}ggplot(
data = final_analysis_df %>% filter(is.finite(avg_seifa_score) & is.finite(pct_change)),
mapping = aes(x = avg_seifa_score, y = pct_change)
) +
geom_point(aes(color = ti_region, size = total_population), alpha = 0.8) +
geom_smooth(method = "lm", se = FALSE, color = "red", linetype = "dashed") +
ggrepel::geom_text_repel(aes(label = ti_region), size = 3.5) +
labs(
title = "The Socio-Economic Divide in Sydney's Commute Recovery",
subtitle = "Higher advantage regions saw a much larger drop in morning peak patronage",
x = "Average Socio-Economic Score (SEIFA)",
y = "Percentage Change in Morning Peak Patronage (%)",
color = "Transport Region",
size = "Total Population"
) +
theme_minimal(base_size = 14) +
theme(legend.position = "none")## `geom_smooth()` using formula = 'y ~ x'
## Rows: 9
## Columns: 5
## $ ti_region <chr> "All - NSW", "Chatswood", "Macquarie Park", "Newc…
## $ pct_change <dbl> -14.948209, -2.471553, 25.461847, -32.802057, -22…
## $ avg_seifa_score <dbl> NA, 1099.2894, 1052.2391, 995.7994, 1093.4241, 10…
## $ cases_per_10k_people <dbl> NA, 32638.98, 17269.49, 68541.79, 16631.96, 22767…
## $ total_population <dbl> NA, 98902, 92741, 538902, 53405, 448404, 60101, 2…
census_data_folder <- "/Users/ryan/Desktop/Winterdatachallenge/2021_GCP_all_for_NSW_short-header/2021 Census GCP All Geographies for NSW/LGA/NSW"
file_pattern <- "G62"
files_to_load <- list.files(
path = census_data_folder,
pattern = file_pattern,
full.names = TRUE
)
travel_to_work_lga <- map_dfr(files_to_load, read.csv)
lga_transport_baseline <- travel_to_work_lga %>%
clean_names() %>%
mutate(lga_code_2021 = as.character(lga_code_2021)) %>%
select(
lga_code_2021,
train = one_method_train_p,
bus = one_method_bus_p,
ferry = one_method_ferry_p,
tram = one_met_tram_or_lt_rail_p,
worked_from_home = worked_home_p,
did_not_go_to_work = did_not_go_to_work_p,
total_workforce = tot_p
) %>%
mutate(across(-lga_code_2021, as.numeric)) %>%
mutate(across(-lga_code_2021, ~replace_na(., 0))) %>%
mutate(
total_public_transport = train + bus + ferry + tram,
total_commuters = total_workforce - worked_from_home - did_not_go_to_work,
pct_public_transport_2021 = (total_public_transport / (total_commuters + 0.001)) * 100
) %>%
mutate(join_key = str_remove(lga_code_2021, "LGA")) %>%
left_join(nswgeo::lga_nsw, by = c("join_key" = "LGA_CODE_2024")) %>%
select(lga_name = LGA_NAME_2024, pct_public_transport_2021) %>%
mutate(lga_name = str_remove(lga_name, " \\(C\\)| \\(A\\)| \\(NSW\\)")) %>%
filter(!is.na(lga_name))regional_transport_baseline <- lga_transport_baseline %>%
left_join(lga_to_ti_region_lookup, by = "lga_name") %>%
filter(!is.na(ti_region)) %>%
group_by(ti_region) %>%
summarise(
avg_pct_public_transport_2021 = mean(pct_public_transport_2021, na.rm = TRUE),
.groups = "drop"
)
master_analysis_df <- final_analysis_df %>%
left_join(regional_transport_baseline, by = "ti_region")ggplot(
data = master_analysis_df %>% filter(is.finite(avg_pct_public_transport_2021)),
mapping = aes(x = avg_pct_public_transport_2021, y = pct_change)
) +
geom_smooth(method = "lm", se = FALSE, color = "#E69F00", linetype = "dashed") +
geom_point(aes(color = avg_seifa_score, size = total_population), alpha = 0.8) +
ggrepel::geom_text_repel(aes(label = ti_region), size = 3.5, max.overlaps = 15) +
geom_hline(yintercept = 0, linetype = "solid", color = "black") +
scale_color_viridis_c() +
labs(
title = "The Vulnerability of Sydney's Commuting Habits",
subtitle = "Regions with higher pre-pandemic PT usage saw a larger drop in patronage",
x = "Baseline Public Transport Use in 2021 (%)",
y = "Percentage Change in Morning Peak Patronage (2020 vs. 2025)",
color = "SEIFA Score",
size = "Total Population"
) +
theme_minimal(base_size = 14)## `geom_smooth()` using formula = 'y ~ x'
map_data <- nsw_lga_shapes %>%
mutate(lga_name = str_remove(LGA_NAME_2024, " \\(C\\)| \\(A\\)| \\(NSW\\)")) %>%
left_join(lga_to_ti_region_lookup, by = "lga_name") %>%
left_join(master_analysis_df, by = "ti_region") %>%
filter(!is.na(pct_change))
ggplot() +
geom_sf(data = nsw_lga_shapes, fill = "grey85", color = "white", linewidth = 0.2) +
geom_sf(data = map_data, aes(fill = pct_change), color = "black", linewidth = 0.4) +
scale_fill_gradient2(
midpoint = 0,
low = "#D55E00",
mid = "white",
high = "#0072B2",
name = "Patronage \nChange (%)"
) +
labs(
title = "Sydney's Commuting Shift: A Regional View",
subtitle = "Mapping the percentage change in morning peak patronage"
) +
coord_sf(xlim = c(150.0, 151.8), ylim = c(-34.5, -32.7), expand = FALSE) +
theme_void(base_size = 14)pca_data <- master_analysis_df %>%
filter(!ti_region %in% c("All - NSW", "Other")) %>%
filter(is.finite(avg_seifa_score) & is.finite(avg_pct_public_transport_2021)) %>%
select(ti_region, pct_change, avg_seifa_score, avg_pct_public_transport_2021)
numeric_pca_data <- pca_data %>%
select(-ti_region) %>%
scale()
pca_result <- prcomp(numeric_pca_data, center = FALSE, scale. = FALSE)
pca_scores <- as.data.frame(pca_result$x)
indexed_df <- pca_data %>%
select(ti_region) %>%
mutate(disruption_index = pca_scores$PC1)
map_data_indexed <- nsw_lga_shapes %>%
mutate(lga_name = str_remove(LGA_NAME_2024, " \\(C\\)| \\(A\\)| \\(NSW\\)")) %>%
left_join(lga_to_ti_region_lookup, by = "lga_name") %>%
left_join(indexed_df, by = "ti_region") %>%
filter(!is.na(disruption_index))
ggplot() +
geom_sf(data = nsw_lga_shapes, fill = "grey85", color = "white", linewidth = 0.2) +
geom_sf(data = map_data_indexed, aes(fill = disruption_index), color = "black", linewidth = 0.4) +
scale_fill_viridis_c(
option = "plasma",
name = "Remote Work \nDisruption Index"
) +
labs(
title = "A Spectrum of Change Across Sydney's Commuting Zones",
subtitle = "Regions ranked by a PCA-derived 'Disruption Index' based on SEIFA, patronage change, and baseline usage"
) +
coord_sf(xlim = c(150.0, 151.8), ylim = c(-34.5, -32.7), expand = FALSE) +
theme_void(base_size = 14)