2. Loading the Data

library(tidyverse)

# only CSV files
files <- list.files("~/Desktop/DVCFinal/dataraw",
                    full.names = TRUE,
                    pattern = "\\.csv$")

# cleaning function
clean_file <- function(path) {
  df <- read_csv(path, show_col_types = FALSE)

  # clean column names
  names(df) <- trimws(names(df))

  required_cols <- c(
    "ServiceName",
    "Suburb",
    "Postcode",
    "State",
    "ServiceType",
    "NumberOfApprovedPlaces"
  )

  # skip bad files safely
  missing <- setdiff(required_cols, names(df))

  if (length(missing) > 0) {
    message("Skipping file: ", path)
    message("Missing columns: ", paste(missing, collapse = ", "))
    return(NULL)
  }

  df %>%
    select(all_of(required_cols)) %>%
    mutate(Postcode = as.character(Postcode))
}

# load + combine all states
all_clean <- files %>%
  map(clean_file) %>%
  bind_rows() %>%
  filter(ServiceType == "Centre-Based Care")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)

State Overview

state_summary <- all_clean %>%
  group_by(State) %>%
  summarise(
    total_places = sum(NumberOfApprovedPlaces, na.rm = TRUE),
    centres = n(),
    .groups = "drop"
  )

p1 <- ggplot(state_summary,
             aes(x = reorder(State, total_places),
                 y = total_places,
                 text = paste0(State, "<br>", total_places))) +
  geom_col(fill = "steelblue") +
  coord_flip() +
  scale_y_continuous(labels = scales::comma) +
  labs(title = "Childcare supply across Australia",
       x = "State",
       y = "Approved places")

ggplotly(p1, tooltip = "text")

Remoteness

all_clean <- all_clean %>%
  mutate(
    remoteness = case_when(
      as.numeric(Postcode) < 3000 ~ "Major Cities",
      as.numeric(Postcode) < 4000 ~ "Inner Regional",
      as.numeric(Postcode) < 5000 ~ "Outer Regional",
      TRUE ~ "Remote"
    )
  )

remoteness_summary <- all_clean %>%
  group_by(State, remoteness) %>%
  summarise(
    total_places = sum(NumberOfApprovedPlaces, na.rm = TRUE),
    .groups = "drop"
  )

p2 <- ggplot(remoteness_summary,
             aes(x = remoteness,
                 y = total_places,
                 fill = State,
                 text = paste0(State, "<br>", remoteness, "<br>", total_places))) +
  geom_col(position = "dodge") +
  labs(title = "Childcare supply by remoteness",
       x = "Area type",
       y = "Places")

ggplotly(p2, tooltip = "text")

Supply Intensity

p3_data <- state_summary %>%
  mutate(avg_places_per_centre = total_places / centres)

p3 <- ggplot(p3_data,
             aes(x = reorder(State, avg_places_per_centre),
                 y = avg_places_per_centre,
                 text = paste0(State))) +
  geom_col(fill = "darkgreen") +
  coord_flip() +
  scale_y_continuous(labels = comma) +
  labs(title = "Supply intensity by state",
       x = "State",
       y = "Avg places per centre")

ggplotly(p3, tooltip = "text")

Suburb Inequality

suburb_supply <- all_clean %>%
  group_by(State, Suburb) %>%
  summarise(
    total_places = sum(NumberOfApprovedPlaces, na.rm = TRUE),
    centres = n(),
    .groups = "drop"
  ) %>%
  arrange(desc(total_places)) %>%
  slice_head(n = 300)

p4 <- ggplot(suburb_supply,
             aes(x = total_places,
                 y = centres,
                 colour = State,
                 text = paste0(Suburb, "<br>", State))) +
  geom_point(alpha = 0.6) +
  labs(title = "Suburb-level childcare access",
       x = "Places",
       y = "Centres")

ggplotly(p4, tooltip = "text")

System Structure

region_summary <- all_clean %>%
  group_by(State) %>%
  summarise(
    total_places = sum(NumberOfApprovedPlaces, na.rm = TRUE),
    centres = n(),
    avg_places_per_centre = total_places / centres,
    .groups = "drop"
  )

p5 <- ggplot(region_summary,
             aes(x = total_places,
                 y = avg_places_per_centre,
                 size = centres,
                 colour = State,
                 text = paste0(State))) +
  geom_point(alpha = 0.7) +
  labs(title = "Childcare system structure",
       x = "Total places",
       y = "Avg per centre")

ggplotly(p5, tooltip = "text")