2. Loading the Data
library(tidyverse)
# only CSV files
files <- list.files("~/Desktop/DVCFinal/dataraw",
full.names = TRUE,
pattern = "\\.csv$")
# cleaning function
clean_file <- function(path) {
df <- read_csv(path, show_col_types = FALSE)
# clean column names
names(df) <- trimws(names(df))
required_cols <- c(
"ServiceName",
"Suburb",
"Postcode",
"State",
"ServiceType",
"NumberOfApprovedPlaces"
)
# skip bad files safely
missing <- setdiff(required_cols, names(df))
if (length(missing) > 0) {
message("Skipping file: ", path)
message("Missing columns: ", paste(missing, collapse = ", "))
return(NULL)
}
df %>%
select(all_of(required_cols)) %>%
mutate(Postcode = as.character(Postcode))
}
# load + combine all states
all_clean <- files %>%
map(clean_file) %>%
bind_rows() %>%
filter(ServiceType == "Centre-Based Care")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
State Overview
state_summary <- all_clean %>%
group_by(State) %>%
summarise(
total_places = sum(NumberOfApprovedPlaces, na.rm = TRUE),
centres = n(),
.groups = "drop"
)
p1 <- ggplot(state_summary,
aes(x = reorder(State, total_places),
y = total_places,
text = paste0(State, "<br>", total_places))) +
geom_col(fill = "steelblue") +
coord_flip() +
scale_y_continuous(labels = scales::comma) +
labs(title = "Childcare supply across Australia",
x = "State",
y = "Approved places")
ggplotly(p1, tooltip = "text")
Remoteness
all_clean <- all_clean %>%
mutate(
remoteness = case_when(
as.numeric(Postcode) < 3000 ~ "Major Cities",
as.numeric(Postcode) < 4000 ~ "Inner Regional",
as.numeric(Postcode) < 5000 ~ "Outer Regional",
TRUE ~ "Remote"
)
)
remoteness_summary <- all_clean %>%
group_by(State, remoteness) %>%
summarise(
total_places = sum(NumberOfApprovedPlaces, na.rm = TRUE),
.groups = "drop"
)
p2 <- ggplot(remoteness_summary,
aes(x = remoteness,
y = total_places,
fill = State,
text = paste0(State, "<br>", remoteness, "<br>", total_places))) +
geom_col(position = "dodge") +
labs(title = "Childcare supply by remoteness",
x = "Area type",
y = "Places")
ggplotly(p2, tooltip = "text")
Supply Intensity
p3_data <- state_summary %>%
mutate(avg_places_per_centre = total_places / centres)
p3 <- ggplot(p3_data,
aes(x = reorder(State, avg_places_per_centre),
y = avg_places_per_centre,
text = paste0(State))) +
geom_col(fill = "darkgreen") +
coord_flip() +
scale_y_continuous(labels = comma) +
labs(title = "Supply intensity by state",
x = "State",
y = "Avg places per centre")
ggplotly(p3, tooltip = "text")
Suburb Inequality
suburb_supply <- all_clean %>%
group_by(State, Suburb) %>%
summarise(
total_places = sum(NumberOfApprovedPlaces, na.rm = TRUE),
centres = n(),
.groups = "drop"
) %>%
arrange(desc(total_places)) %>%
slice_head(n = 300)
p4 <- ggplot(suburb_supply,
aes(x = total_places,
y = centres,
colour = State,
text = paste0(Suburb, "<br>", State))) +
geom_point(alpha = 0.6) +
labs(title = "Suburb-level childcare access",
x = "Places",
y = "Centres")
ggplotly(p4, tooltip = "text")
System Structure
region_summary <- all_clean %>%
group_by(State) %>%
summarise(
total_places = sum(NumberOfApprovedPlaces, na.rm = TRUE),
centres = n(),
avg_places_per_centre = total_places / centres,
.groups = "drop"
)
p5 <- ggplot(region_summary,
aes(x = total_places,
y = avg_places_per_centre,
size = centres,
colour = State,
text = paste0(State))) +
geom_point(alpha = 0.7) +
labs(title = "Childcare system structure",
x = "Total places",
y = "Avg per centre")
ggplotly(p5, tooltip = "text")