Australian Road Death Analysis

R Visualisation 1: Temporal Risk Heatmap

# R Visualisation 1: Temporal Risk Heatmap
# --- Libraries ---
library(readxl)

## Warning: package 'readxl' was built under R version 4.3.3

library(dplyr)

## Warning: package 'dplyr' was built under R version 4.3.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.3.3

library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(viridis)

## Warning: package 'viridis' was built under R version 4.3.3

## Loading required package: viridisLite

library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library(scales)

## 
## Attaching package: 'scales'

## The following object is masked from 'package:viridis':
## 
##     viridis_pal

library(zoo)

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

library(purrr)

## 
## Attaching package: 'purrr'

## The following object is masked from 'package:scales':
## 
##     discard

# Load data
fatality <- read_excel("cleaned_data.xlsx", sheet = "Processed_Fatality")

# Filter data to ONLY include years 2010-2024
fatality <- fatality %>%
  filter(Year >= 2010 & Year <= 2024)

# Enhanced data preparation
fatality <- fatality %>%
  mutate(
    Hour = hour(hms(Time)),
    Weekday = factor(Dayweek, 
                     levels = c("Monday", "Tuesday", "Wednesday", "Thursday", 
                                "Friday", "Saturday", "Sunday"),
                     labels = c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")),
    Month = factor(Month, levels = 1:12, 
                   labels = c("Jan", "Feb", "Mar", "Apr", "May", "Jun",
                              "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")),
    Year = as.factor(Year),
    # Create high-risk time period flag
    HighRiskPeriod = case_when(
      Hour >= 12 & Hour <= 17 ~ "High Risk (12pm-17pm)",
      TRUE ~ "Standard Risk"
    ),
    # Create weekend flag
    Weekend = ifelse(Weekday %in% c("Saturday", "Sunday"), "Weekend", "Weekday")
  )

# Calculate overall statistics for annotations (2010-2024 only)
total_deaths <- nrow(fatality)
peakhour_deaths <- fatality %>% 
  filter(Hour >= 12 & Hour <= 17) %>% 
  nrow()
peakhour_percentage <- round(peakhour_deaths / total_deaths * 100, 1)

# Calculate the actual number of years in the filtered data
years_in_data <- length(unique(fatality$Year))

# Aggregate counts by Hour x Weekday with additional metrics
heat_data <- fatality %>%
  group_by(Hour, Weekday) %>%
  summarise(
    Deaths = n(),
    # Calculate rate per year using actual years in data
    AnnualRate = Deaths / years_in_data,
    .groups = "drop"
  ) %>%
  # Add risk level categorization
  mutate(
    RiskLevel = case_when(
      Deaths >= quantile(Deaths, 0.8) ~ "Very High",
      Deaths >= quantile(Deaths, 0.6) ~ "High", 
      Deaths >= quantile(Deaths, 0.4) ~ "Moderate",
      Deaths >= quantile(Deaths, 0.2) ~ "Low",
      TRUE ~ "Very Low"
    ),
    # Highlight the dangerous period
    DangerPeriod = ifelse(Hour >= 12 & Hour <= 17, "Danger Zone", "Normal")
  )

# Create the enhanced heatmap
p1 <- ggplot(heat_data, aes(x = Hour, y = Weekday, fill = Deaths)) +
  geom_tile(color = "white", linewidth = 0.3) +
  # Add border around high-risk hours
  geom_rect(data = filter(heat_data, Hour >= 12 & Hour <= 17),
            aes(xmin = Hour - 0.5, xmax = Hour + 0.5, 
                ymin = as.numeric(Weekday) - 0.5, ymax = as.numeric(Weekday) + 0.5),
            fill = NA, color = "red", linewidth = 1, inherit.aes = FALSE) +
  # Use a more sophisticated color scale
  scale_fill_viridis_c(option = "plasma", name = "Deaths\n(2010-2024)",
                       trans = "sqrt", # Square root transformation for better color distribution
                       labels = scales::comma_format()) +
  # Enhanced scales and labels
  scale_x_continuous(breaks = seq(0, 23, 2), 
                     labels = paste0(seq(0, 23, 2), ":00")) +
  scale_y_discrete() +
  
  # Professional styling
  labs(
    title = "Australian Road Deaths: Temporal Risk Analysis (2010-2024)",
    subtitle = paste0("High-risk period (12pm-17pm) accounts for ", peakhour_percentage, "% of all fatalities | Years analyzed: ", years_in_data),
    x = "Hour of Day",
    y = "Day of Week",
    caption = "Source: Australian Road Deaths Database | Red borders highlight Danger Zone (2010-2024)"
  ) +
  
  # Clean theme
  theme_minimal() +
  theme(
    axis.title.y = element_text(
      size = 11, face = "bold",
      margin = margin(r = 15)   # increase this value to push title away from tick labels
    ),
    axis.text.y = element_text(
      margin = margin(r = 5)    # keep small margin between labels and tick marks
    ),
    plot.title = element_text(size = 14, face = "bold", hjust = 0.5),
    plot.subtitle = element_text(size = 12, hjust = 0.5, color = "darkred"),
    axis.text.x = element_text(angle = 45, hjust = 1),
    axis.title = element_text(size = 11, face = "bold"),
    legend.position = "right",
    panel.grid = element_blank(),
    plot.caption = element_text(size = 9, color = "gray50")
  )

# Additional Analysis: Create a summary statistics table
risk_summary <- fatality %>%
  group_by(HighRiskPeriod) %>%
  summarise(
    Deaths = n(),
    Percentage = round(n() / nrow(fatality) * 100, 1),
    DeathsPerYear = round(n() / years_in_data, 1)
  )

# Also create hourly summary to identify actual peak hours
hourly_summary <- fatality %>%
  count(Hour, name = "Deaths") %>%
  arrange(desc(Deaths)) %>%
  mutate(
    Percentage = round(Deaths / sum(Deaths) * 100, 1),
    RankRisk = rank(-Deaths)
  )

# Print data summary
print(paste("Data filtered for years 2010-2024"))

## [1] "Data filtered for years 2010-2024"

print(paste("Total years in dataset:", years_in_data))

## [1] "Total years in dataset: 15"

print(paste("Total deaths in filtered period:", total_deaths))

## [1] "Total deaths in filtered period: 18046"

print("")

## [1] ""

print("Risk Period Analysis:")

## [1] "Risk Period Analysis:"

print(risk_summary)

## # A tibble: 2 × 4
##   HighRiskPeriod        Deaths Percentage DeathsPerYear
##   <chr>                  <int>      <dbl>         <dbl>
## 1 High Risk (12pm-17pm)   6432       35.6          429.
## 2 Standard Risk          11614       64.4          774.

print("\nTop 6 Deadliest Hours:")

## [1] "\nTop 6 Deadliest Hours:"

print(head(hourly_summary, 6))

## # A tibble: 6 × 4
##    Hour Deaths Percentage RankRisk
##   <dbl>  <int>      <dbl>    <dbl>
## 1    15   1239        6.9        1
## 2    16   1113        6.2        2
## 3    14   1088        6          3
## 4    17   1040        5.8        4
## 5    12    978        5.4        5
## 6    13    974        5.4        6

# Show the plot
print(p1)

R Visualisation 2: Bubble Plot with Known Baseline

# R Visualisation 2: Bubble Plot with Known Baseline
# --- Libraries ---
library(readxl)
library(dplyr)
library(ggplot2)
library(lubridate)
library(viridis)
library(plotly)

library(scales)
library(zoo)
library(purrr)

# Load data
fatality <- read_excel("cleaned_data.xlsx", sheet = "Processed_Fatality")

# 1. Find most common known demographic group (exclude unknowns)
fatality <- fatality %>%
  mutate(
    Year = if (inherits(Year, "Date") || inherits(Year, "POSIXt")) year(Year) else as.integer(as.character(Year))
  )

# --- pipeline ---
most_common_known <- fatality %>%
  filter(
    Gender != "Unknown",
    `National Remoteness Areas 2021` != "Unknown",
    !is.na(`Age group`)
  ) %>%
  count(`Age group`, Gender, `National Remoteness Areas 2021`,
        sort = TRUE, name = "Deaths") %>%
  slice(1)

baseline_age       <- most_common_known$`Age group`
baseline_gender    <- most_common_known$Gender
baseline_remoteness<- most_common_known$`National Remoteness Areas 2021`

# 2. Calculate risk ratio based on pre-defined baseline (2010-2024)
demo_stats <- fatality %>%
  filter(Year >= 2010, Year <= 2024) %>%     
  group_by(`Age group`, Gender, `National Remoteness Areas 2021`) %>%
  summarise(Deaths = n(), .groups = "drop") %>%
  mutate(
    baseline_deaths = most_common_known$Deaths,
    Risk_Ratio = Deaths / baseline_deaths,
    Risk_category = case_when(
      Risk_Ratio >= 2.0  ~ "Very High Risk (2x+)",
      Risk_Ratio >= 1.5  ~ "High Risk (1.5-2x)",
      Risk_Ratio >= 0.75 ~ "Moderate Risk",
      TRUE               ~ "Lower Risk"
    ),
    Has_Unknown = ifelse(Gender == "Unknown" |
                           `National Remoteness Areas 2021` == "Unknown",
                         "Unknown Data", "Known Data")
  )

# Separate known vs unknown
known_data <- demo_stats %>% filter(Has_Unknown == "Known Data")
unknown_data <- demo_stats %>% filter(Has_Unknown == "Unknown Data")

# 3. Enhanced visualisation with pre-defined baseline
p2 <- plot_ly()

# Unknown data as diamonds (bottom layer)
if(nrow(unknown_data) > 0) {
  p2 <- p2 %>% add_trace(
    data = unknown_data,
    x = ~`Age group`,
    y = ~Gender,
    size = ~Deaths,
    color = ~Risk_Ratio,
    colors = c("#0D0887", "wheat", "#F98C0A"),
    text = ~paste("<b>", `Age group`, Gender, "</b>",
                  "<br><b>Location:</b>", `National Remoteness Areas 2021`,
                  "<br><b>Deaths:</b>", Deaths,
                  "<br><b>Risk Ratio:</b>", round(Risk_Ratio, 2),
                  "<br><b>Risk Level:</b>", Risk_category,
                  "<br><b>Data Quality:</b>", Has_Unknown),
    hovertemplate = "%{text}<extra></extra>",
    type = "scatter",
    mode = "markers",
    marker = list(
      symbol = "diamond",
      sizemode = "diameter",
      sizemin = 5,
      sizemax = 50,
      line = list(width = 2, color = "darkgray"),
      opacity = 0.7,
      showscale = FALSE
    ),
    name = "Unknown Data",
    showlegend = TRUE,
    hoverinfo = "text"
  )
}

# Known data as circles (top layer)
p2 <- p2 %>% add_trace(
  data = known_data,
  x = ~`Age group`,
  y = ~Gender,
  size = ~Deaths,
  color = ~Risk_Ratio,
  colors = c("#0D0887", "wheat", "#F98C0A"),
  text = ~paste("<b>", `Age group`, Gender, "</b>",
                "<br><b>Location:</b>", `National Remoteness Areas 2021`,
                "<br><b>Deaths:</b>", Deaths,
                "<br><b>Risk Ratio:</b>", round(Risk_Ratio, 2),
                "<br><b>Risk Level:</b>", Risk_category,
                "<br><b>Data Quality:</b>", Has_Unknown),
  hovertemplate = "%{text}<extra></extra>",
  type = "scatter",
  mode = "markers",
  marker = list(
    symbol = "circle",
    sizemode = "diameter", 
    sizemin = 5,
    sizemax = 50,
    line = list(width = 0.5, color = "black"),
    colorbar = list(
      title = ""
    )
  ),
  name = "Known Data",
  showlegend = TRUE
) %>%
  layout(
    title = list(
      text = paste0("<b>Australian Road Fatalities: Demographic Risk Analysis</b><br>",
                    "<sub>Source: Australian Road Deaths Database 2010-2024 | Risk Ratios vs Pre-defined Baseline</sub>"),
      font = list(size = 16)
    ),
    xaxis = list(
      title = "<b>Age Group</b>",
      titlefont = list(size = 14),
      categoryorder = "array",
      categoryarray = c("0-17", "18-24", "25-34", "35-44", "45-54", "55-64", "65-74", "75-84", "85+")
    ),
    yaxis = list(
      title = "<b>Gender</b>",
      titlefont = list(size = 14)
    ),
    annotations = list(
      list(
        text = "<i>Bubble size = Total Deaths | Color = Risk Ratio vs Pre-defined Baseline<br> </i>",
        x = 0.5, y = -0.15, 
        xref = "paper", yref = "paper",
        showarrow = FALSE,
        font = list(size = 11, color = "gray")
      ),
      list(
        text = paste0("<b>Pre-defined Baseline:</b> ", baseline_gender, ", ", baseline_remoteness, 
                      ", ", baseline_age, " (Risk Ratio = 1.0)"),
        x = 1, y = 1.02, 
        xref = "paper", yref = "paper",
        showarrow = FALSE, 
        font = list(size = 10, color = "#0D0887"),
        bgcolor = "wheat",
        bordercolor = "#0D0887",
        borderwidth = 1
      )
    ),
    margin = list(t = 80, b = 100),
    plot_bgcolor = "white",
    paper_bgcolor = "white"
  )

p2

## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.

R Visualisation 3: Policy Impact Analysis - Monthly Deaths

# R Visualisation 3: Policy Impact Analysis - Monthly Deaths
# --- Libraries ---
library(dplyr)
library(lubridate)
library(ggplot2)
library(purrr)
library(zoo)

# Load data
fatality <- read_excel("cleaned_data.xlsx", sheet = "Processed_Fatality")
# 1) Clean + Constrain to 2010–2024

fatality <- fatality %>%
  mutate(
    Year_chr   = as.character(Year),
    Month_chr  = as.character(Month),
    Month_lower   = tolower(trimws(Month_chr)),
    Month_numeric = dplyr::case_when(
      grepl("^[0-9]{1,2}$", Month_lower) ~ as.integer(Month_lower),
      Month_lower %in% tolower(month.abb)  ~ match(Month_lower, tolower(month.abb)),
      Month_lower %in% tolower(month.name) ~ match(Month_lower, tolower(month.name)),
      TRUE ~ NA_integer_
    ),
    Year_numeric = suppressWarnings(as.integer(Year_chr))
  ) %>%
  filter(!is.na(Year_numeric), !is.na(Month_numeric),
         Year_numeric >= 2010, Year_numeric <= 2024) %>%
  mutate(Date = make_date(year = Year_numeric, month = Month_numeric, day = 1))

# 2) Policy events: x from Date, only manual y_position + per-line colors
policy_events <- data.frame(
  Date = as.Date(c("2011-01-01","2018-01-01","2018-07-01","2021-01-01")),
  Policy = c("National Strategy\n2011-20",
             "Vehicle Safety\nStandards",
             "Mobile Phone\nEnforcement",
             "National Strategy\n2021-30"),
  Policy_Short = c(" Strategy 2011-20\n commerces",
                   " Enhanced Vehicle\n Safety Standards",
                   " Mobile Phone\n Penalties\n Enforcement",
                   " Strategy 2021-30\n commerces"),
  Type = c("National Strategy","Vehicle Safety","Driver Behaviour","National Strategy"),
  Impact_Expected = "Reduce",
  y_position = c(125, 80, 60, 125),
  # New: color each dashed vertical line (2011–20 and 2021–30 get custom colors)
  LineColor = c("#0D0887", "#415A77", "#415A77", "#89226A"),
  stringsAsFactors = FALSE
)

# 3) Monthly series
monthly_data <- fatality %>%
  group_by(Date, Year_numeric, Month_numeric) %>%
  summarise(Deaths = n(), .groups = "drop") %>%
  arrange(Date) %>%
  mutate(
    Deaths_MA12 = zoo::rollmean(Deaths, k = 12, fill = NA, align = "right"),
    YoY_Change  = (Deaths - dplyr::lag(Deaths, 12)) / dplyr::lag(Deaths, 12) * 100
  )

# 4) Policy impact (optional analytics)
policy_impact <- policy_events %>%
  mutate(
    Pre_Period_Deaths = purrr::map_dbl(Date, function(d0) {
      pre <- monthly_data %>% filter(Date >= (d0 %m-% months(12)), Date < d0, !is.na(Deaths))
      if (nrow(pre) > 0) mean(pre$Deaths) else NA_real_
    }),
    Post_Period_Deaths = purrr::map_dbl(Date, function(d0) {
      post <- monthly_data %>% filter(Date >= d0, Date < (d0 %m+% months(12)), !is.na(Deaths))
      if (nrow(post) > 0) mean(post$Deaths) else NA_real_
    }),
    Impact_Percent = dplyr::case_when(
      is.na(Pre_Period_Deaths) | is.na(Post_Period_Deaths) ~ NA_real_,
      Pre_Period_Deaths == 0 ~ NA_real_,
      TRUE ~ round((Post_Period_Deaths - Pre_Period_Deaths) / Pre_Period_Deaths * 100, 1)
    )
  )

# 5) Targets
baseline_2018_2020 <- 1142
target_2030 <- 571
target_2030_monthly <- target_2030 / 12  # ~47.6

# 6) Plot
p3 <- ggplot(monthly_data, aes(x = Date)) +
  # Strategy backgrounds
  annotate("rect", xmin = as.Date("2011-01-01"), xmax = as.Date("2020-12-31"),
           ymin = -Inf, ymax = Inf, alpha = 0.15, fill = "#0D0887") +
  annotate("rect", xmin = as.Date("2021-01-01"), xmax = as.Date("2024-12-31"),
           ymin = -Inf, ymax = Inf, alpha = 0.15, fill = "#89226A") +
  # COVID window
  annotate("rect", xmin = as.Date("2020-03-01"), xmax = as.Date("2021-09-01"),
           ymin = -Inf, ymax = Inf, alpha = 0.2, fill = "#F98C0A") +
  # Series
  geom_line(aes(y = Deaths), color = "#E8E8E8", alpha = 0.7, size = 0.3) +
  geom_line(aes(y = Deaths_MA12), color = "#0D1B2A", size = 1.2) +
  # Policy lines at actual dates, now with per-line colors
  geom_vline(data = policy_events,
             aes(xintercept = Date, color = LineColor),
             linetype = "dashed", size = 0.8, alpha = 0.8) +
  # Labels: x = Date, y = manual y_position (text color kept uniform)
  geom_text(data = policy_events,
            aes(x = Date, y = y_position, label = Policy_Short),
            color = "#0D1B2A", size = 2.8, fontface = "bold",
            hjust = 0, vjust = 0.5) +
  # 2030 target
  geom_hline(yintercept = target_2030_monthly, color = "#D23105",
             linetype = "dashed", size = 1, alpha = 0.8) +
  annotate("text", x = as.Date("2023-01-01"), y = target_2030_monthly + 5,
           label = paste0(" 2030 Target\n (", round(target_2030_monthly, 1), " deaths/mth)"),
           color = "#D23105", size = 3, fontface = "bold", hjust = 0) +
  # Titles on backgrounds
  annotate("text", x = as.Date("2015-11-15"), y = 138,
           label = "National Strategy 2011-2020", color = "#0D0887",
           size = 3.5, fontface = "bold", alpha = 0.8) +
  annotate("text", x = as.Date("2022-11-15"), y = 138,
           label = "National Strategy 2021-2030", color = "#89226A",
           size = 3.5, fontface = "bold", alpha = 0.8) +
  # COVID label
  annotate("text", x = as.Date("2020-09-01"), y = 110,
           label = "  COVID-19\n  PERIOD", color = "#F98C0A",
           size = 3, fontface = "bold", hjust = 0.5) +
  # Trend note
  annotate("curve", x = as.Date("2019-06-01"), xend = as.Date("2019-01-01"),
           y = 120, yend = 102, color = "#D23105", size = 1,
           arrow = arrow(length = unit(0.2, "cm")), curvature = 0.2) +
  annotate("text", x = as.Date("2019-12-01"), y = 125,
           label = "Trend Reversal\n in 2019", color = "#D23105",
           size = 3, fontface = "bold", hjust = 0.5) +
  # Scales
  scale_x_date(date_breaks = "2 years", date_labels = "%Y",
               limits = c(as.Date("2010-01-01"), as.Date("2025-01-01")),
               expand = expansion(mult = c(0.02, 0.05))) +
  scale_y_continuous(breaks = seq(40, 140, 10),
                     limits = c(40, 145),
                     expand = expansion(mult = c(0.02, 0.02))) +
  # Use the literal hex colors from LineColor and suppress legend
  scale_color_identity(guide = "none") +
  # Labels + theme
  labs(
    title = "Australian Road Deaths: Policy Impact Analysis (2010-2024)",
    subtitle = "Monthly deaths with 12-month moving average and Policy Intervention Timeline",
    x = NULL, y = "Monthly Deaths",
    caption = "Source: Australian Road Deaths Database \n2030 Target: 571 annual fatalities (50% reduction from 2018-2020 baseline of 1,142)"
  ) +
  theme_minimal(base_size = 11) +
  theme(
    plot.title = element_text(size = 16, face = "bold", hjust = 0, margin = margin(b = 5)),
    plot.subtitle = element_text(size = 12, hjust = 0, color = "#415A77", margin = margin(b = 20)),
    axis.title.y = element_text(size = 12, face = "bold", margin = margin(r = 10)),
    axis.text.x = element_text(size = 10, color = "#0D1B2A"),
    axis.text.y = element_text(size = 10, color = "#0D1B2A"),
    panel.grid.major.x = element_line(color = "#E8E8E8", size = 0.3),
    panel.grid.major.y = element_line(color = "#E8E8E8", size = 0.3),
    panel.grid.minor = element_blank(),
    panel.background = element_rect(fill = "white", color = NA),
    plot.background = element_rect(fill = "white", color = NA),
    plot.caption = element_text(size = 9, color = "#778DA9", hjust = 0, margin = margin(t = 15)),
    plot.margin = margin(20, 30, 20, 20)
  )

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

print(p3)

## Warning: Removed 11 rows containing missing values or values outside the scale range
## (`geom_line()`).

Australian Road Death Analysis

Audrey Nguyen

2025-11-23

R Visualisation 1: Temporal Risk Heatmap

R Visualisation 2: Bubble Plot with Known Baseline

R Visualisation 3: Policy Impact Analysis - Monthly Deaths