assignment3_ai_divide.knit

library(readxl)

## Warning: package 'readxl' was built under R version 4.5.3

library(dplyr)

## Warning: package 'dplyr' was built under R version 4.5.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(janitor)

## Warning: package 'janitor' was built under R version 4.5.3

## 
## Attaching package: 'janitor'

## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

library(tidyr)
library(ggplot2)
library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout




``` r
# Install only once
#install.packages(c("readxl", "dplyr", "janitor"))

# Load libraries
library(readxl)
library(dplyr)
library(janitor)
install.packages("readxl")

## Warning: package 'readxl' is in use and will not be installed

library(readxl)

# File paths
library(readxl)
genai_file <- "C:/Users/sarat/Downloads/JSA Gen AI Capacity Study AP Publication Chart Data 20250930.xlsx"

employment_projection_file <- "C:/Users/sarat/Downloads/employment_projections_-_may_2025_to_may_2035.xlsx"

shortage_file <- "C:/Users/sarat/Downloads/2025 Occupation Shortage List - 6 digit ANZSCO and OSCA.xlsx"

tnv_file <- "C:/Users/sarat/Downloads/TNV Data Feb 2026.xlsx"

vet_file <- "C:/Users/sarat/Downloads/vnda_2020-21_graduate_outcomes.xlsx"

# Check sheet names
excel_sheets(genai_file)

##  [1] "Contents"           "Cover"              "APA Figure 1"      
##  [4] "APA Figure 2"       "APA Figure 3"       "APA Figure 4"      
##  [7] "APA Figure 5"       "APA Figure 6"       "APA Figure 7"      
## [10] "APA Figure 8"       "APA Figure 9"       "APA Figure 10"     
## [13] "APA Figure 11"      "APA Figure 12"      "APA Table 1"       
## [16] "APA Table 2"        "APA Table 3"        "APA Table 4"       
## [19] "APA Box 1"          "APB Figure 1"       "APB Figure 2"      
## [22] "APB Figure 3"       "APB Table 1"        "APB Table 2"       
## [25] "APB Table 3"        "APB Box 1"          "APC Figure 1"      
## [28] "APC Figure 2"       "APC Figure 3"       "APC Table 1"       
## [31] "APC Table 2"        "APC Table 3"        "APC Box 1"         
## [34] "APC Box 2"          "APD Figure 1"       "APD Figure 2"      
## [37] "APD Figure 3"       "APD Figure 4"       "APD Figure 5"      
## [40] "APD Table 1"        "APD Table 2"        "APD Table 3"       
## [43] "APD Table 4"        "APD Table 5"        "APD Table 6"       
## [46] "APD Table 7"        "APD Box 1"          "APD Box 2"         
## [49] "APD Box 3"          "APE Figure 1"       "APE Figure 2"      
## [52] "APE Figure 3"       "APE Figure 4"       "APE Figure 5"      
## [55] "APE Figure 6"       "APE Figure 7"       "APE Figure 8"      
## [58] "APE Figure 9"       "APE Figure 10"      "APE Figure 11"     
## [61] "APE Figure 12"      "APE Figure 13"      "APE Figure 14"     
## [64] "APE Figure 15"      "APE Figure 16"      "APE Figure 17"     
## [67] "APE Figure 18"      "APE Figure 19"      "APE Figure 20"     
## [70] "APE Figure 21"      "APE Figure 22"      "APE Table 1"       
## [73] "APE Table 2"        "APE Box 1"          "APE Box 2"         
## [76] "APE Box 3"          "APE Box 4"          "APE Box 5"         
## [79] "APF Figure 1"       "APF Figure 2"       "APF Figure 3"      
## [82] "APF Figure 4"       "APF Figure 5"       "APF Figure 6"      
## [85] "APF Table 1"        "APF Table 2"        "APF Table 3"       
## [88] "APF Table 4"        "APF Table 5"        "APF Table 6"       
## [91] "APF Table 7"        "APF Box 1"          "APF Box 2"         
## [94] "APF Box 3"          "CASE STUDY Table 1"

excel_sheets(employment_projection_file)

## [1] "Contents"                      "Data_Dictionary"              
## [3] "Table_1 Industry Division"     "Table_2 Major Occupation"     
## [5] "Table_3 Skill Level"           "Table_4 State & Territory"    
## [7] "Table_5 Industry Group"        "Table_6 Occupation Unit Group"

excel_sheets(shortage_file)

## [1] "Contents"               "2025 OSL (ANZSCO 2022)" "2025 OSL (OSCA 2024)"

excel_sheets(tnv_file)

## [1] "Contents"    "Table_1"     "Data_1"      "Table_2"     "Data_2"     
## [6] "Concordance"

excel_sheets(vet_file)

## [1] "Contents"                       "Data_Dictionary"               
## [3] "1. National and cohort results" "2. State results"              
## [5] "3. AQF results"                 "4. AQF-FOE (National) results" 
## [7] "5. AQF-FOE (State) results"     "6. Qualification results"      
## [9] "7. Qualification by occupation"

# 1. GenAI data
# APA Figure 2 has occupation-level automation and augmentation scores
# Load only the required packages
library(readxl)
library(janitor)
library(dplyr)
library(ggplot2)
library(plotly)

# Load APA Figure 2
genai_occupation <- read_excel(
  path = genai_file,
  sheet = "APA Figure 2",
  skip = 3
)

## New names:
## • `` -> `...6`
## • `` -> `...7`
## • `` -> `...8`
## • `` -> `...9`

# Clean column names
genai_occupation <- janitor::clean_names(genai_occupation)

# Check if object exists
exists("genai_occupation")

## [1] TRUE

# Check column names
names(genai_occupation)

## [1] "automation_score"   "augmentation_score" "anzsco_code"       
## [4] "anzsco_unit_title"  "colour_theme"       "x6"                
## [7] "x7"                 "x8"                 "x9"

# View first rows
head(genai_occupation)

## # A tibble: 6 × 9
##   automation_score augmentation_score anzsco_code anzsco_unit_title colour_theme
##   <chr>                         <dbl>       <dbl> <chr>             <chr>       
## 1 0.26                           0.67        1111 Chief Executives… #570408     
## 2 0.31                           0.66        1112 General Managers  #570408     
## 3 0.26                           0.63        1113 Legislators       #570408     
## 4 0.41                           0.66        1211 Aquaculture Farm… #570408     
## 5 0.34                           0.67        1212 Crop Farmers      #570408     
## 6 0.34                           0.65        1213 Livestock Farmers #570408     
## # ℹ 4 more variables: x6 <lgl>, x7 <lgl>, x8 <lgl>, x9 <chr>

# Check dimensions
dim(genai_occupation)

## [1] 359   9

genai_occupation_clean <- genai_occupation %>%
  select(
    automation_score,
    augmentation_score,
    anzsco_code,
    anzsco_unit_title,
    colour_theme
  ) %>%
  mutate(
    automation_score = as.numeric(automation_score),
    augmentation_score = as.numeric(augmentation_score),
    anzsco_code = as.character(anzsco_code),
    genai_exposure = automation_score + augmentation_score
  )

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `automation_score = as.numeric(automation_score)`.
## Caused by warning:
## ! NAs introduced by coercion

# Check cleaned data
head(genai_occupation_clean)

## # A tibble: 6 × 6
##   automation_score augmentation_score anzsco_code anzsco_unit_title colour_theme
##              <dbl>              <dbl> <chr>       <chr>             <chr>       
## 1             0.26               0.67 1111        Chief Executives… #570408     
## 2             0.31               0.66 1112        General Managers  #570408     
## 3             0.26               0.63 1113        Legislators       #570408     
## 4             0.41               0.66 1211        Aquaculture Farm… #570408     
## 5             0.34               0.67 1212        Crop Farmers      #570408     
## 6             0.34               0.65 1213        Livestock Farmers #570408     
## # ℹ 1 more variable: genai_exposure <dbl>

summary(genai_occupation_clean)

##  automation_score augmentation_score anzsco_code        anzsco_unit_title 
##  Min.   :0.1000   Min.   :0.3100     Length:359         Length:359        
##  1st Qu.:0.2400   1st Qu.:0.5600     Class :character   Class :character  
##  Median :0.3300   Median :0.6500     Mode  :character   Mode  :character  
##  Mean   :0.3516   Mean   :0.6221                                          
##  3rd Qu.:0.4400   3rd Qu.:0.7000                                          
##  Max.   :0.8100   Max.   :0.7900                                          
##  NA's   :2        NA's   :2                                               
##  colour_theme       genai_exposure  
##  Length:359         Min.   :0.4100  
##  Class :character   1st Qu.:0.8000  
##  Mode  :character   Median :0.9800  
##                     Mean   :0.9737  
##                     3rd Qu.:1.1400  
##                     Max.   :1.5700  
##                     NA's   :2

dim(genai_occupation_clean)

## [1] 359   6

chart1_data <- genai_occupation_clean %>%
  arrange(desc(genai_exposure)) %>%
  slice_head(n = 15)

chart1_data

## # A tibble: 15 × 6
##    automation_score augmentation_score anzsco_code anzsco_unit_title            
##               <dbl>              <dbl> <chr>       <chr>                        
##  1             0.81               0.76 5321        Keyboard Operators           
##  2             0.81               0.76 6393        Telemarketers                
##  3             0.76               0.74 5613        Filing and Registry Clerks   
##  4             0.76               0.73 5994        Human Resource Clerks        
##  5             0.73               0.75 4516        Tourism and Travel Advisers  
##  6             0.75               0.73 5411        Call or Contact Centre Worke…
##  7             0.69               0.77 2222        Financial Dealers            
##  8             0.69               0.77 5512        Bookkeepers                  
##  9             0.71               0.74 5511        Accounting Clerks            
## 10             0.68               0.75 5999        Other Miscellaneous Clerical…
## 11             0.68               0.74 5523        Insurance, Money Market and …
## 12             0.63               0.77 2613        Software and Applications Pr…
## 13             0.71               0.69 5611        Betting Clerks               
## 14             0.71               0.68 5311        General Clerks               
## 15             0.68               0.71 5522        Credit and Loans Officers    
## # ℹ 2 more variables: colour_theme <chr>, genai_exposure <dbl>

genai_occupation_clean <- genai_occupation %>%
  select(
    automation_score,
    augmentation_score,
    anzsco_code,
    anzsco_unit_title,
    colour_theme
  ) %>%
  mutate(
    automation_score = as.numeric(automation_score),
    augmentation_score = as.numeric(augmentation_score),
    anzsco_code = as.character(anzsco_code),
    genai_exposure = automation_score + augmentation_score
  ) %>%
  filter(
    !is.na(automation_score),
    !is.na(augmentation_score),
    !is.na(genai_exposure)
  )

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `automation_score = as.numeric(automation_score)`.
## Caused by warning:
## ! NAs introduced by coercion

# Recreate top 15 chart data
chart1_data <- genai_occupation_clean %>%
  arrange(desc(genai_exposure)) %>%
  slice_head(n = 15)

chart1 <- ggplot(
  chart1_data,
  aes(
    x = reorder(anzsco_unit_title, genai_exposure),
    y = genai_exposure,
    text = paste0(
      "Occupation: ", anzsco_unit_title,
      "<br>ANZSCO Code: ", anzsco_code,
      "<br>Automation Score: ", round(automation_score, 2),
      "<br>Augmentation Score: ", round(augmentation_score, 2),
      "<br>Total GenAI Exposure: ", round(genai_exposure, 2)
    )
  )
) +
  geom_col(fill = "#007377") +
  coord_flip() +
  labs(
    title = "Top occupations most exposed to GenAI",
    subtitle = "Top 15 occupations ranked by combined automation and augmentation exposure",
    x = NULL,
    y = "Combined GenAI exposure score",
    caption = "Source: Jobs and Skills Australia, Generative AI Capacity Study"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    plot.subtitle = element_text(size = 10),
    axis.text.y = element_text(size = 9),
    plot.caption = element_text(size = 8),
    plot.margin = margin(10, 20, 10, 10)
  )

ggplotly(chart1, tooltip = "text")

# Chart 2 data
chart2_data <- genai_occupation_clean %>%
  filter(
    !is.na(automation_score),
    !is.na(augmentation_score)
  )
# Make sure packages are loaded
library(dplyr)
library(ggplot2)
library(plotly)

# Recreate Chart 2 data
chart2_data <- genai_occupation_clean %>%
  filter(
    !is.na(automation_score),
    !is.na(augmentation_score),
    !is.na(genai_exposure)
  )

# Check chart2_data exists
exists("chart2_data")

## [1] TRUE

dim(chart2_data)

## [1] 357   6

head(chart2_data)

## # A tibble: 6 × 6
##   automation_score augmentation_score anzsco_code anzsco_unit_title colour_theme
##              <dbl>              <dbl> <chr>       <chr>             <chr>       
## 1             0.26               0.67 1111        Chief Executives… #570408     
## 2             0.31               0.66 1112        General Managers  #570408     
## 3             0.26               0.63 1113        Legislators       #570408     
## 4             0.41               0.66 1211        Aquaculture Farm… #570408     
## 5             0.34               0.67 1212        Crop Farmers      #570408     
## 6             0.34               0.65 1213        Livestock Farmers #570408     
## # ℹ 1 more variable: genai_exposure <dbl>

chart2 <- ggplot(
  chart2_data,
  aes(
    x = automation_score,
    y = augmentation_score,
    text = paste0(
      "Occupation: ", anzsco_unit_title,
      "<br>ANZSCO Code: ", anzsco_code,
      "<br>Automation Score: ", round(automation_score, 2),
      "<br>Augmentation Score: ", round(augmentation_score, 2),
      "<br>Total GenAI Exposure: ", round(genai_exposure, 2)
    )
  )
) +
  geom_point(alpha = 0.55, size = 2.1, colour = "#007377") +
  geom_vline(
    xintercept = median(chart2_data$automation_score, na.rm = TRUE),
    linetype = "dashed",
    colour = "grey50"
  ) +
  geom_hline(
    yintercept = median(chart2_data$augmentation_score, na.rm = TRUE),
    linetype = "dashed",
    colour = "grey50"
  ) +
  labs(
    title = "Exposure does not mean replacement",
    subtitle = "Each dot represents one occupation; scores compare automation pressure with augmentation potential",
    x = "Automation exposure score",
    y = "Augmentation exposure score",
    caption = "Source: Jobs and Skills Australia, Generative AI Capacity Study"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    plot.subtitle = element_text(size = 10),
    plot.caption = element_text(size = 8),
    plot.margin = margin(10, 20, 10, 10)
  )

ggplotly(chart2, tooltip = "text")

# Load required packages
library(dplyr)
library(tidyr)
library(ggplot2)
library(plotly)

# Create occupation groups using first digit of ANZSCO code
chart3_data <- genai_occupation_clean %>%
  mutate(
    anzsco_major_group_code = substr(anzsco_code, 1, 1),
    occupation_group = case_when(
      anzsco_major_group_code == "1" ~ "Managers",
      anzsco_major_group_code == "2" ~ "Professionals",
      anzsco_major_group_code == "3" ~ "Technicians and Trades Workers",
      anzsco_major_group_code == "4" ~ "Community and Personal Service Workers",
      anzsco_major_group_code == "5" ~ "Clerical and Administrative Workers",
      anzsco_major_group_code == "6" ~ "Sales Workers",
      anzsco_major_group_code == "7" ~ "Machinery Operators and Drivers",
      anzsco_major_group_code == "8" ~ "Labourers",
      TRUE ~ "Other"
    )
  ) %>%
  group_by(occupation_group) %>%
  summarise(
    avg_automation = mean(automation_score, na.rm = TRUE),
    avg_augmentation = mean(augmentation_score, na.rm = TRUE),
    avg_genai_exposure = mean(genai_exposure, na.rm = TRUE),
    number_of_occupations = n(),
    .groups = "drop"
  ) %>%
  arrange(desc(avg_genai_exposure))

# Check Chart 3 data
chart3_data

## # A tibble: 8 × 5
##   occupation_group            avg_automation avg_augmentation avg_genai_exposure
##   <chr>                                <dbl>            <dbl>              <dbl>
## 1 Clerical and Administrativ…          0.600            0.692              1.29 
## 2 Sales Workers                        0.475            0.651              1.13 
## 3 Managers                             0.395            0.691              1.09 
## 4 Professionals                        0.391            0.693              1.08 
## 5 Community and Personal Ser…          0.296            0.571              0.867
## 6 Technicians and Trades Wor…          0.269            0.595              0.863
## 7 Machinery Operators and Dr…          0.275            0.548              0.823
## 8 Labourers                            0.196            0.456              0.652
## # ℹ 1 more variable: number_of_occupations <int>

chart3_long <- chart3_data %>%
  select(
    occupation_group,
    avg_automation,
    avg_augmentation
  ) %>%
  pivot_longer(
    cols = c(avg_automation, avg_augmentation),
    names_to = "exposure_type",
    values_to = "score"
  ) %>%
  mutate(
    exposure_type = case_when(
      exposure_type == "avg_automation" ~ "Automation exposure",
      exposure_type == "avg_augmentation" ~ "Augmentation exposure",
      TRUE ~ exposure_type
    )
  )

chart3_long

## # A tibble: 16 × 3
##    occupation_group                       exposure_type         score
##    <chr>                                  <chr>                 <dbl>
##  1 Clerical and Administrative Workers    Automation exposure   0.600
##  2 Clerical and Administrative Workers    Augmentation exposure 0.692
##  3 Sales Workers                          Automation exposure   0.475
##  4 Sales Workers                          Augmentation exposure 0.651
##  5 Managers                               Automation exposure   0.395
##  6 Managers                               Augmentation exposure 0.691
##  7 Professionals                          Automation exposure   0.391
##  8 Professionals                          Augmentation exposure 0.693
##  9 Community and Personal Service Workers Automation exposure   0.296
## 10 Community and Personal Service Workers Augmentation exposure 0.571
## 11 Technicians and Trades Workers         Automation exposure   0.269
## 12 Technicians and Trades Workers         Augmentation exposure 0.595
## 13 Machinery Operators and Drivers        Automation exposure   0.275
## 14 Machinery Operators and Drivers        Augmentation exposure 0.548
## 15 Labourers                              Automation exposure   0.196
## 16 Labourers                              Augmentation exposure 0.456

chart3 <- ggplot(
  chart3_long,
  aes(
    x = reorder(occupation_group, score),
    y = score,
    fill = exposure_type,
    text = paste0(
      "Occupation group: ", occupation_group,
      "<br>Exposure type: ", exposure_type,
      "<br>Average score: ", round(score, 2)
    )
  )
) +
  geom_col(position = "dodge") +
  coord_flip() +
  labs(
    title = "AI exposure by occupation group",
    subtitle = "Average automation and augmentation exposure by major occupation group",
    x = NULL,
    y = "Average exposure score",
    fill = "Exposure type",
    caption = "Source: Jobs and Skills Australia, Generative AI Capacity Study"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    plot.subtitle = element_text(size = 10),
    plot.caption = element_text(size = 8),
    legend.position = "bottom",
    plot.margin = margin(10, 20, 10, 10)
  )

ggplotly(chart3, tooltip = "text")

chart3 <- ggplot(
  chart3_long,
  aes(
    x = reorder(occupation_group, score),
    y = score,
    fill = exposure_type,
    text = paste0(
      "Occupation group: ", occupation_group,
      "<br>Exposure type: ", exposure_type,
      "<br>Average score: ", round(score, 2)
    )
  )
) +
  geom_col(position = "dodge") +
  coord_flip() +
  labs(
    title = "AI exposure by occupation group",
    subtitle = "Average automation and augmentation scores across major ANZSCO groups",
    x = NULL,
    y = "Average exposure score",
    fill = "Exposure type",
    caption = "Source: Jobs and Skills Australia, Generative AI Capacity Study"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    plot.subtitle = element_text(size = 10),
    plot.caption = element_text(size = 8),
    legend.position = "bottom",
    plot.margin = margin(10, 20, 10, 10)
  )

ggplotly(chart3, tooltip = "text")

employment_projection <- read_excel(
  path = employment_projection_file,
  sheet = "Table_6 Occupation Unit Group",
  skip = 3
) %>%
  clean_names()

## New names:
## • `` -> `...2`
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`
## • `` -> `...6`
## • `` -> `...7`
## • `` -> `...8`
## • `` -> `...9`
## • `` -> `...10`
## • `` -> `...11`
## • `` -> `...12`

# 2. Employment projections
library(readxl)
library(dplyr)
library(janitor)

employment_projection <- read_excel(
  path = employment_projection_file,
  sheet = "Table_6 Occupation Unit Group",
  skip = 7
) %>%
  clean_names()

## New names:
## • `Projected` -> `Projected...7`
## • `Projected` -> `Projected...8`
## • `` -> `...10`
## • `` -> `...12`

names(employment_projection)

##  [1] "occupation_level" "nfd_indicator"    "anzsco_code"      "occupation"      
##  [5] "skill_level_1"    "baseline"         "projected_7"      "projected_8"     
##  [9] "x5_year_change"   "x10"              "x10_year_change"  "x12"

head(employment_projection)

## # A tibble: 6 × 12
##   occupation_level nfd_indicator anzsco_code occupation   skill_level_1 baseline
##   <chr>            <chr>               <dbl> <chr>        <chr>         <chr>   
## 1 <NA>             <NA>                   NA <NA>         <NA>          May 202…
## 2 1                N                       1 Managers     -             1872.74…
## 3 2                Y                      10 Managers nfd -             6.15635…
## 4 3                Y                     100 Managers nfd -             6.15635…
## 5 4                Y                    1000 Managers nfd 1             6.15635…
## 6 2                N                      11 Chief Execu… -             135.498…
## # ℹ 6 more variables: projected_7 <chr>, projected_8 <chr>,
## #   x5_year_change <chr>, x10 <chr>, x10_year_change <chr>, x12 <chr>

dim(employment_projection)

## [1] 670  12

employment_projection_raw <- read_excel(
  path = employment_projection_file,
  sheet = "Table_6 Occupation Unit Group",
  skip = 8,
  col_names = FALSE
)

## New names:
## • `` -> `...1`
## • `` -> `...2`
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`
## • `` -> `...6`
## • `` -> `...7`
## • `` -> `...8`
## • `` -> `...9`
## • `` -> `...10`
## • `` -> `...11`
## • `` -> `...12`

employment_projection <- employment_projection_raw %>%
  setNames(c(
    "occupation_level",
    "nfd_level",
    "anzsco_code",
    "occupation_title",
    "skill_level",
    "base_employment_2025",
    "projected_employment_2030",
    "projected_employment_2035",
    "five_year_change_level",
    "five_year_change_percent",
    "ten_year_change_level",
    "ten_year_change_percent"
  )) %>%
  clean_names()

head(employment_projection)

## # A tibble: 6 × 12
##   occupation_level nfd_level anzsco_code occupation_title            skill_level
##   <chr>            <chr>           <dbl> <chr>                       <chr>      
## 1 <NA>             <NA>               NA <NA>                        <NA>       
## 2 1                N                   1 Managers                    -          
## 3 2                Y                  10 Managers nfd                -          
## 4 3                Y                 100 Managers nfd                -          
## 5 4                Y                1000 Managers nfd                1          
## 6 2                N                  11 Chief Executives, General … -          
## # ℹ 7 more variables: base_employment_2025 <chr>,
## #   projected_employment_2030 <chr>, projected_employment_2035 <chr>,
## #   five_year_change_level <chr>, five_year_change_percent <chr>,
## #   ten_year_change_level <chr>, ten_year_change_percent <chr>

names(employment_projection)

##  [1] "occupation_level"          "nfd_level"                
##  [3] "anzsco_code"               "occupation_title"         
##  [5] "skill_level"               "base_employment_2025"     
##  [7] "projected_employment_2030" "projected_employment_2035"
##  [9] "five_year_change_level"    "five_year_change_percent" 
## [11] "ten_year_change_level"     "ten_year_change_percent"

dim(employment_projection)

## [1] 670  12

employment_projection_clean <- employment_projection %>%
  mutate(
    anzsco_code = as.character(anzsco_code),
    base_employment_2025 = as.numeric(base_employment_2025),
    projected_employment_2030 = as.numeric(projected_employment_2030),
    projected_employment_2035 = as.numeric(projected_employment_2035),
    five_year_change_level = as.numeric(five_year_change_level),
    five_year_change_percent = as.numeric(five_year_change_percent),
    ten_year_change_level = as.numeric(ten_year_change_level),
    ten_year_change_percent = as.numeric(ten_year_change_percent)
  ) %>%
  filter(
    !is.na(anzsco_code),
    nchar(anzsco_code) == 4,
    !is.na(ten_year_change_percent)
  )

## Warning: There were 7 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `base_employment_2025 = as.numeric(base_employment_2025)`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 6 remaining warnings.

head(employment_projection_clean)

## # A tibble: 6 × 12
##   occupation_level nfd_level anzsco_code occupation_title            skill_level
##   <chr>            <chr>     <chr>       <chr>                       <chr>      
## 1 4                Y         1000        Managers nfd                1          
## 2 4                Y         1110        Chief Executives, General … 1          
## 3 4                N         1111        Chief Executives and Manag… 1          
## 4 4                N         1112        General Managers            1          
## 5 4                N         1113        Legislators                 1          
## 6 4                Y         1210        Farmers and Farm Managers … 1          
## # ℹ 7 more variables: base_employment_2025 <dbl>,
## #   projected_employment_2030 <dbl>, projected_employment_2035 <dbl>,
## #   five_year_change_level <dbl>, five_year_change_percent <dbl>,
## #   ten_year_change_level <dbl>, ten_year_change_percent <dbl>

dim(employment_projection_clean)

## [1] 474  12

names(employment_projection_clean)

##  [1] "occupation_level"          "nfd_level"                
##  [3] "anzsco_code"               "occupation_title"         
##  [5] "skill_level"               "base_employment_2025"     
##  [7] "projected_employment_2030" "projected_employment_2035"
##  [9] "five_year_change_level"    "five_year_change_percent" 
## [11] "ten_year_change_level"     "ten_year_change_percent"

head(employment_projection_clean)

## # A tibble: 6 × 12
##   occupation_level nfd_level anzsco_code occupation_title            skill_level
##   <chr>            <chr>     <chr>       <chr>                       <chr>      
## 1 4                Y         1000        Managers nfd                1          
## 2 4                Y         1110        Chief Executives, General … 1          
## 3 4                N         1111        Chief Executives and Manag… 1          
## 4 4                N         1112        General Managers            1          
## 5 4                N         1113        Legislators                 1          
## 6 4                Y         1210        Farmers and Farm Managers … 1          
## # ℹ 7 more variables: base_employment_2025 <dbl>,
## #   projected_employment_2030 <dbl>, projected_employment_2035 <dbl>,
## #   five_year_change_level <dbl>, five_year_change_percent <dbl>,
## #   ten_year_change_level <dbl>, ten_year_change_percent <dbl>

dim(employment_projection_clean)

## [1] 474  12

# Join GenAI occupation data with Employment Projections
chart4_data <- genai_occupation_clean %>%
  inner_join(
    employment_projection_clean,
    by = "anzsco_code"
  ) %>%
  filter(
    !is.na(genai_exposure),
    !is.na(ten_year_change_percent)
  )

# Check joined data
head(chart4_data)

## # A tibble: 6 × 17
##   automation_score augmentation_score anzsco_code anzsco_unit_title colour_theme
##              <dbl>              <dbl> <chr>       <chr>             <chr>       
## 1             0.26               0.67 1111        Chief Executives… #570408     
## 2             0.31               0.66 1112        General Managers  #570408     
## 3             0.26               0.63 1113        Legislators       #570408     
## 4             0.41               0.66 1211        Aquaculture Farm… #570408     
## 5             0.34               0.67 1212        Crop Farmers      #570408     
## 6             0.34               0.65 1213        Livestock Farmers #570408     
## # ℹ 12 more variables: genai_exposure <dbl>, occupation_level <chr>,
## #   nfd_level <chr>, occupation_title <chr>, skill_level <chr>,
## #   base_employment_2025 <dbl>, projected_employment_2030 <dbl>,
## #   projected_employment_2035 <dbl>, five_year_change_level <dbl>,
## #   five_year_change_percent <dbl>, ten_year_change_level <dbl>,
## #   ten_year_change_percent <dbl>

dim(chart4_data)

## [1] 357  17

# Create grouped Chart 4 data
chart4_heatmap_data <- chart4_data %>%
  mutate(
    exposure_group = case_when(
      genai_exposure < quantile(genai_exposure, 0.33, na.rm = TRUE) ~ "Low AI exposure",
      genai_exposure < quantile(genai_exposure, 0.66, na.rm = TRUE) ~ "Medium AI exposure",
      TRUE ~ "High AI exposure"
    ),
    growth_group = case_when(
      ten_year_change_percent <= 0 ~ "Declining or flat",
      ten_year_change_percent <= median(ten_year_change_percent, na.rm = TRUE) ~ "Moderate growth",
      TRUE ~ "Strong growth"
    )
  ) %>%
  group_by(exposure_group, growth_group) %>%
  summarise(
    occupation_count = n(),
    avg_growth = mean(ten_year_change_percent, na.rm = TRUE),
    avg_exposure = mean(genai_exposure, na.rm = TRUE),
    .groups = "drop"
  )

chart4_heatmap_data

## # A tibble: 9 × 5
##   exposure_group     growth_group      occupation_count avg_growth avg_exposure
##   <chr>              <chr>                        <int>      <dbl>        <dbl>
## 1 High AI exposure   Declining or flat                6    -0.0322        1.42 
## 2 High AI exposure   Moderate growth                 40     0.0715        1.25 
## 3 High AI exposure   Strong growth                   81     0.192         1.20 
## 4 Low AI exposure    Declining or flat                4    -0.0223        0.57 
## 5 Low AI exposure    Moderate growth                 89     0.0545        0.692
## 6 Low AI exposure    Strong growth                   24     0.159         0.752
## 7 Medium AI exposure Declining or flat                3    -0.0131        0.973
## 8 Medium AI exposure Moderate growth                 37     0.0708        0.966
## 9 Medium AI exposure Strong growth                   73     0.210         0.975

chart4_heatmap_data <- chart4_heatmap_data %>%
  mutate(
    exposure_group = factor(
      exposure_group,
      levels = c("Low AI exposure", "Medium AI exposure", "High AI exposure")
    ),
    growth_group = factor(
      growth_group,
      levels = c("Declining or flat", "Moderate growth", "Strong growth")
    )
  )

chart4_heatmap <- ggplot(
  chart4_heatmap_data,
  aes(
    x = exposure_group,
    y = growth_group,
    fill = occupation_count,
    text = paste0(
      "AI exposure group: ", exposure_group,
      "<br>Growth group: ", growth_group,
      "<br>Number of occupations: ", occupation_count,
      "<br>Average GenAI exposure: ", round(avg_exposure, 2),
      "<br>Average projected growth: ", round(avg_growth, 2), "%"
    )
  )
) +
  geom_tile(colour = "white", linewidth = 1) +
  geom_text(aes(label = occupation_count), size = 5, fontface = "bold") +
  labs(
    title = "AI-exposed jobs are not all disappearing",
    subtitle = "Number of occupations by GenAI exposure and projected employment growth to 2035",
    x = "GenAI exposure level",
    y = "Projected employment growth level",
    fill = "Occupation count",
    caption = "Source: Jobs and Skills Australia, Generative AI Capacity Study and Employment Projections"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    plot.subtitle = element_text(size = 10),
    plot.caption = element_text(size = 8),
    legend.position = "right",
    plot.margin = margin(10, 20, 10, 10)
  )

ggplotly(chart4_heatmap, tooltip = "text")

# 3. Occupation shortage list
shortage_list <- read_excel(shortage_file, 
                            sheet = "2025 OSL (ANZSCO 2022)", 
                            skip = 3) %>%
  clean_names()

## New names:
## • `` -> `...2`
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`
## • `` -> `...6`
## • `` -> `...7`
## • `` -> `...8`
## • `` -> `...9`
## • `` -> `...10`
## • `` -> `...11`
## • `` -> `...12`
## • `` -> `...13`

shortage_raw <- read_excel(
  path = shortage_file,
  sheet = "2025 OSL (ANZSCO 2022)",
  skip = 7
) %>%
  clean_names()

names(shortage_raw)

##  [1] "occupation_code_anzsco_2022"                                   
##  [2] "occupation_title"                                              
##  [3] "ns_no_shortage_s_shortage_r_regional_shortage_m_metro_shortage"
##  [4] "nsw"                                                           
##  [5] "vic"                                                           
##  [6] "qld"                                                           
##  [7] "sa"                                                            
##  [8] "wa"                                                            
##  [9] "tas"                                                           
## [10] "nt"                                                            
## [11] "act"                                                           
## [12] "skill_level"                                                   
## [13] "major_occupation_group"

head(shortage_raw)

## # A tibble: 6 × 13
##   occupation_code_anzsco_2…¹ occupation_title ns_no_shortage_s_sho…² nsw   vic  
##                        <dbl> <chr>            <chr>                  <chr> <chr>
## 1                     111111 Chief Executive… NS                     NS    NS   
## 2                     111211 Corporate Gener… NS                     NS    NS   
## 3                     121111 Aquaculture Far… NS                     NS    NS   
## 4                     121311 Apiarist         NS                     NS    NS   
## 5                     121312 Beef Cattle Far… NS                     NS    NS   
## 6                     121313 Dairy Cattle Fa… NS                     NS    NS   
## # ℹ abbreviated names: ¹occupation_code_anzsco_2022,
## #   ²ns_no_shortage_s_shortage_r_regional_shortage_m_metro_shortage
## # ℹ 8 more variables: qld <chr>, sa <chr>, wa <chr>, tas <chr>, nt <chr>,
## #   act <chr>, skill_level <dbl>, major_occupation_group <dbl>

dim(shortage_raw)

## [1] 916  13

shortage_clean <- shortage_raw %>%
  rename(
    shortage_rating = ns_no_shortage_s_shortage_r_regional_shortage_m_metro_shortage
  ) %>%
  mutate(
    anzsco_6_digit = as.character(occupation_code_anzsco_2022),
    anzsco_code = substr(anzsco_6_digit, 1, 4),
    shortage_status = case_when(
      shortage_rating == "S" ~ "Shortage",
      shortage_rating == "R" ~ "Regional shortage",
      shortage_rating == "M" ~ "Metro shortage",
      shortage_rating == "NS" ~ "No shortage",
      TRUE ~ "Other"
    ),
    is_shortage = ifelse(shortage_rating %in% c("S", "R", "M"), 1, 0)
  ) %>%
  filter(
    !is.na(anzsco_code),
    nchar(anzsco_code) == 4
  )

head(shortage_clean)

## # A tibble: 6 × 17
##   occupation_code_anzsco_2022 occupation_title shortage_rating nsw   vic   qld  
##                         <dbl> <chr>            <chr>           <chr> <chr> <chr>
## 1                      111111 Chief Executive… NS              NS    NS    NS   
## 2                      111211 Corporate Gener… NS              NS    NS    NS   
## 3                      121111 Aquaculture Far… NS              NS    NS    NS   
## 4                      121311 Apiarist         NS              NS    NS    NS   
## 5                      121312 Beef Cattle Far… NS              NS    NS    NS   
## 6                      121313 Dairy Cattle Fa… NS              NS    NS    NS   
## # ℹ 11 more variables: sa <chr>, wa <chr>, tas <chr>, nt <chr>, act <chr>,
## #   skill_level <dbl>, major_occupation_group <dbl>, anzsco_6_digit <chr>,
## #   anzsco_code <chr>, shortage_status <chr>, is_shortage <dbl>

dim(shortage_clean)

## [1] 916  17

shortage_summary <- shortage_clean %>%
  group_by(anzsco_code) %>%
  summarise(
    shortage_count = sum(is_shortage, na.rm = TRUE),
    total_6_digit_occupations = n(),
    shortage_share = shortage_count / total_6_digit_occupations,
    .groups = "drop"
  )

head(shortage_summary)

## # A tibble: 6 × 4
##   anzsco_code shortage_count total_6_digit_occupations shortage_share
##   <chr>                <dbl>                     <int>          <dbl>
## 1 1111                     0                         1              0
## 2 1112                     0                         1              0
## 3 1211                     0                         1              0
## 4 1213                     0                        11              0
## 5 1215                     0                         4              0
## 6 1216                     0                         8              0

dim(shortage_summary)

## [1] 311   4

chart5_data <- genai_occupation_clean %>%
  inner_join(shortage_summary, by = "anzsco_code") %>%
  mutate(
    exposure_group = case_when(
      genai_exposure < quantile(genai_exposure, 0.33, na.rm = TRUE) ~ "Low AI exposure",
      genai_exposure < quantile(genai_exposure, 0.66, na.rm = TRUE) ~ "Medium AI exposure",
      TRUE ~ "High AI exposure"
    ),
    shortage_group = case_when(
      shortage_share == 0 ~ "No shortage",
      shortage_share > 0 & shortage_share < 0.5 ~ "Partial shortage",
      shortage_share >= 0.5 ~ "Strong shortage",
      TRUE ~ "Unknown"
    )
  )

head(chart5_data)

## # A tibble: 6 × 11
##   automation_score augmentation_score anzsco_code anzsco_unit_title colour_theme
##              <dbl>              <dbl> <chr>       <chr>             <chr>       
## 1             0.26               0.67 1111        Chief Executives… #570408     
## 2             0.31               0.66 1112        General Managers  #570408     
## 3             0.41               0.66 1211        Aquaculture Farm… #570408     
## 4             0.34               0.65 1213        Livestock Farmers #570408     
## 5             0.38               0.72 1311        Advertising, Pub… #570408     
## 6             0.46               0.71 1321        Corporate Servic… #570408     
## # ℹ 6 more variables: genai_exposure <dbl>, shortage_count <dbl>,
## #   total_6_digit_occupations <int>, shortage_share <dbl>,
## #   exposure_group <chr>, shortage_group <chr>

dim(chart5_data)

## [1] 297  11

chart5_heatmap_data <- chart5_data %>%
  group_by(exposure_group, shortage_group) %>%
  summarise(
    occupation_count = n(),
    avg_exposure = mean(genai_exposure, na.rm = TRUE),
    avg_shortage_share = mean(shortage_share, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  mutate(
    exposure_group = factor(
      exposure_group,
      levels = c("Low AI exposure", "Medium AI exposure", "High AI exposure")
    ),
    shortage_group = factor(
      shortage_group,
      levels = c("No shortage", "Partial shortage", "Strong shortage")
    )
  )

chart5_heatmap_data

## # A tibble: 9 × 5
##   exposure_group shortage_group occupation_count avg_exposure avg_shortage_share
##   <fct>          <fct>                     <int>        <dbl>              <dbl>
## 1 High AI expos… No shortage                  89        1.23               0    
## 2 High AI expos… Partial short…                7        1.30               0.312
## 3 High AI expos… Strong shorta…               12        1.18               0.706
## 4 Low AI exposu… No shortage                  32        0.746              0    
## 5 Low AI exposu… Partial short…                9        0.774              0.268
## 6 Low AI exposu… Strong shorta…               56        0.755              0.903
## 7 Medium AI exp… No shortage                  48        1.01               0    
## 8 Medium AI exp… Partial short…                9        1.01               0.164
## 9 Medium AI exp… Strong shorta…               35        0.989              0.892

chart5 <- ggplot(
  chart5_heatmap_data,
  aes(
    x = exposure_group,
    y = shortage_group,
    fill = occupation_count,
    text = paste0(
      "AI exposure group: ", exposure_group,
      "<br>Shortage group: ", shortage_group,
      "<br>Number of occupations: ", occupation_count,
      "<br>Average GenAI exposure: ", round(avg_exposure, 2),
      "<br>Average shortage share: ", round(avg_shortage_share * 100, 1), "%"
    )
  )
) +
  geom_tile(colour = "white", linewidth = 1) +
  geom_text(aes(label = occupation_count), size = 5, fontface = "bold") +
  scale_fill_gradient(
    low = "#FDB863",
    high = "#D7301F"
  ) +
  labs(
    title = "Some AI-exposed jobs are still in shortage",
    subtitle = "Occupation groups by GenAI exposure and 2025 national shortage status",
    x = "GenAI exposure level",
    y = "Shortage level",
    fill = "Occupation count",
    caption = "Source: Jobs and Skills Australia, Generative AI Capacity Study and 2025 Occupation Shortage List"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    plot.subtitle = element_text(size = 10),
    plot.caption = element_text(size = 8),
    legend.position = "right",
    plot.margin = margin(10, 20, 10, 10)
  )

ggplotly(chart5, tooltip = "text")