# install.packages(c("dplyr", "lubridate", "stringr", "tibble", "ggplot2"))
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(stringr)
library(tibble)
library(ggplot2)

1. Data quality and investigation

setwd("C:/Users/zhu12/OneDrive/Career/CC Interview")
setwd("C:/Users/zhu12/OneDrive/Career/CC Interview/Task/Task")
d_main<-read.csv("cancer_incidence_patient_record_level.csv")
d_lga<-read.csv("map_lga_code_to_name.csv")
d_lhd<-read.csv("map_lhd_code_to_name.csv")
glimpse(d_main)
## Rows: 50,000
## Columns: 10
## $ FirstName               <chr> "Nestor", "Rosalee", "Ezzard", "Dr", "Orlo", "…
## $ LastName                <chr> "Daugherty", "Beatty", "Donnelly", "IV", "Torp…
## $ Sex                     <chr> "F", "F", "F", "F", "F", "M", "F", "male", "F"…
## $ DateOfBirth             <chr> "29/12/1970", "22/12/1961", "17/10/1968", "10/…
## $ DiagnosisDate           <chr> "2/07/2025", "22/09/2023", "14/09/2021", "17/0…
## $ CancerTypeName          <chr> "Breast", "Breast", "Breast", "Breast", "Cervi…
## $ ExtentGroupName         <chr> "Regional", "Localised", "Localised", "Localis…
## $ RegionBirthCode         <int> -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 11, -1…
## $ LocalHealthDistictCode  <chr> "X730", "X750", "X760", "X730", "X750", "X730"…
## $ LocalGovernmentAreaCode <int> 10180, 10500, 10600, 10650, 10300, 10180, 1047…
d_main1<-merge(d_main, d_lga, by="LocalGovernmentAreaCode")
d_main2<-merge(d_main1, d_lhd, by="LocalHealthDistictCode")
d_main2$last_four <- as.integer(substr(d_main2$DiagnosisDate, nchar(d_main2$DiagnosisDate) - 3,nchar(d_main2$DiagnosisDate)))
## Warning: NAs introduced by coercion
d_main2 %>% group_by(LocalHealthDistrictName) %>% summarize(met_criteria = sum(last_four > 2021 & Sex == "female", na.rm = TRUE))
## # A tibble: 9 × 2
##   LocalHealthDistrictName   met_criteria
##   <chr>                            <int>
## 1 Central Coast LHD                  378
## 2 Central Coast Region LHD           378
## 3 Far West LHD                       398
## 4 Hunter New England LHD             387
## 5 Illawarra Shoalhaven LHD           406
## 6 Mid North Coast LHD                414
## 7 Murrumbidgee LHD                   421
## 8 Nepean Blue Mountains LHD          354
## 9 Northern NSW LHD                   119
d_main2 %>% group_by(last_four) %>% summarize(met_criteria = sum(last_four > 2021 & Sex == "female", na.rm = TRUE))
## # A tibble: 7 × 2
##   last_four met_criteria
##       <int>        <int>
## 1      2021            0
## 2      2022          753
## 3      2023          751
## 4      2024          737
## 5      2025          737
## 6      2026          277
## 7        NA            0

data quality issues include missing values,

2. Visualisation

ExtentOfDiseaseColoursVector0 <- c("Localised" = "#8055f1",
                                   "Regional" = "#002664",
                                   "Distant" = "#2e808e",
                                   "Unknown" = "#441170")
chart1<-d_main2 %>% group_by(LocalHealthDistrictName) %>% summarize(met_criteria = sum(last_four > 2021 & Sex == "female", na.rm = TRUE))
ggplot(chart1, aes(x = LocalHealthDistrictName, y = met_criteria))

3. Interpretation of results

Northern NSW has the lowest occurance of female breast cancer while other areas are high.

The number of incidents over the past few years are stable - no major increase.