# install.packages(c("dplyr", "lubridate", "stringr", "tibble", "ggplot2"))
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(stringr)
library(tibble)
library(ggplot2)
1. Data quality and investigation
setwd("C:/Users/zhu12/OneDrive/Career/CC Interview")
setwd("C:/Users/zhu12/OneDrive/Career/CC Interview/Task/Task")
d_main<-read.csv("cancer_incidence_patient_record_level.csv")
d_lga<-read.csv("map_lga_code_to_name.csv")
d_lhd<-read.csv("map_lhd_code_to_name.csv")
glimpse(d_main)
## Rows: 50,000
## Columns: 10
## $ FirstName <chr> "Nestor", "Rosalee", "Ezzard", "Dr", "Orlo", "…
## $ LastName <chr> "Daugherty", "Beatty", "Donnelly", "IV", "Torp…
## $ Sex <chr> "F", "F", "F", "F", "F", "M", "F", "male", "F"…
## $ DateOfBirth <chr> "29/12/1970", "22/12/1961", "17/10/1968", "10/…
## $ DiagnosisDate <chr> "2/07/2025", "22/09/2023", "14/09/2021", "17/0…
## $ CancerTypeName <chr> "Breast", "Breast", "Breast", "Breast", "Cervi…
## $ ExtentGroupName <chr> "Regional", "Localised", "Localised", "Localis…
## $ RegionBirthCode <int> -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 11, -1…
## $ LocalHealthDistictCode <chr> "X730", "X750", "X760", "X730", "X750", "X730"…
## $ LocalGovernmentAreaCode <int> 10180, 10500, 10600, 10650, 10300, 10180, 1047…
d_main1<-merge(d_main, d_lga, by="LocalGovernmentAreaCode")
d_main2<-merge(d_main1, d_lhd, by="LocalHealthDistictCode")
d_main2$last_four <- as.integer(substr(d_main2$DiagnosisDate, nchar(d_main2$DiagnosisDate) - 3,nchar(d_main2$DiagnosisDate)))
## Warning: NAs introduced by coercion
d_main2 %>% group_by(LocalHealthDistrictName) %>% summarize(met_criteria = sum(last_four > 2021 & Sex == "female", na.rm = TRUE))
## # A tibble: 9 × 2
## LocalHealthDistrictName met_criteria
## <chr> <int>
## 1 Central Coast LHD 378
## 2 Central Coast Region LHD 378
## 3 Far West LHD 398
## 4 Hunter New England LHD 387
## 5 Illawarra Shoalhaven LHD 406
## 6 Mid North Coast LHD 414
## 7 Murrumbidgee LHD 421
## 8 Nepean Blue Mountains LHD 354
## 9 Northern NSW LHD 119
d_main2 %>% group_by(last_four) %>% summarize(met_criteria = sum(last_four > 2021 & Sex == "female", na.rm = TRUE))
## # A tibble: 7 × 2
## last_four met_criteria
## <int> <int>
## 1 2021 0
## 2 2022 753
## 3 2023 751
## 4 2024 737
## 5 2025 737
## 6 2026 277
## 7 NA 0
data quality issues include missing values,
2. Visualisation
ExtentOfDiseaseColoursVector0 <- c("Localised" = "#8055f1",
"Regional" = "#002664",
"Distant" = "#2e808e",
"Unknown" = "#441170")
chart1<-d_main2 %>% group_by(LocalHealthDistrictName) %>% summarize(met_criteria = sum(last_four > 2021 & Sex == "female", na.rm = TRUE))
ggplot(chart1, aes(x = LocalHealthDistrictName, y = met_criteria))

3. Interpretation of results
Northern NSW has the lowest occurance of female breast cancer while
other areas are high.
The number of incidents over the past few years are stable - no
major increase.