This is an R Markdown
Notebook. When you execute code within the notebook, the results appear
beneath the code.
Try executing this chunk by clicking the Run button within
the chunk or by placing your cursor inside it and pressing
Ctrl+Shift+Enter.
# /////////////------------ LIMS DATA FORMATTING ------------ /////////////////
# /////////////////
# specify function to standardise format of LIMS data for analysis
lims.stnd.format.fun <- function(x,
field_names = c("specimen number", ## specify all possible field names for each of the essential indicators
"crn (hospital) number","patient hospital number",
"nhs number","patient nhs number",
"forename","patient forename",
"surname","patient surname",
"date of birth","patient date of birth","patient dob",
"sex","patient sex",
"ethnic origin desc","ethnicity",
"post code","patient postcode",
"date of specimen","specimen date",
"specimen type desc","specimen type",
"organism desc","result text","organism comment","organism species name", ## field describing organism isolated, including GBS negative and no organism
"antibiotic desc","antimicrobial",
"result","susceptibility test result")){
names(x) <- tolower(gsub("\\.", "", names(x))) # ensure all field names lower case
x <- x %>% select(any_of(field_names)) # select essential indicators
names(x) <- dplyr::case_when((names(x) == "specimen number") ~ "specno", ## standardise field names for each of the essential indicators
(names(x) == "crn (hospital) number" | names(x) == "patient hospital number") ~ "hospno",
(names(x) == "nhs number" | names(x) == "patient nhs number") ~ "nhsno",
(names(x) == "forename" | names(x) == "patient forename") ~ "forename",
(names(x) == "surname" | names(x) == "patient surname") ~ "surname",
(names(x) == "date of birth" | names(x) == "patient date of birth" | names(x) == "patient dob") ~ "dob",
(names(x) == "sex" | names(x) == "patient sex") ~ "sex",
(names(x) == "ethnic origin desc" | names(x) == "ethnicity") ~ "ethnicity",
(names(x) == "post code" | names(x) == "patient postcode") ~ "patient_postcode",
(names(x) == "date of specimen" | names(x) == "specimen date") ~ "specdate",
(names(x) == "specimen type desc" | names(x) == "specimen type") ~ "spectype",
(names(x) == "organism desc" | names(x) == "result text" | names(x) == "organism comment" | names(x) == "organism species name") ~ "species",
(names(x) == "antibiotic desc" | names(x) == "antimicrobial") ~ "abx",
(names(x) == "result" | names(x) == "susceptibility test result") ~ "ast_result")
x <- x %>%
mutate(species=case_when(
grepl("Beta Haemolytic Streptococcus Group B",species) |
grepl("^Streptococcus agalactiae",species) |
species == "Streptococcus Group B" |
species == "S.agalactiae(Group B strep)" |
species == "Group B Haemolytic Strep" |
species == "Strep.agalactiae (Group B)" |
species == "Group B Streptococcus isolated" |
species == "Group B Streptococcus ISOLATED" ~ "Group B Streptococcus",
species == "Group B Strep NOT isolated" |
species== "Streptococcus group B NOT isolated" |
species=="Group B Streptococcus Not isolated" |
species== "No organism detected" |
species=="Negative after 1 day" |
species=="Organism Species Name" |
grepl("^Candida",species) ~ "GBS not isolated",
TRUE ~ species),
ethnicity=case_when(
grepl("xa0Ethnic", ethnicity) ~ "Any other ethnic group",
TRUE ~ ethnicity)
)
x <- x[, !duplicated(colnames(x))] ## remove duplicate cols
# add in antimicrobial cols if not already in data (for GBS neg)
if(!"abx" %in% colnames(x)){
x$abx <- NA
}
if(!"ast_result" %in% colnames(x)){
x$ast_result <- NA
}
x <- x %>% mutate(abx = case_when(
abx=="" ~ NA,
TRUE ~ abx),
ast_result = case_when(
ast_result=="" ~ NA,
TRUE ~ ast_result)) %>%
mutate(ast_result=toupper(ast_result),
module="LIMS")
## converting specimen date and dob to correct date format
x <- x %>%
mutate(
dob = case_when(
inherits(dob, "POSIXct") ~ as.Date(dob), # POSIXct to Date
inherits(dob, "IDate") ~ as.Date(as.character(dob)), # IDate to Date
TRUE ~ as.Date(dob, format = "%d/%m/%Y") # Character string "DD/MM/YYYY" to Date
)
) %>%
mutate(
specdate = case_when(
inherits(specdate, "POSIXct") ~ as.Date(specdate), # POSIXct to Date
inherits(specdate, "IDate") ~ as.Date(as.character(specdate)), # IDate to Date
TRUE ~ as.Date(specdate, format = "%d/%m/%Y") # Character string "DD/MM/YYYY" to Date
)
) ## converting specimen date to correct date format
return(x) # function output
}
# specify species column formatting function
species_format_fun <- function(x){
x <- x %>%
mutate(species=case_when(
grepl("Beta Haemolytic Streptococcus Group B",species) |
grepl("^Streptococcus agalactiae",species) |
species == "Streptococcus Group B" |
species == "S.agalactiae(Group B strep)" |
species == "Group B Haemolytic Strep" |
species == "Strep.agalactiae (Group B)" |
species == "Group B Streptococcus isolated" |
species == "Group B Streptococcus ISOLATED" |
species== "beta Haemolytic Strep, group B" ~ "Group B Streptococcus",
species == "Group B Strep NOT isolated" |
species== "Streptococcus group B NOT isolated" |
species=="Group B Streptococcus Not isolated" |
species== "No organism detected" |
species=="No organism recorded" |
species=="Negative after 1 day" |
species=="Organism Species Name" |
grepl("^Candida",species) ~ "GBS not isolated",
TRUE ~ species)
)
}
# /////////////------------ GBS CARRIAGE ESTIMATION ------------ /////////////////
# /////////////////
# specify function to reshape and calculate GBS carriage (% total patients in sampling timeframe)
gbs.carr.byethn.fun <- function(x){
outdf <- x %>% mutate(z=1)
# %>% filter(hes_ethnicity!="Unknown" & hes_ethnicity!="Not linked" & hes_ethnicity!="_fromLIMS")
ethn_summary <- outdf %>% group_by(hes_ethnicity) %>% summarize(total=sum(z)) %>% mutate(pct=total/sum(total)*100)
# GBS pos by ethnicity
gbspos_byethn <- outdf %>% group_by(hes_ethnicity) %>% mutate(gbspos=ifelse(species=="Group B Streptococcus",1,0)) %>% summarize(gbspos=sum(gbspos)) # GBS pos by ethnicity
# total ECM by ethnicity
total_byeth <- outdf %>% group_by(hes_ethnicity) %>% mutate(total=1) %>% summarize(totalecm=sum(total)) # total ECM by ethnicity
# carriage by ethnicity
carriage_byethn <- merge(gbspos_byethn,total_byeth,by="hes_ethnicity") %>% mutate(carriage_bythn=gbspos/totalecm*100) # carriage estimate
# all GBS pos
gbspos_all <- outdf %>% mutate(gbspos=ifelse(species=="Group B Streptococcus",1,0)) %>% summarize(gbspos=sum(gbspos)) # all GBS pos
# total
total <- outdf %>% mutate(total=1) %>% summarize(totalecm=sum(total)) # total ECM
# carriage
gbs_carriage <- gbspos_all/total*100 # carriage estimate
gbs_carriage <- gbs_carriage %>% rename("total_carriage" = gbspos)
carriage_all <- gbspos_all/length(unique(outdf$nhsno))*100
carriage_summary <- cbind(carriage_byethn,gbs_carriage)
carriage_summary <- merge(ethn_summary,carriage_summary, by="hes_ethnicity")
carriage_summary <- carriage_summary %>%
rowwise() %>%
mutate(
conf = list(binom.confint(gbspos, totalecm, methods = "exact")),
lower_ci = conf[[1, "lower"]] * 100,
upper_ci = conf[[1, "upper"]] * 100
) %>%
select(-conf) %>%
ungroup()
return(carriage_summary)
}
# /////////////////
# specify function to reshape and calculate GBS carriage by site and ethnicity (% total patients in sampling timeframe)
gbs.carr.bysite.byethn.fun <- function(x){
outdf <- x %>% mutate(z=1)
# %>% filter(hes_ethnicity!="Unknown" & hes_ethnicity!="Not linked" & hes_ethnicity!="_fromLIMS")
ethn_summary <- outdf %>% group_by(site,hes_ethnicity) %>% summarize(total=sum(z)) %>% mutate(pct=total/sum(total)*100)
# GBS pos by ethnicity
gbspos_byethn <- outdf %>% group_by(site,hes_ethnicity) %>% mutate(gbspos=ifelse(species=="Group B Streptococcus",1,0)) %>% summarize(gbspos=sum(gbspos)) # GBS pos by ethnicity
# total ECM by ethnicity
total_byeth <- outdf %>% group_by(site,hes_ethnicity) %>% mutate(total=1) %>% summarize(totalecm=sum(total)) # total ECM by ethnicity
# carriage by ethnicity
carriage_byethn <- merge(gbspos_byethn,total_byeth,by= c("site","hes_ethnicity")) %>% mutate(carriage_bythn=gbspos/totalecm*100) # carriage estimate
# all GBS pos
gbspos_all <- outdf %>% mutate(gbspos=ifelse(species=="Group B Streptococcus",1,0)) %>% summarize(gbspos=sum(gbspos)) # all GBS pos
# total
total <- outdf %>% mutate(total=1) %>% summarize(totalecm=sum(total)) # total ECM
# carriage
gbs_carriage <- gbspos_all/total*100 # carriage estimate
gbs_carriage <- gbs_carriage %>% rename("total_carriage" = gbspos)
carriage_all <- gbspos_all/length(unique(outdf$nhsno))*100
carriage_summary <- cbind(carriage_byethn,gbs_carriage)
carriage_summary <- merge(ethn_summary,carriage_summary, by=c("site","hes_ethnicity"))
carriage_summary <- carriage_summary %>%
rowwise() %>%
mutate(
conf = list(binom.confint(gbspos, totalecm, methods = "exact")),
lower_ci = conf[[1, "lower"]] * 100,
upper_ci = conf[[1, "upper"]] * 100
) %>%
select(-conf) %>%
ungroup()
return(carriage_summary)
}
# /////////////////
# specify function to reshape and calculate GBS carriage (% total patients in sampling timeframe) among Asian ethnic subgroups
# Indian, Pakistani and Bangladeshi.
gbs.carr.bysubethn.fun <- function(x){
outdf <- x %>% mutate(z=1) %>% filter(Ethnic_group=="Indian (Asian or Asian British)" |
Ethnic_group=="Bangladeshi (Asian or Asian British)" |
Ethnic_group=="Pakistani (Asian or Asian British)")
ethn_summary <- outdf %>% group_by(Ethnic_group) %>% summarize(total=sum(z)) %>% mutate(pct=total/sum(total)*100)
# GBS pos by ethnicity
gbspos_byethn <- outdf %>% group_by(Ethnic_group) %>% mutate(gbspos=ifelse(species=="Group B Streptococcus",1,0)) %>% summarize(gbspos=sum(gbspos)) # GBS pos by ethnicity
# total ECM by ethnicity
total_byeth <- outdf %>% group_by(Ethnic_group) %>% mutate(total=1) %>% summarize(totalecm=sum(total)) # total ECM by ethnicity
# carriage by ethnicity
carriage_byethn <- merge(gbspos_byethn,total_byeth,by="Ethnic_group") %>% mutate(carriage_bythn=gbspos/totalecm*100) # carriage estimate
# all GBS pos
gbspos_all <- outdf %>% mutate(gbspos=ifelse(species=="Group B Streptococcus",1,0)) %>% summarize(gbspos=sum(gbspos)) # all GBS pos
# total
total <- outdf %>% mutate(total=1) %>% summarize(totalecm=sum(total)) # total ECM
# carriage
gbs_carriage <- gbspos_all/total*100 # carriage estimate
gbs_carriage <- gbs_carriage %>% rename("total_carriage" = gbspos)
carriage_all <- gbspos_all/length(unique(outdf$nhsno))*100
carriage_summary <- cbind(carriage_byethn,gbs_carriage)
carriage_summary <- merge(ethn_summary,carriage_summary, by="Ethnic_group")
# %>% mutate(across(where(is.numeric), round, 2))
return(carriage_summary)
}
# /////////////////
# specify function to reshape and calculate GBS carriage (% total patients in sampling timeframe) for all 16 ethnicity categories in HES
gbs.carr.byallsubethn.fun <- function(x){
outdf <- x %>% mutate(z=1)
# %>% filter(Ethnic_group!="NA" & Ethnic_group!="99 Not known" & Ethnic_group!="Z Not stated")
ethn_summary <- outdf %>% group_by(Ethnic_group) %>% summarize(total=sum(z)) %>% mutate(pct=total/sum(total)*100)
# GBS pos by ethnicity
gbspos_byethn <- outdf %>% group_by(Ethnic_group) %>% mutate(gbspos=ifelse(species=="Group B Streptococcus",1,0)) %>% summarize(gbspos=sum(gbspos)) # GBS pos by ethnicity
# total ECM by ethnicity
total_byeth <- outdf %>% group_by(Ethnic_group) %>% mutate(total=1) %>% summarize(totalecm=sum(total)) # total ECM by ethnicity
# carriage by ethnicity
carriage_byethn <- merge(gbspos_byethn,total_byeth,by="Ethnic_group") %>% mutate(carriage_bythn=gbspos/totalecm*100) # carriage estimate
# all GBS pos
gbspos_all <- outdf %>% mutate(gbspos=ifelse(species=="Group B Streptococcus",1,0)) %>% summarize(gbspos=sum(gbspos)) # all GBS pos
# total
total <- outdf %>% mutate(total=1) %>% summarize(totalecm=sum(total)) # total ECM
# carriage
gbs_carriage <- gbspos_all/total*100 # carriage estimate
gbs_carriage <- gbs_carriage %>% rename("total_carriage" = gbspos)
carriage_all <- gbspos_all/length(unique(outdf$nhsno))*100
carriage_summary <- cbind(carriage_byethn,gbs_carriage)
carriage_summary <- merge(ethn_summary,carriage_summary, by="Ethnic_group")
# %>% mutate(across(where(is.numeric), round, 2))
return(carriage_summary)
}
# /////////////////
# specify function to reshape and calculate GBS carriage by IMD (% total patients in sampling timeframe)
gbs.carr.byimd.fun <- function(x){
outdf <- x %>% mutate(z=1)
# pivot_wider( # currently pivots wider as part of obai process
# names_from = abx,
# values_from = ast_result)
imd_summary <- outdf %>% group_by(imd_quintile) %>% summarize(total=sum(z)) %>% mutate(pct=total/sum(total)*100)
# GBS pos by IMD quintile
gbspos_byimd <- outdf %>% group_by(imd_quintile) %>% mutate(gbspos=ifelse(species=="Group B Streptococcus",1,0)) %>% summarize(gbspos=sum(gbspos)) # GBS pos by IMD
# total ECM by IMD quintile
total_byimd <- outdf %>% group_by(imd_quintile) %>% mutate(total=1) %>% summarize(totalecm=sum(total)) # total ECM by IMD
# carriage by ethnicity
carriage_byimd <- merge(gbspos_byimd,total_byimd,by="imd_quintile") %>% mutate(carriage_byimd=gbspos/totalecm*100) # carriage estimate
# all GBS pos
gbspos_all <- outdf %>% mutate(gbspos=ifelse(species=="Group B Streptococcus",1,0)) %>% summarize(gbspos=sum(gbspos)) # all GBS pos
# total
total <- outdf %>% mutate(total=1) %>% summarize(totalecm=sum(total)) # total ECM
# carriage
gbs_carriage <- gbspos_all/total*100 # carriage estimate
gbs_carriage <- gbs_carriage %>% rename("total_carriage" = gbspos)
carriage_all <- gbspos_all/length(unique(outdf$nhsno))*100
carriage_summary <- cbind(carriage_byimd,gbs_carriage)
carriage_summary <- merge(imd_summary,carriage_summary, by="imd_quintile") %>% mutate(imd_quintile = case_when(
is.na(imd_quintile) ~ "Not linked",
TRUE ~ as.character(imd_quintile)))
carriage_summary <- carriage_summary %>%
rowwise() %>%
mutate(
conf = list(binom.confint(gbspos, totalecm, methods = "exact")),
lower_ci = conf[[1, "lower"]] * 100,
upper_ci = conf[[1, "upper"]] * 100
) %>%
select(-conf) %>%
ungroup()
return(carriage_summary)
}
# /////////////////
# specify function to reshape and calculate GBS carriage (% total patients in sampling timeframe)
gbs.carr.byage.fun <- function(x){
outdf <- x %>% mutate(z=1,
agegroup = cut(age,
breaks = seq(0, 100, by = 5), # Specify the breaks for the age bands
right = FALSE, # Include the left endpoint, exclude the right
labels = paste(seq(0, 95, by = 5), seq(4, 99, by = 5), sep = "-")),
agegroup = case_when(
agegroup %in% c("40-44","45-49") ~ "40-49",
TRUE ~ agegroup
))
# pivot_wider( # currently pivots wider as part of obai process
# names_from = abx,
# values_from = ast_result)
age_summary <- outdf %>% group_by(agegroup) %>% summarize(total=sum(z)) %>% mutate(pct=total/sum(total)*100)
# GBS pos by ethnicity
gbspos_byage <- outdf %>% group_by(agegroup) %>% mutate(gbspos=ifelse(species=="Group B Streptococcus",1,0)) %>% summarize(gbspos=sum(gbspos)) # GBS pos by ethnicity
# total ECM by ethnicity
total_byage <- outdf %>% group_by(agegroup) %>% mutate(total=1) %>% summarize(totalecm=sum(total)) # total ECM by ethnicity
# carriage by ethnicity
carriage_byage <- merge(gbspos_byage,total_byage,by="agegroup") %>% mutate(carriage_byage=gbspos/totalecm*100) # carriage estimate
# all GBS pos
gbspos_all <- outdf %>% mutate(gbspos=ifelse(species=="Group B Streptococcus",1,0)) %>% summarize(gbspos=sum(gbspos)) # all GBS pos
# total
total <- outdf %>% mutate(total=1) %>% summarize(totalecm=sum(total)) # total ECM
# carriage
gbs_carriage <- gbspos_all/total*100 # carriage estimate
gbs_carriage <- gbs_carriage %>% rename("total_carriage" = gbspos)
carriage_all <- gbspos_all/length(unique(outdf$nhsno))*100
carriage_summary <- cbind(carriage_byage,gbs_carriage)
carriage_summary <- merge(age_summary,carriage_summary, by="agegroup")
carriage_summary <- carriage_summary %>%
rowwise() %>%
mutate(
conf = list(binom.confint(gbspos, totalecm, methods = "exact")),
lower_ci = conf[[1, "lower"]] * 100,
upper_ci = conf[[1, "upper"]] * 100
) %>%
select(-conf) %>%
ungroup()
return(carriage_summary)
}
# /////////////////
# specify function to reshape and calculate GBS carriage (% total patients in sampling timeframe)
gbs.carr.bysite.fun <- function(x){
outdf <- x %>% mutate(z=1)
# pivot_wider( # currently pivots wider as part of obai process
# names_from = abx,
# values_from = ast_result)
site_summary <- outdf %>% group_by(site) %>% summarize(total=sum(z)) %>% mutate(pct=total/sum(total)*100)
# GBS pos by ethnicity
gbspos_bysite <- outdf %>% group_by(site) %>% mutate(gbspos=ifelse(species=="Group B Streptococcus",1,0)) %>% summarize(gbspos=sum(gbspos)) # GBS pos by ethnicity
# total ECM by ethnicity
total_bysite <- outdf %>% group_by(site) %>% mutate(total=1) %>% summarize(totalecm=sum(total)) # total ECM by ethnicity
# carriage by ethnicity
carriage_bysite <- merge(gbspos_bysite,total_bysite,by="site") %>% mutate(carriage_bysite=gbspos/totalecm*100) # carriage estimate
# all GBS pos
gbspos_all <- outdf %>% mutate(gbspos=ifelse(species=="Group B Streptococcus",1,0)) %>% summarize(gbspos=sum(gbspos)) # all GBS pos
# total
total <- outdf %>% mutate(total=1) %>% summarize(totalecm=sum(total)) # total ECM
# carriage
gbs_carriage <- gbspos_all/total*100 # carriage estimate
gbs_carriage <- gbs_carriage %>% rename("total_carriage" = gbspos)
carriage_all <- gbspos_all/length(unique(outdf$nhsno))*100
carriage_summary <- cbind(carriage_bysite,gbs_carriage)
carriage_summary <- merge(site_summary,carriage_summary, by="site")
carriage_summary <- carriage_summary %>%
rowwise() %>%
mutate(
conf = list(binom.confint(gbspos, totalecm, methods = "exact")),
lower_ci = conf[[1, "lower"]] * 100,
upper_ci = conf[[1, "upper"]] * 100
) %>%
select(-conf) %>%
ungroup()
return(carriage_summary)
}
# /////////////////
# specify function to reshape and calculate GBS carriage (% total patients) by year
gbs.carr.byyear.fun <- function(x){
outdf <- x %>% mutate(z=1)
ethn_summary <- outdf %>% group_by(year) %>% summarize(total=sum(z)) %>% mutate(pct=total/sum(total)*100)
# GBS pos by ethnicity
gbspos_byyear <- outdf %>% group_by(year) %>% mutate(gbspos=ifelse(species=="Group B Streptococcus",1,0)) %>% summarize(gbspos=sum(gbspos)) # GBS pos by ethnicity
# total ECM by ethnicity
total_byyear <- outdf %>% group_by(year) %>% mutate(total=1) %>% summarize(totalecm=sum(total)) # total ECM by ethnicity
# carriage by ethnicity
carriage_byyear <- merge(gbspos_byyear,total_byyear,by="year") %>% mutate(carriage_byyear=gbspos/totalecm*100) # carriage estimate
# all GBS pos
gbspos_all <- outdf %>% mutate(gbspos=ifelse(species=="Group B Streptococcus",1,0)) %>% summarize(gbspos=sum(gbspos)) # all GBS pos
# total
total <- outdf %>% mutate(total=1) %>% summarize(totalecm=sum(total)) # total ECM
# carriage
gbs_carriage <- gbspos_all/total*100 # carriage estimate
gbs_carriage <- gbs_carriage %>% rename("total_carriage" = gbspos)
carriage_all <- gbspos_all/length(unique(outdf$nhsno))*100
carriage_summary <- cbind(carriage_byyear,gbs_carriage)
carriage_summary <- merge(ethn_summary,carriage_summary, by="year")
# %>% mutate(across(where(is.numeric), round, 2))
return(carriage_summary)
}
# /////////////////
# specify function to reshape and calculate GBS carriage (% total patients) by month
gbs.carr.byym.fun <- function(x){
outdf <- x %>% mutate(z=1)
ethn_summary <- outdf %>% group_by(ym) %>% summarize(total=sum(z)) %>% mutate(pct=total/sum(total)*100)
# GBS pos by ethnicity
gbspos_byym <- outdf %>% group_by(ym) %>% mutate(gbspos=ifelse(species=="Group B Streptococcus",1,0)) %>% summarize(gbspos=sum(gbspos)) # GBS pos by ethnicity
# total ECM by ethnicity
total_byym <- outdf %>% group_by(ym) %>% mutate(total=1) %>% summarize(totalecm=sum(total)) # total ECM by ethnicity
# carriage by ethnicity
carriage_byym <- merge(gbspos_byym,total_byym,by="ym") %>% mutate(carriage_byym=gbspos/totalecm*100) # carriage estimate
# all GBS pos
gbspos_all <- outdf %>% mutate(gbspos=ifelse(species=="Group B Streptococcus",1,0)) %>% summarize(gbspos=sum(gbspos)) # all GBS pos
# total
total <- outdf %>% mutate(total=1) %>% summarize(totalecm=sum(total)) # total ECM
# carriage
gbs_carriage <- gbspos_all/total*100 # carriage estimate
gbs_carriage <- gbs_carriage %>% rename("total_carriage" = gbspos)
carriage_all <- gbspos_all/length(unique(outdf$nhsno))*100
carriage_summary <- cbind(carriage_byym,gbs_carriage)
carriage_summary <- merge(ethn_summary,carriage_summary, by="ym")
# %>% mutate(across(where(is.numeric), round, 2))
return(carriage_summary)
}
# /////////////------------ LIMS LINELIST PROCESSING AND AMR RESULT GENERATION ------------ /////////////////
# /////////////////
# specify umbrella QC function
qc.umbrella.fun <- function(x,summary=list()){
x$sn <- seq_len(nrow(x))
x$qc <- sgss.qc.fun(x$hospno, x$specno, x$forename, x$surname)
x$qc[which(!x$qc %in% c(0, NA))] <- 1
summary$qc_samples <- table(x$qc, x$module)
cat(paste0("Removed ",sum(x$qc) , " quality control samples\n\n"))
x <- x[x$qc==0,]
}
# /////////////////
# specify function to QC data (from obai::sgss_qc_records fun)
sgss.qc.fun <- function (hospno, specno, forename, surname)
{
hospno <- ifelse(is.na(hospno), "", toupper(as.character(hospno)))
specno <- ifelse(is.na(specno), "", toupper(as.character(specno)))
forename <- ifelse(is.na(forename), "", toupper(as.character(forename)))
surname <- ifelse(is.na(surname), "", toupper(as.character(surname)))
hospno <- stringr::str_trim(hospno)
hospno <- stringr::str_replace_all(hospno, "^0*0", "")
qc <- ifelse(stringr::str_detect(hospno, "SENS|NEQAS|NEQSA|^QC|QUALITY|IQA|DIST|SPEC|^TBT|LIQAS|TEST-1") |
stringr::str_detect(specno, "SENS|NEQAS|NEQSA|QC|QUALITY|^IQA|^DIST|SPEC") |
stringr::str_detect(surname, "^SENS|NEQAS|NEQSA|QC|IQA|QUALITY|ANTIMICROBIAL SUSCEPTABILITY|ANTIFUNGAL SUSCEPTIBILITY|INTERNAL QC|FEBUARY|IQAMIC2024") |
stringr::str_detect(forename, "^SENS|NEQAS|NEQSA|IQA|FAECES|QUALITY|ANTIMICROBIAL SUSCEPTABILITY|ANTIFUNGAL SUSCEPTIBILITY|ANTIMICROBIAL|BLOOD CULTURE|DIAGNOSTICS"),
1, 0)
qc <- ifelse(forename %in% toupper(month.name),1,0)
return(qc)
}
# /////////////////
# specify customised obai sgss_infections function for LIMS data deduplication and AMR results generation
obai.dedup.fun <- function(allsites,
linkage_strategy = "default",
episode_length=274, # equivalent of 9 months to capture second pregnancies within time frame of ECM testing at a given site
recurrence_length = episode_length,
episode_type = "fixed",
dedup_module_preference = NULL,
abx=TRUE,
abx_preference = "severity_all"){
allsites$mrk <- diyar::combi(
allsites$species,
allsites$nhsno,
allsites$hospno,
allsites$dob,
allsites$specdate,
allsites$sex,
allsites$site,
allsites$surname,
allsites$forename,
allsites$specno)
dedup_level = c("pids", "species")
abx=TRUE
if("LIMS" %in% dedup_level) allsites$mrk <- diyar::combi(allsites$mrk, allsites$module)
if("module" %in% dedup_level) allsites$mrk <- diyar::combi(allsites$mrk, allsites$module)
if(!isFALSE(abx)) allsites$mrk <- diyar::combi(allsites$mrk, allsites$abx, allsites$ast_result)
summary=list()
same_day_dups <- duplicated(allsites$mrk)
summary$same_day_dups <- length(same_day_dups[same_day_dups])
allsites <- allsites[!same_day_dups,]
allsites$mrk <- NULL
cat(summary$same_day_dups,"same-day depulicates removed\n\n")
cat("Creating a patient identifier: `",
linkage_strategy, "` method\n")
# Make configurable
# Generating a patient identifier
wr_nhs <- lapply(0:9, function(x){
paste0(rep(x, 10), collapse = "")
})
df <- allsites
if(linkage_strategy == "default"){
df$soundex <- stringdist::phonetic(df$surname)
# Inclusion criteria
wr_nhs <- as.character(wr_nhs)
df$cri_1 <- df$cri_2 <- df$cri_3 <- NA
lgk <- which(!(df$nhsno %in% c(".","",wr_nhs,"0","1234567890","9876543210",NA) | nchar(df$nhsno) != 10))
# CR1 - Valid NHS no
df$cri_1[lgk] <- df$nhsno[lgk]
lgk <- which(!(df$hosno %in% c("","NO PATIENT ID","UNKNOWN","NO REF GIVEN", "nk", NA) | df$soundex %in% c("", NA)))
# CR2 - Valid HOS no and surname Soundex
df$cri_2[lgk] <- diyar::combi(df$hospno[lgk], df$soundex[lgk])
lgk <- which(!(df$specno %in% c("","NO PATIENT ID","UNKNOWN","NO REF GIVEN", "nk", NA) | df$soundex %in% c("", NA) | df$sex %in% c("U","Unknown", NA) | df$dob %in% c("", NA)))
# CR3 - Valid Specimen number, Soundex NHS and sex
df$cri_3[lgk] <- diyar::combi(df$specno[lgk], df$soundex[lgk], df$sex[lgk])
# AND (Valid DDMM or DDYY or MMYY)
df$cri_pt1 <- df$cri_pt2 <- df$cri_pt3 <- NA
lgk <- which(!(df$dob %in% c("", NA)))
df$cri_pt1[lgk] = diyar::combi(substr(df$dob[lgk],1,4), substr(df$dob[lgk],6,7))
df$cri_pt2[lgk] = diyar::combi(substr(df$dob[lgk],1,4), substr(df$dob[lgk],9,10))
df$cri_pt3[lgk] = diyar::combi(substr(df$dob[lgk],9,10), substr(df$dob[lgk],6,7))
linked_s_criteria <- list(cr3 = diyar::sub_criteria(df$cri_pt1, df$cri_pt2, df$cri_pt3))
# Order; CR1 to CR3
criteria <- list(df$cri_1, df$cri_2, df$cri_3)
df$soundex <- NULL
}else if(linkage_strategy == "uid_gold"){
# Inclusion criteria
wr_nhs <- as.character(wr_nhs)
df$cri_1 <- df$cri_2 <- df$cri_3 <- NA
lgk <- which(!(df$nhsno %in% c(".","",wr_nhs,"0","1234567890","9876543210",NA) |
df$hosno %in% c("","NO PATIENT ID","UNKNOWN","NO REF GIVEN", "nk", NA) |
df$dob %in% c("", "1900-01-01", NA) |
df$surname %in% c("","NO PATIENT ID","UNKNOWN","NO REF GIVEN", "nk", NA) |
df$hosno %in% c("","NO PATIENT ID","UNKNOWN","NO REF GIVEN", "nk", NA)))
# CR1 - Valid NHS no, DOB, forename and surname
df$cri_1[lgk] <- diyar::combi(df$nhsno[lgk], df$dob[lgk], df$forename[lgk], df$surname[lgk])
lgk <- which(!(df$nhsno %in% c(".","",wr_nhs,"0","1234567890","9876543210",NA) |
df$dob %in% c("", "1900-01-01", NA) |
df$sex %in% c("","NO PATIENT ID","UNKNOWN","NO REF GIVEN", "nk", "U","Unknown", NA)))
# CR2 - Valid NHS no, DOB and sex
df$cri_2[lgk] <- diyar::combi(df$nhsno[lgk], df$dob[lgk], df$sex[lgk])
lgk <- which(!(df$nhsno %in% c(".","",wr_nhs,"0","1234567890","9876543210",NA) |
df$dob %in% c("", "1900-01-01", NA)))
# CR3 - Valid NHS no and DOB
df$cri_3[lgk] <- diyar::combi(df$nhsno[lgk], df$dob[lgk])
# Order; CR1 to CR3
criteria <- list(df$cri_1, df$cri_2, df$cri_3)
linked_s_criteria <- NULL
}else if(linkage_strategy == "basic"){
# Inclusion criteria
wr_nhs <- as.character(wr_nhs)
df$cri_1 <- NA
lgk <- which(!(df$nhsno %in% c(".","",wr_nhs,"0","1234567890","9876543210",NA) |
df$dob %in% c("", "1900-01-01", NA)))
# CR1 - Valid NHS no and DOB
df$cri_1[lgk] <- diyar::combi(df$nhsno[lgk], df$dob[lgk])
# Order; CR1
criteria <- list(df$cri_1)
linked_s_criteria <- NULL
}else if(linkage_strategy == "sgss_audit"){
# 9-stage ordered record linkage to identify patients
# Place holders for 9 matching criteria and attributes for sub_criteria
df$cri_1 <- df$cri_2 <- df$cri_3 <-
df$cri_4 <- df$cri_5 <- df$cri_6 <-
df$cri_7 <- df$cri_8 <- df$cri_9 <-
df$cri_dob_pt1 <- df$cri_dob_pt2 <- df$cri_dob_pt3 <- NA
# Place holders for inclusion criteria of each attribute
v_specno <- v_fname <- v_sname <- v_nhs <- v_dob <- v_hos <- v_sex <- rep(FALSE, nrow(df))
wr_nhs <- lapply(0:9, function(x){
paste0(rep(x,10), collapse = "")
})
df$soundex <- stringdist::phonetic(df$surname)
df$forename_initial <- toupper(substr(df$forename,1, 1))
v_nhs[!df$nhsno %in% c(".","","9999999999","0","123456789","9876543210", NA, wr_nhs)] <- TRUE
v_dob[!df$dob %in% c("1900-01-01","", NA)] <- TRUE
v_hos[!toupper(df$hosno) %in% c("UNKNOWN", "NO PATIENT ID", "","NOT SPECIFIED", NA)] <- TRUE
v_sex[!toupper(df$sex) %in% c("UNKNOWN","","NOT SPECIFIED", NA)] <- TRUE
v_sname[!toupper(df$surname) %in% c("", NA)] <- TRUE
v_fname[!toupper(df$forename) %in% c("", NA)] <- TRUE
v_specno[!toupper(df$specno) %in% c("UNKNOWN", "NO PATIENT ID", "","NOT SPECIFIED", NA)] <- TRUE
# Stage 1
lgk <- which(v_nhs & v_dob)
df$cri_1[lgk] <- diyar::combi(df$nhsno[lgk], df$dob[lgk])
# Stage 2
lgk <- which(v_nhs)
df$cri_2[lgk] <- df$nhsno[lgk]
# Stage 3
lgk <- which(v_hos & v_dob & v_sname)
df$cri_3[lgk] <- diyar::combi(df$hosno[lgk],df$dob[lgk], df$soundex[lgk])
# Stage 4
lgk <- which(v_hos)
df$cri_4[lgk] <- df$hosno[lgk]
# Stage 5
lgk <- which(v_specno & v_sex & v_dob & v_fname & v_sname)
df$cri_5[lgk] <- diyar::combi(df$specno[lgk], df$lab_cd[lgk], df$sex[lgk],
df$forename_initial[lgk], df$dob[lgk], df$soundex[lgk])
# Stage 6
lgk <- which(v_specno & v_sex)
df$cri_6[lgk] <- diyar::combi(df$specno[lgk], df$lab_cd[lgk], df$sex[lgk])
# Stage 7
lgk <- which(v_specno & v_dob)
df$cri_7[lgk] <- diyar::combi(df$specno[lgk], df$dob[lgk])
# Stage 8
# df$cri_8 <- df$cri_7
# Stage 9
lgk <- which(v_specno)
df$cri_9[lgk] <- df$specno[lgk]
lgk <- which(v_dob)
df$cri_dob_pt1[lgk] <- diyar::combi(lubridate::year(df$dob[lgk]), lubridate::month(df$dob[lgk]))
df$cri_dob_pt2[lgk] <- diyar::combi(lubridate::year(df$dob[lgk]), lubridate::day(df$dob[lgk]))
df$cri_dob_pt3[lgk] <- diyar::combi(lubridate::day(df$dob[lgk]), lubridate::month(df$dob[lgk]))
# Sub criteria for stage 6. Also used in stage 9
s_cri6 <- diyar::sub_criteria(df$forename_initial, df$soundex)
# Sub criteria for stage 9
s_cri9 <- diyar::sub_criteria(
s_cri6,
diyar::sub_criteria(df$cri_dob_pt1, df$cri_dob_pt2, df$cri_dob_pt3),
operator = "or")
# Order; CR1 to CR9
criteria <- list(df$cri_1, df$cri_2, df$cri_3, df$cri_4,
df$cri_5, df$cri_6, df$cri_7,
# df$cri_8,
df$cri_9)
linked_s_criteria <- list("cr6" = s_cri6, "cr9" = s_cri9)
df$soundex <- NULL
}
df <- df[names(df)[!grepl("cri_", names(df))]]
# Unlinked records are considered separate entities.
# This is an issue when querying AMR data as each unlinked test is considered a separate patient.
# To prevent this, the closest approximation to a patient's sample is used as the final criteria
criteria <- c(criteria,
list(diyar::combi(df$nhsno, df$hospno, df$dob, df$surname, df$forename, df$sex, df$specno))
)
df$pids <- diyar::links(criteria = criteria,
sub_criteria = linked_s_criteria,
display = "progress")
cat(paste0("Categorising episode: ",
episode_length,"-day difference ",
episode_type, " episodes\n"))
df$dt <- lubridate::ymd(df$specdate)
df$pid_str <- diyar::combi(as.list(df[dedup_level]))
if(!is.null(dedup_module_preference)){
mdls <- c("amr", "cdr")
dedup_module_preference <- factor(df$sgss_module, levels = c(mdls[mdls == dedup_module_preference],
mdls[mdls != dedup_module_preference]))
}
df$epids <- diyar::episodes(date = df$dt,
case_length = episode_length,
recurrence_length = recurrence_length,
episode_type = episode_type,
strata = df$pid_str,
group_stats = TRUE,
display = "progress",
custom_sort = dedup_module_preference)
df$dt <- df$qc <- NULL
# Most severe abx result per episode
# convert instances of "P" and "W" to "R" in ast_result field for indicible clindamycin resistance test results
df <- df %>% mutate(ast_result=case_when(
ast_result == "P" | ast_result == "W" ~ "R",
TRUE ~ ast_result
))
if(!isFALSE(abx)){
cat("Determining the most severe abx result per episode\n")
df$amr_severity <- dplyr::case_when(
df$ast_result == "R" ~ 1,
df$ast_result == "I" ~ 2,
df$ast_result == "S" ~ 3,
df$ast_result == "N" ~ 4,
TRUE ~ 5
)
if(c("severity_all") %in% abx_preference){
df$epi_amr <- jurithy::bys_val(df$amr_severity, df$sn, by= paste(as.numeric(df$epids), df$abx), val=df$amr_severity)
}else if (c("severity_cdr") %in% abx_preference){
tp_md <- factor(tolower(df$sgss_module), levels = c("cdr", "amr"))
df$epi_amr <- jurithy::bys_val(tp_md, df$amr_severity, df$sn, by= paste(as.numeric(df$epids), df$abx), val=df$amr_severity)
rm(tp_md)
}else if (c("severity_amr") %in% abx_preference){
tp_md <- factor(tolower(df$sgss_module), levels = c("amr", "cdr"))
df$epi_amr <- jurithy::bys_val(tp_md, df$amr_severity, df$sn, by= paste(as.numeric(df$epids), df$abx), val=df$amr_severity)
rm(tp_md)
}
df$epi_amr <- c("R","I","S","N","")[df$epi_amr]
abx_recoded <- df$amr!=df$epi_amr
summary$abx_recoded <- length(abx_recoded[abx_recoded])
abx_results <- dplyr::select(df, .data$epids, .data$abx, e = .data$epi_amr, r = .data$ast_result) %>%
dplyr::mutate(epids = as.numeric(.data$epids)) %>%
dplyr::distinct() %>%
tidyr::pivot_wider(names_from = .data$abx,
values_from = c(.data$e, .data$r),
values_fn = list(e = ~ paste0(unique(.), collapse = ";"),
r = ~ paste0(unique(.), collapse = ";")),
values_fill = list(e ="", r = ""))
df$amr_severity <- NULL
}else{
summary$abx_recoded <- NULL
abx_results <- NULL
}
df$patients <- as.numeric(!duplicated(df$pids@.Data))
df$episodes <- as.numeric(!duplicated(df$epids@.Data))
deduped_df <- df %>% filter(patients==1)
deduplication <- dplyr::select(janitor::tabyl(df, .data$species, .data$patients), "species", patients = "1")
deduplication <- dplyr::bind_cols(deduplication, dplyr::select(janitor::tabyl(df, .data$species, .data$episodes), episodes = "1"))
summary$deduplication <- deduplication
df$patients <- df$episodes <- df$sn <- df$pid_str <- NULL
output <- list(
extracts = allsites,
processed = df,
abx_results = abx_results,
summary = summary)
# processed_data <- as.data.frame(output[["processed"]])
processed_data <- deduped_df
abx_data <- as.data.frame(output[["abx_results"]])
processed_data$epids <- as.character(processed_data$epids)
processed_data$epids2 <- sapply(processed_data$epids, function(x) {
sub("^[^0-9]*0*([1-9][0-9]*)", "\\1", strsplit(x, " ")[[1]][1])
})
# merge linelist with abx results on epids
abx_results$epids <- as.character(abx_results$epids)
deduped_df <- left_join(processed_data,abx_results,by="epids") %>% select(-c("sn","pids","epi_amr","patients","abx","ast_result","episodes","epids2"))
return(deduped_df)
}
# /////////////------------ ETHNICITY GROUPINGS ------------ /////////////////
# /////////////////
# specify function to regroup LIMS ethnicity categories to espaur groupings and then to 5 categories for age standardisation
lims.ethcat.fun <- function(x){
#create char vec of unique Ethnic groups in ethnicity-linked linelist data
ethn.vec <- sort(unique(x$ethnicity)) # in alphabetical order for consistency when indexing
# vector positions for each ethnicity report in LIMS
# Unknown
# [1] ""
# [39]"Not Asked"
# [40] "Not Given on Form"
# [41]"Not given/declined"
# [42]"Not specified"
# [43] "Not Specified"
# [44]"Not stated"
# [45]"Other - not stated"
# [54]"Unable or unwilling to answer"
# [55] "UNKNOWN"
#
# Black, Black British, Caribbean or African
# [2]"African or African-Caribbean"
# [3]"Any other Black background"
# [19] "Black - any other black background"
# [20]"Black - Caribbean"
# [21]"Black African"
# [22] "Black Caribbean"
# [23]"Black or Black British - African "
# [24]"Black or Black British - Any other Black background "
# [25] "Black/Black British-Caribbean"
# [26]"Black/Black British African"
# [27]"Black/Black British Caribbean"
#
# Mixed
# [29]"Mixed - any other mixed background"
# [30]"Mixed - Any other mixed background "
# [31] "Mixed - White and Asian "
# [32]"Mixed - White and Black African "
# [33]"Mixed Any other mixed background"
# [34] "Mixed White & Black Caribbean"
# [35]"Mixed White and Asian"
# [36]"Mixed White and Black African"
# [37] "Mixed White and Black Caribbean"
#
# Other
# [4] "Any other ethnic group"
# [46] "Other Ethnic Group"
# [47]"Other Non-European"
# [49] "Other Ethnic Groups - Any other ethnic group "
# [53]"Southern & Other European"
# [38]"Northern European"
#
# Asian, Asian British
# [7] "Asian any other Asian background"
# [8]"Asian Bangladeshi"
# [9]"Asian Indian"
# [10] "Asian or Asian British - Any other Asian background"
# [11]"Asian or Asian British - Bangladeshi "
# [12]"Asian or Asian British - Indian "
# [13] "Asian or Asian British - Pakistani "
# [14]"Asian Pakistani"
# [15]"Asian/Asian British-Pakistani"
# [16] "Asian/Asian British any other"
# [17]"Asian/Asian British Bangladeshi"
# [18]"Asian/Asian British Indian"
# [28] "Chinese"
# [50]"Other Ethnic Groups - Chinese "
# [51]"South Asian"
# [52] "South East Asian"
#
#
#
# White
# [5]"Any other white background"
# [6]"Any other White background"
# [48]"Other white background"
# [56]"White"
# [57]"WHITE"
# [58] "White - Any other White background "
# [59]"White - British"
# [60]"White - Irish"
# [61] "White British"
# [62]"White Irish"
x <- x %>% mutate(Ethnicity_group="")
# for unknown group
x <- x %>%
mutate(Ethnicity_group = case_when(
ethnicity %in% c(ethn.vec[1], ethn.vec[54], ethn.vec[55]) ~ "Unknown",
ethnicity %in% ethn.vec[39:45] ~ "Unknown",
is.na(ethnicity) ~ "Unknown",
TRUE ~ Ethnicity_group
))
# for Black, Black British, Caribbean or African group
x <- x %>%
mutate(Ethnicity_group = case_when(
ethnicity %in% c(ethn.vec[2], ethn.vec[3]) ~ "Black",
ethnicity %in% ethn.vec[19:27] ~ "Black",
TRUE ~ Ethnicity_group
))
# for Mixed
x <- x %>%
mutate(Ethnicity_group = case_when(
ethnicity %in% ethn.vec[29:37] ~ "Mixed",
TRUE ~ Ethnicity_group
))
# for Other
x <- x %>%
mutate(Ethnicity_group = case_when(
ethnicity %in% c(ethn.vec[4],ethn.vec[46],ethn.vec[47],ethn.vec[49],ethn.vec[53],ethn.vec[38]) ~ "Other",
TRUE ~ Ethnicity_group
))
# for Asian, Asian British
x <- x %>%
mutate(Ethnicity_group = case_when(
ethnicity %in% ethn.vec[7:9] ~ "Asian",
ethnicity %in% ethn.vec[10:18] ~ "Asian",
ethnicity %in% ethn.vec[28] ~ "Asian",
ethnicity %in% ethn.vec[50:52] ~ "Asian",
TRUE ~ Ethnicity_group
))
# for White
x <- x %>%
mutate(Ethnicity_group = case_when(
ethnicity %in% c(ethn.vec[5], ethn.vec[6], ethn.vec[48]) ~ "White",
ethnicity %in% ethn.vec[56:62] ~ "White",
TRUE ~ Ethnicity_group
)) %>% select(-c("ethnicity")) %>% # drop reduntant Ethnicity assignment columns
mutate(z=1)
# add in count variable prior to grouping by ethnicity category
}
# /////////////------------ DBS TRACING ------------ /////////////////
# /////////////////
# specify function to perform initial data quality check on LIMS linelist and to generate susbet of data which missing one or more key fields for tracing
initial.lims.dq.fun <- function(x){
# nhsno_na <- sum(is.na(x$nhsno))
# dob_na <- sum(is.na(x$dob))
# err_date <- as.Date(c("1900-01-01"))
# dob_err <- sum(x$dob==err_date)
subset <- x %>% filter(is.na(dob) |
dob==as.Date(c("1900-01-01")) |
is.na(nhsno) |
is.na(surname) |
sex=="U" |
patient_postcode=="Z999")
subset <- subset
}
# /////////////////
# specify function to identify and retain QC sample reports from LIMS linelist, as part of DBS tracing
eqa.record.fun <- function(x){
out <- x %>% filter(dob==as.Date(c("1900-01-01")) |
surname=="LQ BLOOD CULTURE" |
surname=="IQAMIC2024" |
forename=="EQA" |
forename=="FOXTROTFIVEEIGHT" |
forename=="NOT USED" |
forename=="UNKNOWN" |
forename=="PORCINE TRACHEA(NEO)" |
surname=="5995753 CTOX" |
patient_postcode=="Z999")
out <- dbs.tracing.fields.fun(out)
}
# /////////////////
# specify nested function to prepare LIMS linelist data field formatting for DBS tracing
dbs.tracing.fields.fun <- function(x){
lims_for_trace <- x %>% arrange(pid_str) %>% # arrange by patient ID
rename(Identifier=pid_str,
Date_of_Birth=dob,
NHS_number=nhsno,
Surname=surname,
Forename=forename,
Sex=sex) %>%
mutate(null1="",
null2="",
null3="",
null4="",
null5="",
Sex=(if_else(Sex=="M","1",Sex)), # 1 = male, 2 = female, NA = unknown
Sex=(if_else(Sex=="F","2",Sex)),
Sex=(if_else(Sex=="U","",Sex)),
Sex=as.numeric(Sex),
Date_of_Birth=as.Date(Date_of_Birth, "%Y%m%d"),
Date_of_Birth=as.character(str_replace_all(string=Date_of_Birth, pattern="-", replacement = "") ),
Identifier=as.numeric(Identifier)) %>%
select(null1,Identifier,Date_of_Birth,null2,null3,NHS_number,Surname,null4,Forename,null5,Sex) %>% arrange(Identifier)
# filter_at(.vars = vars(Date_of_Birth,NHS_number), .vars_predicate = any_vars(!is.na(.))) # remove instances of episode lacking postcode & NHS
lims_for_trace$Forename <- toupper(gsub("\\.", "", lims_for_trace$Forename))
lims_for_trace$Surname <- toupper(gsub("\\.", "", lims_for_trace$Surname))
return(lims_for_trace)
}
# /////////////////
# specify wrap-around function to apply umbrella and nested functions for DBS tracing data prep
dbs.trace.prep.fun <- function(x,
mainfile=mainfile,
excludefile=excludefile){ # where main input is GroupCG_linelist
excluded_records <- initial.lims.dq.fun(x) # generate subset of data where one or more key fields missing for trace - send these for tracing if minimum necessary PII
# generate main df for tracing
lims_for_trace <- lims.fortrace.fun(x,
excluded_records)
# generate excluded records df for tracing
excluded_records_trace <- excl.records.trace.fun(x,
excluded_records)
# identify probable QC reports and exclude these from datasets to trace
eqa_records <- eqa.record.fun(x)
# exclude probable QC sample reports from data to be sent for tracing
lims_for_trace <- lims_for_trace %>% filter(!Identifier %in% eqa_records$Identifier)
excluded_records_trace <- excluded_records_trace %>% filter(!Identifier %in% eqa_records$Identifier)
# export to csv for sending to DBS team
write.csv(lims_for_trace, file = mainfile, row.names = F)
write.csv(excluded_records_trace, file = excludefile, row.names = F)
}
#////////////
#--main umbrella function to prepare linelist data for DBS tracing
lims.fortrace.fun <- function(x, # where x is GroupCG_linelist
excluded_records){
lims_for_trace <- x %>% filter(!pid_str %in% excluded_records$pid_str) # exclude above records from data to be sent for tracing
lims_for_trace <- dbs.tracing.fields.fun(lims_for_trace) } # prepare data formatting for tracing
# /////////////////
# specify function to prepare excluded data for DBS tracing
excl.records.trace.fun <- function(x, # where x is LIMS linelist
excluded_records){
excluded_records_trace <- x %>% filter((pid_str %in% excluded_records$pid_str)) # prepare excluded records for tracing for PII enrichment (these may still be traced although higher probability of failure)
excluded_records_trace <- dbs.tracing.fields.fun(excluded_records_trace) # prepare data formatting for tracing
}
# /////////////////
# specify function to format trace return colnames
header.fun <- function(x, # where x is trace response data
y=import(here("Data_carriage","DBS_tracing","trace_response_headers.xlsx"),
format = "xlsx",
fill = T,
comment.char="",
na = c(""),
skip = 0,
full.names = T)){ # where y is response headers
# # assign header colnames
# colnames(y) <- as.character(y[1,])
# retain only successful traces and drop additional column in trace response data so that ncol is 60
x <- x %>% filter(V1=="20" | V1=="30" | V1=="33" | V1=="40") %>% select(-any_of(c("V61"))) # retaining those cases that have been successfully traced
# define char vec of trace response headers
headers <- colnames(y)
# confirm length of headers is same as number of columns in trace_response
print(length(headers))
# define char vec of redundant col names (V1, V2 etc)
oldnames <- colnames(x)
# rename columns in trace response data based on headers
x <- x %>%
rename_at(vars(oldnames), ~ headers)
}
# /////////////////
# specify function to format trace return colnames for subset of return where trace unsuccessful
notrace.header.fun <- function(x, # where x is trace response data
y=import(here("Data_carriage","DBS_tracing","trace_response_headers.xlsx"),
format = "xlsx",
fill = T,
comment.char="",
na = c(""),
skip = 0,
full.names = T)){ # where y is response headers
# # assign header colnames
# colnames(y) <- as.character(y[1,])
# drop additional column in trace response data so that ncol is 60
x <- x %>% select(-any_of(c("V61")))
# define char vec of trace response headers
headers <- colnames(y)
# confirm length of headers is same as number of columns in trace_response
print(length(headers))
# define char vec of redundant col names (V1, V2 etc)
oldnames <- colnames(x)
# rename columns in trace response data based on headers
x <- x %>%
rename_at(vars(oldnames), ~ headers)
}
# /////////////////
# specify function to clean var names for traced return
clean.names.fun <- function(x){
# clean up col names (replace instances of [...] with spaces)
names(x) <- str_replace(names(x), "\\[", "")
names(x) <- str_replace(names(x), "\\]", "")
strips <- "^\\s+|\\s+$"
names(x) <- gsub(strips,"",names(x))
names(x) <- gsub("[[:space:]]","_",names(x))
x <- x
}
# /////////////////
# specify function to correctly format traced data prior to merge with original linelist
trace.reformat.fun <- function(x){
x <- x %>% mutate(Date_of_Death=as.character(Date_of_Death)) %>%
# mutate(DOB_S=as.character(DOB_S)) %>%
mutate(DOB_S=as.Date(as.character(DOB_S), format = "%Y%m%d")) %>% # takes DBS trace date format (e.g., 19891222) and converts to date e.g., 1989-12-22
rename(traced=`INSERT_INTO_DBSresponse_(_Response_Type`) %>%
rename(pid_str=Unique_Record_ID) %>%
select(pid_str,NHS_Number,DOB_S,First_Name_S,Surname_S,Sex_S,PostCode) %>%
mutate_all(na_if,"")
}
# /////////////////
# specify wrap function to retain records which weren't successfully traced
no.trace.wrap.fun <- function(x){ # where input is lims_trace_return
no_trace <- x %>% filter(V1!="20" & V1!="30" & V1!="33" & V1!="40")
no_trace <- notrace.header.fun(no_trace)
no_trace <- clean.names.fun(no_trace)
no_trace <- trace.reformat.fun(no_trace)
return(no_trace)
}
# /////////////////
# specify wrap function to retain records which had unknown patient sex
unknown.sex.wrap.fun <- function(x){ # where input is lims_trace_return
no_trace <- x %>% filter(V13=="1" | V13=="3")
no_trace <- notrace.header.fun(no_trace)
no_trace <- clean.names.fun(no_trace)
no_trace <- trace.reformat.fun(no_trace)
return(no_trace)
}
# /////////////////
# specify wrap function to prepare and join trace return with LIMS linelist
traced.wrap.fun <- function(x){ # where input is lims_trace_return
lims_trace_return <- header.fun(x)
lims_trace_return <- clean.names.fun(lims_trace_return)
lims_trace_return <- trace.reformat.fun(lims_trace_return)
merged_df <- right_join(lims_trace_return,distinct_patients,by="pid_str") %>%
mutate(NHS_Number=as.numeric(NHS_Number)) %>%
mutate(nhsno = case_when(
!is.na(NHS_Number) ~ NHS_Number,
TRUE ~ nhsno),
sex = case_when(
Sex_S==2 ~ "F",
TRUE ~ sex
),
traced = case_when(
is.na(NHS_Number) ~ "N",
!is.na(NHS_Number) ~ "Y"
)
) %>% select(-c("NHS_Number","DOB_S","First_Name_S","Surname_S","Sex_S")) %>%
mutate(patient_postcode=PostCode) %>% select(-c("PostCode"))
return(merged_df)
}
# /////////////////
# specify function to generate age groupings (8 groups)
agegroup.fun <- function(x){
x <- x %>% mutate(agegroup=case_when(age <1 ~ 1,
(age >=1 & age <= 4) ~ 2,
(age >=5 & age <= 9) ~ 3,
(age >=10 & age <= 14) ~ 4,
(age >=15 & age <= 44) ~ 5,
(age >=45 & age <= 64) ~ 6,
(age >=65 & age <= 74) ~ 7,
(age >=75 & age <=123) ~ 8,
(is.na(age) ~ 0)))
x$agegroup <- factor(x$agegroup,
levels=c('0','1','2','3','4','5','6','7','8'),
labels = c("not known","<1","1 to 4","5 to 9","10 to 14","15 to 44","45 to 64","65 to 74",">= 75"))
out <- x
}
# /////////////////
# specify function to further prepare LIMS linelist data for linkage (ethnicity and IMD)
linelist.prep.fun <- function(x){
x <- x %>% mutate(dob = lubridate::ymd(dob)) %>% # generate date of birth variable in date format
mutate(spec_date = lubridate::ymd(specdate), # generate date variable in date format from existing date char var
# Year = as.numeric(format(Date, '%Y')),
age = as.numeric(difftime(specdate,dob),unit="weeks")/52.25, # generate age variable (years, numeric)
age = round(age, digits=0),
dob_specdate = case_when(dob==specdate ~ "yes",
dob!=specdate ~ "no"),
sex = if_else(sex=="M","Male",sex),
sex = if_else(sex=="F","Female",sex),
sex = if_else(sex=="U","Unknown",sex)) %>%
mutate(n=1)
x <- agegroup.fun(x) # apply agegroup function
out <- x
}
# /////////////------------ IMD ENRICHMENT ------------ /////////////////
# /////////////////
# specify function to link linelist to IMD using lab postcode and CCG LSOA intermediate lookups
imd.linkage.fun <- function(x, # input is allsites (LIMS linelist post-DBS linkage)
lab_lookup_query = "SELECT [Lab_Geography_Code] ,[Lab_Geography_Name_Current] ,[POSTCODE] as labpostcode FROM [SGSSDW].[dbo].[DIMENSION_LAB_GEOGRAPHY] where Logical_Delete_Flag <> ('1') AND POSTCODE <> ('NULL')",
mycon = dbConnect(odbc::odbc(), # connection to SGSSDW
.connection_string = "DRIVER=SQL Server;
DATABASE=SGSSDW;
Trusted_Connection=Yes;
SERVER=sgssdb.phe.gov.uk;"),
# pcd2_lsoa_lookup = read_dta(file='//COLHPAFIL003.HPA.org.uk/ProjectData/HCAI/Scientific resources/GIS/ONS lookups/NHSPD_MAY_2022_UK_FULL/Data/pcd_CCG_lsoa_lookup.dta'),
hes_qry = glue(
"SELECT [pcd2], [ctry] ,[nhser] ,[lsoa11] ,[icb], [oac11] FROM [LookupsShared].[dbo].[vONS_NSPL11_UK_2023.05]"
),
hes_qry2 = glue(
"SELECT [pcd2],[LSOA11CD],[IMD2019_Deciles_LSOA11_England],[IMD2019_Quintiles_LSOA11_England],[ctry] FROM [LookupsShared].[dbo].[vSocioDemog_LSOA11] INNER JOIN [vONS_NSPL_UK_202205] on [lsoa11] = [LSOA11CD] where [ctry] IN ('E92000001')"
),
rx_dsn = "LookupsShared",
data_lake_connect = paste0('driver={SQL Server};server=SQLClusColLK19\\Lake19;',
'database=',rx_dsn,';trusted_connection=true'),
hes_con = odbc::dbConnect(odbc::odbc(),.connection_string=data_lake_connect)){ # connection to HES
# /// Lab postcode lookup for merge
# perform extract
# mycon <- mycon
lab_lookup_out <- dbSendQuery(mycon, lab_lookup_query)
lab_lookup <- dbFetch(lab_lookup_out)
dbClearResult (lab_lookup_out)
odbc::dbDisconnect(mycon)
# NEED TO MANUALLY LOOK UP LABGEOCODE FOR GBS ECM SITES IN LINELIST AND USE THIS TO MERGE ON LOOKUP
# merge linelist with lab_lookup on labgeocode
lab_lookup <- lab_lookup %>% rename(labgeocode=Lab_Geography_Code)
# add in labgeocodes for ECM sites
x <- x %>% mutate(labgeocode = case_when(
site=="Royal Stoke" ~ "597955", #ROYAL STOKE UNIVERSITY HOSPITAL
site=="Bedford" ~ "235615", # LUTON MICROBIOLOGY LABORATORY
site=="Derriford" ~ "526765", # DERRIFORD HOSP. (PLYMOUTH)
site=="Freeman" ~ "056130", # FREEMAN HOSPITAL (NEWCASTLE UPON TYNE)
site=="Royal Blackburn" ~ "670861", # ROYAL BLACKBURN
site=="Epsom" ~ "375120", # EPSOM (EPSOM AND ST HELIER)
site=="Birmingham" ~ "610640") # BIRMINGHAM (BIRMINGHAM WOMEN AND CHILDRENS)
)
x <- merge(x,lab_lookup,by="labgeocode")
# /// PCD2 LSOA lookup for merge
# perform extract
pcd2_lsoa_qry <- dbSendQuery(hes_con, hes_qry)
pcd2_lsoa_lookup <- dbFetch(pcd2_lsoa_qry)
dbClearResult (pcd2_lsoa_qry)
pcd2_lsoa_lookup$pcd2 <- str_replace_all(string=pcd2_lsoa_lookup$pcd2, pattern=" ", repl="") # remove white spaces in postcode field
pcd2_lsoa_lookup$pcd2 <- str_replace_all(string=pcd2_lsoa_lookup$pcd2, pattern=" ", repl="") # remove white spaces in postcode field
pcd2_lsoa_lookup <- pcd2_lsoa_lookup %>% filter(ctry=="E92000001") %>% arrange(lsoa11)
# define derivedpostcode var in linelist and preferentially assign as patient postcode > labpostcode
x <- x %>% mutate(derivedpostcode=case_when(
grepl("^ZZ99",patient_postcode) | patient_postcode=="No Data Avaliable" ~ labpostcode, # no patient postcode in LIMS ECM data
(!is.na(patient_postcode) & !grepl("^ZZ99",patient_postcode)) ~ patient_postcode,
is.na(patient_postcode) ~ labpostcode)) %>%
rename(pcd2=derivedpostcode) %>% # rename derivedpostcode prior to merge
mutate(pcd2=str_replace_all(pcd2, " ", ""))
# ensure instances of patient_postcode=="", labpostcode used
x <- x %>% mutate(pcd2 = case_when(
patient_postcode=="" ~ labpostcode,
TRUE ~ pcd2
))
# merge with CCG LSOA lookup based on pcd2 (derived postcode)
x <- merge(x,pcd2_lsoa_lookup,by="pcd2",all=T)
x <- x %>% filter(!is.na(pid_str))
# /// generate dynamic LSOA IMD code lookup
# perform extract
lsoa_IMD_qry <- dbSendQuery(hes_con, hes_qry2)
lsoa_IMD_lookup <- dbFetch(lsoa_IMD_qry)
dbClearResult (lsoa_IMD_qry)
odbc::dbDisconnect(hes_con)
lsoa_IMD_lookup$pcd2 <- str_replace_all(string=lsoa_IMD_lookup$pcd2, pattern=" ", repl="") # remove white spaces in postcode field
lsoa_IMD_lookup$pcd2 <- str_replace_all(string=lsoa_IMD_lookup$pcd2, pattern=" ", repl="") # remove white spaces in postcode field
lsoa_IMD_lookup <- lsoa_IMD_lookup %>% filter(ctry=="E92000001")
# merge with LSOA IMD lookup based on pcd2 (derived postcode)
x <- merge(x,lsoa_IMD_lookup,by="pcd2",all=T)
x <- x %>% filter(!is.na(pid_str))
x <- x[!duplicated(colnames(x))]
# reassign as linelist
return(x)
}
# /////////////------------ ETHNICITY ENRICHMENT ------------ /////////////////
# /////////////////
# specify function to prepare LIMS linelist data for ethnicity enrichment
datalink.prep.fun <- function(x){
# check names of date of birth and nhs number fields.
xnew <- x %>% mutate(dateofbirth = as.Date(dob, format = "%Y-%m-%d"))
xnew$Patient_NHS_No <- as.character(xnew$nhsno)
out <- xnew
}
# /////////////////
# specify function to link LIMS linelist to HES ethnicity fields by creating a temporary table in SQL
hes.ethnlink.fun <- function(x, # where main input is LIMS linelist
rx_dsn = "Y080_UID_PID",
data_lake_connect = paste0('driver={SQL Server};server=SQLClusColLK19\\Lake19;',
'database=',rx_dsn,';trusted_connection=true'),
data_lake = odbc::dbConnect(odbc::odbc(),.connection_string=data_lake_connect), # create DataLake connection
PII_data_name = paste0("JR_tbl_",format(Sys.Date(), "%y%m%d"),"_",format(Sys.time(), "%H_%M")), # PII to upload to DataLake: table name needs to be unique and not previously on the Database
target_table = paste0("JR_upld_",format(Sys.Date(), "%y%m%d"),"_",format(Sys.time(), "%H_%M")),
hes_qry = glue(
"SELECT *
INTO [Y080_UID_PID].[dbo].[{target_table}]
FROM (
SELECT DISTINCT
uploadeddata.*
, ethntp.TOKEN_PERSON_ID
, ethntp.NEW_ETHNOS -- Ethnicity; code
, ethntp.NEW_ETHNOS_Ethnic_subgroup
, ethntp.NEW_ETHNOS_Ethnic_group
FROM [Y080_UID_PID].[dbo].[{PII_data_name}] uploadeddata
LEFT JOIN [Y080_UID_PID].[dbo].[vY080_UID_HESOP_PID] apcpid ON apcpid.NEWNHSNO = uploadeddata.Patient_NHS_No AND CONVERT(DATE, CAST(apcpid.DOB_DV AS DATE),3) = uploadeddata.dateofbirth -- linked on NHS Number & DoB
LEFT JOIN [Y089_DLakeLinkedRepo_PID].[dbo].[x_HES_Analysis_Pseudo_vY089_HES_Ethnos_TPID] ethntp ON apcpid.TOKEN_PERSON_ID = ethntp.TOKEN_PERSON_ID
) AS hes"
)){
x <- datalink.prep.fun(x) # prepare linelist data for HES linkage
odbc::dbListTables(conn=data_lake, catalog_name=rx_dsn, schema_name="dbo") # Check names of the tables in the database
dbCreateTable(conn=data_lake, name = PII_data_name, fields = x, overwrite = T)
tableid <- Id(schema = "dbo", table = PII_data_name)
dbWriteTable(data_lake, tableid, x, append = TRUE, row.names = F)
linked_ethn <- dbSendQuery(data_lake,hes_qry)
linked_ethn <- dbGetQuery(data_lake,glue("select * from {target_table}"))
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
table_name <- target_table
# output table generated from query
ethnicity_linked_data <- dbReadTable(data_lake,table_name) # Extracting the ethnicity linked data from the DataLake
ethnicity_linked_data <- distinct(ethnicity_linked_data)
# get rid of temporary tables
dbSendQuery(data_lake, glue("drop table {target_table}"))
dbSendQuery(data_lake, glue("drop table {PII_data_name}"))
# disconnect from DL
dbDisconnect(data_lake)
write.csv(ethnicity_linked_data, "Data_carriage/Linelist/ethnlinked_intermediate.csv", row.names=F)
# reassign to data df
x <- as_tibble(ethnicity_linked_data)
# clean ethnicity linked linelist
x <- ethlinked.data.clean.fun(x)
# format IMD quintile variable as factor with labels
x <- imd.fac.label.fun(x)
return(x)
}
# /////////////////
# specify function to link LIMS linelist to ECDS ethnicity fields by creating a temporary table in SQL
ecds.ethnlink.fun <- function(x, # where main input is LIMS linelist
rx_dsn = "Y080_UID_PID",
data_lake_connect = paste0('driver={SQL Server};server=SQLClusColLK19\\Lake19;',
'database=',rx_dsn,';trusted_connection=true'),
data_lake = odbc::dbConnect(odbc::odbc(),.connection_string=data_lake_connect), # create DataLake connection
PII_data_name = paste0("JR_ecds_tbl_",format(Sys.Date(), "%y%m%d"),"_",format(Sys.time(), "%H_%M")), # PII to upload to DataLake: table name needs to be unique and not previously on the Database
target_table = paste0("JR_ecds_upld_",format(Sys.Date(), "%y%m%d"),"_",format(Sys.time(), "%H_%M")),
hes_qry = glue(
"SELECT *
INTO [Y080_UID_PID].[dbo].[{target_table}] -- new table of linked data, make sure it is a new name
FROM (
SELECT DISTINCT espamr.*
-- , ethntp.[PatientUsualAddressImdDecile]
-- , ethntp.[PatientUsualAddressImdDecileDescription]
-- , ethntp.[PatientUsualAddressRuralUrbanIndicator]
, ethntp.[PatientEthnicCategoryCode]
FROM [Y080_UID_PID].[dbo].[{PII_data_name}] espamr
LEFT JOIN [Y080_UID_PID].[dbo].[vY092_ECDS_SUS_PID] ethntp ON ethntp.NhsNumber = espamr.Patient_NHS_No AND CONVERT(DATE, CAST(ethntp.PatientBirthDate AS DATE),3) = espamr.dateofbirth
) AS hes"
)){
x <- datalink.prep.fun(x) # prepare linelist data for HES linkage
odbc::dbListTables(conn=data_lake, catalog_name=rx_dsn, schema_name="dbo") # Check names of the tables in the database
dbCreateTable(conn=data_lake, name = PII_data_name, fields = x, overwrite = T)
tableid <- Id(schema = "dbo", table = PII_data_name)
dbWriteTable(data_lake, tableid, x, append = TRUE, row.names = F)
linked_ecds_ethn <- dbSendQuery(data_lake,hes_qry)
linked_ecds_ethn <- dbGetQuery(data_lake,glue("select * from {target_table}"))
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
table_name <- target_table
# output table generated from query
ecds_ethnicity_linked_data <- dbReadTable(data_lake,table_name) # Extracting the ethnicity linked data from the DataLake
ecds_ethnicity_linked_data <- distinct(ecds_ethnicity_linked_data)
# get rid of temporary tables
dbSendQuery(data_lake, glue("drop table {target_table}"))
dbSendQuery(data_lake, glue("drop table {PII_data_name}"))
# disconnect from DL
dbDisconnect(data_lake)
# reassign to data df
x <- as_tibble(ecds_ethnicity_linked_data)
x <- x %>% rename(ethn_code=PatientEthnicCategoryCode)
return(x)
}
# /////////////////
# specify function to clean ethnicity linked LIMS linelist prior to regrouping ethnicity
ethlinked.data.clean.fun <- function(x){ # where main input is ethnicity-enriched LIMS linelist
# include pid_str and HES token ID in separate lookup
x <- x %>% select(any_of(c("pid_str","year","site","Lab_Geography_Name_Current","specdate","dob_specdate","spectype","species","forename","surname","nhsno","Patient_NHS_No","dob","age","agegroup","dateofbirth","sex","IMD2019_Quintiles_LSOA11_England","Ethnicity_group","NEW_ETHNOS_Ethnic_group","NEW_ETHNOS_Ethnic_subgroup","Fact_of_Death","Date_of_Death"))) %>%
rename_at(vars(starts_with("IMD2019_Quintiles_LSOA11_England")), ~ "imd_quintile") %>% # NEXT: apply rename_at to factofdeath and dateofdeath
# rename_at(vars(starts_with("Fact_of_Death")), ~ "factofdeath") %>% # optional: may not have death field in linelist (less likely for women of child bearing age)
# rename_at(vars(starts_with("Date_of_Death")), ~ "dateofdeath") %>%
rename_at(vars(starts_with("NEW_ETHNOS_Ethnic_subgroup")), ~ "Ethnic_group") %>%
# mutate(across((contains("factofdeath") | contains("Fact_of_Death")), .fns = temp_fun, .names = "factofdeath")) %>%
# mutate(factofdeath=case_when(factofdeath=="D" ~ 1,
# is.na(factofdeath) ~ 0)) %>%
mutate(note = case_when(
dateofbirth=="NA" | is.na(dateofbirth) | nhsno=="NA" | is.na(nhsno) ~ "insufficient PII for ethn trace",
!is.na(nhsno) | !is.na(dateofbirth) ~ "PII available for ethn trace",
!is.na(nhsno) & nhsno!="NA" & !is.na(dateofbirth) & dateofbirth!="NA" ~ "ethnicity not available")
)
}
# /////////////////
# specify function to generate binary (white vs non-white) ethnicity categorical variable for model
ethn.binary.fac.fun <- function(x){
x$hes_ethnicity <- fct_collapse(x$hes_ethnicity,
"white" = c("White"),
"Nonwhite" = c("Black","Asian","Other","Mixed"))
return(x)
}
# /////////////////
# specify function to assign IMD quintile field in linelist as factor with corresponding labels
imd.fac.label.fun <- function(x){
x$imd_quintile <- factor(x$imd_quintile,
levels=c('1','2','3','4','5'),
labels = c("20% most deprived","20% to 40%","40% to 60%","60% to 80%","20% least deprived"))
out <- x
}
linelist.ethcat5.fun <- function(x){
ethn.vec <- sort(unique(x$NEW_ETHNOS_Ethnic_group))
# [1] "99 Not known"
# [2] "Any other ethnic group"
# [3] "Asian / Asian British"
# [4] "Black / African / Caribbean / Black British"
# [5] "Mixed / Multiple ethnic groups"
# [6] "White"
# [7] "Z Not stated"
x <- x %>% mutate(hes_ethnicity="")
# for unknown group and not linked
x <- x %>%
mutate(hes_ethnicity = case_when(
NEW_ETHNOS_Ethnic_group %in% c(ethn.vec[1], ethn.vec[7]) ~ "Unknown",
is.na(NEW_ETHNOS_Ethnic_group) ~ "Not linked",
TRUE ~ hes_ethnicity
))
# for Black, Black British, Caribbean or African group
x <- x %>%
mutate(hes_ethnicity = case_when(
NEW_ETHNOS_Ethnic_group %in% ethn.vec[4] ~ "Black",
TRUE ~ hes_ethnicity
))
# for Mixed
x <- x %>%
mutate(hes_ethnicity = case_when(
NEW_ETHNOS_Ethnic_group %in% ethn.vec[5] ~ "Mixed",
TRUE ~ hes_ethnicity
))
# for Other
x <- x %>%
mutate(hes_ethnicity = case_when(
NEW_ETHNOS_Ethnic_group %in% ethn.vec[2] ~ "Other",
TRUE ~ hes_ethnicity
))
# for Asian, Asian British
x <- x %>%
mutate(hes_ethnicity = case_when(
NEW_ETHNOS_Ethnic_group %in% ethn.vec[3] ~ "Asian",
TRUE ~ hes_ethnicity
))
# for White
x <- x %>%
mutate(hes_ethnicity = case_when(
NEW_ETHNOS_Ethnic_group %in% ethn.vec[6] ~ "White",
TRUE ~ hes_ethnicity
)) %>%
select(-c("Ethnic_group","NEW_ETHNOS_Ethnic_group")) %>% mutate(z=1)
x <- if ("Ethnicity_group" %in% names(x)) {
x %>% rename(lim_ethnicity=Ethnicity_group) } else { # drop reduntant Ethnicity assignment columns
x <- x
}
# add in count variable prior to grouping by ethnicity category
}
# /////////////------------ HES APC LINKAGE FOR PREGNANCY INDICATORS ------------ /////////////////
hesapc.link.fun <- function(x, # where main input is LIMS linelist
rx_dsn = "Y080_UID_PID",
data_lake_connect = paste0('driver={SQL Server};server=SQLClusColLK19\\Lake19;',
'database=',rx_dsn,';trusted_connection=true'),
data_lake = odbc::dbConnect(odbc::odbc(),.connection_string=data_lake_connect), # create DataLake connection
PII_data_name = paste0("JR_tbl_",format(Sys.Date(), "%y%m%d"),"_",format(Sys.time(), "%H_%M")), # PII to upload to DataLake: table name needs to be unique and not previously on the Database
target_table = paste0("JR_upld_",format(Sys.Date(), "%y%m%d"),"_",format(Sys.time(), "%H_%M")),
hes_qry = glue(
"-- Step 1: Create target table if it doesn't exist
CREATE TABLE [Y080_UID_PID].[dbo].[{target_table}] (
Patient_NHS_No VARCHAR(50),
dateofbirth DATE,
specdate DATE,
FYEAR INT,
EPIKEY VARCHAR(50),
DOBBABY_1_DV DATE,
ADMIDATE DATE,
ADMIDATE_DV DATE,
DISDATE DATE,
DISDATE_DV DATE,
EPITYPE VARCHAR(50),
OPERTN_01 VARCHAR(50),
OPDATE_01 DATE,
BIRSTAT_1 VARCHAR(50),
DELMETH_D VARCHAR(50),
GESTAT_1 VARCHAR(50),
DIAG_01 VARCHAR(50),
DIAG_02 VARCHAR(50),
DIAG_03 VARCHAR(50),
DIAG_04 VARCHAR(50),
DIAG_05 VARCHAR(50),
PROCODE3 VARCHAR(50)
);
-- Step 2: Insert data into target table using CTEs
;WITH filtered_hesapc AS (
SELECT FYEAR, EPIKEY, PROCODE3
FROM [HES_APC].[dbo].[vtHES_APC]
WHERE FYEAR IN (2122, 2223, 2324) AND PROCODE3 IN ('RTG', 'RD8', 'RFW', 'R1H', 'RXR', 'RJR', 'RQ3', 'RWY', 'RXF', 'RC9', 'R1K', 'RVR', 'RTD', 'RM3', 'RTK', 'RK9', 'RBT')
),
filtered_admis AS (
SELECT *
FROM [HES_APC].[dbo].[vHES_APC_Flat]
WHERE FYEAR IN (2122, 2223, 2324)
),
filtered_oprtn AS (
SELECT *
FROM [HES_APC].[dbo].[vHES_APC_OPERTN_Flat]
WHERE FYEAR IN (2122, 2223, 2324)
),
filtered_mat AS (
SELECT *
FROM [HES_APC].[dbo].[vtHES_APC_MAT]
WHERE FYEAR IN (2122, 2223, 2324)
),
filtered_diag AS (
SELECT *
FROM [HES_APC].[dbo].[vHES_APC_DIAG_Flat]
WHERE FYEAR IN (2122, 2223, 2324)
),
filtered_apcpid AS (
SELECT *
FROM [Y080_UID_PID].[dbo].[vY080_UID_HESAPC_PID]
)
INSERT INTO [Y080_UID_PID].[dbo].[{target_table}]
SELECT
uploadeddata.Patient_NHS_No,
uploadeddata.dateofbirth,
uploadeddata.specdate,
apcpid.FYEAR,
apcpid.EPIKEY,
apcpid.DOBBABY_1_DV,
admis.ADMIDATE,
admis.ADMIDATE_DV,
admis.DISDATE,
admis.DISDATE_DV,
admis.EPITYPE,
oprtn.OPERTN_01,
oprtn.OPDATE_01,
mat.BIRSTAT_1,
mat.DELMETH_D,
mat.GESTAT_1,
diag.DIAG_01,
diag.DIAG_02,
diag.DIAG_03,
diag.DIAG_04,
diag.DIAG_05,
hesapc.PROCODE3
FROM [Y080_UID_PID].[dbo].[{PII_data_name}] AS uploadeddata
LEFT JOIN
filtered_apcpid AS apcpid ON apcpid.NEWNHSNO = uploadeddata.Patient_NHS_No
AND CONVERT(DATE, apcpid.DOB_DV, 3) = uploadeddata.dateofbirth
INNER JOIN
filtered_oprtn AS oprtn ON apcpid.EPIKEY = oprtn.EPIKEY AND apcpid.FYEAR = oprtn.FYEAR
INNER JOIN
filtered_mat AS mat ON apcpid.EPIKEY = mat.EPIKEY AND apcpid.FYEAR = mat.FYEAR
INNER JOIN
filtered_admis AS admis ON mat.EPIKEY = admis.EPIKEY AND mat.FYEAR = admis.FYEAR
INNER JOIN
filtered_diag AS diag ON admis.EPIKEY = diag.EPIKEY AND diag.FYEAR = admis.FYEAR
INNER JOIN
filtered_hesapc AS hesapc ON diag.EPIKEY = hesapc.EPIKEY AND hesapc.FYEAR = diag.FYEAR;")){
x <- datalink.prep.fun(x) # prepare linelist data for HES linkage
odbc::dbListTables(conn=data_lake, catalog_name=rx_dsn, schema_name="dbo") # Check names of the tables in the database
dbCreateTable(conn=data_lake, name = PII_data_name, fields = x, overwrite = T)
tableid <- Id(schema = "dbo", table = PII_data_name)
dbWriteTable(data_lake, tableid, x, append = TRUE, row.names = F)
linked_data <- dbSendQuery(data_lake,hes_qry)
linked_data <- dbGetQuery(data_lake,glue("select * from {target_table}"))
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
table_name <- target_table
# output table generated from query
hesapc_linked_data <- dbReadTable(data_lake,table_name) # Extracting the ethnicity linked data from the DataLake
hesapc_linked_data <- distinct(hesapc_linked_data)
# get rid of temporary tables
dbSendQuery(data_lake, glue("drop table {target_table}"))
dbSendQuery(data_lake, glue("drop table {PII_data_name}"))
# disconnect from DL
dbDisconnect(data_lake)
# reassign to data df
x <- as_tibble(hesapc_linked_data)
setwd(wd)
# export for further processing (ethnic group assignment according to CHIME methodology) in Stata
write.csv(x, "Data_carriage/Linelist/static_basedata_hesapclinked.csv", row.names=F)
return(x)
}
Add a new chunk by clicking the Insert Chunk button on the
toolbar or by pressing Ctrl+Alt+I.
When you save the notebook, an HTML file containing the code and
output will be saved alongside it (click the Preview button or
press Ctrl+Shift+K to preview the HTML file).
The preview shows you a rendered HTML copy of the contents of the
editor. Consequently, unlike Knit, Preview does not
run any R code chunks. Instead, the output of the chunk when it was last
run in the editor is displayed.
---
title: "R Notebook"
output: html_notebook
---

This is an [R Markdown](http://rmarkdown.rstudio.com) Notebook. When you execute code within the notebook, the results appear beneath the code. 

Try executing this chunk by clicking the *Run* button within the chunk or by placing your cursor inside it and pressing *Ctrl+Shift+Enter*. 

```{r}


# /////////////------------ LIMS DATA FORMATTING ------------ /////////////////

# /////////////////
# specify function to standardise format of LIMS data for analysis
lims.stnd.format.fun <- function(x,
                                 field_names = c("specimen number",                         ## specify all possible field names for each of the essential indicators
                                                 "crn (hospital) number","patient hospital number",                    
                                                 "nhs number","patient nhs number",                                   
                                                 "forename","patient forename",                                                                       
                                                 "surname","patient surname",                                                           
                                                 "date of birth","patient date of birth","patient dob",         
                                                 "sex","patient sex",        
                                                 "ethnic origin desc","ethnicity",                      
                                                 "post code","patient postcode",     
                                                 "date of specimen","specimen date", 
                                                 "specimen type desc","specimen type",                          
                                                 "organism desc","result text","organism comment","organism species name",            ## field describing organism isolated, including GBS negative and no organism                                                    
                                                 "antibiotic desc","antimicrobial",                                                   
                                                 "result","susceptibility test result")){                                                     
  
  names(x) <- tolower(gsub("\\.", "", names(x)))                                       # ensure all field names lower case
  
  x <- x %>% select(any_of(field_names))                                               # select essential indicators
  
  names(x) <- dplyr::case_when((names(x) == "specimen number") ~ "specno",                                       ## standardise field names for each of the essential indicators
                               (names(x) == "crn (hospital) number" | names(x) == "patient hospital number") ~ "hospno",
                               (names(x) == "nhs number" | names(x) == "patient nhs number") ~ "nhsno",
                               (names(x) == "forename" | names(x) == "patient forename") ~ "forename",
                               (names(x) == "surname" | names(x) == "patient surname") ~ "surname",
                               (names(x) == "date of birth" | names(x) == "patient date of birth" | names(x) == "patient dob") ~ "dob",
                               (names(x) == "sex" | names(x) == "patient sex") ~ "sex",
                               (names(x) == "ethnic origin desc" | names(x) == "ethnicity") ~ "ethnicity",
                               (names(x) == "post code" | names(x) == "patient postcode") ~ "patient_postcode",
                               (names(x) == "date of specimen" | names(x) == "specimen date") ~ "specdate",
                               (names(x) == "specimen type desc" | names(x) == "specimen type") ~ "spectype",
                               (names(x) == "organism desc" | names(x) == "result text" | names(x) == "organism comment" | names(x) == "organism species name") ~ "species",
                               (names(x) == "antibiotic desc" | names(x) == "antimicrobial") ~ "abx",
                               (names(x) == "result" | names(x) == "susceptibility test result") ~ "ast_result")
  
  x <- x %>% 
    mutate(species=case_when(
      grepl("Beta Haemolytic Streptococcus Group B",species) | 
        grepl("^Streptococcus agalactiae",species) | 
        species == "Streptococcus Group B" | 
        species == "S.agalactiae(Group B strep)" | 
        species == "Group B Haemolytic Strep"  | 
        species == "Strep.agalactiae (Group B)" | 
        species == "Group B Streptococcus isolated" | 
        species == "Group B Streptococcus ISOLATED" ~ "Group B Streptococcus",
      species == "Group B Strep NOT isolated" | 
        species== "Streptococcus group B NOT isolated" | 
        species=="Group B Streptococcus Not isolated" | 
        species== "No organism detected" | 
        species=="Negative after 1 day" | 
        species=="Organism Species Name" |
        grepl("^Candida",species) ~ "GBS not isolated",
      TRUE ~ species),
      ethnicity=case_when(
      grepl("xa0Ethnic", ethnicity) ~ "Any other ethnic group", 
      TRUE ~ ethnicity)
)
        x <- x[, !duplicated(colnames(x))]                                                                 ## remove duplicate cols
  
        # add in antimicrobial cols if not already in data (for GBS neg)
        if(!"abx" %in% colnames(x)){
          x$abx <- NA
        }
        if(!"ast_result" %in% colnames(x)){
          x$ast_result <- NA
        }
          
  x <- x %>% mutate(abx = case_when(
    abx=="" ~ NA,
    TRUE ~ abx),
    ast_result = case_when(
      ast_result=="" ~ NA,
      TRUE ~ ast_result)) %>% 
    mutate(ast_result=toupper(ast_result),
           module="LIMS")
  
  ## converting specimen date and dob to correct date format
  x <- x %>%
    mutate(
      dob = case_when(
        inherits(dob, "POSIXct") ~ as.Date(dob),  # POSIXct to Date
        inherits(dob, "IDate") ~ as.Date(as.character(dob)),  # IDate to Date
        TRUE ~ as.Date(dob, format = "%d/%m/%Y")  # Character string "DD/MM/YYYY" to Date
      )
    ) %>% 
    mutate(
      specdate = case_when(
        inherits(specdate, "POSIXct") ~ as.Date(specdate),  # POSIXct to Date
        inherits(specdate, "IDate") ~ as.Date(as.character(specdate)),  # IDate to Date
        TRUE ~ as.Date(specdate, format = "%d/%m/%Y")  # Character string "DD/MM/YYYY" to Date
      )
    )                                    ## converting specimen date to correct date format
  
  return(x)       # function output
  
}


# specify species column formatting function
species_format_fun <- function(x){
  x <- x %>% 
    mutate(species=case_when(
      grepl("Beta Haemolytic Streptococcus Group B",species) | 
        grepl("^Streptococcus agalactiae",species) | 
        species == "Streptococcus Group B" | 
        species == "S.agalactiae(Group B strep)" | 
        species == "Group B Haemolytic Strep"  | 
        species == "Strep.agalactiae (Group B)" | 
        species == "Group B Streptococcus isolated" | 
        species == "Group B Streptococcus ISOLATED" |
        species== "beta Haemolytic Strep, group B" ~ "Group B Streptococcus",
      species == "Group B Strep NOT isolated" | 
        species== "Streptococcus group B NOT isolated" | 
        species=="Group B Streptococcus Not isolated" | 
        species== "No organism detected" |
        species=="No organism recorded" |
        species=="Negative after 1 day" | 
        species=="Organism Species Name" |
        grepl("^Candida",species) ~ "GBS not isolated",
      TRUE ~ species)
    )
}




# /////////////------------ GBS CARRIAGE ESTIMATION ------------ /////////////////


# /////////////////
# specify function to reshape and calculate GBS carriage (% total patients in sampling timeframe)
gbs.carr.byethn.fun <- function(x){
  
  outdf <- x %>% mutate(z=1) 
  # %>% filter(hes_ethnicity!="Unknown" & hes_ethnicity!="Not linked" & hes_ethnicity!="_fromLIMS") 

  
  ethn_summary <- outdf %>% group_by(hes_ethnicity) %>% summarize(total=sum(z)) %>% mutate(pct=total/sum(total)*100)
  
  # GBS pos by ethnicity
  gbspos_byethn <- outdf %>% group_by(hes_ethnicity) %>% mutate(gbspos=ifelse(species=="Group B Streptococcus",1,0)) %>% summarize(gbspos=sum(gbspos))   # GBS pos by ethnicity
  
  # total ECM by ethnicity
  total_byeth <- outdf %>% group_by(hes_ethnicity) %>% mutate(total=1) %>% summarize(totalecm=sum(total))   # total ECM by ethnicity
  
  # carriage by ethnicity
  carriage_byethn <- merge(gbspos_byethn,total_byeth,by="hes_ethnicity") %>% mutate(carriage_bythn=gbspos/totalecm*100)    # carriage estimate
  
  # all GBS pos
  gbspos_all <- outdf %>% mutate(gbspos=ifelse(species=="Group B Streptococcus",1,0)) %>% summarize(gbspos=sum(gbspos))   # all GBS pos 
  
  # total 
  total <- outdf %>% mutate(total=1) %>% summarize(totalecm=sum(total))   # total ECM 
  
  # carriage
  gbs_carriage <- gbspos_all/total*100    # carriage estimate
  gbs_carriage <- gbs_carriage %>% rename("total_carriage" = gbspos)
  
  carriage_all <- gbspos_all/length(unique(outdf$nhsno))*100
  
  carriage_summary <- cbind(carriage_byethn,gbs_carriage)
  carriage_summary <- merge(ethn_summary,carriage_summary, by="hes_ethnicity") 
  carriage_summary <- carriage_summary %>% 
    rowwise() %>%
    mutate(
      conf = list(binom.confint(gbspos, totalecm, methods = "exact")),
      lower_ci = conf[[1, "lower"]] * 100,
      upper_ci = conf[[1, "upper"]] * 100
    ) %>%
    select(-conf) %>%
    ungroup()
    
  return(carriage_summary)
  
}


# /////////////////
# specify function to reshape and calculate GBS carriage by site and ethnicity (% total patients in sampling timeframe)
gbs.carr.bysite.byethn.fun <- function(x){
  
  outdf <- x %>% mutate(z=1) 
  # %>% filter(hes_ethnicity!="Unknown" & hes_ethnicity!="Not linked" & hes_ethnicity!="_fromLIMS") 
  
  
  ethn_summary <- outdf %>% group_by(site,hes_ethnicity) %>% summarize(total=sum(z)) %>% mutate(pct=total/sum(total)*100)
  
  # GBS pos by ethnicity
  gbspos_byethn <- outdf %>% group_by(site,hes_ethnicity) %>% mutate(gbspos=ifelse(species=="Group B Streptococcus",1,0)) %>% summarize(gbspos=sum(gbspos))   # GBS pos by ethnicity
  
  # total ECM by ethnicity
  total_byeth <- outdf %>% group_by(site,hes_ethnicity) %>% mutate(total=1) %>% summarize(totalecm=sum(total))   # total ECM by ethnicity
  
  # carriage by ethnicity
  carriage_byethn <- merge(gbspos_byethn,total_byeth,by= c("site","hes_ethnicity")) %>% mutate(carriage_bythn=gbspos/totalecm*100)    # carriage estimate
  
  # all GBS pos
  gbspos_all <- outdf %>% mutate(gbspos=ifelse(species=="Group B Streptococcus",1,0)) %>% summarize(gbspos=sum(gbspos))   # all GBS pos 
  
  # total 
  total <- outdf %>% mutate(total=1) %>% summarize(totalecm=sum(total))   # total ECM 
  
  # carriage
  gbs_carriage <- gbspos_all/total*100    # carriage estimate
  gbs_carriage <- gbs_carriage %>% rename("total_carriage" = gbspos)
  
  carriage_all <- gbspos_all/length(unique(outdf$nhsno))*100
  
  carriage_summary <- cbind(carriage_byethn,gbs_carriage)
  carriage_summary <- merge(ethn_summary,carriage_summary, by=c("site","hes_ethnicity")) 
  carriage_summary <- carriage_summary %>% 
    rowwise() %>%
    mutate(
      conf = list(binom.confint(gbspos, totalecm, methods = "exact")),
      lower_ci = conf[[1, "lower"]] * 100,
      upper_ci = conf[[1, "upper"]] * 100
    ) %>%
    select(-conf) %>%
    ungroup()
  
  return(carriage_summary)
  
}



# /////////////////
# specify function to reshape and calculate GBS carriage (% total patients in sampling timeframe) among Asian ethnic subgroups
# Indian, Pakistani and Bangladeshi. 
gbs.carr.bysubethn.fun <- function(x){
  
  outdf <- x %>% mutate(z=1) %>% filter(Ethnic_group=="Indian (Asian or Asian British)" | 
                                          Ethnic_group=="Bangladeshi (Asian or Asian British)" |
                                          Ethnic_group=="Pakistani (Asian or Asian British)")
  
  ethn_summary <- outdf %>% group_by(Ethnic_group) %>% summarize(total=sum(z)) %>% mutate(pct=total/sum(total)*100)
  
  # GBS pos by ethnicity
  gbspos_byethn <- outdf %>% group_by(Ethnic_group) %>% mutate(gbspos=ifelse(species=="Group B Streptococcus",1,0)) %>% summarize(gbspos=sum(gbspos))   # GBS pos by ethnicity
  
  # total ECM by ethnicity
  total_byeth <- outdf %>% group_by(Ethnic_group) %>% mutate(total=1) %>% summarize(totalecm=sum(total))   # total ECM by ethnicity
  
  # carriage by ethnicity
  carriage_byethn <- merge(gbspos_byethn,total_byeth,by="Ethnic_group") %>% mutate(carriage_bythn=gbspos/totalecm*100)    # carriage estimate
  
  # all GBS pos
  gbspos_all <- outdf %>% mutate(gbspos=ifelse(species=="Group B Streptococcus",1,0)) %>% summarize(gbspos=sum(gbspos))   # all GBS pos 
  
  # total 
  total <- outdf %>% mutate(total=1) %>% summarize(totalecm=sum(total))   # total ECM 
  
  # carriage
  gbs_carriage <- gbspos_all/total*100    # carriage estimate
  gbs_carriage <- gbs_carriage %>% rename("total_carriage" = gbspos)
  
  carriage_all <- gbspos_all/length(unique(outdf$nhsno))*100
  
  carriage_summary <- cbind(carriage_byethn,gbs_carriage)
  carriage_summary <- merge(ethn_summary,carriage_summary, by="Ethnic_group") 
  # %>% mutate(across(where(is.numeric), round, 2))
  
  return(carriage_summary)
  
}


# /////////////////
# specify function to reshape and calculate GBS carriage (% total patients in sampling timeframe) for all 16 ethnicity categories in HES
gbs.carr.byallsubethn.fun <- function(x){
  
  outdf <- x %>% mutate(z=1) 
  # %>% filter(Ethnic_group!="NA" & Ethnic_group!="99 Not known" & Ethnic_group!="Z Not stated")
  
  ethn_summary <- outdf %>% group_by(Ethnic_group) %>% summarize(total=sum(z)) %>% mutate(pct=total/sum(total)*100)
  
  # GBS pos by ethnicity
  gbspos_byethn <- outdf %>% group_by(Ethnic_group) %>% mutate(gbspos=ifelse(species=="Group B Streptococcus",1,0)) %>% summarize(gbspos=sum(gbspos))   # GBS pos by ethnicity
  
  # total ECM by ethnicity
  total_byeth <- outdf %>% group_by(Ethnic_group) %>% mutate(total=1) %>% summarize(totalecm=sum(total))   # total ECM by ethnicity
  
  # carriage by ethnicity
  carriage_byethn <- merge(gbspos_byethn,total_byeth,by="Ethnic_group") %>% mutate(carriage_bythn=gbspos/totalecm*100)    # carriage estimate
  
  # all GBS pos
  gbspos_all <- outdf %>% mutate(gbspos=ifelse(species=="Group B Streptococcus",1,0)) %>% summarize(gbspos=sum(gbspos))   # all GBS pos 
  
  # total 
  total <- outdf %>% mutate(total=1) %>% summarize(totalecm=sum(total))   # total ECM 
  
  # carriage
  gbs_carriage <- gbspos_all/total*100    # carriage estimate
  gbs_carriage <- gbs_carriage %>% rename("total_carriage" = gbspos)
  
  carriage_all <- gbspos_all/length(unique(outdf$nhsno))*100
  
  carriage_summary <- cbind(carriage_byethn,gbs_carriage)
  carriage_summary <- merge(ethn_summary,carriage_summary, by="Ethnic_group") 
  # %>% mutate(across(where(is.numeric), round, 2))
  
  return(carriage_summary)
  
}


# /////////////////
# specify function to reshape and calculate GBS carriage by IMD (% total patients in sampling timeframe)
gbs.carr.byimd.fun <- function(x){
  
  outdf <- x %>% mutate(z=1) 
  # pivot_wider(                       # currently pivots wider as part of obai process
  #   names_from = abx,
  #   values_from = ast_result) 
  
  imd_summary <- outdf %>% group_by(imd_quintile) %>% summarize(total=sum(z)) %>% mutate(pct=total/sum(total)*100)
  
  # GBS pos by IMD quintile
  gbspos_byimd <- outdf %>% group_by(imd_quintile) %>% mutate(gbspos=ifelse(species=="Group B Streptococcus",1,0)) %>% summarize(gbspos=sum(gbspos))   # GBS pos by IMD
  
  # total ECM by IMD quintile
  total_byimd <- outdf %>% group_by(imd_quintile) %>% mutate(total=1) %>% summarize(totalecm=sum(total))   # total ECM by IMD 
  
  # carriage by ethnicity
  carriage_byimd <- merge(gbspos_byimd,total_byimd,by="imd_quintile") %>% mutate(carriage_byimd=gbspos/totalecm*100)    # carriage estimate
  
  # all GBS pos
  gbspos_all <- outdf %>% mutate(gbspos=ifelse(species=="Group B Streptococcus",1,0)) %>% summarize(gbspos=sum(gbspos))   # all GBS pos 
  
  # total 
  total <- outdf %>% mutate(total=1) %>% summarize(totalecm=sum(total))   # total ECM 
  
  # carriage
  gbs_carriage <- gbspos_all/total*100    # carriage estimate
  gbs_carriage <- gbs_carriage %>% rename("total_carriage" = gbspos)
  
  carriage_all <- gbspos_all/length(unique(outdf$nhsno))*100
  
  carriage_summary <- cbind(carriage_byimd,gbs_carriage)
  carriage_summary <- merge(imd_summary,carriage_summary, by="imd_quintile") %>% mutate(imd_quintile = case_when(
    is.na(imd_quintile) ~ "Not linked",
    TRUE ~ as.character(imd_quintile)))
  
  carriage_summary <- carriage_summary %>% 
    rowwise() %>%
    mutate(
      conf = list(binom.confint(gbspos, totalecm, methods = "exact")),
      lower_ci = conf[[1, "lower"]] * 100,
      upper_ci = conf[[1, "upper"]] * 100
    ) %>%
    select(-conf) %>%
    ungroup()
  
  return(carriage_summary)
  
}


# /////////////////
# specify function to reshape and calculate GBS carriage (% total patients in sampling timeframe)
gbs.carr.byage.fun <- function(x){
  
  outdf <- x %>% mutate(z=1,
                        agegroup = cut(age, 
                                       breaks = seq(0, 100, by = 5),  # Specify the breaks for the age bands
                                       right = FALSE,  # Include the left endpoint, exclude the right
                                       labels = paste(seq(0, 95, by = 5), seq(4, 99, by = 5), sep = "-")),
                        agegroup = case_when(
                          agegroup %in% c("40-44","45-49") ~ "40-49",
                          TRUE ~ agegroup
                        )) 
  # pivot_wider(                       # currently pivots wider as part of obai process
  #   names_from = abx,
  #   values_from = ast_result) 
  
  age_summary <- outdf %>% group_by(agegroup) %>% summarize(total=sum(z)) %>% mutate(pct=total/sum(total)*100)
  
  # GBS pos by ethnicity
  gbspos_byage <- outdf %>% group_by(agegroup) %>% mutate(gbspos=ifelse(species=="Group B Streptococcus",1,0)) %>% summarize(gbspos=sum(gbspos))   # GBS pos by ethnicity
  
  # total ECM by ethnicity
  total_byage <- outdf %>% group_by(agegroup) %>% mutate(total=1) %>% summarize(totalecm=sum(total))   # total ECM by ethnicity
  
  # carriage by ethnicity
  carriage_byage <- merge(gbspos_byage,total_byage,by="agegroup") %>% mutate(carriage_byage=gbspos/totalecm*100)    # carriage estimate
  
  # all GBS pos
  gbspos_all <- outdf %>% mutate(gbspos=ifelse(species=="Group B Streptococcus",1,0)) %>% summarize(gbspos=sum(gbspos))   # all GBS pos 
  
  # total 
  total <- outdf %>% mutate(total=1) %>% summarize(totalecm=sum(total))   # total ECM 
  
  # carriage
  gbs_carriage <- gbspos_all/total*100    # carriage estimate
  gbs_carriage <- gbs_carriage %>% rename("total_carriage" = gbspos)
  
  carriage_all <- gbspos_all/length(unique(outdf$nhsno))*100
  
  carriage_summary <- cbind(carriage_byage,gbs_carriage)
  carriage_summary <- merge(age_summary,carriage_summary, by="agegroup") 
  
  carriage_summary <- carriage_summary %>% 
    rowwise() %>%
    mutate(
      conf = list(binom.confint(gbspos, totalecm, methods = "exact")),
      lower_ci = conf[[1, "lower"]] * 100,
      upper_ci = conf[[1, "upper"]] * 100
    ) %>%
    select(-conf) %>%
    ungroup()
  
  return(carriage_summary)
  
}



# /////////////////
# specify function to reshape and calculate GBS carriage (% total patients in sampling timeframe)
gbs.carr.bysite.fun <- function(x){
  
  outdf <- x %>% mutate(z=1)
  
  # pivot_wider(                       # currently pivots wider as part of obai process
  #   names_from = abx,
  #   values_from = ast_result) 
  
  site_summary <- outdf %>% group_by(site) %>% summarize(total=sum(z)) %>% mutate(pct=total/sum(total)*100)
  
  # GBS pos by ethnicity
  gbspos_bysite <- outdf %>% group_by(site) %>% mutate(gbspos=ifelse(species=="Group B Streptococcus",1,0)) %>% summarize(gbspos=sum(gbspos))   # GBS pos by ethnicity
  
  # total ECM by ethnicity
  total_bysite <- outdf %>% group_by(site) %>% mutate(total=1) %>% summarize(totalecm=sum(total))   # total ECM by ethnicity
  
  # carriage by ethnicity
  carriage_bysite <- merge(gbspos_bysite,total_bysite,by="site") %>% mutate(carriage_bysite=gbspos/totalecm*100)    # carriage estimate
  
  # all GBS pos
  gbspos_all <- outdf %>% mutate(gbspos=ifelse(species=="Group B Streptococcus",1,0)) %>% summarize(gbspos=sum(gbspos))   # all GBS pos 
  
  # total 
  total <- outdf %>% mutate(total=1) %>% summarize(totalecm=sum(total))   # total ECM 
  
  # carriage
  gbs_carriage <- gbspos_all/total*100    # carriage estimate
  gbs_carriage <- gbs_carriage %>% rename("total_carriage" = gbspos)
  
  carriage_all <- gbspos_all/length(unique(outdf$nhsno))*100
  
  carriage_summary <- cbind(carriage_bysite,gbs_carriage)
  carriage_summary <- merge(site_summary,carriage_summary, by="site") 
  
  carriage_summary <- carriage_summary %>% 
    rowwise() %>%
    mutate(
      conf = list(binom.confint(gbspos, totalecm, methods = "exact")),
      lower_ci = conf[[1, "lower"]] * 100,
      upper_ci = conf[[1, "upper"]] * 100
    ) %>%
    select(-conf) %>%
    ungroup()
  
  return(carriage_summary)
  
}



# /////////////////
# specify function to reshape and calculate GBS carriage (% total patients) by year
gbs.carr.byyear.fun <- function(x){
  
  outdf <- x %>% mutate(z=1)
  
  ethn_summary <- outdf %>% group_by(year) %>% summarize(total=sum(z)) %>% mutate(pct=total/sum(total)*100)
  
  # GBS pos by ethnicity
  gbspos_byyear <- outdf %>% group_by(year) %>% mutate(gbspos=ifelse(species=="Group B Streptococcus",1,0)) %>% summarize(gbspos=sum(gbspos))   # GBS pos by ethnicity
  
  # total ECM by ethnicity
  total_byyear <- outdf %>% group_by(year) %>% mutate(total=1) %>% summarize(totalecm=sum(total))   # total ECM by ethnicity
  
  # carriage by ethnicity
  carriage_byyear <- merge(gbspos_byyear,total_byyear,by="year") %>% mutate(carriage_byyear=gbspos/totalecm*100)    # carriage estimate
  
  # all GBS pos
  gbspos_all <- outdf %>% mutate(gbspos=ifelse(species=="Group B Streptococcus",1,0)) %>% summarize(gbspos=sum(gbspos))   # all GBS pos 
  
  # total 
  total <- outdf %>% mutate(total=1) %>% summarize(totalecm=sum(total))   # total ECM 
  
  # carriage
  gbs_carriage <- gbspos_all/total*100    # carriage estimate
  gbs_carriage <- gbs_carriage %>% rename("total_carriage" = gbspos)
  
  carriage_all <- gbspos_all/length(unique(outdf$nhsno))*100
  
  carriage_summary <- cbind(carriage_byyear,gbs_carriage)
  carriage_summary <- merge(ethn_summary,carriage_summary, by="year") 
  # %>% mutate(across(where(is.numeric), round, 2))
  
  return(carriage_summary)
  
}


# /////////////////
# specify function to reshape and calculate GBS carriage (% total patients) by month
gbs.carr.byym.fun <- function(x){
  
  outdf <- x %>% mutate(z=1)
  
  ethn_summary <- outdf %>% group_by(ym) %>% summarize(total=sum(z)) %>% mutate(pct=total/sum(total)*100)
  
  # GBS pos by ethnicity
  gbspos_byym <- outdf %>% group_by(ym) %>% mutate(gbspos=ifelse(species=="Group B Streptococcus",1,0)) %>% summarize(gbspos=sum(gbspos))   # GBS pos by ethnicity
  
  # total ECM by ethnicity
  total_byym <- outdf %>% group_by(ym) %>% mutate(total=1) %>% summarize(totalecm=sum(total))   # total ECM by ethnicity
  
  # carriage by ethnicity
  carriage_byym <- merge(gbspos_byym,total_byym,by="ym") %>% mutate(carriage_byym=gbspos/totalecm*100)    # carriage estimate
  
  # all GBS pos
  gbspos_all <- outdf %>% mutate(gbspos=ifelse(species=="Group B Streptococcus",1,0)) %>% summarize(gbspos=sum(gbspos))   # all GBS pos 
  
  # total 
  total <- outdf %>% mutate(total=1) %>% summarize(totalecm=sum(total))   # total ECM 
  
  # carriage
  gbs_carriage <- gbspos_all/total*100    # carriage estimate
  gbs_carriage <- gbs_carriage %>% rename("total_carriage" = gbspos)
  
  carriage_all <- gbspos_all/length(unique(outdf$nhsno))*100
  
  carriage_summary <- cbind(carriage_byym,gbs_carriage)
  carriage_summary <- merge(ethn_summary,carriage_summary, by="ym") 
  # %>% mutate(across(where(is.numeric), round, 2))
  
  return(carriage_summary)
  
}


# /////////////------------ LIMS LINELIST PROCESSING AND AMR RESULT GENERATION ------------ /////////////////



# /////////////////
# specify umbrella QC function 
qc.umbrella.fun <- function(x,summary=list()){
  x$sn <- seq_len(nrow(x))
  x$qc <- sgss.qc.fun(x$hospno, x$specno, x$forename, x$surname)
  x$qc[which(!x$qc %in% c(0, NA))] <- 1
  summary$qc_samples <- table(x$qc, x$module)
  cat(paste0("Removed ",sum(x$qc) , " quality control samples\n\n"))
  x <- x[x$qc==0,]
}


# /////////////////
# specify function to QC data (from obai::sgss_qc_records fun)
sgss.qc.fun <- function (hospno, specno, forename, surname) 
{
  hospno <- ifelse(is.na(hospno), "", toupper(as.character(hospno)))
  specno <- ifelse(is.na(specno), "", toupper(as.character(specno)))
  forename <- ifelse(is.na(forename), "", toupper(as.character(forename)))
  surname <- ifelse(is.na(surname), "", toupper(as.character(surname)))
  hospno <- stringr::str_trim(hospno)
  hospno <- stringr::str_replace_all(hospno, "^0*0", "")
  qc <- ifelse(stringr::str_detect(hospno, "SENS|NEQAS|NEQSA|^QC|QUALITY|IQA|DIST|SPEC|^TBT|LIQAS|TEST-1") | 
                 stringr::str_detect(specno, "SENS|NEQAS|NEQSA|QC|QUALITY|^IQA|^DIST|SPEC") | 
                 stringr::str_detect(surname, "^SENS|NEQAS|NEQSA|QC|IQA|QUALITY|ANTIMICROBIAL SUSCEPTABILITY|ANTIFUNGAL SUSCEPTIBILITY|INTERNAL QC|FEBUARY|IQAMIC2024") | 
                 stringr::str_detect(forename, "^SENS|NEQAS|NEQSA|IQA|FAECES|QUALITY|ANTIMICROBIAL SUSCEPTABILITY|ANTIFUNGAL SUSCEPTIBILITY|ANTIMICROBIAL|BLOOD CULTURE|DIAGNOSTICS"), 
               1, 0)
  qc <- ifelse(forename %in% toupper(month.name),1,0)
  
  return(qc)
}


# /////////////////
# specify customised obai sgss_infections function for LIMS data deduplication and AMR results generation
obai.dedup.fun <- function(allsites,
                           linkage_strategy = "default",
                           episode_length=274,   # equivalent of 9 months to capture second pregnancies within time frame of ECM testing at a given site
                           recurrence_length = episode_length,
                           episode_type = "fixed",
                           dedup_module_preference = NULL,
                           abx=TRUE,
                           abx_preference = "severity_all"){
  
  allsites$mrk <- diyar::combi(
    allsites$species,
    allsites$nhsno,
    allsites$hospno,
    allsites$dob,
    allsites$specdate,
    allsites$sex,
    allsites$site,
    allsites$surname,
    allsites$forename,
    allsites$specno)
  
  dedup_level = c("pids", "species")
  abx=TRUE
  
  if("LIMS" %in% dedup_level) allsites$mrk <- diyar::combi(allsites$mrk, allsites$module)
  if("module" %in% dedup_level) allsites$mrk <- diyar::combi(allsites$mrk, allsites$module)
  if(!isFALSE(abx)) allsites$mrk <- diyar::combi(allsites$mrk, allsites$abx, allsites$ast_result)
  
  summary=list()
  
  same_day_dups <- duplicated(allsites$mrk)
  summary$same_day_dups <- length(same_day_dups[same_day_dups])
  
  allsites <- allsites[!same_day_dups,]
  allsites$mrk <- NULL
  cat(summary$same_day_dups,"same-day depulicates removed\n\n")
  
  
  cat("Creating a patient identifier: `",
      linkage_strategy, "` method\n")
  # Make configurable
  # Generating a patient identifier
  wr_nhs <- lapply(0:9, function(x){
    paste0(rep(x, 10), collapse = "")
  })
  
  df <- allsites
  
  if(linkage_strategy == "default"){
    df$soundex <- stringdist::phonetic(df$surname)
    # Inclusion criteria
    wr_nhs <- as.character(wr_nhs)
    df$cri_1 <- df$cri_2 <- df$cri_3 <- NA
    lgk <- which(!(df$nhsno %in% c(".","",wr_nhs,"0","1234567890","9876543210",NA) | nchar(df$nhsno) != 10))
    # CR1 - Valid NHS no
    df$cri_1[lgk] <- df$nhsno[lgk]
    lgk <- which(!(df$hosno %in% c("","NO PATIENT ID","UNKNOWN","NO REF GIVEN", "nk", NA) | df$soundex %in% c("", NA)))
    # CR2 - Valid HOS no and surname Soundex
    df$cri_2[lgk] <- diyar::combi(df$hospno[lgk], df$soundex[lgk])
    lgk <- which(!(df$specno %in% c("","NO PATIENT ID","UNKNOWN","NO REF GIVEN", "nk", NA) | df$soundex %in% c("", NA) | df$sex %in% c("U","Unknown", NA) | df$dob %in% c("", NA)))
    # CR3 - Valid Specimen number, Soundex NHS and sex
    df$cri_3[lgk] <- diyar::combi(df$specno[lgk], df$soundex[lgk], df$sex[lgk])
    # AND (Valid DDMM or DDYY or MMYY)
    df$cri_pt1 <- df$cri_pt2 <- df$cri_pt3 <- NA
    lgk <- which(!(df$dob %in% c("", NA)))
    df$cri_pt1[lgk] = diyar::combi(substr(df$dob[lgk],1,4), substr(df$dob[lgk],6,7))
    df$cri_pt2[lgk] = diyar::combi(substr(df$dob[lgk],1,4), substr(df$dob[lgk],9,10))
    df$cri_pt3[lgk] = diyar::combi(substr(df$dob[lgk],9,10), substr(df$dob[lgk],6,7))
    linked_s_criteria <- list(cr3 = diyar::sub_criteria(df$cri_pt1, df$cri_pt2, df$cri_pt3))
    
    # Order; CR1 to CR3
    criteria <- list(df$cri_1, df$cri_2, df$cri_3)
    df$soundex <- NULL
  }else if(linkage_strategy == "uid_gold"){
    # Inclusion criteria
    wr_nhs <- as.character(wr_nhs)
    df$cri_1 <- df$cri_2 <- df$cri_3 <- NA
    lgk <- which(!(df$nhsno %in% c(".","",wr_nhs,"0","1234567890","9876543210",NA) |
                     df$hosno %in% c("","NO PATIENT ID","UNKNOWN","NO REF GIVEN", "nk", NA) |
                     df$dob %in% c("", "1900-01-01", NA) |
                     df$surname %in% c("","NO PATIENT ID","UNKNOWN","NO REF GIVEN", "nk", NA) |
                     df$hosno %in% c("","NO PATIENT ID","UNKNOWN","NO REF GIVEN", "nk", NA)))
    # CR1 - Valid NHS no, DOB, forename and surname
    df$cri_1[lgk] <- diyar::combi(df$nhsno[lgk], df$dob[lgk], df$forename[lgk], df$surname[lgk])
    lgk <- which(!(df$nhsno %in% c(".","",wr_nhs,"0","1234567890","9876543210",NA) |
                     df$dob %in% c("", "1900-01-01", NA) |
                     df$sex %in% c("","NO PATIENT ID","UNKNOWN","NO REF GIVEN", "nk", "U","Unknown", NA)))
    # CR2 - Valid NHS no, DOB and sex
    df$cri_2[lgk] <- diyar::combi(df$nhsno[lgk], df$dob[lgk], df$sex[lgk])
    lgk <- which(!(df$nhsno %in% c(".","",wr_nhs,"0","1234567890","9876543210",NA) |
                     df$dob %in% c("", "1900-01-01", NA)))
    # CR3 - Valid NHS no and DOB
    df$cri_3[lgk] <- diyar::combi(df$nhsno[lgk], df$dob[lgk])
    
    # Order; CR1 to CR3
    criteria <- list(df$cri_1, df$cri_2, df$cri_3)
    linked_s_criteria <- NULL
    
  }else if(linkage_strategy == "basic"){
    # Inclusion criteria
    wr_nhs <- as.character(wr_nhs)
    df$cri_1 <- NA
    lgk <- which(!(df$nhsno %in% c(".","",wr_nhs,"0","1234567890","9876543210",NA) |
                     df$dob %in% c("", "1900-01-01", NA)))
    # CR1 - Valid NHS no and DOB
    df$cri_1[lgk] <- diyar::combi(df$nhsno[lgk], df$dob[lgk])
    # Order; CR1
    criteria <- list(df$cri_1)
    linked_s_criteria <- NULL
  }else if(linkage_strategy == "sgss_audit"){
    # 9-stage ordered record linkage to identify patients
    # Place holders for 9 matching criteria and attributes for sub_criteria
    df$cri_1 <- df$cri_2 <- df$cri_3 <-
      df$cri_4 <- df$cri_5 <- df$cri_6 <-
      df$cri_7 <- df$cri_8 <- df$cri_9 <-
      df$cri_dob_pt1 <- df$cri_dob_pt2 <- df$cri_dob_pt3 <- NA
    
    # Place holders for inclusion criteria of each attribute
    v_specno <- v_fname <- v_sname <- v_nhs <- v_dob <- v_hos <- v_sex <- rep(FALSE, nrow(df))
    
    wr_nhs <- lapply(0:9, function(x){
      paste0(rep(x,10), collapse = "")
    })
    
    df$soundex <- stringdist::phonetic(df$surname)
    df$forename_initial <- toupper(substr(df$forename,1, 1))
    v_nhs[!df$nhsno %in% c(".","","9999999999","0","123456789","9876543210", NA, wr_nhs)] <- TRUE
    v_dob[!df$dob %in% c("1900-01-01","", NA)] <- TRUE
    v_hos[!toupper(df$hosno) %in% c("UNKNOWN", "NO PATIENT ID", "","NOT SPECIFIED", NA)] <- TRUE
    v_sex[!toupper(df$sex) %in% c("UNKNOWN","","NOT SPECIFIED",  NA)] <- TRUE
    v_sname[!toupper(df$surname) %in% c("", NA)] <- TRUE
    v_fname[!toupper(df$forename) %in% c("", NA)] <- TRUE
    v_specno[!toupper(df$specno) %in% c("UNKNOWN", "NO PATIENT ID", "","NOT SPECIFIED", NA)] <- TRUE
    
    # Stage 1
    lgk <- which(v_nhs & v_dob)
    df$cri_1[lgk] <- diyar::combi(df$nhsno[lgk], df$dob[lgk])
    
    # Stage 2
    lgk <- which(v_nhs)
    df$cri_2[lgk] <- df$nhsno[lgk]
    
    # Stage 3
    lgk <- which(v_hos & v_dob & v_sname)
    df$cri_3[lgk] <- diyar::combi(df$hosno[lgk],df$dob[lgk], df$soundex[lgk])
    # Stage 4
    lgk <- which(v_hos)
    df$cri_4[lgk] <- df$hosno[lgk]
    
    # Stage 5
    lgk <- which(v_specno & v_sex & v_dob & v_fname & v_sname)
    df$cri_5[lgk] <- diyar::combi(df$specno[lgk], df$lab_cd[lgk], df$sex[lgk],
                                  df$forename_initial[lgk], df$dob[lgk], df$soundex[lgk])
    # Stage 6
    lgk <- which(v_specno & v_sex)
    df$cri_6[lgk] <- diyar::combi(df$specno[lgk], df$lab_cd[lgk], df$sex[lgk])
    # Stage 7
    lgk <- which(v_specno & v_dob)
    df$cri_7[lgk] <- diyar::combi(df$specno[lgk], df$dob[lgk])
    # Stage 8
    # df$cri_8 <- df$cri_7
    
    # Stage 9
    lgk <- which(v_specno)
    df$cri_9[lgk] <- df$specno[lgk]
    lgk <- which(v_dob)
    df$cri_dob_pt1[lgk] <- diyar::combi(lubridate::year(df$dob[lgk]), lubridate::month(df$dob[lgk]))
    df$cri_dob_pt2[lgk] <- diyar::combi(lubridate::year(df$dob[lgk]), lubridate::day(df$dob[lgk]))
    df$cri_dob_pt3[lgk] <- diyar::combi(lubridate::day(df$dob[lgk]), lubridate::month(df$dob[lgk]))
    
    # Sub criteria for stage 6. Also used in stage 9
    s_cri6 <- diyar::sub_criteria(df$forename_initial, df$soundex)
    # Sub criteria for stage 9
    s_cri9 <- diyar::sub_criteria(
      s_cri6,
      diyar::sub_criteria(df$cri_dob_pt1, df$cri_dob_pt2, df$cri_dob_pt3),
      operator = "or")
    
    # Order; CR1 to CR9
    criteria <- list(df$cri_1, df$cri_2, df$cri_3, df$cri_4,
                     df$cri_5, df$cri_6, df$cri_7,
                     # df$cri_8,
                     df$cri_9)
    linked_s_criteria <- list("cr6" = s_cri6, "cr9" = s_cri9)
    df$soundex <- NULL
  }
  
  
  df <- df[names(df)[!grepl("cri_", names(df))]]
  
  # Unlinked records are considered separate entities.
  # This is an issue when querying AMR data as each unlinked test is considered a separate patient.
  # To prevent this, the closest approximation to a patient's sample is used as the final criteria
  criteria <- c(criteria,
                list(diyar::combi(df$nhsno, df$hospno, df$dob, df$surname, df$forename, df$sex, df$specno))
  )
  df$pids <- diyar::links(criteria = criteria,
                          sub_criteria = linked_s_criteria,
                          display = "progress")
  
  cat(paste0("Categorising episode: ",
             episode_length,"-day difference ",
             episode_type, " episodes\n"))
  df$dt <- lubridate::ymd(df$specdate)
  
  
  df$pid_str <- diyar::combi(as.list(df[dedup_level]))
  
  
  if(!is.null(dedup_module_preference)){
    mdls <- c("amr", "cdr")
    dedup_module_preference <- factor(df$sgss_module, levels = c(mdls[mdls == dedup_module_preference],
                                                                 mdls[mdls != dedup_module_preference]))
  }
  
  df$epids <- diyar::episodes(date = df$dt,
                              case_length = episode_length,
                              recurrence_length = recurrence_length,
                              episode_type = episode_type,
                              strata = df$pid_str,
                              group_stats = TRUE,
                              display = "progress",
                              custom_sort = dedup_module_preference)
  
  df$dt <- df$qc <- NULL
  
  # Most severe abx result per episode
  
  # convert instances of "P" and "W" to "R" in ast_result field for indicible clindamycin resistance test results 
  df <- df %>% mutate(ast_result=case_when(
    ast_result == "P" | ast_result == "W" ~ "R",
    TRUE ~ ast_result
  ))
  
  if(!isFALSE(abx)){
    cat("Determining the most severe abx result per episode\n")
    df$amr_severity <- dplyr::case_when(
      df$ast_result == "R" ~ 1,
      df$ast_result == "I" ~ 2,
      df$ast_result == "S" ~ 3,
      df$ast_result == "N" ~ 4,
      TRUE ~ 5
    )
    
    if(c("severity_all") %in% abx_preference){
      df$epi_amr <- jurithy::bys_val(df$amr_severity, df$sn, by= paste(as.numeric(df$epids), df$abx), val=df$amr_severity)
    }else if (c("severity_cdr") %in% abx_preference){
      tp_md <- factor(tolower(df$sgss_module), levels = c("cdr", "amr"))
      df$epi_amr <- jurithy::bys_val(tp_md, df$amr_severity, df$sn, by= paste(as.numeric(df$epids), df$abx), val=df$amr_severity)
      rm(tp_md)
    }else if (c("severity_amr") %in% abx_preference){
      tp_md <- factor(tolower(df$sgss_module), levels = c("amr", "cdr"))
      df$epi_amr <- jurithy::bys_val(tp_md, df$amr_severity, df$sn, by= paste(as.numeric(df$epids), df$abx), val=df$amr_severity)
      rm(tp_md)
    }
    
    df$epi_amr <- c("R","I","S","N","")[df$epi_amr]
    
    abx_recoded <- df$amr!=df$epi_amr
    summary$abx_recoded <- length(abx_recoded[abx_recoded])
    
    abx_results <- dplyr::select(df, .data$epids, .data$abx, e = .data$epi_amr, r = .data$ast_result) %>%
      dplyr::mutate(epids = as.numeric(.data$epids)) %>%
      dplyr::distinct() %>%
      tidyr::pivot_wider(names_from = .data$abx,
                         values_from = c(.data$e, .data$r),
                         values_fn = list(e = ~ paste0(unique(.), collapse = ";"),
                                          r = ~ paste0(unique(.), collapse = ";")),
                         values_fill = list(e ="", r = ""))
    df$amr_severity <- NULL
    
  }else{
    summary$abx_recoded <- NULL
    abx_results <- NULL
  }
  
  df$patients <- as.numeric(!duplicated(df$pids@.Data))
  df$episodes <- as.numeric(!duplicated(df$epids@.Data))
  
  deduped_df <- df %>% filter(patients==1)
  
  deduplication <- dplyr::select(janitor::tabyl(df, .data$species,  .data$patients),  "species", patients = "1")
  deduplication <- dplyr::bind_cols(deduplication,  dplyr::select(janitor::tabyl(df, .data$species,  .data$episodes), episodes = "1"))
  
  summary$deduplication <- deduplication
  
  df$patients <- df$episodes <- df$sn <- df$pid_str <- NULL
  output <- list(
    extracts = allsites,
    processed = df,
    abx_results = abx_results,
    summary = summary)
  
  
  # processed_data <- as.data.frame(output[["processed"]])
  processed_data <- deduped_df
  abx_data <- as.data.frame(output[["abx_results"]])
  
  processed_data$epids <- as.character(processed_data$epids)
  processed_data$epids2 <- sapply(processed_data$epids, function(x) {
    sub("^[^0-9]*0*([1-9][0-9]*)", "\\1", strsplit(x, " ")[[1]][1])
  })
  
  # merge linelist with abx results on epids
  abx_results$epids <- as.character(abx_results$epids)
  deduped_df <- left_join(processed_data,abx_results,by="epids") %>% select(-c("sn","pids","epi_amr","patients","abx","ast_result","episodes","epids2"))
  
  return(deduped_df)
  
}


# /////////////------------ ETHNICITY GROUPINGS ------------ /////////////////

# /////////////////
# specify function to regroup LIMS ethnicity categories to espaur groupings and then to 5 categories for age standardisation 
lims.ethcat.fun <- function(x){

  #create char vec of unique Ethnic groups in ethnicity-linked linelist data
  ethn.vec <- sort(unique(x$ethnicity))   # in alphabetical order for consistency when indexing
  
  # vector positions for each ethnicity report in LIMS
  
  # Unknown 
  # [1] ""  
  # [39]"Not Asked"
  # [40] "Not Given on Form"                                    
  # [41]"Not given/declined"                                   
  # [42]"Not specified"                                       
  # [43] "Not Specified"                                        
  # [44]"Not stated" 
  # [45]"Other - not stated" 
  # [54]"Unable or unwilling to answer"                       
  # [55] "UNKNOWN"                                                                               
  # 
  # Black, Black British, Caribbean or African
  # [2]"African or African-Caribbean"                         
  # [3]"Any other Black background"
  # [19] "Black - any other black background"                   
  # [20]"Black - Caribbean"                                    
  # [21]"Black African"                                       
  # [22] "Black Caribbean"                                      
  # [23]"Black or Black British - African "                    
  # [24]"Black or Black British - Any other Black background "
  # [25] "Black/Black British-Caribbean"                        
  # [26]"Black/Black British African"                          
  # [27]"Black/Black British Caribbean"   
  # 
  # Mixed                                  
  # [29]"Mixed - any other mixed background"                   
  # [30]"Mixed - Any other mixed background "                 
  # [31] "Mixed - White and Asian "                             
  # [32]"Mixed - White and Black African "                     
  # [33]"Mixed Any other mixed background"                    
  # [34] "Mixed White & Black Caribbean"                        
  # [35]"Mixed White and Asian"                                
  # [36]"Mixed White and Black African"                       
  # [37] "Mixed White and Black Caribbean"                      
  # 
  # Other
  # [4] "Any other ethnic group" 
  # [46] "Other Ethnic Group"                                   
  # [47]"Other Non-European"
  # [49] "Other Ethnic Groups - Any other ethnic group " 
  # [53]"Southern & Other European" 
  # [38]"Northern European"                                    
  # 
  # Asian, Asian British
  # [7] "Asian any other Asian background"                     
  # [8]"Asian Bangladeshi"                                    
  # [9]"Asian Indian"                                        
  # [10] "Asian or Asian British - Any other Asian background"  
  # [11]"Asian or Asian British - Bangladeshi "                
  # [12]"Asian or Asian British - Indian "                    
  # [13] "Asian or Asian British - Pakistani "                  
  # [14]"Asian Pakistani"                                      
  # [15]"Asian/Asian British-Pakistani"                       
  # [16] "Asian/Asian British any other"                        
  # [17]"Asian/Asian British Bangladeshi"                      
  # [18]"Asian/Asian British Indian"                          
  # [28] "Chinese" 
  # [50]"Other Ethnic Groups - Chinese "                       
  # [51]"South Asian"                                         
  # [52] "South East Asian"                                     
  # 
  # 
  # 
  # White
  # [5]"Any other white background"                           
  # [6]"Any other White background"  
  # [48]"Other white background"  
  # [56]"White"                                                
  # [57]"WHITE"                                               
  # [58] "White - Any other White background "                  
  # [59]"White - British"                                      
  # [60]"White - Irish"                                       
  # [61] "White British"                                        
  # [62]"White Irish"  
  
  
  x <- x %>% mutate(Ethnicity_group="") 
  
  # for unknown group
  x <- x %>%
    mutate(Ethnicity_group = case_when(
      ethnicity %in% c(ethn.vec[1], ethn.vec[54], ethn.vec[55]) ~ "Unknown",
      ethnicity %in% ethn.vec[39:45] ~ "Unknown",
      is.na(ethnicity) ~ "Unknown",
      TRUE ~ Ethnicity_group
    ))
  
  # for Black, Black British, Caribbean or African group
  x <- x %>%
    mutate(Ethnicity_group = case_when(
      ethnicity %in% c(ethn.vec[2], ethn.vec[3]) ~ "Black",
      ethnicity %in% ethn.vec[19:27] ~ "Black",
      TRUE ~ Ethnicity_group
    ))
  
  # for Mixed
  x <- x %>%
    mutate(Ethnicity_group = case_when(
      ethnicity %in% ethn.vec[29:37] ~ "Mixed",
      TRUE ~ Ethnicity_group
    ))
  
  # for Other
  x <- x %>%
    mutate(Ethnicity_group = case_when(
      ethnicity %in% c(ethn.vec[4],ethn.vec[46],ethn.vec[47],ethn.vec[49],ethn.vec[53],ethn.vec[38]) ~ "Other",
      TRUE ~ Ethnicity_group
    ))
  
  # for Asian, Asian British
  x <- x %>%
    mutate(Ethnicity_group = case_when(
      ethnicity %in% ethn.vec[7:9] ~ "Asian",
      ethnicity %in% ethn.vec[10:18] ~ "Asian",
      ethnicity %in% ethn.vec[28] ~ "Asian",
      ethnicity %in% ethn.vec[50:52] ~ "Asian",
      TRUE ~ Ethnicity_group
    ))
  
  # for White
  x <- x %>%
    mutate(Ethnicity_group = case_when(
      ethnicity %in% c(ethn.vec[5], ethn.vec[6], ethn.vec[48]) ~ "White",
      ethnicity %in% ethn.vec[56:62] ~ "White",
      TRUE ~ Ethnicity_group
    )) %>% select(-c("ethnicity")) %>%                                                               # drop reduntant Ethnicity assignment columns
    mutate(z=1)  
  
  # add in count variable prior to grouping by ethnicity category
  
}



# /////////////------------ DBS TRACING ------------ /////////////////


# /////////////////
# specify function to perform initial data quality check on LIMS linelist and to generate susbet of data which missing one or more key fields for tracing
initial.lims.dq.fun <- function(x){
  # nhsno_na <- sum(is.na(x$nhsno))
  # dob_na <- sum(is.na(x$dob))
  # err_date <- as.Date(c("1900-01-01"))
  # dob_err <- sum(x$dob==err_date)
  
  subset <- x %>% filter(is.na(dob) |
                           dob==as.Date(c("1900-01-01")) |
                           is.na(nhsno) |
                           is.na(surname) |
                           sex=="U" |
                           patient_postcode=="Z999")
  
  subset <- subset
}


# /////////////////
# specify function to identify and retain QC sample reports from LIMS linelist, as part of DBS tracing
eqa.record.fun <- function(x){
  
  out <- x %>% filter(dob==as.Date(c("1900-01-01")) |
                        surname=="LQ BLOOD CULTURE" | 
                        surname=="IQAMIC2024" |
                        forename=="EQA" |
                        forename=="FOXTROTFIVEEIGHT" |
                        forename=="NOT USED" |
                        forename=="UNKNOWN" |
                        forename=="PORCINE TRACHEA(NEO)" |
                        surname=="5995753 CTOX" |
                        patient_postcode=="Z999")
  
  out <- dbs.tracing.fields.fun(out)
}


# /////////////////
# specify nested function to prepare LIMS linelist data field formatting for DBS tracing 
dbs.tracing.fields.fun <- function(x){
  
  lims_for_trace <- x %>% arrange(pid_str) %>%  # arrange by patient ID
    rename(Identifier=pid_str,
           Date_of_Birth=dob,
           NHS_number=nhsno,
           Surname=surname,
           Forename=forename,
           Sex=sex) %>% 
    mutate(null1="",
           null2="",
           null3="",
           null4="",
           null5="",
           Sex=(if_else(Sex=="M","1",Sex)),     # 1 = male, 2 = female, NA = unknown
           Sex=(if_else(Sex=="F","2",Sex)),
           Sex=(if_else(Sex=="U","",Sex)),
           Sex=as.numeric(Sex),
           Date_of_Birth=as.Date(Date_of_Birth, "%Y%m%d"),
           Date_of_Birth=as.character(str_replace_all(string=Date_of_Birth, pattern="-", replacement = "") ),
           Identifier=as.numeric(Identifier)) %>% 
    select(null1,Identifier,Date_of_Birth,null2,null3,NHS_number,Surname,null4,Forename,null5,Sex) %>% arrange(Identifier)
  # filter_at(.vars = vars(Date_of_Birth,NHS_number), .vars_predicate = any_vars(!is.na(.)))  # remove instances of episode lacking postcode & NHS
  
  
  lims_for_trace$Forename <- toupper(gsub("\\.", "", lims_for_trace$Forename))
  lims_for_trace$Surname <- toupper(gsub("\\.", "", lims_for_trace$Surname))
  
  return(lims_for_trace)
  
}


# /////////////////
# specify wrap-around function to apply umbrella and nested functions for DBS tracing data prep
dbs.trace.prep.fun <- function(x,
                               mainfile=mainfile,
                               excludefile=excludefile){  # where main input is GroupCG_linelist
  
  excluded_records <- initial.lims.dq.fun(x)                                        # generate subset of data where one or more key fields missing for trace - send these for tracing if minimum necessary PII 
  
  # generate main df for tracing
  lims_for_trace <- lims.fortrace.fun(x,
                                  excluded_records)
  
  # generate excluded records df for tracing
  excluded_records_trace <- excl.records.trace.fun(x,
                                                   excluded_records)
  
  # identify probable QC reports and exclude these from datasets to trace
  eqa_records <- eqa.record.fun(x)
  
  # exclude probable QC sample reports from data to be sent for tracing
  lims_for_trace <- lims_for_trace %>% filter(!Identifier %in% eqa_records$Identifier)                           
  excluded_records_trace <- excluded_records_trace %>% filter(!Identifier %in% eqa_records$Identifier)

  
  # export to csv for sending to DBS team
  write.csv(lims_for_trace, file = mainfile, row.names = F)
  write.csv(excluded_records_trace, file = excludefile, row.names = F)
}


#////////////
#--main umbrella function to prepare linelist data for DBS tracing
lims.fortrace.fun <- function(x, # where x is GroupCG_linelist
                            excluded_records){ 
  
  lims_for_trace <- x %>% filter(!pid_str %in% excluded_records$pid_str)                            # exclude above records from data to be sent for tracing
  
  lims_for_trace <- dbs.tracing.fields.fun(lims_for_trace) }                                        # prepare data formatting for tracing


# /////////////////
# specify function to prepare excluded data for DBS tracing
excl.records.trace.fun <- function(x, # where x is LIMS linelist
                                   excluded_records){
  
  excluded_records_trace <- x %>% filter((pid_str %in% excluded_records$pid_str))             # prepare excluded records for tracing for PII enrichment (these may still be traced although higher probability of failure)
  
  excluded_records_trace <- dbs.tracing.fields.fun(excluded_records_trace)                    # prepare data formatting for tracing
}



# /////////////////
# specify function to format trace return colnames
header.fun <- function(x,   # where x is trace response data
                       y=import(here("Data_carriage","DBS_tracing","trace_response_headers.xlsx"),
                                format = "xlsx",
                                fill = T,
                                comment.char="",
                                na = c(""),
                                skip = 0,
                                full.names = T)){  # where y is response headers 
  
  
  
  # # assign header colnames
  # colnames(y) <- as.character(y[1,])
  
  # retain only successful traces and drop additional column in trace response data so that ncol is 60
  x <- x %>% filter(V1=="20" | V1=="30" | V1=="33" | V1=="40") %>% select(-any_of(c("V61")))                  # retaining those cases that have been successfully traced
  
  # define char vec of trace response headers
  headers <- colnames(y)      
  
  # confirm length of headers is same as number of columns in trace_response 
  print(length(headers))
  
  # define char vec of redundant col names (V1, V2 etc)
  oldnames <- colnames(x)
  
  # rename columns in trace response data based on headers
  x <- x %>% 
    rename_at(vars(oldnames), ~ headers)
}


# /////////////////
# specify function to format trace return colnames for subset of return where trace unsuccessful
 notrace.header.fun <- function(x,   # where x is trace response data
                       y=import(here("Data_carriage","DBS_tracing","trace_response_headers.xlsx"),
                                format = "xlsx",
                                fill = T,
                                comment.char="",
                                na = c(""),
                                skip = 0,
                                full.names = T)){  # where y is response headers 
  
  
  
  # # assign header colnames
  # colnames(y) <- as.character(y[1,])
  
  # drop additional column in trace response data so that ncol is 60
  x <- x %>% select(-any_of(c("V61")))             
  
  # define char vec of trace response headers
  headers <- colnames(y)      
  
  # confirm length of headers is same as number of columns in trace_response 
  print(length(headers))
  
  # define char vec of redundant col names (V1, V2 etc)
  oldnames <- colnames(x)
  
  # rename columns in trace response data based on headers
  x <- x %>% 
    rename_at(vars(oldnames), ~ headers)
}


# /////////////////
# specify function to clean var names for traced return
clean.names.fun <- function(x){
  # clean up col names (replace instances of [...] with spaces)
  names(x) <- str_replace(names(x), "\\[", "")
  names(x) <- str_replace(names(x), "\\]", "")
  strips <- "^\\s+|\\s+$"
  names(x) <- gsub(strips,"",names(x))
  names(x) <- gsub("[[:space:]]","_",names(x))
  x <- x
}


# /////////////////
# specify function to correctly format traced data prior to merge with original linelist
trace.reformat.fun <- function(x){
  x <- x %>% mutate(Date_of_Death=as.character(Date_of_Death)) %>% 
    # mutate(DOB_S=as.character(DOB_S)) %>% 
    mutate(DOB_S=as.Date(as.character(DOB_S), format = "%Y%m%d")) %>%  # takes DBS trace date format (e.g., 19891222) and converts to date e.g., 1989-12-22
    rename(traced=`INSERT_INTO_DBSresponse_(_Response_Type`) %>% 
    rename(pid_str=Unique_Record_ID) %>% 
    select(pid_str,NHS_Number,DOB_S,First_Name_S,Surname_S,Sex_S,PostCode) %>%
    mutate_all(na_if,"")
}

# /////////////////
# specify wrap function to retain records which weren't successfully traced
no.trace.wrap.fun <- function(x){  # where input is lims_trace_return
  no_trace <- x %>% filter(V1!="20" & V1!="30" & V1!="33" & V1!="40")
  no_trace <- notrace.header.fun(no_trace)
  no_trace <- clean.names.fun(no_trace)
  no_trace <- trace.reformat.fun(no_trace)
  return(no_trace)
}

# /////////////////
# specify wrap function to retain records which had unknown patient sex
unknown.sex.wrap.fun <- function(x){  # where input is lims_trace_return
  no_trace <- x %>% filter(V13=="1" | V13=="3")
  no_trace <- notrace.header.fun(no_trace)
  no_trace <- clean.names.fun(no_trace)
  no_trace <- trace.reformat.fun(no_trace)
  return(no_trace)
}

# /////////////////
# specify wrap function to prepare and join trace return with LIMS linelist
traced.wrap.fun <- function(x){ # where input is lims_trace_return
  lims_trace_return <- header.fun(x)
  lims_trace_return <- clean.names.fun(lims_trace_return)
  lims_trace_return <- trace.reformat.fun(lims_trace_return)
  merged_df <- right_join(lims_trace_return,distinct_patients,by="pid_str") %>% 
    mutate(NHS_Number=as.numeric(NHS_Number)) %>% 
    mutate(nhsno = case_when(
    !is.na(NHS_Number) ~ NHS_Number,
    TRUE ~ nhsno),
    sex = case_when(
      Sex_S==2 ~ "F",
      TRUE ~ sex
    ),
    traced = case_when(
      is.na(NHS_Number) ~ "N",
      !is.na(NHS_Number) ~ "Y"
    )
  ) %>% select(-c("NHS_Number","DOB_S","First_Name_S","Surname_S","Sex_S")) %>%
    mutate(patient_postcode=PostCode) %>% select(-c("PostCode"))
  return(merged_df)
}



# /////////////////
# specify function to generate age groupings (8 groups)
agegroup.fun <- function(x){
  
  x <- x %>% mutate(agegroup=case_when(age <1 ~ 1,
                                       (age >=1 & age <= 4) ~ 2,
                                       (age >=5 & age <= 9) ~ 3,
                                       (age >=10 & age <= 14) ~ 4,
                                       (age >=15 & age <= 44) ~ 5,
                                       (age >=45 & age <= 64) ~ 6,
                                       (age >=65 & age <= 74) ~ 7,
                                       (age >=75 & age <=123) ~ 8,
                                       (is.na(age) ~ 0)))
  
  x$agegroup <- factor(x$agegroup, 
                       levels=c('0','1','2','3','4','5','6','7','8'),
                       labels = c("not known","<1","1 to 4","5 to 9","10 to 14","15 to 44","45 to 64","65 to 74",">= 75"))
  out <- x
}


# /////////////////
# specify function to further prepare LIMS linelist data for linkage (ethnicity and IMD)
linelist.prep.fun <- function(x){
  
  x <- x %>% mutate(dob = lubridate::ymd(dob)) %>%                                            # generate date of birth variable in date format
    mutate(spec_date = lubridate::ymd(specdate),                                             # generate date variable in date format from existing date char var
           # Year = as.numeric(format(Date, '%Y')), 
           age = as.numeric(difftime(specdate,dob),unit="weeks")/52.25,                      # generate age variable (years, numeric)
           age = round(age, digits=0),
           dob_specdate = case_when(dob==specdate ~ "yes",
                                    dob!=specdate ~ "no"),
           sex = if_else(sex=="M","Male",sex),
           sex = if_else(sex=="F","Female",sex),
           sex = if_else(sex=="U","Unknown",sex)) %>% 
    mutate(n=1)
  
  x <- agegroup.fun(x)                                                                        # apply agegroup function 
  
  out <- x
  
}



# /////////////------------ IMD ENRICHMENT ------------ /////////////////


# /////////////////
# specify function to link linelist to IMD using lab postcode and CCG LSOA intermediate lookups
imd.linkage.fun <- function(x, # input is allsites (LIMS linelist post-DBS linkage)
                            lab_lookup_query = "SELECT [Lab_Geography_Code] ,[Lab_Geography_Name_Current] ,[POSTCODE] as labpostcode FROM [SGSSDW].[dbo].[DIMENSION_LAB_GEOGRAPHY] where Logical_Delete_Flag <> ('1') AND POSTCODE <> ('NULL')",
                            mycon = dbConnect(odbc::odbc(),                                                # connection to SGSSDW
                                              .connection_string = "DRIVER=SQL Server;
                   DATABASE=SGSSDW;
                   Trusted_Connection=Yes;
                   SERVER=sgssdb.phe.gov.uk;"),
                            # pcd2_lsoa_lookup = read_dta(file='//COLHPAFIL003.HPA.org.uk/ProjectData/HCAI/Scientific resources/GIS/ONS lookups/NHSPD_MAY_2022_UK_FULL/Data/pcd_CCG_lsoa_lookup.dta'),
                            hes_qry = glue(
                              "SELECT [pcd2], [ctry] ,[nhser] ,[lsoa11] ,[icb], [oac11] FROM [LookupsShared].[dbo].[vONS_NSPL11_UK_2023.05]"
                            ),
                            hes_qry2 = glue(
                              "SELECT [pcd2],[LSOA11CD],[IMD2019_Deciles_LSOA11_England],[IMD2019_Quintiles_LSOA11_England],[ctry] FROM [LookupsShared].[dbo].[vSocioDemog_LSOA11] INNER JOIN [vONS_NSPL_UK_202205] on [lsoa11] = [LSOA11CD] where [ctry] IN ('E92000001')"
                            ),
                            rx_dsn = "LookupsShared",
                            
                            data_lake_connect = paste0('driver={SQL Server};server=SQLClusColLK19\\Lake19;',
                                                       'database=',rx_dsn,';trusted_connection=true'),
                            hes_con = odbc::dbConnect(odbc::odbc(),.connection_string=data_lake_connect)){    # connection to HES
  
  # /// Lab postcode lookup for merge
  # perform extract
  # mycon <- mycon
  
  lab_lookup_out <- dbSendQuery(mycon, lab_lookup_query)
  
  lab_lookup <- dbFetch(lab_lookup_out)
  dbClearResult (lab_lookup_out)
  odbc::dbDisconnect(mycon)
  
  # NEED TO MANUALLY LOOK UP LABGEOCODE FOR GBS ECM SITES IN LINELIST AND USE THIS TO MERGE ON LOOKUP
  
  # merge linelist with lab_lookup on labgeocode
  lab_lookup <- lab_lookup %>% rename(labgeocode=Lab_Geography_Code)
  
  # add in labgeocodes for ECM sites
  x <- x %>% mutate(labgeocode = case_when(
    site=="Royal Stoke" ~ "597955",  #ROYAL STOKE UNIVERSITY HOSPITAL
    site=="Bedford" ~ "235615",  # LUTON MICROBIOLOGY LABORATORY
    site=="Derriford" ~ "526765", # DERRIFORD HOSP. (PLYMOUTH)
    site=="Freeman" ~ "056130",  # FREEMAN HOSPITAL (NEWCASTLE UPON TYNE)
    site=="Royal Blackburn" ~ "670861", # ROYAL BLACKBURN
    site=="Epsom" ~ "375120", # EPSOM (EPSOM AND ST HELIER)
    site=="Birmingham" ~ "610640")   # BIRMINGHAM (BIRMINGHAM WOMEN AND CHILDRENS)
  ) 
  
  x <- merge(x,lab_lookup,by="labgeocode")
  
  # /// PCD2 LSOA lookup for merge 
  # perform extract
  
  pcd2_lsoa_qry <- dbSendQuery(hes_con, hes_qry)
  
  pcd2_lsoa_lookup <- dbFetch(pcd2_lsoa_qry)
  dbClearResult (pcd2_lsoa_qry)
  
  
  pcd2_lsoa_lookup$pcd2 <- str_replace_all(string=pcd2_lsoa_lookup$pcd2, pattern="  ", repl="")   # remove white spaces in postcode field
  pcd2_lsoa_lookup$pcd2 <- str_replace_all(string=pcd2_lsoa_lookup$pcd2, pattern=" ", repl="")    # remove white spaces in postcode field
  
  pcd2_lsoa_lookup <- pcd2_lsoa_lookup %>% filter(ctry=="E92000001") %>% arrange(lsoa11)
  
  # define derivedpostcode var in linelist and preferentially assign as patient postcode > labpostcode
  x <- x %>% mutate(derivedpostcode=case_when(
    grepl("^ZZ99",patient_postcode) | patient_postcode=="No Data Avaliable" ~ labpostcode,                                   # no patient postcode in LIMS ECM data
    (!is.na(patient_postcode) & !grepl("^ZZ99",patient_postcode)) ~ patient_postcode,
    is.na(patient_postcode) ~ labpostcode)) %>%
    rename(pcd2=derivedpostcode) %>%         # rename derivedpostcode prior to merge
    mutate(pcd2=str_replace_all(pcd2, " ", ""))
  
  # ensure instances of patient_postcode=="", labpostcode used
  x <- x %>% mutate(pcd2 = case_when(
    patient_postcode=="" ~ labpostcode,
    TRUE ~ pcd2
  ))

  
  # merge with CCG LSOA lookup based on pcd2 (derived postcode)
  x <- merge(x,pcd2_lsoa_lookup,by="pcd2",all=T)
  
  x <- x %>% filter(!is.na(pid_str))
  
  # /// generate dynamic LSOA IMD code lookup
  
  # perform extract
  lsoa_IMD_qry <- dbSendQuery(hes_con, hes_qry2)
  
  lsoa_IMD_lookup <- dbFetch(lsoa_IMD_qry)
  dbClearResult (lsoa_IMD_qry)
  odbc::dbDisconnect(hes_con)
  
  
  lsoa_IMD_lookup$pcd2 <- str_replace_all(string=lsoa_IMD_lookup$pcd2, pattern="  ", repl="")   # remove white spaces in postcode field
  lsoa_IMD_lookup$pcd2 <- str_replace_all(string=lsoa_IMD_lookup$pcd2, pattern=" ", repl="")    # remove white spaces in postcode field
  
  lsoa_IMD_lookup <- lsoa_IMD_lookup %>% filter(ctry=="E92000001") 
  
  # merge with LSOA IMD lookup based on pcd2 (derived postcode)
  x <- merge(x,lsoa_IMD_lookup,by="pcd2",all=T)
  x <- x %>% filter(!is.na(pid_str))
  x <- x[!duplicated(colnames(x))]
  
  # reassign as linelist
  return(x)
  
}


# /////////////------------ ETHNICITY ENRICHMENT ------------ /////////////////


# /////////////////
# specify function to prepare LIMS linelist data for ethnicity enrichment
datalink.prep.fun <- function(x){
  # check names of date of birth and nhs number fields.
  xnew <- x %>% mutate(dateofbirth =  as.Date(dob, format = "%Y-%m-%d"))
  xnew$Patient_NHS_No <- as.character(xnew$nhsno)
  out <- xnew
}


# /////////////////
# specify function to link LIMS linelist to HES ethnicity fields by creating a temporary table in SQL 
hes.ethnlink.fun <- function(x,  # where main input is LIMS linelist
                             rx_dsn = "Y080_UID_PID",
                             data_lake_connect = paste0('driver={SQL Server};server=SQLClusColLK19\\Lake19;',
                                                        'database=',rx_dsn,';trusted_connection=true'),
                             data_lake = odbc::dbConnect(odbc::odbc(),.connection_string=data_lake_connect),        # create DataLake connection 
                             PII_data_name = paste0("JR_tbl_",format(Sys.Date(), "%y%m%d"),"_",format(Sys.time(), "%H_%M")),        # PII to upload to DataLake: table name needs to be unique and not previously on the Database
                             target_table = paste0("JR_upld_",format(Sys.Date(), "%y%m%d"),"_",format(Sys.time(), "%H_%M")),
                             hes_qry = glue(
                               "SELECT *
INTO [Y080_UID_PID].[dbo].[{target_table}]  

FROM (

SELECT DISTINCT 
       uploadeddata.* 
	 , ethntp.TOKEN_PERSON_ID
	 , ethntp.NEW_ETHNOS -- Ethnicity; code
	 , ethntp.NEW_ETHNOS_Ethnic_subgroup 
	 , ethntp.NEW_ETHNOS_Ethnic_group

FROM [Y080_UID_PID].[dbo].[{PII_data_name}] uploadeddata

LEFT JOIN [Y080_UID_PID].[dbo].[vY080_UID_HESOP_PID] apcpid ON apcpid.NEWNHSNO = uploadeddata.Patient_NHS_No AND CONVERT(DATE, CAST(apcpid.DOB_DV AS DATE),3) = uploadeddata.dateofbirth -- linked on NHS Number & DoB

LEFT JOIN [Y089_DLakeLinkedRepo_PID].[dbo].[x_HES_Analysis_Pseudo_vY089_HES_Ethnos_TPID] ethntp ON apcpid.TOKEN_PERSON_ID = ethntp.TOKEN_PERSON_ID

) AS hes"
                             )){
  
  x <- datalink.prep.fun(x)    # prepare linelist data for HES linkage
  
  odbc::dbListTables(conn=data_lake, catalog_name=rx_dsn, schema_name="dbo")                                         # Check names of the tables in the database
  
  dbCreateTable(conn=data_lake, name = PII_data_name, fields = x, overwrite = T)
  
  tableid <- Id(schema = "dbo", table = PII_data_name)
  
  dbWriteTable(data_lake, tableid, x, append = TRUE, row.names = F)
  
  linked_ethn <- dbSendQuery(data_lake,hes_qry)
  linked_ethn <- dbGetQuery(data_lake,glue("select * from {target_table}"))
  
  
  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
  
  table_name <- target_table   
  
  # output table generated from query
  ethnicity_linked_data <- dbReadTable(data_lake,table_name)                                                        # Extracting the ethnicity linked data from the DataLake
  ethnicity_linked_data <- distinct(ethnicity_linked_data)
  
  
  # get rid of temporary tables 
  dbSendQuery(data_lake, glue("drop table {target_table}"))
  dbSendQuery(data_lake, glue("drop table {PII_data_name}"))
  
  # disconnect from DL
  dbDisconnect(data_lake)
  
  write.csv(ethnicity_linked_data, "Data_carriage/Linelist/ethnlinked_intermediate.csv", row.names=F)
  
  # reassign to data df
  x <- as_tibble(ethnicity_linked_data)
  
  # clean ethnicity linked linelist
  x <- ethlinked.data.clean.fun(x)
  
  # format IMD quintile variable as factor with labels
  x <- imd.fac.label.fun(x)
  
  return(x)
  
}


# /////////////////
# specify function to link LIMS linelist to ECDS ethnicity fields by creating a temporary table in SQL 
ecds.ethnlink.fun <- function(x,  # where main input is LIMS linelist
                              rx_dsn = "Y080_UID_PID",
                              data_lake_connect = paste0('driver={SQL Server};server=SQLClusColLK19\\Lake19;',
                                                         'database=',rx_dsn,';trusted_connection=true'),
                              data_lake = odbc::dbConnect(odbc::odbc(),.connection_string=data_lake_connect),        # create DataLake connection 
                              PII_data_name = paste0("JR_ecds_tbl_",format(Sys.Date(), "%y%m%d"),"_",format(Sys.time(), "%H_%M")),        # PII to upload to DataLake: table name needs to be unique and not previously on the Database
                              target_table = paste0("JR_ecds_upld_",format(Sys.Date(), "%y%m%d"),"_",format(Sys.time(), "%H_%M")),
                              hes_qry = glue(
                                "SELECT *
  INTO [Y080_UID_PID].[dbo].[{target_table}] -- new table of linked data, make sure it is a new name

FROM (
  
  SELECT DISTINCT espamr.*
    -- , ethntp.[PatientUsualAddressImdDecile]
  -- , ethntp.[PatientUsualAddressImdDecileDescription]
  -- , ethntp.[PatientUsualAddressRuralUrbanIndicator]
  , ethntp.[PatientEthnicCategoryCode]
  
  
  FROM [Y080_UID_PID].[dbo].[{PII_data_name}] espamr
  
  
  LEFT JOIN [Y080_UID_PID].[dbo].[vY092_ECDS_SUS_PID] ethntp ON ethntp.NhsNumber = espamr.Patient_NHS_No AND CONVERT(DATE, CAST(ethntp.PatientBirthDate AS DATE),3) = espamr.dateofbirth  
  
) AS hes"
                              )){
  
  x <- datalink.prep.fun(x)    # prepare linelist data for HES linkage
  
  odbc::dbListTables(conn=data_lake, catalog_name=rx_dsn, schema_name="dbo")                                         # Check names of the tables in the database
  
  dbCreateTable(conn=data_lake, name = PII_data_name, fields = x, overwrite = T)
  
  tableid <- Id(schema = "dbo", table = PII_data_name)
  
  dbWriteTable(data_lake, tableid, x, append = TRUE, row.names = F)
  
  linked_ecds_ethn <- dbSendQuery(data_lake,hes_qry)
  linked_ecds_ethn <- dbGetQuery(data_lake,glue("select * from {target_table}"))
  
  
  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
  
  table_name <- target_table   
  
  # output table generated from query
  ecds_ethnicity_linked_data <- dbReadTable(data_lake,table_name)                                                        # Extracting the ethnicity linked data from the DataLake
  ecds_ethnicity_linked_data <- distinct(ecds_ethnicity_linked_data)
  
  
  # get rid of temporary tables 
  dbSendQuery(data_lake, glue("drop table {target_table}"))
  dbSendQuery(data_lake, glue("drop table {PII_data_name}"))
  
  # disconnect from DL
  dbDisconnect(data_lake)
  
  # reassign to data df
  x <- as_tibble(ecds_ethnicity_linked_data)
  
  x <- x %>% rename(ethn_code=PatientEthnicCategoryCode)
  
  return(x)
  
}


# /////////////////
# specify function to clean ethnicity linked LIMS linelist prior to regrouping ethnicity
ethlinked.data.clean.fun <- function(x){      # where main input is ethnicity-enriched LIMS linelist
  
  # include pid_str and HES token ID in separate lookup
  
  x <- x %>% select(any_of(c("pid_str","year","site","Lab_Geography_Name_Current","specdate","dob_specdate","spectype","species","forename","surname","nhsno","Patient_NHS_No","dob","age","agegroup","dateofbirth","sex","IMD2019_Quintiles_LSOA11_England","Ethnicity_group","NEW_ETHNOS_Ethnic_group","NEW_ETHNOS_Ethnic_subgroup","Fact_of_Death","Date_of_Death"))) %>%
    rename_at(vars(starts_with("IMD2019_Quintiles_LSOA11_England")), ~ "imd_quintile") %>%       # NEXT: apply rename_at to factofdeath and dateofdeath
    # rename_at(vars(starts_with("Fact_of_Death")), ~ "factofdeath") %>%                          # optional: may not have death field in linelist (less likely for women of child bearing age)
    # rename_at(vars(starts_with("Date_of_Death")), ~ "dateofdeath") %>% 
    rename_at(vars(starts_with("NEW_ETHNOS_Ethnic_subgroup")), ~ "Ethnic_group") %>% 
    # mutate(across((contains("factofdeath") | contains("Fact_of_Death")), .fns = temp_fun, .names = "factofdeath")) %>% 
    # mutate(factofdeath=case_when(factofdeath=="D" ~ 1,
    #                              is.na(factofdeath) ~ 0)) %>%
    mutate(note = case_when(
      dateofbirth=="NA" | is.na(dateofbirth) | nhsno=="NA" | is.na(nhsno) ~ "insufficient PII for ethn trace",
      !is.na(nhsno) | !is.na(dateofbirth) ~ "PII available for ethn trace",
      !is.na(nhsno) & nhsno!="NA" & !is.na(dateofbirth) & dateofbirth!="NA" ~ "ethnicity not available")
    )
}



# /////////////////
# specify function to generate binary (white vs non-white) ethnicity categorical variable for model
ethn.binary.fac.fun <- function(x){
  
  x$hes_ethnicity <- fct_collapse(x$hes_ethnicity,
                                  "white" = c("White"),
                                  "Nonwhite" = c("Black","Asian","Other","Mixed"))
  return(x)
} 


# /////////////////
# specify function to assign IMD quintile field in linelist as factor with corresponding labels
imd.fac.label.fun <- function(x){
  x$imd_quintile <- factor(x$imd_quintile, 
                           levels=c('1','2','3','4','5'),
                           labels = c("20% most deprived","20% to 40%","40% to 60%","60% to 80%","20% least deprived"))
  out <- x
}


linelist.ethcat5.fun <- function(x){
  
  ethn.vec <- sort(unique(x$NEW_ETHNOS_Ethnic_group)) 
  
  # [1] "99 Not known"                                
  # [2] "Any other ethnic group"                      
  # [3] "Asian / Asian British"                       
  # [4] "Black / African / Caribbean / Black British"
  # [5] "Mixed / Multiple ethnic groups"              
  # [6] "White"                                      
  # [7] "Z Not stated"   
  
  x <- x %>% mutate(hes_ethnicity="") 
  
  # for unknown group and not linked
  x <- x %>%
    mutate(hes_ethnicity = case_when(
      NEW_ETHNOS_Ethnic_group %in% c(ethn.vec[1], ethn.vec[7]) ~ "Unknown",
      is.na(NEW_ETHNOS_Ethnic_group) ~ "Not linked",
      TRUE ~ hes_ethnicity
    ))
  
  # for Black, Black British, Caribbean or African group
  x <- x %>%
    mutate(hes_ethnicity = case_when(
      NEW_ETHNOS_Ethnic_group %in% ethn.vec[4] ~ "Black",
      TRUE ~ hes_ethnicity
    ))
  
  # for Mixed
  x <- x %>%
    mutate(hes_ethnicity = case_when(
      NEW_ETHNOS_Ethnic_group %in% ethn.vec[5] ~ "Mixed",
      TRUE ~ hes_ethnicity
    ))
  
  # for Other
  x <- x %>%
    mutate(hes_ethnicity = case_when(
      NEW_ETHNOS_Ethnic_group %in% ethn.vec[2] ~ "Other",
      TRUE ~ hes_ethnicity
    ))
  
  # for Asian, Asian British
  x <- x %>%
    mutate(hes_ethnicity = case_when(
      NEW_ETHNOS_Ethnic_group %in% ethn.vec[3] ~ "Asian",
      TRUE ~ hes_ethnicity
    ))
  
  # for White
  x <- x %>%
    mutate(hes_ethnicity = case_when(
      NEW_ETHNOS_Ethnic_group %in% ethn.vec[6] ~ "White",
      TRUE ~ hes_ethnicity
    )) %>%
    select(-c("Ethnic_group","NEW_ETHNOS_Ethnic_group")) %>% mutate(z=1)
  
  x <- if ("Ethnicity_group" %in% names(x)) {           
    
    x %>% rename(lim_ethnicity=Ethnicity_group) } else {    # drop reduntant Ethnicity assignment columns

      x <- x
      
    }
  
  # add in count variable prior to grouping by ethnicity category
  
}




# /////////////------------ HES APC LINKAGE FOR PREGNANCY INDICATORS ------------ /////////////////

hesapc.link.fun <- function(x,  # where main input is LIMS linelist
                            rx_dsn = "Y080_UID_PID",
                            data_lake_connect = paste0('driver={SQL Server};server=SQLClusColLK19\\Lake19;',
                                                       'database=',rx_dsn,';trusted_connection=true'),
                            data_lake = odbc::dbConnect(odbc::odbc(),.connection_string=data_lake_connect),        # create DataLake connection 
                            PII_data_name = paste0("JR_tbl_",format(Sys.Date(), "%y%m%d"),"_",format(Sys.time(), "%H_%M")),        # PII to upload to DataLake: table name needs to be unique and not previously on the Database
                            target_table = paste0("JR_upld_",format(Sys.Date(), "%y%m%d"),"_",format(Sys.time(), "%H_%M")),
                            hes_qry = glue(
                              "-- Step 1: Create target table if it doesn't exist
CREATE TABLE [Y080_UID_PID].[dbo].[{target_table}] (
    Patient_NHS_No VARCHAR(50),
    dateofbirth DATE,
    specdate DATE,
    FYEAR INT,
    EPIKEY VARCHAR(50),
    DOBBABY_1_DV DATE,
    ADMIDATE DATE,
    ADMIDATE_DV DATE,
    DISDATE DATE,
    DISDATE_DV DATE,
    EPITYPE VARCHAR(50),
    OPERTN_01 VARCHAR(50),
    OPDATE_01 DATE,
    BIRSTAT_1 VARCHAR(50),
    DELMETH_D VARCHAR(50),
    GESTAT_1 VARCHAR(50),
    DIAG_01 VARCHAR(50),
    DIAG_02 VARCHAR(50),
    DIAG_03 VARCHAR(50),
    DIAG_04 VARCHAR(50),
    DIAG_05 VARCHAR(50),
    PROCODE3 VARCHAR(50)
);

-- Step 2: Insert data into target table using CTEs
;WITH filtered_hesapc AS (
    SELECT FYEAR, EPIKEY, PROCODE3
    FROM [HES_APC].[dbo].[vtHES_APC]
    WHERE FYEAR IN (2122, 2223, 2324) AND PROCODE3 IN ('RTG', 'RD8', 'RFW', 'R1H', 'RXR', 'RJR', 'RQ3', 'RWY', 'RXF', 'RC9', 'R1K', 'RVR', 'RTD', 'RM3', 'RTK', 'RK9', 'RBT')
),    
filtered_admis AS (
    SELECT *
    FROM [HES_APC].[dbo].[vHES_APC_Flat]
    WHERE FYEAR IN (2122, 2223, 2324) 
),
filtered_oprtn AS (
    SELECT *
    FROM [HES_APC].[dbo].[vHES_APC_OPERTN_Flat]
    WHERE FYEAR IN (2122, 2223, 2324) 
),
filtered_mat AS (
    SELECT *
    FROM [HES_APC].[dbo].[vtHES_APC_MAT]
    WHERE FYEAR IN (2122, 2223, 2324)
),
filtered_diag AS (
    SELECT *
    FROM [HES_APC].[dbo].[vHES_APC_DIAG_Flat]
    WHERE FYEAR IN (2122, 2223, 2324) 
),
filtered_apcpid AS (
    SELECT *
    FROM [Y080_UID_PID].[dbo].[vY080_UID_HESAPC_PID]
)
INSERT INTO [Y080_UID_PID].[dbo].[{target_table}]
SELECT 
    uploadeddata.Patient_NHS_No, 
    uploadeddata.dateofbirth, 
    uploadeddata.specdate, 
    apcpid.FYEAR, 
    apcpid.EPIKEY, 
    apcpid.DOBBABY_1_DV, 
    admis.ADMIDATE, 
    admis.ADMIDATE_DV, 
    admis.DISDATE, 
    admis.DISDATE_DV, 
    admis.EPITYPE, 
    oprtn.OPERTN_01, 
    oprtn.OPDATE_01, 
    mat.BIRSTAT_1, 
    mat.DELMETH_D, 
    mat.GESTAT_1,
    diag.DIAG_01,
    diag.DIAG_02,
    diag.DIAG_03,
    diag.DIAG_04,
    diag.DIAG_05,
    hesapc.PROCODE3
FROM [Y080_UID_PID].[dbo].[{PII_data_name}] AS uploadeddata
LEFT JOIN 
    filtered_apcpid AS apcpid ON apcpid.NEWNHSNO = uploadeddata.Patient_NHS_No 
    AND CONVERT(DATE, apcpid.DOB_DV, 3) = uploadeddata.dateofbirth
INNER JOIN 
    filtered_oprtn AS oprtn ON apcpid.EPIKEY = oprtn.EPIKEY AND apcpid.FYEAR = oprtn.FYEAR
INNER JOIN 
    filtered_mat AS mat ON apcpid.EPIKEY = mat.EPIKEY AND apcpid.FYEAR = mat.FYEAR 
INNER JOIN 
    filtered_admis AS admis ON mat.EPIKEY = admis.EPIKEY AND mat.FYEAR = admis.FYEAR
INNER JOIN 
    filtered_diag AS diag ON admis.EPIKEY = diag.EPIKEY AND diag.FYEAR = admis.FYEAR
INNER JOIN
    filtered_hesapc AS hesapc ON diag.EPIKEY = hesapc.EPIKEY AND hesapc.FYEAR = diag.FYEAR;")){
      
      x <- datalink.prep.fun(x)    # prepare linelist data for HES linkage
      
      odbc::dbListTables(conn=data_lake, catalog_name=rx_dsn, schema_name="dbo")                                         # Check names of the tables in the database
      
      dbCreateTable(conn=data_lake, name = PII_data_name, fields = x, overwrite = T)
      
      tableid <- Id(schema = "dbo", table = PII_data_name)
      
      dbWriteTable(data_lake, tableid, x, append = TRUE, row.names = F)
      
      linked_data <- dbSendQuery(data_lake,hes_qry)
      linked_data <- dbGetQuery(data_lake,glue("select * from {target_table}"))
      
      
      #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
      
      table_name <- target_table   
      
      # output table generated from query
      hesapc_linked_data <- dbReadTable(data_lake,table_name)                                                        # Extracting the ethnicity linked data from the DataLake
      hesapc_linked_data <- distinct(hesapc_linked_data)
      
      
      # get rid of temporary tables 
      dbSendQuery(data_lake, glue("drop table {target_table}"))
      dbSendQuery(data_lake, glue("drop table {PII_data_name}"))
      
      # disconnect from DL
      dbDisconnect(data_lake)
      
      # reassign to data df
      x <- as_tibble(hesapc_linked_data)
      
      setwd(wd)
      # export for further processing (ethnic group assignment according to CHIME methodology) in Stata
      write.csv(x, "Data_carriage/Linelist/static_basedata_hesapclinked.csv", row.names=F)
      
      return(x)
      
    }

```

Add a new chunk by clicking the *Insert Chunk* button on the toolbar or by pressing *Ctrl+Alt+I*.

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the *Preview* button or press *Ctrl+Shift+K* to preview the HTML file).

The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike *Knit*, *Preview* does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.
