This is an R Markdown
Notebook outlining steps for LIMS data processing and GBS carriage
estimation from ECM-based testing data from the GBS3 trial
Set main working directory
Install and load packages
Load functions
setwd(wd)
source("R scripts_carriage/functions_GBS_carriage.R")
Import LIMS data from ECM-based GBS testing (datasets provided by
micro labs) Pre-check - open csv and check that in correct template
format - describe the raw data
# setwd(wd)
#////////////////////
# for Royal Stoke
testpos <- import(here("Data_carriage","ECM_carriage","Edited","ECM_pos_06_02_2023to01_04_2024_Royal_stoke.xlsx"),
format = "xlsx",
fill = T,
comment.char="",
na = c(""),
skip = 0,
full.names = T)
testneg <- import(here("Data_carriage","ECM_carriage","Edited","ECM_neg_06_02_2023to01_04_2024_Royal_stoke.xlsx"),
format = "xlsx",
fill = T,
comment.char="",
na = c(""),
skip = 0,
full.names = T)
testpos2 <- import(here("Data_carriage","ECM_carriage","Edited","ECM_pos_30_10_2022to05_02_2023_Royal_stoke.xls"),
format = "xls",
fill = T,
comment.char="",
na = c(""),
skip = 0,
full.names = T)
testneg2 <- import(here("Data_carriage","ECM_carriage","Edited","ECM_neg_30_10_2022to05_02_2023_Royal_stoke.xls"),
format = "xls",
fill = T,
comment.char="",
na = c(""),
skip = 0,
full.names = T)
colnames(testpos) %in% colnames(testneg) # last 3 cols distinct in testneg
testpos <- lims.stnd.format.fun(testpos)
testpos2 <- lims.stnd.format.fun(testpos2)
testneg <- lims.stnd.format.fun(testneg)
testneg2 <- lims.stnd.format.fun(testneg2)
alltest <- rbind(testpos,testpos2,testneg,testneg2)
alltest1 <- alltest %>% mutate(site="Royal Stoke")
rm(testpos,testpos2,testneg,testneg2)
#////////////////////
# for Bedford
alltest <- import(here("Data_carriage","ECM_carriage","Edited","Bedford_GBS3_CleanedData.csv"),
format = "csv",
fill = T,
comment.char="",
na = c(""),
skip = 0,
full.names = T)
alltest$`Patient NHS Number` <- as.numeric(alltest$`Patient NHS Number`)
alltest2 <- lims.stnd.format.fun(alltest)
alltest2 <- alltest2 %>% mutate(site="Bedford")
#////////////////////
# for Derriford
alltest <- import(here("Data_carriage","ECM_carriage","Edited","Derriford_GBS3_CleanedData.csv"),
format = "csv",
fill = T,
comment.char="",
na = c(""),
skip = 0,
full.names = T)
alltest$`Patient NHS Number` <- as.numeric(alltest$`Patient NHS Number`)
alltest3 <- lims.stnd.format.fun(alltest)
alltest3 <- alltest3 %>% mutate(site="Derriford")
#////////////////////
# for Freeman
alltest <- import(here("Data_carriage","ECM_carriage","Edited","Freeman_GBS3_CleanedData.csv"),
format = "csv",
fill = T,
comment.char="",
na = c(""),
skip = 0,
full.names = T)
alltest <- alltest %>%
mutate(
`Patient NHS Number` = as.numeric(`Patient NHS Number`),
`Specimen Date` = as.Date(`Specimen Date`, format = "%d/%m/%Y"),
`Patient Date of Birth` = as.Date(`Patient Date of Birth`, format = "%d/%m/%Y")
)
# CONTINUE HERE - specimen date format different for Freeman vs other sites. Resolve here before applying df$dt <- lubridate::ymd(df$specdate) step in obai process
# handle format for strings in Ethnicity column which have symbols
alltest$Ethnicity <- iconv(alltest$Ethnicity, from = "ISO-8859-1", to = "UTF-8")
alltest4 <- lims.stnd.format.fun(alltest)
alltest4 <- alltest4 %>% mutate(site="Freeman")
#////////////////////
# for Epsom and St Helier (St George's)
alltest5 <- read.csv("\\\\IISNISCOL01\\GBSStudy\\Processed\\Epsom_StHelier_GBS3_UID.csv")
alltest5 <- alltest5 %>% mutate(site="Epsom") %>% relocate(site, .after = last_col()) %>% mutate(species=case_when(
is.na(species) ~ "GBS not isolated",
(species!="Beta Haemolytic Streptococcus Group B" & species!="Group B Streptococcus") ~ "GBS not isolated",
(species=="Beta Haemolytic Streptococcus Group B" | species=="Group B Streptococcus") ~ "Group B Streptococcus",
TRUE ~ species
)) %>% rename(ethnicity=Ethnicity_group) %>% relocate(ethnicity, .after = sex) %>%
mutate(
dob = as.Date(dob, format = "%d/%m/%Y"), # Character string "DD/MM/YYYY" to Date
specdate = as.Date(specdate, format = "%d/%m/%Y")
) %>% select(-c("pt_id"))
#////////////////////
# for Royal Blackburn
alltest6 <- import(here("Data_carriage","ECM_carriage","Edited","UKHSA GB3 trial_Royal_Blackburn.csv"),
format = "csv",
fill = T,
comment.char="",
na = c(""),
skip = 0,
full.names = T)
alltest6 <- lims.stnd.format.fun(alltest6)
alltest6 <- alltest6 %>% mutate(site="Royal Blackburn",
dob=as.Date(dob, format = "%d/%m/%Y")) %>%
mutate(species=case_when(
is.na(species) ~ "GBS not isolated",
(species!="beta Haemolytic Strep, group B" & species!="Group B Streptococcus") ~ "GBS not isolated",
(species=="beta Haemolytic Strep, group B" | species=="Group B Streptococcus") ~ "Group B Streptococcus",
TRUE ~ species
))# need to apply additional date conversion as not handled in flexible reformatting function for dates in char string "19/06/1991" in Royal Blackburn LIMS data
#//////////////////
# combine into single df
allsites <- rbind(alltest1,alltest2,alltest3,alltest4,alltest5,alltest6)
allsites$nhsno <- as.numeric(allsites$nhsno)
rm(alltest,alltest1,alltest2,alltest3,alltest4,alltest5,alltest6)
# apply ethnicity categorization function to group into 5 main ethnicities
allsites <- lims.ethcat.fun(allsites)
# convert instances of sex=="FALSE" into "U"
allsites <- allsites %>% mutate(sex = case_when(
sex=="FALSE" ~ "U",
TRUE ~ sex
))
# NOTE: CHECK at this stage that dob and specdate fields have consistent date formatting across sites
Process LIMS linelist using adapted version of obai
sgss_infections
# Remove quality control samples
allsites <- qc.umbrella.fun(allsites)
# Proceed to dedpup LIMS linelist to patient-level with AMR results
allsites <- obai.dedup.fun(allsites)
## perform initial checks
# total episodes (ECM test result within 9 month window)
length(unique(allsites$epids)) # 12,916
# total patients
length(unique(allsites$pid_str))# 12,916
# total unique NHS number
length(unique(allsites$nhsno)) # 12,445
# total records where NHS number missing
sum(is.na(allsites$nhsno)) # 472
# isolate duplicates in nhsno field
duplicates_only <- allsites %>%
filter(nhsno %in% nhsno[duplicated(nhsno)]) %>% arrange(forename)
nrow(duplicates_only) # 472 - all of which nhsno missing
# identify true (likely) duplicates using patient name fields
true_dups <- duplicates_only %>% select(specno:site) %>% mutate(name_concat=paste(forename,"_",surname),
name_concat=gsub(" ","",name_concat)) %>%
filter(name_concat %in% name_concat[duplicated(name_concat)]) %>% arrange(name_concat)
length(unique(true_dups$name_concat))
# 7 true duplicates (across name fields), of which 1 ("FEBUARY_IQAMIC2024") a QC sample and 1 pertains to 2 samples a year apart for a patient
# remove remaining QC sample rows
allsites <- allsites %>% filter(surname!="IQAMIC2024")
# //////////// key checkpoint 1 //////////////
## save as static base dataset
write.csv(allsites, "Data_carriage/Linelist/static_basedata.csv", row.names=F) # REPLACE WITH SAVE TO SERVER
Prepare processed LIMS data for DBS tracing
## load static base linelist
allsites <- read.csv("Data_carriage/Linelist/static_basedata.csv")
allsites$specdate <- as.Date(allsites$specdate, format = "%Y-%m-%d")
allsites$dob <- as.Date(allsites$dob, format = "%Y-%m-%d")
allsites$pid_str <- as.character(allsites$pid_str)
dbsprep <- allsites %>% mutate(year = format(specdate, "%Y")) %>% select(year, everything()) %>% select(1:match("epids",names(allsites)))
# generate .csv files for sending to DBS trace team (generates two files, one with sufficient PII and one where some/all PII missing. Send BOTH for tracing)
dbs.trace.prep.fun(dbsprep,
mainfile="Data_carriage/DBS_tracing/LIMS_linelist_for_tracing.csv",
excludefile="Data_carriage/DBS_tracing/excluded_records_for_tracing.csv")
# NOTE: always open .cvs files for tracing and check that columns formatted correctly, prior to sending to DBS team
# # state NHS number completion before DBS tracing: percent nhsno missing
sum_na_nhs <- sum(is.na(dbsprep$nhsno))/nrow(dbsprep)*100 # 3.6%
na_nhs <- dbsprep %>% filter(is.na(nhsno)) # generate df of records where NHS number missing
Integrate PII-enriched DBS trace return into linelist and retain
records which didn’t have successful trace first time around for second
attempt
## import and format trace return csv files
# These are saved initially in 'response holding bay' subfolder of 'DBS_tracing' with original DBS file names that have long numeric suffixes. Rename, save in main 'DBS_tracing' folder (overwriting previous versions from prior runs of process which should first be moved to Archive) and import:
setwd(wd)
lims_trace_return <- read.csv("Data_carriage/DBS_tracing/LIMS_linelist_response.csv", header = F) # update input file name as needed
lims_excl_trace_return <- read.csv("Data_carriage/DBS_tracing/excluded_records_response.csv", header = F)
# confirm total number of records returned
sum(nrow(lims_trace_return)+nrow(lims_excl_trace_return)) # 12,917
lims_trace_return <- rbind(lims_trace_return,lims_excl_trace_return) %>% filter(!grepl("^0011",V1) & !grepl("^9911",V1))
# retain records which NOT successfully traced for subsequent QC and re-tracing
no_trace <- no.trace.wrap.fun(lims_trace_return)
# retain records where sex unknown (may overlap with no_trace generated above)
sex_unknown <- unknown.sex.wrap.fun(lims_trace_return)
# keep ONLY records which successfully traced in first round of tracing and merge with LIMS linelist
allsites <- traced.wrap.fun(lims_trace_return) # 12,913
# state NHS number completion after DBS tracing
sum(is.na(allsites$nhsno))
# 260 NHS number missing
Prepare subsets of data for second round of DBS tracing
dbsprep2 <- allsites %>% mutate(year = as.numeric(format(specdate, '%Y'))) %>% select(year, everything()) %>% select(1:match("epids", names(allsites))) %>% filter(pid_str %in% no_trace$pid_str | pid_str %in% sex_unknown$pid_str) %>%
mutate(forename = sub(" .*", "", forename)) %>% # reformat name fields in no_trace to resend for tracing (where middle names not compatible with DBS trace algorithm)
mutate(sex = case_when( # take all records where sex=="U" and presumptively recode to "F" to resend for tracing (if trace returns sex == 2 then confident that pertains to females)
sex == "U" ~ "F",
TRUE ~ sex
))
# generate .csv files for sending to DBS trace team for second trace (generates two files, one with sufficient PII and one where some/all PII missing. Send BOTH for tracing)
dbs.trace.prep.fun(dbsprep2,
mainfile="Data_carriage/DBS_tracing/LIMS_linelist_for_tracing_rnd2.csv",
excludefile="Data_carriage/DBS_tracing/excluded_records_for_tracing_rnd2.csv")
Import second set of DBS tracing results
#///////////////////////////
#TEMPORARY CODE CHUNK
# review content of latest files sent for DBS tracing (Jan 2025)
test <- read.csv("Data_carriage/DBS_tracing/LIMS_linelist_for_tracing_rnd2.csv", header = T)
test2 <- read.csv("Data_carriage/DBS_tracing/excluded_records_for_tracing_rnd2.csv", header = T)
test <- rbind(test,test2)
sites <- allsites %>% filter(nhsno %in% unique(test$NHS_number))
unique(sites$site)
# has data for all 6 sites
# percent of total records in linelist which in second trace request files
perc <- nrow(sites)/nrow(allsites)*100
# 19.3%
# current percent records missing nhsno, prior to linking back in second trace return
perc <- (sum(is.na(allsites$nhsno))/nrow(allsites))*100
print(perc)
# 3.6%
#///////////////////////////
# import and format trace return csv files
setwd(wd)
lims_trace_return2 <- read.csv("Data_carriage/DBS_tracing/LIMS_linelist_response2.csv", header = F) # update input file name as needed
lims_excl_trace_return2 <- read.csv("Data_carriage/DBS_tracing/excluded_records_response2.csv", header = F)
lims_trace_return2 <- rbind(lims_trace_return2,lims_excl_trace_return2) # combine into single df once checked that non-overlapping patient IDs
# retain records which NOT successfully traced for subsequent QC and re-tracing
no_trace2 <- no.trace.wrap.fun(lims_trace_return2)
# keep ONLY records which successfully traced in second round of tracing and merge with LIMS linelist
allsites$pid_str <- as.integer(allsites$pid_str) # ensure pid_str column in both datasets used for merge in same format
allsites <- traced.wrap.fun(lims_trace_return2) # merge these records back into main linelist
# current percent records missing nhsno, after linking back in second trace return
perc <- (sum(is.na(allsites$nhsno))/nrow(allsites))*100
print(perc)
# 2.01%
# //////////// key checkpoint 2 //////////////
## save as static base dataset **after tracing**
write.csv(allsites, "Data_carriage/Linelist/static_basedata_traced.csv", row.names=F)
Preliminary analysis results prep and presentation
linelist <- read.csv("Data_carriage/Linelist/static_basedata_traced.csv")
linelist$specdate <- as.Date(linelist$specdate, format = "%Y-%m-%d")
linelist$dob <- as.Date(linelist$dob, format = "%Y-%m-%d")
linelist <- linelist %>%
mutate(year = as.numeric(format(specdate, '%Y'))) %>%
select(year, everything()) %>% select(-any_of(c("V1"))) %>%
filter(!grepl("IQAM", surname)) %>% filter(sex!="M") %>%
mutate(ym = format(specdate, "%Y/%m")) %>%
mutate(ym_date = as.Date(paste0(ym, "/01"), format = "%Y/%m/%d"))
# /////////////
# 1) monthly ECM test totals by site
# Summarize totals by year-month and site
monthly_test <- linelist %>% group_by(ym_date,site) %>% summarize(total_tested=sum(z), .groups= 'drop')
# Create a complete set of year-month and site combinations
month_complete <- monthly_test %>% complete(ym_date,site)
# Join the summarized data, allowing for NA in missing combinations
month_complete <- month_complete %>%
left_join(monthly_test, by = c("ym_date", "site")) %>%
mutate(total_tested = ifelse(is.na(total_tested.y), NA, total_tested.y)) %>%
select(ym_date, site, total_tested) %>% mutate(total_tested = case_when(
total_tested<=30 ~ NA,
TRUE ~ total_tested)
)
# plot of monthly test numbers by site
plot <- ggplot(month_complete, aes(x = ym_date, y = total_tested, color = site, group = site)) +
geom_line(size = 1) +
geom_point(size = 2) +
scale_x_date(date_labels = "%b %Y", date_breaks = "1 month") + # Format x-axis
labs(title = "Total ECM tests by Year-Month and Site",
x = "Year-Month", y = "Total Value", color = "Site") +
theme_minimal() +
theme(panel.grid = element_blank(),
axis.line = element_line(color = "black", size = 0.5),
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(size = 14, hjust= 0.5))
# save plot
ggsave("Charts_carriage/ym_ecm_test_bysite.jpeg", plot = plot, device = "jpeg", width = 8, height = 6, dpi = 300)
# now looking 1 site at a time, comparing monthly test numbers in linelist vs original ECM test numbers from GBS3
# Summarize totals across whole period for each site
test_bysite <- linelist %>% group_by(site) %>% summarize(total_tested=sum(z), .groups= 'drop')
# Create a bar graph of totals by site
plot <- ggplot(test_bysite, aes(x = site, y = total_tested)) +
geom_bar(stat = "identity", fill = "skyblue", color = "black") +
labs(title = "Total ECM tests by Site", x = "Site", y = "Totals") +
theme_minimal() +
theme(
axis.text.x = element_text(size = 14), # Increase size of x-axis labels
axis.text.y = element_text(size = 14), # Increase size of y-axis labels
axis.title.x = element_text(size = 16), # Increase size of x-axis title
axis.title.y = element_text(size = 16) # Increase size of y-axis title
)
# save plot
ggsave("Charts_carriage/all_ecm_test_bysite.jpeg", plot = plot, device = "jpeg", width = 8, height = 6, dpi = 300)
# /////////////
# 2) age distributions by site and for all sites combined
agedis <- linelist %>% group_by(site) %>%
summarise(
median_age=median(age, na.rm=T),
IQR_age=IQR(age, na.rm=T),
age_min=min(age, na.rm=T),
age_max=max(age, na.rm=T),
Q1 = quantile(age, 0.25, na.rm=T),
Q3 = quantile(age, 0.75, na.rm=T)
)
# Create a plot showing median and IQRs
plot <- ggplot(agedis, aes(x = site, y = median_age)) +
geom_point(size = 3, color = "blue") + # Plot the median age as points
geom_errorbar(aes(ymin = Q1, ymax = Q3), width = 0.2, color = "red") + # Add IQR as error bars
labs(title = "Median Age and IQRs by Category", x = "Category", y = "Median Age") +
theme_minimal()
# save plot
ggsave("Charts_carriage/median_age_bysite.jpeg", plot = plot, device = "jpeg", width = 8, height = 6, dpi = 300)
# save table
write.csv(agedis, "Data_carriage/ECM_carriage/Stat_summaries/Age_distribution/age_dis.csv")
Apply geo - IMD lookup to link to IMD
# apply IMD linkage function to linelist without AMR data (to be linked back in later)
linelist <- imd.linkage.fun(linelist)
write.csv(linelist, paste0(wd,"/Data_carriage/Linelist/static_basedata_imdlinked_raw.csv"), row.names=F)
linelist <- read.csv("Data_carriage/Linelist/static_basedata_imdlinked_raw.csv")
linelist <- linelist %>%
mutate(
specdate = as.Date(specdate, format = "%Y-%m-%d"),
dob = as.Date(dob, format = "%Y-%m-%d"))
HES linkage for ethnicity enrichment
linelist <- linelist.prep.fun(linelist)
linelist <- hes.ethnlink.fun(linelist)
# //////////// key checkpoint 3a //////////////
## save as static base dataset **after HES CHIME ethnicity enrichment*
write.csv(linelist, "Data_carriage/Linelist/static_basedata_ethnlinked_raw.csv", row.names=F)
linelist <- read.csv("Data_carriage/Linelist/static_basedata_ethnlinked_raw.csv")
linelist$specdate <- as.Date(linelist$specdate, format = "%Y-%m-%d")
linelist$dob <- as.Date(linelist$dob, format = "%Y-%m-%d")
# quantify ethnicity categories (numbers and % total in linelist, including not linked - NA)
hes_eth_allcat <- linelist %>% mutate(z=1) %>% group_by(Ethnic_group) %>% summarize(total=sum(z)) %>% mutate(pct=total/sum(total)*100)
# % not linked = 2.5% (denom = all records)
# to document % not linked from CHIME method, as well as of those linked, the % which not known or not stated
hes_linked_only <- linelist %>% filter(!is.na(Ethnic_group)) %>% mutate(z=1) %>% group_by(Ethnic_group) %>% summarize(total=sum(z)) %>% mutate(pct=total/sum(total)*100)
# % not known = 0.67
# % not states = 1.63 (denom = all HES-linked records)
# if HES returned unknown, not linked, not stated then pass to ECDS method to fill in gaps. Log that ECDS data source
for_ecds <- linelist %>% filter(Ethnic_group == "99 Not Known" | Ethnic_group == "Z Not stated" | is.na(Ethnic_group)) %>% mutate(nhsno_status=case_when(is.na(nhsno) ~ "missing",
!is.na(nhsno) ~ "known"))
# 528 rows (4.1% of linelist)
# tabulate HES ethnicity assignment by NHS number status (known or missing)
tab <- for_ecds %>% mutate(z=1) %>% group_by(nhsno_status,Ethnic_group) %>% summarize(total=sum(z)) %>% mutate(pct=total/sum(total)*100)
# among known nhsno, 205 (76.2%) were HES linked but assigned as "ethnicity not stated"; all records missing nhsno were not linked to HES
write.csv(tab, "Data_carriage/ECM_carriage/Stat_summaries/hes_noethn_by_nhsno_status.csv", row.names=F)
ecds_linked <- linelist.prep.fun(for_ecds)
ecds_linked <- ecds.ethnlink.fun(ecds_linked)
# //////////// key checkpoint 3b //////////////
## save subset of ECDS-linked linelist
write.csv(ecds_linked, "Data_carriage/Linelist/ecds_linked_subset.csv", row.names=F)
ecds_linked <- read.csv("Data_carriage/Linelist/ecds_linked_subset.csv")
ecds_linked <- ecds_linked %>% rename(ethn_code=PatientEthnicCategoryCode) %>%
mutate(ecds_ethnicity=case_when(ethn_code=="A" ~ "British (White)",
ethn_code=="N" ~ "African (Black or Black British)",
ethn_code=="L" ~ "Any other Asian background",
ethn_code=="G" ~ "Any other Mixed background",
ethn_code=="C" ~ "Any other White background",
ethn_code=="S" ~ "Any other ethnic group",
ethn_code=="K" ~ "Bangladeshi (Asian or Asian British)",
ethn_code=="M" ~ "Caribbean (Black or Black British)",
ethn_code=="R" ~ "Chinese (other ethnic group)",
ethn_code=="H" ~ "Indian (Asian or Asian British)",
ethn_code=="J" ~ "Pakistani (Asian or Asian British)",
ethn_code=="F" ~ "White and Asian (Mixed)",
ethn_code=="E" ~ "White and Black African (Mixed)",
ethn_code=="D" ~ "White and Black Caribbean (Mixed)",
ethn_code=="Z" ~ "Z Not Stated",
ethn_code=="B" ~ "White Irish",
ethn_code=="9" ~ "Not Known",
ethn_code=="P" ~ "Any other Black background (Black or Black British)"))
# identify and flag duplicate records resulting from ECDS linkage i.e., where ECDS returned 2 or more ethnicity assignments for a given patient
ecds_linked$dup_flag <- duplicated(ecds_linked$pid_str) | duplicated(ecds_linked$pid_str, fromLast = TRUE)
dups <- ecds_linked %>% filter(dup_flag==T) %>% select(pid_str,specdate,Ethnic_group,ecds_ethnicity,dup_flag) %>% arrange(pid_str,ecds_ethnicity) # explore duplicates and assign a rule set for retaining records
length(unique(dups$pid_str)) # 14 pid_str with duplicate rows
pid_str_dups <- unique(dups$pid_str)
# if 'Z not Stated' and 'Not Known' retain either
# if 'Z not Stated'/'Not Known' and a named ethnicity group, retain ethnicity group
# if NA and 'Z not Stated'/'Not Known' or a named ethnicity group, retain latter (non-NA entry)
# if 'Any other Black background (Black or Black British)' and 'Any other ethnic group' retain the higher specificity ethnicity assignment (in this case Black or Black British > Any other ethnicity group)
#////// ecds_ethnicity hierarchy /////////
# max specificity:
max_spec <- c("British (White)",
"African (Black or Black British)",
"Bangladeshi (Asian or Asian British)",
"Caribbean (Black or Black British)",
"Chinese (other ethnic group)",
"Indian (Asian or Asian British)",
"Pakistani (Asian or Asian British)",
"White and Asian (Mixed)",
"White and Black African (Mixed)",
"White and Black Caribbean (Mixed)",
"White Irish")
# high specificity
high_spec <- c("Any other Black background (Black or Black British)",
"Any other Asian background",
"Any other Mixed background",
"Any other White background")
# medium specificity:
med_spec <- c("Any other ethnic group","Z Not Stated")
# low specificity:
low_spec <- c("Not Known")
# no specificity/missing
no_spec <- NA
# define hierarchy for category or cat
category_hierarchy <- c("Max" = 1, "High" = 2, "Medium" = 3, "Low" = 4, "No" = 5)
# assign each row as per above hierarchy
dups <- dups %>% mutate(cat=case_when(ecds_ethnicity %in% max_spec ~ "Max",
ecds_ethnicity %in% high_spec ~ "High",
ecds_ethnicity %in% med_spec ~ "Medium",
ecds_ethnicity %in% low_spec ~ "Low",
ecds_ethnicity %in% no_spec ~ "No"),
cat_rank = category_hierarchy[cat]) %>%
arrange(pid_str, cat_rank) %>%
distinct(pid_str, .keep_all = T)
# now apply to ecds_linked df
ecds_linked <- ecds_linked %>% mutate(cat=case_when(ecds_ethnicity %in% max_spec ~ "Max",
ecds_ethnicity %in% high_spec ~ "High",
ecds_ethnicity %in% med_spec ~ "Medium",
ecds_ethnicity %in% low_spec ~ "Low",
ecds_ethnicity %in% no_spec ~ "No"),
cat_rank = category_hierarchy[cat]) %>%
group_by(pid_str) %>%
mutate(is_selected = if_else(dup_flag, cat_rank == min(cat_rank[dup_flag]), TRUE)) %>%
ungroup() %>%
filter(is_selected) %>% select(-c("dup_flag","cat","cat_rank","is_selected"))
# check that non-duplicate rows now in data for pid_str vals flagged above
pid_str_dups %in% ecds_linked$pid_str # TRUE in all instances
# check that pid_str retained correspond to those identified as having duplicate entries above
unique(dups$pid_str) %in% pid_str_dups # TRUE at all positions in vector
# tabulate ECDS ethnicity assignment by NHS number status (known or missing)
tab <- ecds_linked %>% mutate(nhsno_status=case_when(is.na(nhsno) ~ "missing",
!is.na(nhsno) ~ "known"),
heslink_status=case_when(is.na(Ethnic_group) ~ "not linked",
!is.na(Ethnic_group) ~ "linked")
) %>% mutate(z=1) %>% group_by(nhsno_status,heslink_status,ecds_ethnicity) %>% summarize(total=sum(z)) %>% mutate(pct=total/sum(total)*100)
write.csv(tab, "Data_carriage/ECM_carriage/Stat_summaries/hes_noethn_ecds_cat.csv", row.names=F)
# among known nhsno, 218 (77%) were HES linked but returned "99 Not Known" or "Z Not stated". Only 8 of these 218 records had an ethnicity assignment in ECDS. For those records not HES linked, ECDS enrichment returned 6 records with known ethnicity. Overall, ECDS enrichment provided an ethnicity for 4.9% records that had an NHS number
ecds_linked <- ecds_linked %>% select(pid_str,ecds_ethnicity) %>% filter(!is.na(ecds_ethnicity)) # CONTINUE HERE - THERE ARE DUPLICATES OF pid_str
testmerge <- merge(linelist,ecds_linked, by = "pid_str", all=T) # merge back deduplicated, ecds-linked subset of records into linelist to retain new ecds_ethnicity col
testmerge <- testmerge %>% mutate(Ethnic_group=case_when((ecds_ethnicity!=Ethnic_group & # assign ECDS ethnicity where not HES linked or where ethnicity not known in HES but returned in ECDS linkage
ecds_ethnicity!="Not Known" &
ecds_ethnicity!="Z Not Stated") |
is.na(Ethnic_group) ~ ecds_ethnicity,
TRUE ~ Ethnic_group)) %>% select(-c("ecds_ethnicity")) %>%
mutate(Ethnicity_group=case_when(Ethnicity_group=="Asian" ~ "Any other Asian background", # re code LIMS ethnicity categories to align with HES categorization (defaulting to Any other...)
Ethnicity_group=="White" ~ "Any other White background",
Ethnicity_group=="Black" ~ "Any other Black background",
Ethnicity_group=="Mixed" ~ "Any other Mixed background",
Ethnicity_group=="" ~ "Z Not stated")) %>%
mutate(Ethnic_group=case_when((Ethnic_group=="Z Not stated" | Ethnic_group=="99 Not Known" | Ethnic_group=="Z not stated" | is.na(Ethnic_group)) & Ethnicity_group!="" & Ethnicity_group!="Unknown" ~ Ethnicity_group,
TRUE ~ Ethnic_group)) # assign LIMS ethnicity where recorded, if not HES linked or ethnicity not stated in HES, and not returned in ECDS linkage (last resort ethnicity assignment)
# DEDUPLICATE HERE AFTER ETHNICITY LINKAGE
# //////////// key checkpoint 4 //////////////
## save as static base dataset **after full ethnicity assignment and categorisation**
write.csv(linelist, "Data_carriage/Linelist/static_basedata_ethnlinked_final.csv", row.names=F)
linelist <- read.csv("Data_carriage/Linelist/static_basedata_ethnlinked_raw.csv")
linelist$specdate <- as.Date(linelist$specdate, format = "%Y-%m-%d")
linelist$dob <- as.Date(linelist$dob, format = "%Y-%m-%d")
# CONTINUE HERE
linelist <- linelist.ethcat5.fun(linelist)
# use ethnicity field in LIMS if hes ethnicity not returned for a patient
linelist <- linelist %>% mutate(hes_ethnicity = case_when(
(hes_ethnicity=="Not linked" & lim_ethnicity!="Unknown") ~ paste0(lim_ethnicity,"_fromLIMS"),
TRUE ~ hes_ethnicity
))
# # SAVE ETH LINKED LINELIST NOW
# write.csv(linelist, "Data_carriage/linelist/ethnlinked_linelist2.csv")
# record the data source for ethnicity by site
# recording attrition by site (where ethnicity missing too, as this can't be used to estimate carriage by ethnicity)
# - raw data (from site) NHS number completion
# enhanced data NHS number completion
# for automated report
# compare data quality by site - are some biasing the results (sensitivity analysis)
# IMPORT MAIN ETHNICITY ENRICHED LINELIST HERE (ethnicity in 5 categories)
ethnlinked_linelist=import(here("Data_carriage","Linelist","ethnlinked_linelist2.csv"),
format = "csv",
fill = T,
comment.char="",
na = c(""),
skip = 0,
full.names = T)
linelist <- ethnlinked_linelist
linelist <- linelist %>% mutate(species=case_when(species=="beta Haemolytic Strep, group B" ~ "Group B Streptococcus",
TRUE ~ species)) # mop up of any variants of GBS pos test field
# success of ethnicity ascertainment from CHIME linkage (and supplemented by ECDS), overall and by site, and then % of ethn filled in from local LIMS data, also overall and by site
Summarise ethnicity ascertainment and estimate carriage by
ethnicity
# filter for first 4 sites data analysed
four_site <- c("Bedford","Derriford","Freeman","Royal Stoke")
linelist <- linelist %>% filter(site %in% four_site)
gbscarr_ethn <- gbs.carr.byethn.fun(linelist)
write.csv(gbscarr_ethn, "Data_carriage/ECM_carriage/Carriage_estimates/gbscarr_ethn.csv")
# prep eth categories for plot
# Create a bar graph of gbs carriage by ethn (where denom doesn't include ethnicity unknown/link to HES not successful)
gbscarr_forplot <- gbscarr_ethn %>%
select(-c("carriage_bythn")) %>%
mutate(ethn_forplot = case_when(
hes_ethnicity=="Asian_fromLIMS" ~ "Asian",
hes_ethnicity=="Black_fromLIMS" ~ "Black",
hes_ethnicity=="Mixed_fromLIMS" ~ "Mixed",
hes_ethnicity=="White_fromLIMS" ~ "White",
hes_ethnicity=="Other_fromLIMS" ~ "Other",
TRUE ~ hes_ethnicity
)) %>%
select(-c("hes_ethnicity")) %>%
group_by(ethn_forplot) %>%
summarize(across(everything(), sum)) %>%
mutate(gbc_carr = gbspos/totalecm*100) %>%
filter(ethn_forplot!="Not linked" & ethn_forplot!="Unknown") %>%
arrange(gbc_carr) %>% mutate(ethn_forplot=factor(ethn_forplot, levels = c("White","Black","Asian","Other","Mixed")))
plot <- ggplot(gbscarr_forplot, aes(x = ethn_forplot, y = gbc_carr)) +
geom_bar(stat = "identity", fill = "skyblue", color = "black") +
geom_errorbar(
aes(ymin = lower_ci, ymax = upper_ci),
width = 0.2,
color = "black"
) +
geom_hline(yintercept = unique(gbscarr_forplot$total_carriage), linetype = "dashed", color = "red", size = 1) +
labs(title = "GBS carriage by ethnicity", x = "Ethnicity", y = "GBS carriage (%)") +
theme_minimal() +
theme(
panel.grid = element_blank(),
axis.line = element_line(color = "black", size = 0.5),
axis.text.x = element_text(size = 12), # Increase size of x-axis labels
axis.text.y = element_text(size = 12), # Increase size of y-axis labels
axis.title.x = element_text(size = 14,
margin = margin(b = 20)), # Increase size of x-axis title
axis.title.y = element_text(size = 14, # Increase size of y-axis title
margin = margin(r = 10)),
plot.title = element_text(size = 14, hjust= 0.5)
)
# save plot
ggsave("Charts_carriage/gbscarr_ethn.jpeg", plot = plot, device = "jpeg", width = 8, height = 6, dpi = 300)
Estimate carriage by ethnicity for Asian subgroupings Indian,
Pakistani and Bangladeshi, using linelist prior to 5 category grouping
of ethnicity field
# import linelist with full set of HES ethnicity categories
linelist_ethnsub=import(here("Data_carriage","Linelist","ethnlinked_linelist.csv"),
format = "csv",
fill = T,
comment.char="",
na = c(""),
skip = 0,
full.names = T)
nrow(linelist_ethnsub)
unique(linelist_ethnsub$Ethnic_group)
gbscarr_subethn <- gbs.carr.bysubethn.fun(linelist_ethnsub)
# now generate GBS carriage summaries for all 16 HES ethnicity categories
gbscarr_allsubeth <- gbs.carr.byallsubethn.fun(linelist_ethnsub)
write.csv(gbscarr_allsubeth, "Data_carriage/ECM_carriage/Carriage_estimates/gbscarr_allsubeth.csv")
Estimate carriage by deprivation
gbscarr_imd <- gbs.carr.byimd.fun(linelist)
gbscarr_forplot <- gbscarr_imd %>% filter(imd_quintile!="Not linked" & imd_quintile!="NA" & !is.na(imd_quintile)) %>%
mutate(imd_quintile=factor(imd_quintile, levels = c("20% least deprived", "20% to 40%", "40% to 60%", "60% to 80%", "20% most deprived"), ordered = TRUE))
write.csv(gbscarr_imd, "Data_carriage/ECM_carriage/Carriage_estimates/gbscarr_imd.csv")
# Create a bar graph of gbs carriage by IMD quintile
plot <- ggplot(gbscarr_forplot, aes(x = imd_quintile, y = carriage_byimd)) +
geom_bar(stat = "identity", fill = "skyblue", color = "black") +
geom_errorbar(
aes(ymin = lower_ci, ymax = upper_ci),
width = 0.2,
color = "black"
) +
geom_hline(yintercept = unique(gbscarr_forplot$total_carriage), linetype = "dashed", color = "red", size = 1) +
labs(title = "GBS carriage by IMD quintile", x = "IMD quintile", y = "GBS carriage (%)") +
theme_minimal() +
theme(
panel.grid = element_blank(),
axis.line = element_line(color = "black", size = 0.5),
axis.text.x = element_text(size = 12), # Increase size of x-axis labels
axis.text.y = element_text(size = 12), # Increase size of y-axis labels
axis.title.x = element_text(size = 14,
margin = margin(b = 20)), # Increase size of x-axis title
axis.title.y = element_text(size = 14, # Increase size of y-axis title
margin = margin(r = 10)),
plot.title = element_text(size = 14, hjust= 0.5)
)
# save plot
ggsave("Charts_carriage/gbs_carr_imd.jpeg", plot = plot, device = "jpeg", width = 8, height = 6, dpi = 300)
Estimate carriage by age group
gbscarr_age <- gbs.carr.byage.fun(linelist)
write.csv(gbscarr_age, "Data_carriage/ECM_carriage/Carriage_estimates/gbscarr_age.csv")
# repeat but restrict to each ethnicity in turn
ll_black <- linelist %>% filter(hes_ethnicity=="Black" | hes_ethnicity=="Black_fromLIMS")
gbscarr_age <- gbs.carr.byage.fun(ll_black)
ll_asian <- linelist %>% filter(hes_ethnicity=="Asian" | hes_ethnicity=="Asian_fromLIMS")
gbscarr_age <- gbs.carr.byage.fun(ll_asian)
ll_mixed <- linelist %>% filter(hes_ethnicity=="Mixed" | hes_ethnicity=="Mixed_fromLIMS")
gbscarr_age <- gbs.carr.byage.fun(ll_mixed)
ll_other <- linelist %>% filter(hes_ethnicity=="Other" | hes_ethnicity=="Other_fromLIMS")
gbscarr_age <- gbs.carr.byage.fun(ll_other)
ll_white <- linelist %>% filter(hes_ethnicity=="White" | hes_ethnicity=="White_fromLIMS")
gbscarr_age <- gbs.carr.byage.fun(ll_white)
# plot % carriage by age group in bar chart
gbscarr_forplot <- gbscarr_age %>% filter(!agegroup %in% c("10-14","50-54","55-59","60-64") & !is.na(agegroup))
# Create a bar graph of gbs carriage by age group
plot <- ggplot(gbscarr_forplot, aes(x = agegroup, y = carriage_byage)) +
geom_bar(stat = "identity", fill = "skyblue", color = "black") +
geom_errorbar(
aes(ymin = lower_ci, ymax = upper_ci),
width = 0.2,
color = "black"
) +
geom_hline(yintercept = unique(gbscarr_forplot$total_carriage), linetype = "dashed", color = "red", size = 1) +
labs(title = "GBS carriage by age group", x = "Age group", y = "GBS carriage (%)") +
theme_minimal() +
theme(
panel.grid = element_blank(),
axis.line = element_line(color = "black", size = 0.5),
axis.text.x = element_text(size = 12), # Increase size of x-axis labels
axis.text.y = element_text(size = 12), # Increase size of y-axis labels
axis.title.x = element_text(size = 14,
margin = margin(b = 20)), # Increase size of x-axis title
axis.title.y = element_text(size = 14, # Increase size of y-axis title
margin = margin(r = 10)),
plot.title = element_text(size = 14, hjust= 0.5)
)
# save plot
ggsave("Charts_carriage/gbs_carr_agegroup.jpeg", plot = plot, device = "jpeg", width = 8, height = 6, dpi = 300)
# Create a bar graph of totals by age group
gbstest_age <- gbscarr_age %>% select(-c("carriage_byage","total_carriage","lower_ci","upper_ci")) %>% filter(!is.na(agegroup)) %>% mutate(agegroup = case_when(
agegroup %in% c("40-44","45-49") ~ "40-49",
agegroup %in% c("50-54","60-64") ~ "50+",
TRUE ~ agegroup
)) %>% group_by(agegroup) %>% summarize(across(everything(), sum))
plot <- ggplot(gbstest_age, aes(x = agegroup, y = pct)) +
geom_bar(stat = "identity", fill = "skyblue", color = "black") +
labs(title = "% total ECM tests by age group", x = "Age group", y = "% total") +
theme_minimal() +
theme(
panel.grid = element_blank(),
axis.line = element_line(color = "black", size = 0.5),
axis.text.x = element_text(size = 12), # Increase size of x-axis labels
axis.text.y = element_text(size = 12), # Increase size of y-axis labels
axis.title.x = element_text(size = 14,
margin = margin(b = 20)), # Increase size of x-axis title
axis.title.y = element_text(size = 14, # Increase size of y-axis title
margin = margin(r = 10)),
plot.title = element_text(size = 14, hjust= 0.5)
)
# # save plot
ggsave("Charts_carriage/totals_agegroup.jpeg", plot = plot, device = "jpeg", width = 8, height = 6, dpi = 300)
# saving age dist by eth plots
# save plot
ggsave("Charts_carriage/agegroup_black.jpeg", plot = plot, device = "jpeg", width = 8, height = 6, dpi = 300)
ggsave("Charts_carriage/agegroup_asian.jpeg", plot = plot, device = "jpeg", width = 8, height = 6, dpi = 300)
ggsave("Charts_carriage/agegroup_mixed.jpeg", plot = plot, device = "jpeg", width = 8, height = 6, dpi = 300)
ggsave("Charts_carriage/agegroup_other.jpeg", plot = plot, device = "jpeg", width = 8, height = 6, dpi = 300)
ggsave("Charts_carriage/agegroup_white.jpeg", plot = plot, device = "jpeg", width = 8, height = 6, dpi = 300)
Estimate carriage by site
gbscarr_site <- gbs.carr.bysite.fun(linelist)
# Create a bar graph of gbs carriage by site
plot <- ggplot(gbscarr_site, aes(x = site, y = carriage_bysite)) +
geom_bar(stat = "identity", fill = "skyblue", color = "black") +
geom_errorbar(
aes(ymin = lower_ci, ymax = upper_ci),
width = 0.2,
color = "black"
) +
geom_hline(yintercept = unique(gbscarr_forplot$total_carriage), linetype = "dashed", color = "red", size = 1) +
labs(title = "GBS carriage (%) by ECM site", x = "Site", y = "%") +
theme_minimal() +
theme(
panel.grid = element_blank(),
axis.line = element_line(color = "black", size = 0.5),
axis.text.x = element_text(size = 12), # Increase size of x-axis labels
axis.text.y = element_text(size = 12), # Increase size of y-axis labels
axis.title.x = element_text(size = 14,
margin = margin(b = 20)), # Increase size of x-axis title
axis.title.y = element_text(size = 14, # Increase size of y-axis title
margin = margin(r = 10)),
plot.title = element_text(size = 14, hjust= 0.5)
)
# save plot
ggsave("Charts_carriage/gbs_carr_sites.jpeg", plot = plot, device = "jpeg", width = 8, height = 6, dpi = 300)
# conf int for overall GBS carr estimate
total_carr <- gbscarr_site %>% select(c("total","gbspos")) %>% summarize(across(everything(), sum)) %>%
rowwise() %>%
mutate(
conf = list(binom.confint(gbspos, total, methods = "exact")),
lower_ci = conf[[1, "lower"]] * 100,
upper_ci = conf[[1, "upper"]] * 100
) %>%
select(-conf)
Estimate carriage by month of ECM implementation, for all sites
combined
gbscarr_month <- gbs.carr.byym.fun(linelist)
write.csv(gbscarr_month, "Data_carriage/ECM_carriage/Carriage_estimates/gbscarr_byym.csv")
gbscarr_month$month_year <- ymd(paste0(gbscarr_month$month_year, "-01")) # Append day to create a complete date
# Create a bar graph of gbs carriage by age group
plot <- ggplot(gbscarr_month, aes(x = ym, y = carriage_byym)) +
geom_bar(stat = "identity", fill = "skyblue", color = "black") +
labs(title = "GBS carriage (%) by month and year (all 4 sites)", x = "month and year", y = "%") +
theme_minimal() +
theme(axis.text.x = element_text(size = 10, angle = 45, hjust = 1)) +
scale_x_date(date_labels = "%Y-%m", date_breaks = "1 month") + # Format x-axis
theme(
axis.text.x = element_text(size = 14), # Increase size of x-axis labels
axis.text.y = element_text(size = 14), # Increase size of y-axis labels
axis.title.x = element_text(size = 16), # Increase size of x-axis title
axis.title.y = element_text(size = 16) # Increase size of y-axis title
)
# save plot
ggsave("Charts_carriage/gbscarr_bymonth.jpeg", plot = plot, device = "jpeg", width = 8, height = 6, dpi = 300)
Estimate carriage by year of ECM implementation, for all sites
combined
gbs_byyear <- gbs.carr.byyear.fun(linelist)
write.csv(gbs_byyear, "Data_carriage/ECM_carriage/Carriage_estimates/gbscarr_byyear.csv")
Produce summaries for flow chart for data linkage support request
# total unique PIDs per site
# Get unique site names
unique_sites <- unique(linelist$site)
# Use lapply to calculate counts for each site
site_counts <- lapply(unique_sites, function(site) sum(linelist$site == site))
# Combine results into a named vector or data frame for clarity
names(site_counts) <- unique_sites
site_counts <- unlist(site_counts)
# Print the results
print(site_counts)
length(unique(linelist$pid_str))
# date range by site
range(linelist$year, na.rm=T)
range(linelist$specdate, na.rm=T)
Logistic regression - association between maternal ethnicity and GBS
carriage, controlling for age and deprivation
# ///////////////////
# model with ethnicity as five categories
model_data <- linelist %>% filter(!is.na(hes_ethnicity)) %>% mutate(gbs_car = case_when(species=="Group B Streptococcus" ~ 1,
species=="GBS not isolated" | species=="No organism recorded" ~ 0),
hes_ethnicity = case_when((hes_ethnicity=="Asian" | hes_ethnicity=="Asian_fromLIMS") ~ "Asian",
(hes_ethnicity=="Black" | hes_ethnicity=="Black_fromLIMS") ~ "Black",
(hes_ethnicity=="Mixed" | hes_ethnicity=="Mixed_fromLIMS") ~ "Mixed",
(hes_ethnicity=="Other" | hes_ethnicity=="Other_fromLIMS") ~ "Other",
(hes_ethnicity=="White" | hes_ethnicity=="White_fromLIMS") ~ "White"),
hes_ethnicity = as.factor(hes_ethnicity)) %>% filter(agegroup!="not known") %>% filter(!is.na(age)) %>% mutate(z=1,
agegroup = cut(age,
breaks = seq(0, 100, by = 5), # Specify the breaks for the age bands
right = FALSE, # Include the left endpoint, exclude the right
labels = paste(seq(0, 95, by = 5), seq(4, 99, by = 5), sep = "-")),
agegroup = fct_collapse(agegroup,
"0-14" = c("0-4","5-9","10-14"),
"40+" = c("40-44","40-44","45-49","50-54","55-59","60-64","65-69","70-74","75-79","80-84","85-89","90-94","95-99"))) %>% select(year,agegroup,imd_quintile,hes_ethnicity,gbs_car) %>% filter(!is.na(hes_ethnicity))
# %>% filter(!is.na(imd_quintile))
model_data <- imd.fac.label.fun(model_data)
model_data$imd_quintile <- as.factor(model_data$imd_quintile)
model_data$hes_ethnicity <- relevel(model_data$hes_ethnicity, ref = "White")
model_data$imd_quintile <- relevel(model_data$imd_quintile, ref = "20% least deprived")
model_data$agegroup <- relevel(model_data$agegroup, ref = "20-24")
model_data$gbs_car <- as.factor(model_data$gbs_car)
model_data <- droplevels(model_data)
successes <- sum(as.numeric(model_data$gbs_car == 1))
trials <- nrow(model_data)
result <- prop.test(successes, trials, conf.level = 0.95)
model <- glm(gbs_car ~ hes_ethnicity + agegroup + imd_quintile,
data = model_data,
family = binomial(link = "logit"))
sapply(model_data, function(x) if(is.factor(x)) levels(x))
# Summary of the model to see coefficients and statistics
summary(model)
# Exponentiate the coefficients to get odds ratios
exp(coef(model))
model_out <- tidy(model, exponentiate = TRUE, conf.int = TRUE)
write.csv(model_out, "Data_carriage/Model/multilog.csv", row.names=F)
# ///////////////////////
# rerun model with ethnicity as binary (white vs any non-white ethnicity)
model_data_binary <- ethn.binary.fac.fun(model_data)
model <- glm(gbs_car ~ hes_ethnicity + agegroup + imd_quintile + year,
data = model_data_binary,
family = binomial(link = "logit"))
sapply(model_data, function(x) if(is.factor(x)) levels(x))
# Summary of the model to see coefficients and statistics
summary(model)
# Exponentiate the coefficients to get odds ratios
exp(coef(model))
model_out <- tidy(model, exponentiate = TRUE, conf.int = TRUE)
write.csv(model_out, "Data_carriage/Model/multilog_ethn_binary.csv", row.names=F)
Add a new chunk by clicking the Insert Chunk button on the
toolbar or by pressing Ctrl+Alt+I.
When you save the notebook, an HTML file containing the code and
output will be saved alongside it (click the Preview button or
press Ctrl+Shift+K to preview the HTML file).
The preview shows you a rendered HTML copy of the contents of the
editor. Consequently, unlike Knit, Preview does not
run any R code chunks. Instead, the output of the chunk when it was last
run in the editor is displayed.
---
title: "GBS carriage estimation process"
output:
  html_notebook: default
  html_document:
    df_print: paged
  pdf_document: default
---

This is an [R Markdown](http://rmarkdown.rstudio.com) Notebook outlining steps for LIMS data processing and GBS carriage estimation from ECM-based testing data from the GBS3 trial

Set main working directory

```{r set wd, include=F}

wd <- "//FILECOL17.phe.gov.uk/ProjectData/HCAI/Workstream 4 Folder/streptococcal infections/Group B strep/GBS3/GBS maternal carriage study/GBS carriage estimation"

```

Install and load packages

```{r packages, include=F}

setwd(wd)
source("R scripts_carriage/packages_GBS_carriage.R")

# password prompt is to access PHE indicator methods functions

```

Load functions

```{r functions, include=TRUE}

setwd(wd)
source("R scripts_carriage/functions_GBS_carriage.R")

```

Import LIMS data from ECM-based GBS testing (datasets provided by micro labs)
Pre-check - open csv and check that in correct template format - describe the raw data

```{r lims data import, include=TRUE}

# setwd(wd)

#////////////////////
# for Royal Stoke

testpos <- import(here("Data_carriage","ECM_carriage","Edited","ECM_pos_06_02_2023to01_04_2024_Royal_stoke.xlsx"),
                                format = "xlsx",
                                fill = T,
                                comment.char="",
                                na = c(""),
                                skip = 0,
                                full.names = T)

testneg <- import(here("Data_carriage","ECM_carriage","Edited","ECM_neg_06_02_2023to01_04_2024_Royal_stoke.xlsx"),
                                format = "xlsx",
                                fill = T,
                                comment.char="",
                                na = c(""),
                                skip = 0,
                                full.names = T)

testpos2 <- import(here("Data_carriage","ECM_carriage","Edited","ECM_pos_30_10_2022to05_02_2023_Royal_stoke.xls"),
                                format = "xls",
                                fill = T,
                                comment.char="",
                                na = c(""),
                                skip = 0,
                                full.names = T)

testneg2 <- import(here("Data_carriage","ECM_carriage","Edited","ECM_neg_30_10_2022to05_02_2023_Royal_stoke.xls"),
                                format = "xls",
                                fill = T,
                                comment.char="",
                                na = c(""),
                                skip = 0,
                                full.names = T)


colnames(testpos) %in% colnames(testneg) # last 3 cols distinct in testneg

testpos <- lims.stnd.format.fun(testpos)
testpos2 <- lims.stnd.format.fun(testpos2)

testneg <- lims.stnd.format.fun(testneg)
testneg2 <- lims.stnd.format.fun(testneg2)

alltest <- rbind(testpos,testpos2,testneg,testneg2)
alltest1 <- alltest %>% mutate(site="Royal Stoke")
rm(testpos,testpos2,testneg,testneg2)


#////////////////////
# for Bedford
alltest <- import(here("Data_carriage","ECM_carriage","Edited","Bedford_GBS3_CleanedData.csv"),
                                format = "csv",
                                fill = T,
                                comment.char="",
                                na = c(""),
                                skip = 0,
                                full.names = T)
alltest$`Patient NHS Number` <- as.numeric(alltest$`Patient NHS Number`)

alltest2 <- lims.stnd.format.fun(alltest)
alltest2 <- alltest2 %>% mutate(site="Bedford")


#////////////////////
# for Derriford
alltest <- import(here("Data_carriage","ECM_carriage","Edited","Derriford_GBS3_CleanedData.csv"),
                                format = "csv",
                                fill = T,
                                comment.char="",
                                na = c(""),
                                skip = 0,
                                full.names = T)
alltest$`Patient NHS Number` <- as.numeric(alltest$`Patient NHS Number`)


alltest3 <- lims.stnd.format.fun(alltest)
alltest3 <- alltest3 %>% mutate(site="Derriford")


#////////////////////
# for Freeman
alltest <- import(here("Data_carriage","ECM_carriage","Edited","Freeman_GBS3_CleanedData.csv"),
                                format = "csv",
                                fill = T,
                                comment.char="",
                                na = c(""),
                                skip = 0,
                                full.names = T)
alltest <- alltest %>%
  mutate(
    `Patient NHS Number` = as.numeric(`Patient NHS Number`),
    `Specimen Date` = as.Date(`Specimen Date`, format = "%d/%m/%Y"),
    `Patient Date of Birth` = as.Date(`Patient Date of Birth`, format = "%d/%m/%Y")
  )

# CONTINUE HERE - specimen date format different for Freeman vs other sites. Resolve here before applying df$dt <- lubridate::ymd(df$specdate) step in obai process

# handle format for strings in Ethnicity column which have symbols
alltest$Ethnicity <- iconv(alltest$Ethnicity, from = "ISO-8859-1", to = "UTF-8")


alltest4 <- lims.stnd.format.fun(alltest)
alltest4 <- alltest4 %>% mutate(site="Freeman")


#////////////////////
# for Epsom and St Helier (St George's)
alltest5 <- read.csv("\\\\IISNISCOL01\\GBSStudy\\Processed\\Epsom_StHelier_GBS3_UID.csv")
alltest5 <- alltest5 %>% mutate(site="Epsom") %>% relocate(site, .after = last_col()) %>% mutate(species=case_when(
  is.na(species) ~ "GBS not isolated",
  (species!="Beta Haemolytic Streptococcus Group B" & species!="Group B Streptococcus") ~ "GBS not isolated",
  (species=="Beta Haemolytic Streptococcus Group B" | species=="Group B Streptococcus") ~ "Group B Streptococcus",
  TRUE ~ species
)) %>% rename(ethnicity=Ethnicity_group) %>% relocate(ethnicity, .after = sex) %>%
    mutate(
      dob = as.Date(dob, format = "%d/%m/%Y"),  # Character string "DD/MM/YYYY" to Date
      specdate = as.Date(specdate, format = "%d/%m/%Y")
    ) %>% select(-c("pt_id"))



#////////////////////
# for Royal Blackburn
alltest6 <- import(here("Data_carriage","ECM_carriage","Edited","UKHSA GB3 trial_Royal_Blackburn.csv"),
                                format = "csv",
                                fill = T,
                                comment.char="",
                                na = c(""),
                                skip = 0,
                                full.names = T)

alltest6 <- lims.stnd.format.fun(alltest6)
alltest6 <- alltest6 %>% mutate(site="Royal Blackburn",
                                dob=as.Date(dob, format = "%d/%m/%Y")) %>% 
  mutate(species=case_when(
  is.na(species) ~ "GBS not isolated",
  (species!="beta Haemolytic Strep, group B" & species!="Group B Streptococcus") ~ "GBS not isolated",
  (species=="beta Haemolytic Strep, group B" | species=="Group B Streptococcus") ~ "Group B Streptococcus",
  TRUE ~ species
))# need to apply additional date conversion as not handled in flexible reformatting function for dates in char string "19/06/1991" in Royal Blackburn LIMS data

#//////////////////
# combine into single df
allsites <- rbind(alltest1,alltest2,alltest3,alltest4,alltest5,alltest6)
allsites$nhsno <- as.numeric(allsites$nhsno)
rm(alltest,alltest1,alltest2,alltest3,alltest4,alltest5,alltest6)

# apply ethnicity categorization function to group into 5 main ethnicities
allsites <- lims.ethcat.fun(allsites)

# convert instances of sex=="FALSE" into "U"
allsites <- allsites %>% mutate(sex = case_when(
  sex=="FALSE" ~ "U",
  TRUE ~ sex
))


# NOTE: CHECK at this stage that dob and specdate fields have consistent date formatting across sites

```

Process LIMS linelist using adapted version of obai sgss_infections 

```{r lims data processing, include=TRUE}

# Remove quality control samples
allsites <- qc.umbrella.fun(allsites)

# Proceed to dedpup LIMS linelist to patient-level with AMR results
allsites <- obai.dedup.fun(allsites)

## perform initial checks

# total episodes (ECM test result within 9 month window)
  length(unique(allsites$epids))  # 12,916
# total patients
  length(unique(allsites$pid_str))# 12,916
# total unique NHS number 
  length(unique(allsites$nhsno))  # 12,445
# total records where NHS number missing
  sum(is.na(allsites$nhsno))      # 472
  
  # isolate duplicates in nhsno field
  duplicates_only <- allsites %>%
  filter(nhsno %in% nhsno[duplicated(nhsno)]) %>% arrange(forename)
  nrow(duplicates_only)           # 472 - all of which nhsno missing
  
  # identify true (likely) duplicates using patient name fields
  true_dups <- duplicates_only %>% select(specno:site) %>% mutate(name_concat=paste(forename,"_",surname),
                                                                  name_concat=gsub(" ","",name_concat)) %>% 
     filter(name_concat %in% name_concat[duplicated(name_concat)]) %>% arrange(name_concat)
  length(unique(true_dups$name_concat))  
                                  # 7 true duplicates (across name fields), of which 1 ("FEBUARY_IQAMIC2024") a QC sample and 1 pertains to 2 samples a year apart for a patient
  
  # remove remaining QC sample rows
  allsites <- allsites %>% filter(surname!="IQAMIC2024")

  # //////////// key checkpoint 1 //////////////
  ## save as static base dataset 
write.csv(allsites, "Data_carriage/Linelist/static_basedata.csv", row.names=F)  # REPLACE WITH SAVE TO SERVER
  
```

Prepare processed LIMS data for DBS tracing

```{r dbs trace prep, include=TRUE}

## load static base linelist
allsites <- read.csv("Data_carriage/Linelist/static_basedata.csv")
allsites$specdate <- as.Date(allsites$specdate, format = "%Y-%m-%d")
allsites$dob <- as.Date(allsites$dob, format = "%Y-%m-%d")
allsites$pid_str <- as.character(allsites$pid_str)

dbsprep <- allsites %>% mutate(year = format(specdate, "%Y")) %>% select(year, everything()) %>% select(1:match("epids",names(allsites)))


# generate .csv files for sending to DBS trace team (generates two files, one with sufficient PII and one where some/all PII missing. Send BOTH for tracing)
dbs.trace.prep.fun(dbsprep, 
                   mainfile="Data_carriage/DBS_tracing/LIMS_linelist_for_tracing.csv", 
                   excludefile="Data_carriage/DBS_tracing/excluded_records_for_tracing.csv")
# NOTE: always open .cvs files for tracing and check that columns formatted correctly, prior to sending to DBS team

# # state NHS number completion before DBS tracing: percent nhsno missing
sum_na_nhs <- sum(is.na(dbsprep$nhsno))/nrow(dbsprep)*100   # 3.6%
na_nhs <- dbsprep %>% filter(is.na(nhsno))                  # generate df of records where NHS number missing

```

Integrate PII-enriched DBS trace return into linelist and retain records which didn't have successful trace first time around for second attempt

```{r dbs trace return, include=TRUE}

## import and format trace return csv files

# These are saved initially in 'response holding bay' subfolder of 'DBS_tracing' with original DBS file names that have long numeric suffixes. Rename, save in main 'DBS_tracing' folder (overwriting previous versions from prior runs of process which should first be moved to Archive) and import:

setwd(wd)
lims_trace_return <- read.csv("Data_carriage/DBS_tracing/LIMS_linelist_response.csv", header = F)                 # update input file name as needed
lims_excl_trace_return <- read.csv("Data_carriage/DBS_tracing/excluded_records_response.csv", header = F)

# confirm total number of records returned
sum(nrow(lims_trace_return)+nrow(lims_excl_trace_return))    # 12,917
    
lims_trace_return <- rbind(lims_trace_return,lims_excl_trace_return) %>% filter(!grepl("^0011",V1) & !grepl("^9911",V1))

# retain records which NOT successfully traced for subsequent QC and re-tracing
no_trace <- no.trace.wrap.fun(lims_trace_return)

# retain records where sex unknown (may overlap with no_trace generated above)
sex_unknown <- unknown.sex.wrap.fun(lims_trace_return)

# keep ONLY records which successfully traced in first round of tracing and merge with LIMS linelist 
allsites <- traced.wrap.fun(lims_trace_return)               # 12,913

# state NHS number completion after DBS tracing
sum(is.na(allsites$nhsno))
# 260 NHS number missing

```

Prepare subsets of data for second round of DBS tracing

```{r second dbs trace prep, include=TRUE}

dbsprep2 <- allsites %>% mutate(year = as.numeric(format(specdate, '%Y'))) %>% select(year, everything()) %>% select(1:match("epids", names(allsites))) %>% filter(pid_str %in% no_trace$pid_str | pid_str %in% sex_unknown$pid_str) %>% 
  mutate(forename = sub(" .*", "", forename)) %>%   # reformat name fields in no_trace to resend for tracing (where middle names not compatible with DBS trace algorithm)
  mutate(sex = case_when(                           # take all records where sex=="U" and presumptively recode to "F" to resend for tracing (if trace returns sex == 2 then confident that pertains to females)
    sex == "U" ~ "F",
    TRUE ~ sex
  ))

# generate .csv files for sending to DBS trace team for second trace (generates two files, one with sufficient PII and one where some/all PII missing. Send BOTH for tracing)
dbs.trace.prep.fun(dbsprep2, 
                   mainfile="Data_carriage/DBS_tracing/LIMS_linelist_for_tracing_rnd2.csv", 
                   excludefile="Data_carriage/DBS_tracing/excluded_records_for_tracing_rnd2.csv")   

```

Import second set of DBS tracing results

```{r second dbs trace result, include=TRUE}

#///////////////////////////
#TEMPORARY CODE CHUNK
  # review content of latest files sent for DBS tracing (Jan 2025)
test <- read.csv("Data_carriage/DBS_tracing/LIMS_linelist_for_tracing_rnd2.csv", header = T)
test2 <- read.csv("Data_carriage/DBS_tracing/excluded_records_for_tracing_rnd2.csv", header = T)
test <- rbind(test,test2)
sites <- allsites %>% filter(nhsno %in% unique(test$NHS_number))
unique(sites$site)
# has data for all 6 sites
# percent of total records in linelist which in second trace request files
perc <- nrow(sites)/nrow(allsites)*100
# 19.3%
# current percent records missing nhsno, prior to linking back in second trace return
perc <- (sum(is.na(allsites$nhsno))/nrow(allsites))*100
print(perc)
# 3.6%
#///////////////////////////

# import and format trace return csv files
setwd(wd)

lims_trace_return2 <- read.csv("Data_carriage/DBS_tracing/LIMS_linelist_response2.csv", header = F)                 # update input file name as needed
lims_excl_trace_return2 <- read.csv("Data_carriage/DBS_tracing/excluded_records_response2.csv", header = F)  

lims_trace_return2 <- rbind(lims_trace_return2,lims_excl_trace_return2)    # combine into single df once checked that non-overlapping patient IDs

# retain records which NOT successfully traced for subsequent QC and re-tracing
no_trace2 <- no.trace.wrap.fun(lims_trace_return2)

# keep ONLY records which successfully traced in second round of tracing and merge with LIMS linelist
allsites$pid_str <- as.integer(allsites$pid_str)  # ensure pid_str column in both datasets used for merge in same format
allsites <- traced.wrap.fun(lims_trace_return2)    # merge these records back into main linelist

# current percent records missing nhsno, after linking back in second trace return
perc <- (sum(is.na(allsites$nhsno))/nrow(allsites))*100
print(perc)
# 2.01%

  # //////////// key checkpoint 2 //////////////
  ## save as static base dataset **after tracing**
write.csv(allsites, "Data_carriage/Linelist/static_basedata_traced.csv", row.names=F)


```

Preliminary analysis results prep and presentation

```{r prelim results prep, include=TRUE}

linelist <- read.csv("Data_carriage/Linelist/static_basedata_traced.csv")
  linelist$specdate <- as.Date(linelist$specdate, format = "%Y-%m-%d")
  linelist$dob <- as.Date(linelist$dob, format = "%Y-%m-%d")
  

linelist <- linelist %>%
  mutate(year = as.numeric(format(specdate, '%Y'))) %>%
  select(year, everything()) %>% select(-any_of(c("V1"))) %>% 
  filter(!grepl("IQAM", surname)) %>% filter(sex!="M") %>%
  mutate(ym = format(specdate, "%Y/%m")) %>% 
  mutate(ym_date = as.Date(paste0(ym, "/01"), format = "%Y/%m/%d"))


# /////////////
# 1) monthly ECM test totals by site

# Summarize totals by year-month and site
monthly_test <- linelist %>% group_by(ym_date,site) %>% summarize(total_tested=sum(z), .groups= 'drop') 

# Create a complete set of year-month and site combinations
month_complete <- monthly_test %>% complete(ym_date,site)

# Join the summarized data, allowing for NA in missing combinations
month_complete <- month_complete %>%
  left_join(monthly_test, by = c("ym_date", "site")) %>%
  mutate(total_tested = ifelse(is.na(total_tested.y), NA, total_tested.y)) %>%
  select(ym_date, site, total_tested) %>% mutate(total_tested = case_when(
    total_tested<=30 ~ NA,
    TRUE ~ total_tested)
  )

# plot of monthly test numbers by site 
plot <- ggplot(month_complete, aes(x = ym_date, y = total_tested, color = site, group = site)) +
 geom_line(size = 1) +
  geom_point(size = 2) +
  scale_x_date(date_labels = "%b %Y", date_breaks = "1 month") +  # Format x-axis
  labs(title = "Total ECM tests by Year-Month and Site", 
       x = "Year-Month", y = "Total Value", color = "Site") +
  theme_minimal() +
  theme(panel.grid = element_blank(),
        axis.line = element_line(color = "black", size = 0.5),
        axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(size = 14, hjust= 0.5))

# save plot
ggsave("Charts_carriage/ym_ecm_test_bysite.jpeg", plot = plot, device = "jpeg", width = 8, height = 6, dpi = 300)


# now looking 1 site at a time, comparing monthly test numbers in linelist vs original ECM test numbers from GBS3


# Summarize totals across whole period for each site
test_bysite <- linelist %>% group_by(site) %>% summarize(total_tested=sum(z), .groups= 'drop')

# Create a bar graph of totals by site
plot <- ggplot(test_bysite, aes(x = site, y = total_tested)) +
  geom_bar(stat = "identity", fill = "skyblue", color = "black") +
  labs(title = "Total ECM tests by Site", x = "Site", y = "Totals") +
  theme_minimal() +
  theme(
    axis.text.x = element_text(size = 14),  # Increase size of x-axis labels
    axis.text.y = element_text(size = 14),  # Increase size of y-axis labels
    axis.title.x = element_text(size = 16),  # Increase size of x-axis title
    axis.title.y = element_text(size = 16)   # Increase size of y-axis title
  )

# save plot
ggsave("Charts_carriage/all_ecm_test_bysite.jpeg", plot = plot, device = "jpeg", width = 8, height = 6, dpi = 300)


# /////////////
# 2) age distributions by site and for all sites combined
agedis <- linelist %>% group_by(site) %>% 
  summarise(
    median_age=median(age, na.rm=T),
    IQR_age=IQR(age, na.rm=T),
    age_min=min(age, na.rm=T),
    age_max=max(age, na.rm=T),
    Q1 = quantile(age, 0.25, na.rm=T),
    Q3 = quantile(age, 0.75, na.rm=T)
  )

# Create a plot showing median and IQRs
plot <- ggplot(agedis, aes(x = site, y = median_age)) +
  geom_point(size = 3, color = "blue") +  # Plot the median age as points
  geom_errorbar(aes(ymin = Q1, ymax = Q3), width = 0.2, color = "red") +  # Add IQR as error bars
  labs(title = "Median Age and IQRs by Category", x = "Category", y = "Median Age") +
  theme_minimal()

# save plot
ggsave("Charts_carriage/median_age_bysite.jpeg", plot = plot, device = "jpeg", width = 8, height = 6, dpi = 300)

# save table
write.csv(agedis, "Data_carriage/ECM_carriage/Stat_summaries/Age_distribution/age_dis.csv")

```

Apply geo - IMD lookup to link to IMD

```{r imd linkage, include=TRUE}

# apply IMD linkage function to linelist without AMR data (to be linked back in later)
linelist <- imd.linkage.fun(linelist)

  write.csv(linelist, paste0(wd,"/Data_carriage/Linelist/static_basedata_imdlinked_raw.csv"), row.names=F)

  linelist <- read.csv("Data_carriage/Linelist/static_basedata_imdlinked_raw.csv")
  linelist <- linelist %>%
  mutate(
    specdate = as.Date(specdate, format = "%Y-%m-%d"),
    dob = as.Date(dob, format = "%Y-%m-%d"))

```

HES linkage for ethnicity enrichment

```{r hes ethn linkage, include=TRUE}

linelist <- linelist.prep.fun(linelist)
linelist <- hes.ethnlink.fun(linelist)

# //////////// key checkpoint 3a //////////////
  ## save as static base dataset **after HES CHIME ethnicity enrichment*
  write.csv(linelist, "Data_carriage/Linelist/static_basedata_ethnlinked_raw.csv", row.names=F)
  
  linelist <- read.csv("Data_carriage/Linelist/static_basedata_ethnlinked_raw.csv")
  linelist$specdate <- as.Date(linelist$specdate, format = "%Y-%m-%d")
  linelist$dob <- as.Date(linelist$dob, format = "%Y-%m-%d")
  
  # quantify ethnicity categories (numbers and % total in linelist, including not linked - NA)
  hes_eth_allcat <- linelist %>% mutate(z=1) %>% group_by(Ethnic_group) %>% summarize(total=sum(z)) %>% mutate(pct=total/sum(total)*100)
  # % not linked = 2.5% (denom = all records)
  
  # to document % not linked from CHIME method, as well as of those linked, the % which not known or not stated
  hes_linked_only <- linelist %>% filter(!is.na(Ethnic_group)) %>% mutate(z=1) %>% group_by(Ethnic_group) %>% summarize(total=sum(z)) %>% mutate(pct=total/sum(total)*100)
  # % not known = 0.67
  # % not states = 1.63 (denom = all HES-linked records)
  
# if HES returned unknown, not linked, not stated then pass to ECDS method to fill in gaps. Log that ECDS data source
for_ecds <- linelist %>% filter(Ethnic_group == "99 Not Known" | Ethnic_group == "Z Not stated" | is.na(Ethnic_group)) %>% mutate(nhsno_status=case_when(is.na(nhsno) ~ "missing",
                                                                                                                                                          !is.na(nhsno) ~ "known"))
# 528 rows (4.1% of linelist)

# tabulate HES ethnicity assignment by NHS number status (known or missing)
tab <- for_ecds %>% mutate(z=1) %>% group_by(nhsno_status,Ethnic_group) %>% summarize(total=sum(z)) %>% mutate(pct=total/sum(total)*100)
# among known nhsno, 205 (76.2%) were HES linked but assigned as "ethnicity not stated"; all records missing nhsno were not linked to HES
write.csv(tab, "Data_carriage/ECM_carriage/Stat_summaries/hes_noethn_by_nhsno_status.csv", row.names=F)

ecds_linked <- linelist.prep.fun(for_ecds)
ecds_linked <- ecds.ethnlink.fun(ecds_linked)

# //////////// key checkpoint 3b //////////////
  ## save subset of ECDS-linked linelist
write.csv(ecds_linked, "Data_carriage/Linelist/ecds_linked_subset.csv", row.names=F)
ecds_linked <- read.csv("Data_carriage/Linelist/ecds_linked_subset.csv")

ecds_linked <- ecds_linked %>% rename(ethn_code=PatientEthnicCategoryCode) %>%
  mutate(ecds_ethnicity=case_when(ethn_code=="A" ~ "British (White)",
                                 ethn_code=="N" ~ "African (Black or Black British)", 
                                 ethn_code=="L" ~ "Any other Asian background", 
                                 ethn_code=="G" ~ "Any other Mixed background", 
                                 ethn_code=="C" ~ "Any other White background", 
                                 ethn_code=="S" ~ "Any other ethnic group", 
                                 ethn_code=="K" ~ "Bangladeshi (Asian or Asian British)", 
                                 ethn_code=="M" ~ "Caribbean (Black or Black British)", 
                                 ethn_code=="R" ~ "Chinese (other ethnic group)", 
                                 ethn_code=="H" ~ "Indian (Asian or Asian British)",
                                 ethn_code=="J" ~ "Pakistani (Asian or Asian British)",
                                 ethn_code=="F" ~ "White and Asian (Mixed)",
                                 ethn_code=="E" ~ "White and Black African (Mixed)",
                                 ethn_code=="D" ~ "White and Black Caribbean (Mixed)",
                                 ethn_code=="Z" ~ "Z Not Stated",
                                 ethn_code=="B" ~ "White Irish",
                                 ethn_code=="9" ~ "Not Known",
                                 ethn_code=="P" ~ "Any other Black background (Black or Black British)")) 

# identify and flag duplicate records resulting from ECDS linkage i.e., where ECDS returned 2 or more ethnicity assignments for a given patient
ecds_linked$dup_flag <- duplicated(ecds_linked$pid_str) |  duplicated(ecds_linked$pid_str, fromLast = TRUE)
dups <- ecds_linked %>% filter(dup_flag==T) %>% select(pid_str,specdate,Ethnic_group,ecds_ethnicity,dup_flag) %>% arrange(pid_str,ecds_ethnicity) # explore duplicates and assign a rule set for retaining records
length(unique(dups$pid_str))  # 14 pid_str with duplicate rows
pid_str_dups <- unique(dups$pid_str)

# if 'Z not Stated' and 'Not Known' retain either
# if 'Z not Stated'/'Not Known' and a named ethnicity group, retain ethnicity group
# if NA and 'Z not Stated'/'Not Known' or a named ethnicity group, retain latter (non-NA entry)
# if 'Any other Black background (Black or Black British)' and 'Any other ethnic group' retain the higher specificity ethnicity assignment (in this case Black or Black British > Any other ethnicity group)

#////// ecds_ethnicity hierarchy /////////
# max specificity:
max_spec <- c("British (White)",
  "African (Black or Black British)",
  "Bangladeshi (Asian or Asian British)",
  "Caribbean (Black or Black British)",
  "Chinese (other ethnic group)",
  "Indian (Asian or Asian British)",
  "Pakistani (Asian or Asian British)",
  "White and Asian (Mixed)",
  "White and Black African (Mixed)",
  "White and Black Caribbean (Mixed)",
  "White Irish")

# high specificity
high_spec <- c("Any other Black background (Black or Black British)",
              "Any other Asian background", 
              "Any other Mixed background",
              "Any other White background")

# medium specificity:
med_spec <- c("Any other ethnic group","Z Not Stated")

# low specificity:
low_spec <- c("Not Known")

# no specificity/missing
no_spec <- NA

# define hierarchy for category or cat
category_hierarchy <- c("Max" = 1, "High" = 2, "Medium" = 3, "Low" = 4, "No" = 5)

# assign each row as per above hierarchy
dups <- dups %>% mutate(cat=case_when(ecds_ethnicity %in% max_spec  ~ "Max",
                                      ecds_ethnicity %in% high_spec ~ "High",
                                      ecds_ethnicity %in% med_spec ~ "Medium",
                                      ecds_ethnicity %in% low_spec ~ "Low",
                                      ecds_ethnicity %in% no_spec ~ "No"),
                        cat_rank = category_hierarchy[cat]) %>% 
  arrange(pid_str, cat_rank) %>% 
  distinct(pid_str, .keep_all = T)

# now apply to ecds_linked df 
ecds_linked <- ecds_linked %>% mutate(cat=case_when(ecds_ethnicity %in% max_spec  ~ "Max",
                                                    ecds_ethnicity %in% high_spec ~ "High",
                                                    ecds_ethnicity %in% med_spec ~ "Medium",
                                                    ecds_ethnicity %in% low_spec ~ "Low",
                                                    ecds_ethnicity %in% no_spec ~ "No"),
                                      cat_rank = category_hierarchy[cat]) %>%
  group_by(pid_str) %>%
  mutate(is_selected = if_else(dup_flag, cat_rank == min(cat_rank[dup_flag]), TRUE)) %>%
  ungroup() %>%
  filter(is_selected) %>% select(-c("dup_flag","cat","cat_rank","is_selected"))

# check that non-duplicate rows now in data for pid_str vals flagged above
pid_str_dups %in% ecds_linked$pid_str  # TRUE in all instances

# check that pid_str retained correspond to those identified as having duplicate entries above
unique(dups$pid_str) %in% pid_str_dups # TRUE at all positions in vector

# tabulate ECDS ethnicity assignment by NHS number status (known or missing)
tab <- ecds_linked %>% mutate(nhsno_status=case_when(is.na(nhsno) ~ "missing",
                                                     !is.na(nhsno) ~ "known"),
                              heslink_status=case_when(is.na(Ethnic_group) ~ "not linked",
                                                             !is.na(Ethnic_group) ~ "linked")
                              ) %>% mutate(z=1) %>% group_by(nhsno_status,heslink_status,ecds_ethnicity) %>% summarize(total=sum(z)) %>% mutate(pct=total/sum(total)*100)

write.csv(tab, "Data_carriage/ECM_carriage/Stat_summaries/hes_noethn_ecds_cat.csv", row.names=F)
# among known nhsno, 218 (77%) were HES linked but returned "99 Not Known" or "Z Not stated". Only 8 of these 218 records had an ethnicity assignment in ECDS. For those records not HES linked, ECDS enrichment returned 6 records with known ethnicity. Overall, ECDS enrichment provided an ethnicity for 4.9% records that had an NHS number

ecds_linked <- ecds_linked %>% select(pid_str,ecds_ethnicity) %>% filter(!is.na(ecds_ethnicity)) # CONTINUE HERE - THERE ARE DUPLICATES OF pid_str

testmerge <- merge(linelist,ecds_linked, by = "pid_str", all=T)                                 # merge back deduplicated, ecds-linked subset of records into linelist to retain new ecds_ethnicity col
  
testmerge <- testmerge %>% mutate(Ethnic_group=case_when((ecds_ethnicity!=Ethnic_group &        # assign ECDS ethnicity where not HES linked or where ethnicity not known in HES but returned in ECDS linkage
                                  ecds_ethnicity!="Not Known" & 
                                  ecds_ethnicity!="Z Not Stated") |
                                  is.na(Ethnic_group) ~ ecds_ethnicity, 
                                TRUE ~ Ethnic_group)) %>% select(-c("ecds_ethnicity")) %>% 
  mutate(Ethnicity_group=case_when(Ethnicity_group=="Asian" ~ "Any other Asian background",      # re code LIMS ethnicity categories to align with HES categorization (defaulting to Any other...)
                                   Ethnicity_group=="White" ~ "Any other White background",
                                   Ethnicity_group=="Black" ~ "Any other Black background",
                                   Ethnicity_group=="Mixed" ~ "Any other Mixed background",
                                   Ethnicity_group=="" ~ "Z Not stated")) %>% 
  mutate(Ethnic_group=case_when((Ethnic_group=="Z Not stated" | Ethnic_group=="99 Not Known" | Ethnic_group=="Z not stated" | is.na(Ethnic_group)) & Ethnicity_group!="" & Ethnicity_group!="Unknown" ~ Ethnicity_group,    
         TRUE ~ Ethnic_group))                                                                  # assign LIMS ethnicity where recorded, if not HES linked or ethnicity not stated in HES, and not returned in ECDS linkage (last resort ethnicity assignment)

# DEDUPLICATE HERE AFTER ETHNICITY LINKAGE 

# //////////// key checkpoint 4 //////////////
  ## save as static base dataset **after full ethnicity assignment and categorisation**
  write.csv(linelist, "Data_carriage/Linelist/static_basedata_ethnlinked_final.csv", row.names=F)
  
  linelist <- read.csv("Data_carriage/Linelist/static_basedata_ethnlinked_raw.csv")
  linelist$specdate <- as.Date(linelist$specdate, format = "%Y-%m-%d")
  linelist$dob <- as.Date(linelist$dob, format = "%Y-%m-%d")

  # CONTINUE HERE  
linelist <- linelist.ethcat5.fun(linelist)

# use ethnicity field in LIMS if hes ethnicity not returned for a patient
linelist <- linelist %>% mutate(hes_ethnicity = case_when(
  (hes_ethnicity=="Not linked" & lim_ethnicity!="Unknown") ~ paste0(lim_ethnicity,"_fromLIMS"),
  TRUE ~ hes_ethnicity
))

# # SAVE ETH LINKED LINELIST NOW
# write.csv(linelist, "Data_carriage/linelist/ethnlinked_linelist2.csv")

# record the data source for ethnicity by site 
# recording attrition by site (where ethnicity missing too, as this can't be used to estimate carriage by ethnicity)
# - raw data (from site) NHS number completion
# enhanced data NHS number completion
# for automated report 
# compare data quality by site - are some biasing the results (sensitivity analysis)

# IMPORT MAIN ETHNICITY ENRICHED LINELIST HERE (ethnicity in 5 categories)
ethnlinked_linelist=import(here("Data_carriage","Linelist","ethnlinked_linelist2.csv"),
                                format = "csv",
                                fill = T,
                                comment.char="",
                                na = c(""),
                                skip = 0,
                                full.names = T)

linelist <- ethnlinked_linelist
linelist <- linelist %>% mutate(species=case_when(species=="beta Haemolytic Strep, group B" ~ "Group B Streptococcus",
                                                    TRUE ~ species))  # mop up of any variants of GBS pos test field

 
# success of ethnicity ascertainment from CHIME linkage (and supplemented by ECDS), overall and by site, and then % of ethn filled in from local LIMS data, also overall and by site

```

Summarise ethnicity ascertainment and estimate carriage by ethnicity 

```{r carriage by ethn, include=TRUE}

# filter for first 4 sites data analysed
four_site <- c("Bedford","Derriford","Freeman","Royal Stoke")

linelist <- linelist %>% filter(site %in% four_site)
gbscarr_ethn <- gbs.carr.byethn.fun(linelist)

write.csv(gbscarr_ethn, "Data_carriage/ECM_carriage/Carriage_estimates/gbscarr_ethn.csv")

# prep eth categories for plot


# Create a bar graph of gbs carriage by ethn (where denom doesn't include ethnicity unknown/link to HES not successful)
gbscarr_forplot <- gbscarr_ethn %>% 
  select(-c("carriage_bythn")) %>% 
  mutate(ethn_forplot = case_when(
  hes_ethnicity=="Asian_fromLIMS" ~ "Asian",
  hes_ethnicity=="Black_fromLIMS" ~ "Black",
  hes_ethnicity=="Mixed_fromLIMS" ~ "Mixed",
  hes_ethnicity=="White_fromLIMS" ~ "White",
  hes_ethnicity=="Other_fromLIMS" ~ "Other",
  TRUE ~ hes_ethnicity
)) %>% 
  select(-c("hes_ethnicity")) %>% 
  group_by(ethn_forplot) %>% 
  summarize(across(everything(), sum)) %>% 
  mutate(gbc_carr = gbspos/totalecm*100) %>%
  filter(ethn_forplot!="Not linked" & ethn_forplot!="Unknown") %>% 
  arrange(gbc_carr) %>% mutate(ethn_forplot=factor(ethn_forplot, levels = c("White","Black","Asian","Other","Mixed"))) 

plot <- ggplot(gbscarr_forplot, aes(x = ethn_forplot, y = gbc_carr)) +
  geom_bar(stat = "identity", fill = "skyblue", color = "black") +
  geom_errorbar(
    aes(ymin = lower_ci, ymax = upper_ci),
    width = 0.2,
    color = "black"
  ) +
  geom_hline(yintercept = unique(gbscarr_forplot$total_carriage), linetype = "dashed", color = "red", size = 1) +
  labs(title = "GBS carriage by ethnicity", x = "Ethnicity", y = "GBS carriage (%)") +
  theme_minimal() +
  theme(
    panel.grid = element_blank(),
    axis.line = element_line(color = "black", size = 0.5),
    axis.text.x = element_text(size = 12),  # Increase size of x-axis labels
    axis.text.y = element_text(size = 12),  # Increase size of y-axis labels
    axis.title.x = element_text(size = 14,
                                margin = margin(b = 20)),  # Increase size of x-axis title
    axis.title.y = element_text(size = 14,   # Increase size of y-axis title
                                margin = margin(r = 10)),   
    plot.title = element_text(size = 14, hjust= 0.5)
  )


# save plot
ggsave("Charts_carriage/gbscarr_ethn.jpeg", plot = plot, device = "jpeg", width = 8, height = 6, dpi = 300)

```

Estimate carriage by ethnicity for Asian subgroupings Indian, Pakistani and Bangladeshi, using linelist prior to 5 category grouping of ethnicity field

```{r ethn asian subcat, include=TRUE}

# import linelist with full set of HES ethnicity categories
linelist_ethnsub=import(here("Data_carriage","Linelist","ethnlinked_linelist.csv"),
                                format = "csv",
                                fill = T,
                                comment.char="",
                                na = c(""),
                                skip = 0,
                                full.names = T)

nrow(linelist_ethnsub)

unique(linelist_ethnsub$Ethnic_group)

gbscarr_subethn <- gbs.carr.bysubethn.fun(linelist_ethnsub)

# now generate GBS carriage summaries for all 16 HES ethnicity categories

gbscarr_allsubeth <- gbs.carr.byallsubethn.fun(linelist_ethnsub)
write.csv(gbscarr_allsubeth, "Data_carriage/ECM_carriage/Carriage_estimates/gbscarr_allsubeth.csv")

```

Estimate carriage by deprivation

```{r carriage by imd, include=TRUE}

gbscarr_imd <- gbs.carr.byimd.fun(linelist)

gbscarr_forplot <- gbscarr_imd %>% filter(imd_quintile!="Not linked" & imd_quintile!="NA" & !is.na(imd_quintile)) %>% 
  mutate(imd_quintile=factor(imd_quintile, levels = c("20% least deprived", "20% to 40%", "40% to 60%", "60% to 80%", "20% most deprived"), ordered = TRUE))

write.csv(gbscarr_imd, "Data_carriage/ECM_carriage/Carriage_estimates/gbscarr_imd.csv")

# Create a bar graph of gbs carriage by IMD quintile
plot <- ggplot(gbscarr_forplot, aes(x = imd_quintile, y = carriage_byimd)) +
  geom_bar(stat = "identity", fill = "skyblue", color = "black") +
   geom_errorbar(
    aes(ymin = lower_ci, ymax = upper_ci),
    width = 0.2,
    color = "black"
  ) +
  geom_hline(yintercept = unique(gbscarr_forplot$total_carriage), linetype = "dashed", color = "red", size = 1) +
  labs(title = "GBS carriage by IMD quintile", x = "IMD quintile", y = "GBS carriage (%)") +
   theme_minimal() +
  theme(
    panel.grid = element_blank(),
    axis.line = element_line(color = "black", size = 0.5),
    axis.text.x = element_text(size = 12),  # Increase size of x-axis labels
    axis.text.y = element_text(size = 12),  # Increase size of y-axis labels
    axis.title.x = element_text(size = 14,
                                margin = margin(b = 20)),  # Increase size of x-axis title
    axis.title.y = element_text(size = 14,   # Increase size of y-axis title
                                margin = margin(r = 10)),   
    plot.title = element_text(size = 14, hjust= 0.5)
  )

 
# save plot
ggsave("Charts_carriage/gbs_carr_imd.jpeg", plot = plot, device = "jpeg", width = 8, height = 6, dpi = 300)


```

Estimate carriage by age group

```{r carriage by age, include=TRUE}

gbscarr_age <- gbs.carr.byage.fun(linelist)

write.csv(gbscarr_age, "Data_carriage/ECM_carriage/Carriage_estimates/gbscarr_age.csv")

# repeat but restrict to each ethnicity in turn
ll_black <- linelist %>% filter(hes_ethnicity=="Black" | hes_ethnicity=="Black_fromLIMS")
gbscarr_age <- gbs.carr.byage.fun(ll_black)

ll_asian <- linelist %>% filter(hes_ethnicity=="Asian" | hes_ethnicity=="Asian_fromLIMS")
gbscarr_age <- gbs.carr.byage.fun(ll_asian)


ll_mixed <- linelist %>% filter(hes_ethnicity=="Mixed" | hes_ethnicity=="Mixed_fromLIMS")
gbscarr_age <- gbs.carr.byage.fun(ll_mixed)


ll_other <- linelist %>% filter(hes_ethnicity=="Other" | hes_ethnicity=="Other_fromLIMS")
gbscarr_age <- gbs.carr.byage.fun(ll_other)

ll_white <- linelist %>% filter(hes_ethnicity=="White" | hes_ethnicity=="White_fromLIMS")
gbscarr_age <- gbs.carr.byage.fun(ll_white)
# plot % carriage by age group in bar chart

gbscarr_forplot <- gbscarr_age %>% filter(!agegroup %in% c("10-14","50-54","55-59","60-64") & !is.na(agegroup))

# Create a bar graph of gbs carriage by age group
plot <- ggplot(gbscarr_forplot, aes(x = agegroup, y = carriage_byage)) +
  geom_bar(stat = "identity", fill = "skyblue", color = "black") +
    geom_errorbar(
    aes(ymin = lower_ci, ymax = upper_ci),
    width = 0.2,
    color = "black"
  ) +
  geom_hline(yintercept = unique(gbscarr_forplot$total_carriage), linetype = "dashed", color = "red", size = 1) +
  labs(title = "GBS carriage by age group", x = "Age group", y = "GBS carriage (%)") +
   theme_minimal() +
  theme(
    panel.grid = element_blank(),
    axis.line = element_line(color = "black", size = 0.5),
    axis.text.x = element_text(size = 12),  # Increase size of x-axis labels
    axis.text.y = element_text(size = 12),  # Increase size of y-axis labels
    axis.title.x = element_text(size = 14,
                                margin = margin(b = 20)),  # Increase size of x-axis title
    axis.title.y = element_text(size = 14,   # Increase size of y-axis title
                                margin = margin(r = 10)),   
    plot.title = element_text(size = 14, hjust= 0.5)
  )

 
# save plot
ggsave("Charts_carriage/gbs_carr_agegroup.jpeg", plot = plot, device = "jpeg", width = 8, height = 6, dpi = 300)

# Create a bar graph of totals by age group

gbstest_age <- gbscarr_age %>% select(-c("carriage_byage","total_carriage","lower_ci","upper_ci")) %>% filter(!is.na(agegroup)) %>% mutate(agegroup = case_when(
                          agegroup %in% c("40-44","45-49") ~ "40-49",
                          agegroup %in% c("50-54","60-64") ~ "50+",
                          TRUE ~ agegroup
                        )) %>% group_by(agegroup) %>% summarize(across(everything(), sum))

plot <- ggplot(gbstest_age, aes(x = agegroup, y = pct)) +
  geom_bar(stat = "identity", fill = "skyblue", color = "black") +
  labs(title = "% total ECM tests by age group", x = "Age group", y = "% total") +
  theme_minimal() +
  theme(
    panel.grid = element_blank(),
    axis.line = element_line(color = "black", size = 0.5),
    axis.text.x = element_text(size = 12),  # Increase size of x-axis labels
    axis.text.y = element_text(size = 12),  # Increase size of y-axis labels
    axis.title.x = element_text(size = 14,
                                margin = margin(b = 20)),  # Increase size of x-axis title
    axis.title.y = element_text(size = 14,   # Increase size of y-axis title
                                margin = margin(r = 10)),   
    plot.title = element_text(size = 14, hjust= 0.5)
  )

# # save plot
ggsave("Charts_carriage/totals_agegroup.jpeg", plot = plot, device = "jpeg", width = 8, height = 6, dpi = 300)

# saving age dist by eth plots
# save plot
ggsave("Charts_carriage/agegroup_black.jpeg", plot = plot, device = "jpeg", width = 8, height = 6, dpi = 300)
ggsave("Charts_carriage/agegroup_asian.jpeg", plot = plot, device = "jpeg", width = 8, height = 6, dpi = 300)
ggsave("Charts_carriage/agegroup_mixed.jpeg", plot = plot, device = "jpeg", width = 8, height = 6, dpi = 300)
ggsave("Charts_carriage/agegroup_other.jpeg", plot = plot, device = "jpeg", width = 8, height = 6, dpi = 300)
ggsave("Charts_carriage/agegroup_white.jpeg", plot = plot, device = "jpeg", width = 8, height = 6, dpi = 300)

```

Estimate carriage by site

```{r carriage by site, include=T}

gbscarr_site <- gbs.carr.bysite.fun(linelist) 


# Create a bar graph of gbs carriage by site
plot <- ggplot(gbscarr_site, aes(x = site, y = carriage_bysite)) +
   geom_bar(stat = "identity", fill = "skyblue", color = "black") +
  geom_errorbar(
    aes(ymin = lower_ci, ymax = upper_ci),
    width = 0.2,
    color = "black"
  ) +
  geom_hline(yintercept = unique(gbscarr_forplot$total_carriage), linetype = "dashed", color = "red", size = 1) +
  labs(title = "GBS carriage (%) by ECM site", x = "Site", y = "%") +
  theme_minimal() +
  theme(
    panel.grid = element_blank(),
    axis.line = element_line(color = "black", size = 0.5),
    axis.text.x = element_text(size = 12),  # Increase size of x-axis labels
    axis.text.y = element_text(size = 12),  # Increase size of y-axis labels
    axis.title.x = element_text(size = 14,
                                margin = margin(b = 20)),  # Increase size of x-axis title
    axis.title.y = element_text(size = 14,   # Increase size of y-axis title
                                margin = margin(r = 10)),   
    plot.title = element_text(size = 14, hjust= 0.5)
  )

# save plot
ggsave("Charts_carriage/gbs_carr_sites.jpeg", plot = plot, device = "jpeg", width = 8, height = 6, dpi = 300)

# conf int for overall GBS carr estimate
total_carr <- gbscarr_site %>% select(c("total","gbspos")) %>% summarize(across(everything(), sum)) %>% 
  rowwise() %>% 
    mutate(
      conf = list(binom.confint(gbspos, total, methods = "exact")),
      lower_ci = conf[[1, "lower"]] * 100,
      upper_ci = conf[[1, "upper"]] * 100
    ) %>%
    select(-conf) 

```

Estimate carriage by month of ECM implementation, for all sites combined

```{r carriage by month, include=TRUE}

gbscarr_month <- gbs.carr.byym.fun(linelist)
write.csv(gbscarr_month, "Data_carriage/ECM_carriage/Carriage_estimates/gbscarr_byym.csv")

 gbscarr_month$month_year <- ymd(paste0(gbscarr_month$month_year, "-01"))  # Append day to create a complete date

# Create a bar graph of gbs carriage by age group
plot <- ggplot(gbscarr_month, aes(x = ym, y = carriage_byym)) +
  geom_bar(stat = "identity", fill = "skyblue", color = "black") +
  labs(title = "GBS carriage (%) by month and year (all 4 sites)", x = "month and year", y = "%") +
  theme_minimal() +
      theme(axis.text.x = element_text(size = 10, angle = 45, hjust = 1)) +
      scale_x_date(date_labels = "%Y-%m", date_breaks = "1 month") +  # Format x-axis
  theme(
    axis.text.x = element_text(size = 14),  # Increase size of x-axis labels
    axis.text.y = element_text(size = 14),  # Increase size of y-axis labels
    axis.title.x = element_text(size = 16),  # Increase size of x-axis title
    axis.title.y = element_text(size = 16)   # Increase size of y-axis title
  )

# save plot
ggsave("Charts_carriage/gbscarr_bymonth.jpeg", plot = plot, device = "jpeg", width = 8, height = 6, dpi = 300)

```

Estimate carriage by year of ECM implementation, for all sites combined

```{r carriage by year, include=TRUE}

gbs_byyear <- gbs.carr.byyear.fun(linelist)
write.csv(gbs_byyear, "Data_carriage/ECM_carriage/Carriage_estimates/gbscarr_byyear.csv")

```

Produce summaries for flow chart for data linkage support request

```{r flow chart summaries}

# total unique PIDs per site
# Get unique site names
unique_sites <- unique(linelist$site)

# Use lapply to calculate counts for each site
site_counts <- lapply(unique_sites, function(site) sum(linelist$site == site))

# Combine results into a named vector or data frame for clarity
names(site_counts) <- unique_sites
site_counts <- unlist(site_counts)

# Print the results
print(site_counts)

length(unique(linelist$pid_str))

# date range by site
range(linelist$year, na.rm=T)
range(linelist$specdate, na.rm=T)

```

Logistic regression - association between maternal ethnicity and GBS carriage, controlling for age and deprivation

```{r logit reg, include=T}

# ///////////////////
# model with ethnicity as five categories
model_data <- linelist %>% filter(!is.na(hes_ethnicity)) %>% mutate(gbs_car = case_when(species=="Group B Streptococcus" ~ 1,
                                                      species=="GBS not isolated" | species=="No organism recorded" ~ 0),
                                  hes_ethnicity = case_when((hes_ethnicity=="Asian" | hes_ethnicity=="Asian_fromLIMS") ~ "Asian",
                                                             (hes_ethnicity=="Black" | hes_ethnicity=="Black_fromLIMS") ~ "Black",
                                                             (hes_ethnicity=="Mixed" | hes_ethnicity=="Mixed_fromLIMS") ~ "Mixed",
                                                             (hes_ethnicity=="Other" | hes_ethnicity=="Other_fromLIMS") ~ "Other",
                                                             (hes_ethnicity=="White" | hes_ethnicity=="White_fromLIMS") ~ "White"),
                                  hes_ethnicity = as.factor(hes_ethnicity)) %>% filter(agegroup!="not known") %>% filter(!is.na(age)) %>% mutate(z=1,
                        agegroup = cut(age, 
                                       breaks = seq(0, 100, by = 5),  # Specify the breaks for the age bands
                                       right = FALSE,  # Include the left endpoint, exclude the right
                                       labels = paste(seq(0, 95, by = 5), seq(4, 99, by = 5), sep = "-")),
                        agegroup = fct_collapse(agegroup,
                                                "0-14" = c("0-4","5-9","10-14"),
                                                "40+" = c("40-44","40-44","45-49","50-54","55-59","60-64","65-69","70-74","75-79","80-84","85-89","90-94","95-99"))) %>% select(year,agegroup,imd_quintile,hes_ethnicity,gbs_car) %>% filter(!is.na(hes_ethnicity)) 
# %>% filter(!is.na(imd_quintile))
model_data <- imd.fac.label.fun(model_data)
                          
model_data$imd_quintile <- as.factor(model_data$imd_quintile)
model_data$hes_ethnicity <- relevel(model_data$hes_ethnicity, ref = "White")
model_data$imd_quintile <- relevel(model_data$imd_quintile, ref = "20% least deprived")
model_data$agegroup <- relevel(model_data$agegroup, ref = "20-24")
model_data$gbs_car <- as.factor(model_data$gbs_car) 
model_data <- droplevels(model_data)

successes <- sum(as.numeric(model_data$gbs_car == 1))
trials <- nrow(model_data)
result <- prop.test(successes, trials, conf.level = 0.95)

                                       
model <- glm(gbs_car ~ hes_ethnicity + agegroup + imd_quintile,
             data = model_data,
             family = binomial(link = "logit"))

sapply(model_data, function(x) if(is.factor(x)) levels(x))

# Summary of the model to see coefficients and statistics
  summary(model)
  
  # Exponentiate the coefficients to get odds ratios
  exp(coef(model))
  
  model_out <- tidy(model, exponentiate = TRUE, conf.int = TRUE)
  
write.csv(model_out, "Data_carriage/Model/multilog.csv", row.names=F)


# ///////////////////////
# rerun model with ethnicity as binary (white vs any non-white ethnicity)

model_data_binary <- ethn.binary.fac.fun(model_data)

model <- glm(gbs_car ~ hes_ethnicity + agegroup + imd_quintile + year,
             data = model_data_binary,
             family = binomial(link = "logit"))

sapply(model_data, function(x) if(is.factor(x)) levels(x))

# Summary of the model to see coefficients and statistics
  summary(model)
  
  # Exponentiate the coefficients to get odds ratios
  exp(coef(model))
  
  model_out <- tidy(model, exponentiate = TRUE, conf.int = TRUE)
  
    
write.csv(model_out, "Data_carriage/Model/multilog_ethn_binary.csv", row.names=F)

```

Add a new chunk by clicking the *Insert Chunk* button on the toolbar or by pressing *Ctrl+Alt+I*.

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the *Preview* button or press *Ctrl+Shift+K* to preview the HTML file).

The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike *Knit*, *Preview* does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.