library(tidyverse)
library(readxl)
library(ggplot2)
library(DT)

Data Read in

The single cause of TB death

single <- read.table('/Users/hec442/Library/CloudStorage/OneDrive-HarvardUniversity/harvard on computer/projects/TB calculator/TB code/TB-calculator/data/Single Cause of Death, 1999-2020.txt', header = TRUE, sep = "", na.strings = "NA", fill = TRUE,stringsAsFactors = FALSE)
single = single[,2:7] %>% 
  slice(-c(nrow(single), nrow(single)-1)) %>% 
# not too sure about the suppressed and unreliable or not applicable
  mutate(across(where(is.character), ~case_when(
    . == "Suppressed" ~ "5",
    . == "Unreliable" ~ "0.008",
    TRUE ~ as.character(.)
  ))) %>% 
   
  mutate(across(where(is.character), ~na_if(., "Applicable"))) %>%
  mutate(across(where(is.character), ~na_if(., "Not"))) 

colnames(single) <- c("age_group", "death","population", "rate", "low95" , "up95")

single = single %>% 
  mutate(across(c(death, population, rate, low95, up95), 
                ~ as.numeric(as.character(.)))) %>% 
  mutate(rate = ifelse(age_group %in% c("1", "5-9", "10-14"), death / population, rate)) 

sin85_plus <- single %>%
  filter(age_group %in% c("85-89", "90-94", "95-99", "100+")) %>%
  summarise(across(where(is.numeric), sum, na.rm = TRUE))


sin85_plus$age_group <- "85+"

single <- bind_rows(single, sin85_plus)

# Remove the old rows for the age groups that have been combined
single <- single %>%
  filter(!age_group %in% c("85-89", "90-94", "95-99", "100+", "NS"))


datatable(
data = single,
caption = "Table 1: Single cause TB mortality",
filter = "top",
style = "bootstrap4"
)

The multiple cause of TB death

multiple <- read.table('/Users/hec442/Library/CloudStorage/OneDrive-HarvardUniversity/harvard on computer/projects/TB calculator/TB code/TB-calculator/data/Multiple Cause of Death, 1999-2020.txt', header = TRUE, sep = "", na.strings = "NA", fill = TRUE,stringsAsFactors = FALSE)
multiple = multiple[,2:7] %>% 
    slice(-c(nrow(multiple), nrow(multiple)-1)) %>% 
# not too sure about the suppressed and unreliable or not applicable
  mutate(across(where(is.character), ~case_when(
    . == "Suppressed" ~ "5",
    . == "Unreliable" ~ "0.008",
    TRUE ~ as.character(.)
  ))) %>% 
   
  mutate(across(where(is.character), ~na_if(., "Applicable"))) %>%
  mutate(across(where(is.character), ~na_if(., "Not"))) 

colnames(multiple) <- c("age_group", "death","population", "rate", "low95" , "up95")

multiple = multiple %>% 
  mutate(across(c(death, population, rate, low95, up95), 
                ~ as.numeric(as.character(.)))) %>% 
  mutate(rate = ifelse(age_group %in% c("1", "5-9", "10-14"), death / population, rate)) 

mul85_plus <- multiple %>%
  filter(age_group %in% c("85-89", "90-94", "95-99", "100+")) %>%
  summarise(across(where(is.numeric), sum, na.rm = TRUE))


mul85_plus$age_group <- "85+"

multiple <- bind_rows(multiple, mul85_plus)

# Remove the old rows for the age groups that have been combined
multiple <- multiple %>%
  filter(!age_group %in% c("85-89", "90-94", "95-99", "100+", "NS"))


datatable(
data = multiple,
caption = "Table 2: Multiple cause TB mortality",
filter = "top",
style = "bootstrap4"
)

TB cases

cases <- read.table('/Users/hec442/Library/CloudStorage/OneDrive-HarvardUniversity/harvard on computer/projects/TB calculator/TB code/TB-calculator/data/OTIS TB Data 1993-2022.txt', header = TRUE, sep = "", na.strings = "NA", fill = TRUE,stringsAsFactors = FALSE)
cases = cases[,2:4] %>% 
  slice(-c(nrow(cases), nrow(cases)-1))
colnames(cases) <- c("age_group", "case","perc_of_total")
datatable(
data = cases,
caption = "Table 2: TB cases",
filter = "top",
style = "bootstrap4"
)

It seems the age_group is not matched, try to aligned with the TB cases

age_group_map <- c("1" = "1","1-4" = "1-4", "5-9" = "5-14", "10-14" = "5-14", 
                   "15-19" = "15-24", "20-24" = "15-24", "25-29" = "25-34", 
                   "30-34" = "25-34", "35-39" = "35-44", "40-44" = "35-44", 
                   "45-49" = "45-54", "50-54" = "45-54", "55-59" = "55-64", 
                   "60-64" = "55-64", "65-69" = "65-74", "70-74" = "65-74", 
                   "75-79" = "75-84", "80-84" = "75-84", "85+" = "85+")

single$new_age_group <- age_group_map[single$age_group]

# Group by the new age group and summarise the cases
single_grouped <- single %>%
  group_by(new_age_group) %>%
  summarise(death = sum(death, na.rm = TRUE), 
            population = sum(population, na.rm = TRUE),.groups = 'drop') %>% 
  mutate(rate= death/population*100000)

single$death[single$new_age_group == "5-14"] <- 11

multiple$new_age_group <- age_group_map[multiple$age_group]
multiple$death[multiple$new_age_group == "5-14"] <- 15
# Group by the new age group and summarise the cases
multiple_grouped <- multiple %>%
  group_by(new_age_group) %>%
  summarise(death = sum(death, na.rm = TRUE), 
            population = sum(population, na.rm = TRUE),.groups = 'drop') %>% 
  mutate(rate= death/population*100000)

Base case mortality rate

bmor1 <- read_excel('/Users/hec442/Library/CloudStorage/OneDrive-HarvardUniversity/harvard on computer/projects/TB calculator/TB code/TB-calculator/data/base life table 2019.xlsx', sheet = 1) %>% 
  janitor::clean_names()
bmor2 <- read_excel('/Users/hec442/Library/CloudStorage/OneDrive-HarvardUniversity/harvard on computer/projects/TB calculator/TB code/TB-calculator/data/base life table 2019.xlsx', sheet = 2) %>% 
  janitor::clean_names()
# calculate the number of death
bmor2$death <- c(diff(bmor2$total) * -1, NA)
# calculate the mortality rate

bmor2_plus <- bmor2 %>%
  filter(age %in% c("85", "90", "95", "100")) %>%
  summarise(across(where(is.numeric), sum, na.rm = TRUE))


bmor2_plus$age <- 86

bmor2<- bind_rows(bmor2, bmor2_plus)

# Remove the old rows for the age groups that have been combined
bmor2 <- bmor2 %>%
  filter(!age %in% c("85", "90", "95", "100"))


age_group_map1 <- c("0" = "1","1" = "1-4", "5" = "5-14", "10" = "5-14", 
                   "15" = "15-24", "20" = "15-24", "25" = "25-34", 
                   "30" = "25-34", "35" = "35-44", "40" = "35-44", 
                   "45" = "45-54", "50" = "45-54", "55" = "55-64", 
                   "60" = "55-64", "65" = "65-74", "70" = "65-74", 
                   "75" = "75-84", "80" = "75-84", "86" = "85+")

bmor2$new_age_group <- age_group_map1[as.character(bmor2$age)]

# Group by the new age group and summarise the cases
bmor2_grouped <- bmor2%>%
  group_by(new_age_group) %>%
  summarise(death = sum(death, na.rm = TRUE), 
            total = sum(total, na.rm = TRUE),.groups = 'drop') %>% 
  mutate(morrate= death/total)

datatable(
data = cases,
caption = "Table 4: Base mortality rate",
filter = "top",
style = "bootstrap4"
)

Model fitting

Single cause

Fitted with TB mortality rate

full = single_grouped %>% 
  rename(sdeath = death,
                spopulation = population,
                srate= rate) %>% 
  left_join(multiple_grouped, by = c("new_age_group" = "new_age_group")) %>% 
  left_join(cases, by = c("new_age_group" = "age_group")) %>% 
  left_join(bmor2_grouped, by = c("new_age_group" = "new_age_group")) %>% 
    filter(new_age_group!="1") %>% 
    mutate(new_age_group= as.factor(new_age_group),
        case= as.numeric(case))

  
  full$new_age_group <- factor(full$new_age_group, levels = c( "1-4", "5-14", "15-24", "25-34", 
  "35-44", "45-54", "55-64", "65-74", 
  "75-84", "85+"
)) 
  
  full = full %>% arrange(new_age_group)

datatable(
data = full,
caption = "Table 5:  TB mortality full table",
filter = "top",
style = "bootstrap4"
)
singlm = glm(sdeath ~ new_age_group + offset(log(case* morrate)), family="poisson", data= full)

summary(singlm)
## 
## Call:
## glm(formula = sdeath ~ new_age_group + offset(log(case * morrate)), 
##     family = "poisson", data = full)
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          1.8409     0.3015   6.106 1.02e-09 ***
## new_age_group5-14    0.3182     0.4369   0.728 0.466440    
## new_age_group15-24  -0.8583     0.3343  -2.568 0.010241 *  
## new_age_group25-34  -1.1587     0.3155  -3.673 0.000240 ***
## new_age_group35-44  -0.9113     0.3089  -2.950 0.003178 ** 
## new_age_group45-54  -0.8506     0.3046  -2.792 0.005235 ** 
## new_age_group55-64  -1.0976     0.3032  -3.620 0.000294 ***
## new_age_group65-74  -1.4798     0.3031  -4.883 1.05e-06 ***
## new_age_group75-84  -2.0287     0.3029  -6.698 2.11e-11 ***
## new_age_group85+    -2.3571     0.3031  -7.776 7.47e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 1.5364e+03  on 9  degrees of freedom
## Residual deviance: 2.0739e-13  on 0  degrees of freedom
## AIC: 91.32
## 
## Number of Fisher Scoring iterations: 3

The fitted vs observe

fitted_values <- fitted(singlm)

plot(full$sdeath, type = 'p', main = "Original vs. Fitted Data (single cause)", xlab = "Age Group", ylab = "TB mortality rate", xlim = c(1, length(levels(full$new_age_group))), xaxt = 'n')
points(fitted_values, type = 'l', col = 'red')  


axis(1, at = seq_along(levels(full$new_age_group)), labels = levels(full$new_age_group), las = 1)
legend("bottomright", legend=c("Original", "Fitted"), col=c("black", "red"), lty=1:1)

Log scale

plot(log(full$sdeath), type = 'p', main = "Original vs. Fitted Data (single cause)", xlab = "Age Group", ylab = "TB mortality rate(log scale)",  xaxt = 'n')
points(log(fitted_values), type = 'l', col = 'red')  
axis(1, at = seq_along(levels(full$new_age_group)), labels = levels(full$new_age_group), las = 1)

legend("bottomright", legend=c("Original", "Fitted"), col=c("black", "red"), lty=1:1)

Risk Ratio

exp_coefs <- exp(coef(singlm))*full$morrate


plot(log(exp_coefs), type = 'b', main = "TB mortality rate ratio (single cause)", xlab = "Age Group", ylab = "TB mortality rate ratio (log scale)",   xaxt = 'n')
axis(1, at = seq_along(levels(full$new_age_group)), labels = levels(full$new_age_group), las = 1)

Multiple cause

Fitted with multiple cause TB mortality rate

multglm = glm(death.x ~ new_age_group + offset(log(case* morrate)), family="poisson", data= full)

summary(multglm)
## 
## Call:
## glm(formula = death.x ~ new_age_group + offset(log(case * morrate)), 
##     family = "poisson", data = full)
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          2.1511     0.2582   8.331  < 2e-16 ***
## new_age_group5-14    1.1067     0.3162   3.500 0.000466 ***
## new_age_group15-24  -0.7630     0.2838  -2.688 0.007184 ** 
## new_age_group25-34  -0.9546     0.2680  -3.562 0.000368 ***
## new_age_group35-44  -0.6874     0.2633  -2.611 0.009033 ** 
## new_age_group45-54  -0.5781     0.2602  -2.221 0.026331 *  
## new_age_group55-64  -0.7675     0.2592  -2.961 0.003069 ** 
## new_age_group65-74  -1.1472     0.2591  -4.427 9.57e-06 ***
## new_age_group75-84  -1.7313     0.2591  -6.683 2.34e-11 ***
## new_age_group85+    -2.0861     0.2592  -8.047 8.51e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 2.9063e+03  on 9  degrees of freedom
## Residual deviance: 4.4453e-13  on 0  degrees of freedom
## AIC: 97.22
## 
## Number of Fisher Scoring iterations: 3

The observe vs fitted

fitted_mult <- fitted(multglm)

plot(full$death.x, type = 'p', main = "Original vs. Fitted Data(multi cause)", xlab = "Age Group", ylab = "TB mortality rate",  xaxt = 'n')
points(fitted_mult, type = 'l', col = 'red')  

axis(1, at = seq_along(levels(full$new_age_group)), labels = levels(full$new_age_group), las = 1)
legend("bottomright", legend=c("Original", "Fitted"), col=c("black", "red"), lty=1:1)

Log scale

plot(log(full$death.x), type = 'p', main = "Original vs. Fitted Data (multi cause)", xlab = "Age Group", ylab = "TB mortality rate(log scale)",  xaxt = 'n')
points(log(fitted_mult), type = 'l', col = 'red')  

axis(1, at = seq_along(levels(full$new_age_group)), labels = levels(full$new_age_group), las = 1)
legend("bottomright", legend=c("Original", "Fitted"), col=c("black", "red"), lty=1:1)

Risk ratio

mult_exp_coefs <- exp(coef(multglm))*full$morrate

plot(log(mult_exp_coefs), type = 'b', main = "TB mortality rate ratio(multi cause)", xlab = "Age Group", ylab = "TB mortality rate ratio(log scale)",  xaxt = 'n')
axis(1, at = seq_along(levels(full$new_age_group)), labels = levels(full$new_age_group), las = 1)

The mean of single and multiple

mean_coef= (mult_exp_coefs+exp_coefs)/2
plot(log(mean_coef), type = 'b', main = "TB mortality rate ratio(multi cause)", xlab = "Age Group", ylab = "TB mortality rate ratio(log scale)",  xaxt = 'n')
axis(1, at = seq_along(levels(full$new_age_group)), labels = levels(full$new_age_group), las = 1)

Binomial model

The way of fitted in binomial:

As a factor: ‘success’ is interpreted as the factor not having the first level (and hence usually of having the second level). — this one certainly not working

As a numerical vector with values between 0 and 1, interpreted as the proportion of successful cases (with the total number of cases given by the weights). –will try

As a two-column integer matrix: the first column gives the number of successes and the second the number of failures. –will try

singe cause

First try the numerical vector + offset as rate

full$sratecase= full$sdeath/full$spopulation
full$mratecase= full$death.x/full$spopulation
full = full %>% 
  filter(new_age_group!= "85+")

bio1sin = glm(sratecase ~ new_age_group, family = binomial(link = "cloglog"), data = full, weights = spopulation)
summary(bio1sin)
## 
## Call:
## glm(formula = sratecase ~ new_age_group, family = binomial(link = "cloglog"), 
##     data = full, weights = spopulation)
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        -16.4926     0.3015 -54.700  < 2e-16 ***
## new_age_group5-14   -1.0387     0.4369  -2.377   0.0174 *  
## new_age_group15-24   0.4717     0.3343   1.411   0.1582    
## new_age_group25-34   1.3500     0.3155   4.279 1.87e-05 ***
## new_age_group35-44   2.0639     0.3089   6.681 2.37e-11 ***
## new_age_group45-54   2.8738     0.3046   9.433  < 2e-16 ***
## new_age_group55-64   3.5794     0.3032  11.806  < 2e-16 ***
## new_age_group65-74   4.0620     0.3031  13.403  < 2e-16 ***
## new_age_group75-84   4.8351     0.3029  15.964  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance:  6.6983e+03  on 8  degrees of freedom
## Residual deviance: -3.7923e-11  on 0  degrees of freedom
## AIC: 80.546
## 
## Number of Fisher Scoring iterations: 4
fitted_values <- fitted(bio1sin)
plot(full$sratecase, type = 'p', main = "Original vs. Fitted Data(rate)", xlab = "Age Group", ylab = "TB mortality rate", xaxt = 'n')
points(fitted_values, type = 'l', col = 'red')  

axis(1, at = seq_along(levels(full$new_age_group)), labels = levels(full$new_age_group), las = 1)
legend("bottomright", legend=c("Original", "Fitted"), col=c("black", "red"), lty=1:1)

Now try the 2 column matrix

full$ssurvive = full$spopulation-full$sdeath
bio2sin <- glm(cbind(sdeath, ssurvive) ~ new_age_group, family = binomial(link = "cloglog"), data = full)
summary(bio2sin)
## 
## Call:
## glm(formula = cbind(sdeath, ssurvive) ~ new_age_group, family = binomial(link = "cloglog"), 
##     data = full)
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        -16.4926     0.3015 -54.700  < 2e-16 ***
## new_age_group5-14   -1.0387     0.4369  -2.377   0.0174 *  
## new_age_group15-24   0.4717     0.3343   1.411   0.1582    
## new_age_group25-34   1.3500     0.3155   4.279 1.87e-05 ***
## new_age_group35-44   2.0639     0.3089   6.681 2.37e-11 ***
## new_age_group45-54   2.8738     0.3046   9.433  < 2e-16 ***
## new_age_group55-64   3.5794     0.3032  11.806  < 2e-16 ***
## new_age_group65-74   4.0620     0.3031  13.403  < 2e-16 ***
## new_age_group75-84   4.8351     0.3029  15.964  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance:  6.6983e+03  on 8  degrees of freedom
## Residual deviance: -3.7923e-11  on 0  degrees of freedom
## AIC: 80.546
## 
## Number of Fisher Scoring iterations: 4
fitted_values <- fitted(bio2sin)
plot(full$sratecase, type = 'p', main = "Original vs. Fitted Data(rate)", xlab = "Age Group", ylab = "TB mortality rate", xaxt = 'n')
points(fitted_values, type = 'l', col = 'red')  

axis(1, at = seq_along(levels(full$new_age_group)), labels = levels(full$new_age_group), las = 1)
legend("bottomright", legend=c("Original", "Fitted"), col=c("black", "red"), lty=1:1)