library(tidyverse)
library(readxl)
library(ggplot2)
library(DT)
single <- read.table('/Users/hec442/Library/CloudStorage/OneDrive-HarvardUniversity/harvard on computer/projects/TB calculator/TB code/TB-calculator/data/Single Cause of Death, 1999-2020.txt', header = TRUE, sep = "", na.strings = "NA", fill = TRUE,stringsAsFactors = FALSE)
single = single[,2:7] %>%
slice(-c(nrow(single), nrow(single)-1)) %>%
# not too sure about the suppressed and unreliable or not applicable
mutate(across(where(is.character), ~case_when(
. == "Suppressed" ~ "5",
. == "Unreliable" ~ "0.008",
TRUE ~ as.character(.)
))) %>%
mutate(across(where(is.character), ~na_if(., "Applicable"))) %>%
mutate(across(where(is.character), ~na_if(., "Not")))
colnames(single) <- c("age_group", "death","population", "rate", "low95" , "up95")
single = single %>%
mutate(across(c(death, population, rate, low95, up95),
~ as.numeric(as.character(.)))) %>%
mutate(rate = ifelse(age_group %in% c("1", "5-9", "10-14"), death / population, rate))
sin85_plus <- single %>%
filter(age_group %in% c("85-89", "90-94", "95-99", "100+")) %>%
summarise(across(where(is.numeric), sum, na.rm = TRUE))
sin85_plus$age_group <- "85+"
single <- bind_rows(single, sin85_plus)
# Remove the old rows for the age groups that have been combined
single <- single %>%
filter(!age_group %in% c("85-89", "90-94", "95-99", "100+", "NS"))
datatable(
data = single,
caption = "Table 1: Single cause TB mortality",
filter = "top",
style = "bootstrap4"
)
multiple <- read.table('/Users/hec442/Library/CloudStorage/OneDrive-HarvardUniversity/harvard on computer/projects/TB calculator/TB code/TB-calculator/data/Multiple Cause of Death, 1999-2020.txt', header = TRUE, sep = "", na.strings = "NA", fill = TRUE,stringsAsFactors = FALSE)
multiple = multiple[,2:7] %>%
slice(-c(nrow(multiple), nrow(multiple)-1)) %>%
# not too sure about the suppressed and unreliable or not applicable
mutate(across(where(is.character), ~case_when(
. == "Suppressed" ~ "5",
. == "Unreliable" ~ "0.008",
TRUE ~ as.character(.)
))) %>%
mutate(across(where(is.character), ~na_if(., "Applicable"))) %>%
mutate(across(where(is.character), ~na_if(., "Not")))
colnames(multiple) <- c("age_group", "death","population", "rate", "low95" , "up95")
multiple = multiple %>%
mutate(across(c(death, population, rate, low95, up95),
~ as.numeric(as.character(.)))) %>%
mutate(rate = ifelse(age_group %in% c("1", "5-9", "10-14"), death / population, rate))
mul85_plus <- multiple %>%
filter(age_group %in% c("85-89", "90-94", "95-99", "100+")) %>%
summarise(across(where(is.numeric), sum, na.rm = TRUE))
mul85_plus$age_group <- "85+"
multiple <- bind_rows(multiple, mul85_plus)
# Remove the old rows for the age groups that have been combined
multiple <- multiple %>%
filter(!age_group %in% c("85-89", "90-94", "95-99", "100+", "NS"))
datatable(
data = multiple,
caption = "Table 2: Multiple cause TB mortality",
filter = "top",
style = "bootstrap4"
)
cases <- read.table('/Users/hec442/Library/CloudStorage/OneDrive-HarvardUniversity/harvard on computer/projects/TB calculator/TB code/TB-calculator/data/OTIS TB Data 1993-2022.txt', header = TRUE, sep = "", na.strings = "NA", fill = TRUE,stringsAsFactors = FALSE)
cases = cases[,2:4] %>%
slice(-c(nrow(cases), nrow(cases)-1))
colnames(cases) <- c("age_group", "case","perc_of_total")
datatable(
data = cases,
caption = "Table 2: TB cases",
filter = "top",
style = "bootstrap4"
)
It seems the age_group is not matched, try to aligned with the TB cases
age_group_map <- c("1" = "1","1-4" = "1-4", "5-9" = "5-14", "10-14" = "5-14",
"15-19" = "15-24", "20-24" = "15-24", "25-29" = "25-34",
"30-34" = "25-34", "35-39" = "35-44", "40-44" = "35-44",
"45-49" = "45-54", "50-54" = "45-54", "55-59" = "55-64",
"60-64" = "55-64", "65-69" = "65-74", "70-74" = "65-74",
"75-79" = "75-84", "80-84" = "75-84", "85+" = "85+")
single$new_age_group <- age_group_map[single$age_group]
# Group by the new age group and summarise the cases
single_grouped <- single %>%
group_by(new_age_group) %>%
summarise(death = sum(death, na.rm = TRUE),
population = sum(population, na.rm = TRUE),.groups = 'drop') %>%
mutate(rate= death/population*100000)
single$death[single$new_age_group == "5-14"] <- 11
multiple$new_age_group <- age_group_map[multiple$age_group]
multiple$death[multiple$new_age_group == "5-14"] <- 15
# Group by the new age group and summarise the cases
multiple_grouped <- multiple %>%
group_by(new_age_group) %>%
summarise(death = sum(death, na.rm = TRUE),
population = sum(population, na.rm = TRUE),.groups = 'drop') %>%
mutate(rate= death/population*100000)
bmor1 <- read_excel('/Users/hec442/Library/CloudStorage/OneDrive-HarvardUniversity/harvard on computer/projects/TB calculator/TB code/TB-calculator/data/base life table 2019.xlsx', sheet = 1) %>%
janitor::clean_names()
bmor2 <- read_excel('/Users/hec442/Library/CloudStorage/OneDrive-HarvardUniversity/harvard on computer/projects/TB calculator/TB code/TB-calculator/data/base life table 2019.xlsx', sheet = 2) %>%
janitor::clean_names()
# calculate the number of death
bmor2$death <- c(diff(bmor2$total) * -1, NA)
# calculate the mortality rate
bmor2_plus <- bmor2 %>%
filter(age %in% c("85", "90", "95", "100")) %>%
summarise(across(where(is.numeric), sum, na.rm = TRUE))
bmor2_plus$age <- 86
bmor2<- bind_rows(bmor2, bmor2_plus)
# Remove the old rows for the age groups that have been combined
bmor2 <- bmor2 %>%
filter(!age %in% c("85", "90", "95", "100"))
age_group_map1 <- c("0" = "1","1" = "1-4", "5" = "5-14", "10" = "5-14",
"15" = "15-24", "20" = "15-24", "25" = "25-34",
"30" = "25-34", "35" = "35-44", "40" = "35-44",
"45" = "45-54", "50" = "45-54", "55" = "55-64",
"60" = "55-64", "65" = "65-74", "70" = "65-74",
"75" = "75-84", "80" = "75-84", "86" = "85+")
bmor2$new_age_group <- age_group_map1[as.character(bmor2$age)]
# Group by the new age group and summarise the cases
bmor2_grouped <- bmor2%>%
group_by(new_age_group) %>%
summarise(death = sum(death, na.rm = TRUE),
total = sum(total, na.rm = TRUE),.groups = 'drop') %>%
mutate(morrate= death/total)
datatable(
data = cases,
caption = "Table 4: Base mortality rate",
filter = "top",
style = "bootstrap4"
)
full = single_grouped %>%
rename(sdeath = death,
spopulation = population,
srate= rate) %>%
left_join(multiple_grouped, by = c("new_age_group" = "new_age_group")) %>%
left_join(cases, by = c("new_age_group" = "age_group")) %>%
left_join(bmor2_grouped, by = c("new_age_group" = "new_age_group")) %>%
filter(new_age_group!="1") %>%
mutate(new_age_group= as.factor(new_age_group),
case= as.numeric(case))
full$new_age_group <- factor(full$new_age_group, levels = c( "1-4", "5-14", "15-24", "25-34",
"35-44", "45-54", "55-64", "65-74",
"75-84", "85+"
))
full = full %>% arrange(new_age_group)
datatable(
data = full,
caption = "Table 5: TB mortality full table",
filter = "top",
style = "bootstrap4"
)
singlm = glm(sdeath ~ new_age_group + offset(log(case* morrate)), family="poisson", data= full)
summary(singlm)
##
## Call:
## glm(formula = sdeath ~ new_age_group + offset(log(case * morrate)),
## family = "poisson", data = full)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.8409 0.3015 6.106 1.02e-09 ***
## new_age_group5-14 0.3182 0.4369 0.728 0.466440
## new_age_group15-24 -0.8583 0.3343 -2.568 0.010241 *
## new_age_group25-34 -1.1587 0.3155 -3.673 0.000240 ***
## new_age_group35-44 -0.9113 0.3089 -2.950 0.003178 **
## new_age_group45-54 -0.8506 0.3046 -2.792 0.005235 **
## new_age_group55-64 -1.0976 0.3032 -3.620 0.000294 ***
## new_age_group65-74 -1.4798 0.3031 -4.883 1.05e-06 ***
## new_age_group75-84 -2.0287 0.3029 -6.698 2.11e-11 ***
## new_age_group85+ -2.3571 0.3031 -7.776 7.47e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 1.5364e+03 on 9 degrees of freedom
## Residual deviance: 2.0739e-13 on 0 degrees of freedom
## AIC: 91.32
##
## Number of Fisher Scoring iterations: 3
The fitted vs observe
fitted_values <- fitted(singlm)
plot(full$sdeath, type = 'p', main = "Original vs. Fitted Data (single cause)", xlab = "Age Group", ylab = "TB mortality rate", xlim = c(1, length(levels(full$new_age_group))), xaxt = 'n')
points(fitted_values, type = 'l', col = 'red')
axis(1, at = seq_along(levels(full$new_age_group)), labels = levels(full$new_age_group), las = 1)
legend("bottomright", legend=c("Original", "Fitted"), col=c("black", "red"), lty=1:1)
Log scale
plot(log(full$sdeath), type = 'p', main = "Original vs. Fitted Data (single cause)", xlab = "Age Group", ylab = "TB mortality rate(log scale)", xaxt = 'n')
points(log(fitted_values), type = 'l', col = 'red')
axis(1, at = seq_along(levels(full$new_age_group)), labels = levels(full$new_age_group), las = 1)
legend("bottomright", legend=c("Original", "Fitted"), col=c("black", "red"), lty=1:1)
Risk Ratio
exp_coefs <- exp(coef(singlm))*full$morrate
plot(log(exp_coefs), type = 'b', main = "TB mortality rate ratio (single cause)", xlab = "Age Group", ylab = "TB mortality rate ratio (log scale)", xaxt = 'n')
axis(1, at = seq_along(levels(full$new_age_group)), labels = levels(full$new_age_group), las = 1)
multglm = glm(death.x ~ new_age_group + offset(log(case* morrate)), family="poisson", data= full)
summary(multglm)
##
## Call:
## glm(formula = death.x ~ new_age_group + offset(log(case * morrate)),
## family = "poisson", data = full)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.1511 0.2582 8.331 < 2e-16 ***
## new_age_group5-14 1.1067 0.3162 3.500 0.000466 ***
## new_age_group15-24 -0.7630 0.2838 -2.688 0.007184 **
## new_age_group25-34 -0.9546 0.2680 -3.562 0.000368 ***
## new_age_group35-44 -0.6874 0.2633 -2.611 0.009033 **
## new_age_group45-54 -0.5781 0.2602 -2.221 0.026331 *
## new_age_group55-64 -0.7675 0.2592 -2.961 0.003069 **
## new_age_group65-74 -1.1472 0.2591 -4.427 9.57e-06 ***
## new_age_group75-84 -1.7313 0.2591 -6.683 2.34e-11 ***
## new_age_group85+ -2.0861 0.2592 -8.047 8.51e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 2.9063e+03 on 9 degrees of freedom
## Residual deviance: 4.4453e-13 on 0 degrees of freedom
## AIC: 97.22
##
## Number of Fisher Scoring iterations: 3
The observe vs fitted
fitted_mult <- fitted(multglm)
plot(full$death.x, type = 'p', main = "Original vs. Fitted Data(multi cause)", xlab = "Age Group", ylab = "TB mortality rate", xaxt = 'n')
points(fitted_mult, type = 'l', col = 'red')
axis(1, at = seq_along(levels(full$new_age_group)), labels = levels(full$new_age_group), las = 1)
legend("bottomright", legend=c("Original", "Fitted"), col=c("black", "red"), lty=1:1)
Log scale
plot(log(full$death.x), type = 'p', main = "Original vs. Fitted Data (multi cause)", xlab = "Age Group", ylab = "TB mortality rate(log scale)", xaxt = 'n')
points(log(fitted_mult), type = 'l', col = 'red')
axis(1, at = seq_along(levels(full$new_age_group)), labels = levels(full$new_age_group), las = 1)
legend("bottomright", legend=c("Original", "Fitted"), col=c("black", "red"), lty=1:1)
Risk ratio
mult_exp_coefs <- exp(coef(multglm))*full$morrate
plot(log(mult_exp_coefs), type = 'b', main = "TB mortality rate ratio(multi cause)", xlab = "Age Group", ylab = "TB mortality rate ratio(log scale)", xaxt = 'n')
axis(1, at = seq_along(levels(full$new_age_group)), labels = levels(full$new_age_group), las = 1)
mean_coef= (mult_exp_coefs+exp_coefs)/2
plot(log(mean_coef), type = 'b', main = "TB mortality rate ratio(multi cause)", xlab = "Age Group", ylab = "TB mortality rate ratio(log scale)", xaxt = 'n')
axis(1, at = seq_along(levels(full$new_age_group)), labels = levels(full$new_age_group), las = 1)
The way of fitted in binomial:
As a factor: ‘success’ is interpreted as the factor not having the first level (and hence usually of having the second level). — this one certainly not working
As a numerical vector with values between 0 and 1, interpreted as the proportion of successful cases (with the total number of cases given by the weights). –will try
As a two-column integer matrix: the first column gives the number of successes and the second the number of failures. –will try
First try the numerical vector + offset as rate
full$sratecase= full$sdeath/full$spopulation
full$mratecase= full$death.x/full$spopulation
full = full %>%
filter(new_age_group!= "85+")
bio1sin = glm(sratecase ~ new_age_group, family = binomial(link = "cloglog"), data = full, weights = spopulation)
summary(bio1sin)
##
## Call:
## glm(formula = sratecase ~ new_age_group, family = binomial(link = "cloglog"),
## data = full, weights = spopulation)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -16.4926 0.3015 -54.700 < 2e-16 ***
## new_age_group5-14 -1.0387 0.4369 -2.377 0.0174 *
## new_age_group15-24 0.4717 0.3343 1.411 0.1582
## new_age_group25-34 1.3500 0.3155 4.279 1.87e-05 ***
## new_age_group35-44 2.0639 0.3089 6.681 2.37e-11 ***
## new_age_group45-54 2.8738 0.3046 9.433 < 2e-16 ***
## new_age_group55-64 3.5794 0.3032 11.806 < 2e-16 ***
## new_age_group65-74 4.0620 0.3031 13.403 < 2e-16 ***
## new_age_group75-84 4.8351 0.3029 15.964 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6.6983e+03 on 8 degrees of freedom
## Residual deviance: -3.7923e-11 on 0 degrees of freedom
## AIC: 80.546
##
## Number of Fisher Scoring iterations: 4
fitted_values <- fitted(bio1sin)
plot(full$sratecase, type = 'p', main = "Original vs. Fitted Data(rate)", xlab = "Age Group", ylab = "TB mortality rate", xaxt = 'n')
points(fitted_values, type = 'l', col = 'red')
axis(1, at = seq_along(levels(full$new_age_group)), labels = levels(full$new_age_group), las = 1)
legend("bottomright", legend=c("Original", "Fitted"), col=c("black", "red"), lty=1:1)
Now try the 2 column matrix
full$ssurvive = full$spopulation-full$sdeath
bio2sin <- glm(cbind(sdeath, ssurvive) ~ new_age_group, family = binomial(link = "cloglog"), data = full)
summary(bio2sin)
##
## Call:
## glm(formula = cbind(sdeath, ssurvive) ~ new_age_group, family = binomial(link = "cloglog"),
## data = full)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -16.4926 0.3015 -54.700 < 2e-16 ***
## new_age_group5-14 -1.0387 0.4369 -2.377 0.0174 *
## new_age_group15-24 0.4717 0.3343 1.411 0.1582
## new_age_group25-34 1.3500 0.3155 4.279 1.87e-05 ***
## new_age_group35-44 2.0639 0.3089 6.681 2.37e-11 ***
## new_age_group45-54 2.8738 0.3046 9.433 < 2e-16 ***
## new_age_group55-64 3.5794 0.3032 11.806 < 2e-16 ***
## new_age_group65-74 4.0620 0.3031 13.403 < 2e-16 ***
## new_age_group75-84 4.8351 0.3029 15.964 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6.6983e+03 on 8 degrees of freedom
## Residual deviance: -3.7923e-11 on 0 degrees of freedom
## AIC: 80.546
##
## Number of Fisher Scoring iterations: 4
fitted_values <- fitted(bio2sin)
plot(full$sratecase, type = 'p', main = "Original vs. Fitted Data(rate)", xlab = "Age Group", ylab = "TB mortality rate", xaxt = 'n')
points(fitted_values, type = 'l', col = 'red')
axis(1, at = seq_along(levels(full$new_age_group)), labels = levels(full$new_age_group), las = 1)
legend("bottomright", legend=c("Original", "Fitted"), col=c("black", "red"), lty=1:1)