# load data
cleaned_data_3 <- read_excel("cleaned_data_3.xlsx")
# check missing observations of each variable
cleaned_data_3 %>%
sapply(function(x)sum(is.na(x)))
## record_id MTB_load smartt_id
## 0 2 0
## bl_age pretx_sex ptretx_bmi
## 0 0 0
## ses_education_level living_alone_37eb74_v2_v2 smoker_5c21df_v2_v2
## 31 6 6
## alcohol_83d0af_v2_v2 tretx_dm prettx_prevtbtx
## 8 0 0
## late_culture_conversion time_culture_pos pretx_hiv
## 0 0 0
## hiv_status_control hb qol_usual_activity
## 4 4 1
## edu_level ses_income_before INH_fill
## 11 32 3
## FQs_fill resistance_pattern
## 4 4
# code all categories as 1,2,3...
cleaned_data_4 <- cleaned_data_3 %>%
mutate(MTB_load = case_when(
MTB_load == "very low" ~ 1,
MTB_load == "low" ~ 2,
MTB_load == "medium" ~ 3,
MTB_load == "high" ~ 4,
TRUE ~ NA_real_
),
resistance_pattern = case_when(
resistance_pattern == "monoDR" ~ 1,
resistance_pattern == "MDR" ~ 2,
resistance_pattern == "(pre)XDR" ~ 3,
TRUE ~ NA_real_
))
table(cleaned_data_4$MTB_load, cleaned_data_4$late_culture_conversion, useNA = "always")
##
## 0 1 <NA>
## 1 10 11 0
## 2 20 8 0
## 3 16 23 0
## 4 26 53 0
## <NA> 1 1 0
round(prop.table(table(cleaned_data_3$MTB_load, cleaned_data_3$late_culture_conversion, useNA = "always")),3)
##
## 0 1 <NA>
## high 0.154 0.314 0.000
## low 0.118 0.047 0.000
## medium 0.095 0.136 0.000
## very low 0.059 0.065 0.000
## <NA> 0.006 0.006 0.000
table(cleaned_data_4$alcohol_83d0af_v2_v2, cleaned_data_4$late_culture_conversion, useNA = "always")
##
## 0 1 <NA>
## 1 26 40 0
## 2 22 24 0
## 3 16 19 0
## 4 6 8 0
## <NA> 3 5 0
round(prop.table(table(cleaned_data_3$MTB_load, cleaned_data_3$late_culture_conversion, useNA = "always")),3)
##
## 0 1 <NA>
## high 0.154 0.314 0.000
## low 0.118 0.047 0.000
## medium 0.095 0.136 0.000
## very low 0.059 0.065 0.000
## <NA> 0.006 0.006 0.000
1 None, 2 Light (once a month), 3 Moderate (once a week), 4 Heavy (daily)
table(cleaned_data_3$ses_education_level, cleaned_data_3$late_culture_conversion, useNA = "always")
##
## 0 1 <NA>
## 1 7 11 0
## 2 38 49 0
## 3 16 17 0
## <NA> 12 19 0
round(prop.table(table(cleaned_data_3$ses_education_level, cleaned_data_3$late_culture_conversion, useNA = "always")),3)
##
## 0 1 <NA>
## 1 0.041 0.065 0.000
## 2 0.225 0.290 0.000
## 3 0.095 0.101 0.000
## <NA> 0.071 0.112 0.000
1 No secondary education, 2 Some secondary education, 3 Matric or higher
table(cleaned_data_3$edu_level, cleaned_data_3$late_culture_conversion, useNA = "always")
##
## 0 1 <NA>
## 1 2 2 0
## 2 19 15 0
## 3 47 69 0
## 4 2 2 0
## <NA> 3 8 0
round(prop.table(table(cleaned_data_3$edu_level, cleaned_data_3$late_culture_conversion, useNA = "always")),3)
##
## 0 1 <NA>
## 1 0.012 0.012 0.000
## 2 0.112 0.089 0.000
## 3 0.278 0.408 0.000
## 4 0.012 0.012 0.000
## <NA> 0.018 0.047 0.000
1: no schooling, 2: primary, 3: secondary, 4: tertiary
table(cleaned_data_3$hiv_status_control, cleaned_data_3$late_culture_conversion, useNA = "always")
##
## 0 1 <NA>
## 0 28 32 0
## 1 12 32 0
## 2 16 15 0
## 3 15 15 0
## <NA> 2 2 0
round(prop.table(table(cleaned_data_3$hiv_status_control, cleaned_data_3$late_culture_conversion, useNA = "always")),3)
##
## 0 1 <NA>
## 0 0.166 0.189 0.000
## 1 0.071 0.189 0.000
## 2 0.095 0.089 0.000
## 3 0.089 0.089 0.000
## <NA> 0.012 0.012 0.000
0 HIV negative, 1 HIV positive, on ART and viral load controlled (Viral load < = 1000), 2 HIV positive, on ART with no viral load control (Viral load >1000), 3 HIV positive, but not on ART
table(cleaned_data_3$resistance_pattern, cleaned_data_3$late_culture_conversion, useNA = "always")
##
## 0 1 <NA>
## (pre)XDR 6 13 0
## MDR 23 33 0
## monoDR 41 49 0
## <NA> 3 1 0
round(prop.table(table(cleaned_data_3$resistance_pattern, cleaned_data_3$late_culture_conversion, useNA = "always")),3)
##
## 0 1 <NA>
## (pre)XDR 0.036 0.077 0.000
## MDR 0.136 0.195 0.000
## monoDR 0.243 0.290 0.000
## <NA> 0.018 0.006 0.000
table(cleaned_data_3$qol_usual_activity, cleaned_data_3$late_culture_conversion, useNA = "always")
##
## 0 1 <NA>
## 1 52 48 0
## 2 12 29 0
## 3 6 11 0
## 4 3 5 0
## 5 0 2 0
## <NA> 0 1 0
round(prop.table(table(cleaned_data_3$qol_usual_activity, cleaned_data_3$late_culture_conversion, useNA = "always")),3)
##
## 0 1 <NA>
## 1 0.308 0.284 0.000
## 2 0.071 0.172 0.000
## 3 0.036 0.065 0.000
## 4 0.018 0.030 0.000
## 5 0.000 0.012 0.000
## <NA> 0.000 0.006 0.000
1: I have no problems doing my usual activities, 2: I have slight problems doing my usual activities, 3: I have moderate problems doing my usual activities, 4: I have severe problems doing my usual activities, 5: I am unable to do my usual activities first dataset
# create data for imputation
data_1_pre_impute <- cleaned_data_4 %>%
select(-record_id, -smartt_id, -edu_level, -hiv_status_control, -INH_fill, -FQs_fill )
# percentage of rows containing at least 1 NA
1- nrow(na.omit(data_1_pre_impute))/ nrow(data_1_pre_impute)
## [1] 0.2899408
# => 29% of rows with any NAs
data_1_pre_impute %>% sapply(function(x)sum(is.na(x)))
## MTB_load bl_age pretx_sex
## 2 0 0
## ptretx_bmi ses_education_level living_alone_37eb74_v2_v2
## 0 31 6
## smoker_5c21df_v2_v2 alcohol_83d0af_v2_v2 tretx_dm
## 6 8 0
## prettx_prevtbtx late_culture_conversion time_culture_pos
## 0 0 0
## pretx_hiv hb qol_usual_activity
## 0 4 1
## ses_income_before resistance_pattern
## 32 4
#str(data_1_pre_impute)
#summary(data_1_pre_impute)
Change class for variables before imputation
data_1_pre_impute<- data_1_pre_impute %>%
mutate(
MTB_load = as.factor(MTB_load),
pretx_sex = as.factor(pretx_sex),
ses_education_level = as.factor(ses_education_level),
living_alone_37eb74_v2_v2 = as.factor(living_alone_37eb74_v2_v2),
smoker_5c21df_v2_v2 = as.factor(smoker_5c21df_v2_v2),
alcohol_83d0af_v2_v2 = as.factor(alcohol_83d0af_v2_v2),
tretx_dm = as.factor(tretx_dm),
prettx_prevtbtx = as.factor(prettx_prevtbtx),
late_culture_conversion = as.factor(late_culture_conversion),
pretx_hiv = as.factor(pretx_hiv),
resistance_pattern = as.factor(resistance_pattern),
qol_usual_activity = as.factor(qol_usual_activity)
)
# Check the structure to ensure the changes
str(data_1_pre_impute)
## tibble [169 × 17] (S3: tbl_df/tbl/data.frame)
## $ MTB_load : Factor w/ 4 levels "1","2","3","4": 1 4 1 4 4 3 3 4 4 3 ...
## $ bl_age : num [1:169] 61.4 32.6 59.1 33.1 22 38.8 30.4 44 42.6 68.8 ...
## $ pretx_sex : Factor w/ 2 levels "1","2": 2 2 2 2 2 2 2 1 2 1 ...
## $ ptretx_bmi : num [1:169] 18.8 18.2 22.8 16.1 16.2 ...
## $ ses_education_level : Factor w/ 3 levels "1","2","3": 1 2 3 NA 2 2 2 NA 2 2 ...
## $ living_alone_37eb74_v2_v2: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ smoker_5c21df_v2_v2 : Factor w/ 2 levels "0","1": 2 2 1 2 1 1 2 1 2 1 ...
## $ alcohol_83d0af_v2_v2 : Factor w/ 4 levels "1","2","3","4": 2 1 1 2 1 2 1 1 1 1 ...
## $ tretx_dm : Factor w/ 2 levels "0","1": 1 1 2 1 1 1 1 1 1 2 ...
## $ prettx_prevtbtx : Factor w/ 2 levels "0","1": 2 1 1 2 2 2 2 1 1 1 ...
## $ late_culture_conversion : Factor w/ 2 levels "0","1": 2 2 1 1 2 1 2 1 1 2 ...
## $ time_culture_pos : num [1:169] 11 6 15 18 13 21 15 13 21 20 ...
## $ pretx_hiv : Factor w/ 2 levels "0","1": 2 1 1 2 1 1 2 2 1 1 ...
## $ hb : num [1:169] 9.4 13.5 12.2 7 12 9.2 13.3 11.3 12 10.4 ...
## $ qol_usual_activity : Factor w/ 5 levels "1","2","3","4",..: 1 2 1 4 1 2 2 2 1 4 ...
## $ ses_income_before : num [1:169] 5000 3500 6000 NA 1200 0 600 NA 1800 1850 ...
## $ resistance_pattern : Factor w/ 3 levels "1","2","3": 1 1 2 1 1 2 1 2 1 1 ...
summarise data
#percentage are calculated after excluding NAs
tab1 <- CreateTableOne(data = data_1_pre_impute)
tab1_df <- print(tab1, showAllLevels = T)
##
## level Overall
## n 169
## MTB_load (%) 1 21 (12.6)
## 2 28 (16.8)
## 3 39 (23.4)
## 4 79 (47.3)
## bl_age (mean (SD)) 41.55 (12.77)
## pretx_sex (%) 1 46 (27.2)
## 2 123 (72.8)
## ptretx_bmi (mean (SD)) 18.70 (4.10)
## ses_education_level (%) 1 18 (13.0)
## 2 87 (63.0)
## 3 33 (23.9)
## living_alone_37eb74_v2_v2 (%) 0 144 (88.3)
## 1 19 (11.7)
## smoker_5c21df_v2_v2 (%) 0 78 (47.9)
## 1 85 (52.1)
## alcohol_83d0af_v2_v2 (%) 1 66 (41.0)
## 2 46 (28.6)
## 3 35 (21.7)
## 4 14 ( 8.7)
## tretx_dm (%) 0 157 (92.9)
## 1 12 ( 7.1)
## prettx_prevtbtx (%) 0 97 (57.4)
## 1 72 (42.6)
## late_culture_conversion (%) 0 73 (43.2)
## 1 96 (56.8)
## time_culture_pos (mean (SD)) 16.69 (11.54)
## pretx_hiv (%) 0 60 (35.5)
## 1 109 (64.5)
## hb (mean (SD)) 11.22 (2.38)
## qol_usual_activity (%) 1 100 (59.5)
## 2 41 (24.4)
## 3 17 (10.1)
## 4 8 ( 4.8)
## 5 2 ( 1.2)
## ses_income_before (mean (SD)) 3835.73 (5802.57)
## resistance_pattern (%) 1 90 (54.5)
## 2 56 (33.9)
## 3 19 (11.5)
#summarize, with NAs included
summary(tab1)
##
## ### Summary of continuous variables ###
##
## strata: Overall
## n miss p.miss mean sd median p25 p75 min max skew
## bl_age 169 0 0 42 13 40 32 49 18 75 0.4
## ptretx_bmi 169 0 0 19 4 18 16 20 12 38 1.7
## time_culture_pos 169 0 0 17 12 13 9 21 2 42 1.2
## hb 169 4 2 11 2 11 10 13 4 18 -0.1
## ses_income_before 169 32 19 3836 5803 2600 1350 4500 0 60000 7.1
## kurt
## bl_age -0.37
## ptretx_bmi 4.51
## time_culture_pos 0.33
## hb 0.03
## ses_income_before 65.43
##
## =======================================================================================
##
## ### Summary of categorical variables ###
##
## strata: Overall
## var n miss p.miss level freq percent cum.percent
## MTB_load 169 2 1.2 1 21 12.6 12.6
## 2 28 16.8 29.3
## 3 39 23.4 52.7
## 4 79 47.3 100.0
##
## pretx_sex 169 0 0.0 1 46 27.2 27.2
## 2 123 72.8 100.0
##
## ses_education_level 169 31 18.3 1 18 13.0 13.0
## 2 87 63.0 76.1
## 3 33 23.9 100.0
##
## living_alone_37eb74_v2_v2 169 6 3.6 0 144 88.3 88.3
## 1 19 11.7 100.0
##
## smoker_5c21df_v2_v2 169 6 3.6 0 78 47.9 47.9
## 1 85 52.1 100.0
##
## alcohol_83d0af_v2_v2 169 8 4.7 1 66 41.0 41.0
## 2 46 28.6 69.6
## 3 35 21.7 91.3
## 4 14 8.7 100.0
##
## tretx_dm 169 0 0.0 0 157 92.9 92.9
## 1 12 7.1 100.0
##
## prettx_prevtbtx 169 0 0.0 0 97 57.4 57.4
## 1 72 42.6 100.0
##
## late_culture_conversion 169 0 0.0 0 73 43.2 43.2
## 1 96 56.8 100.0
##
## pretx_hiv 169 0 0.0 0 60 35.5 35.5
## 1 109 64.5 100.0
##
## qol_usual_activity 169 1 0.6 1 100 59.5 59.5
## 2 41 24.4 83.9
## 3 17 10.1 94.0
## 4 8 4.8 98.8
## 5 2 1.2 100.0
##
## resistance_pattern 169 4 2.4 1 90 54.5 54.5
## 2 56 33.9 88.5
## 3 19 11.5 100.0
##
md.pattern(data_1_pre_impute, plot = T)
## bl_age pretx_sex ptretx_bmi tretx_dm prettx_prevtbtx
## 120 1 1 1 1 1
## 1 1 1 1 1 1
## 1 1 1 1 1 1
## 23 1 1 1 1 1
## 4 1 1 1 1 1
## 1 1 1 1 1 1
## 1 1 1 1 1 1
## 1 1 1 1 1 1
## 3 1 1 1 1 1
## 3 1 1 1 1 1
## 4 1 1 1 1 1
## 1 1 1 1 1 1
## 3 1 1 1 1 1
## 1 1 1 1 1 1
## 1 1 1 1 1 1
## 1 1 1 1 1 1
## 0 0 0 0 0
## late_culture_conversion time_culture_pos pretx_hiv qol_usual_activity
## 120 1 1 1 1
## 1 1 1 1 1
## 1 1 1 1 1
## 23 1 1 1 1
## 4 1 1 1 1
## 1 1 1 1 1
## 1 1 1 1 1
## 1 1 1 1 1
## 3 1 1 1 1
## 3 1 1 1 1
## 4 1 1 1 1
## 1 1 1 1 1
## 3 1 1 1 1
## 1 1 1 1 1
## 1 1 1 1 1
## 1 1 1 1 0
## 0 0 0 1
## MTB_load hb resistance_pattern living_alone_37eb74_v2_v2
## 120 1 1 1 1
## 1 1 1 1 1
## 1 1 1 1 1
## 23 1 1 1 1
## 4 1 1 1 1
## 1 1 1 1 1
## 1 1 1 1 1
## 1 1 1 1 1
## 3 1 1 1 0
## 3 1 1 1 0
## 4 1 1 0 1
## 1 1 0 1 1
## 3 1 0 1 1
## 1 0 1 1 1
## 1 0 1 1 1
## 1 1 1 1 1
## 2 4 4 6
## smoker_5c21df_v2_v2 alcohol_83d0af_v2_v2 ses_education_level
## 120 1 1 1
## 1 1 1 1
## 1 1 1 0
## 23 1 1 0
## 4 1 0 1
## 1 0 1 1
## 1 0 1 0
## 1 0 0 1
## 3 1 1 1
## 3 0 0 0
## 4 1 1 1
## 1 1 1 1
## 3 1 1 0
## 1 1 1 1
## 1 1 1 1
## 1 1 1 1
## 6 8 31
## ses_income_before
## 120 1 0
## 1 0 1
## 1 1 1
## 23 0 2
## 4 1 1
## 1 1 1
## 1 0 3
## 1 1 2
## 3 1 1
## 3 0 5
## 4 1 1
## 1 1 1
## 3 0 3
## 1 1 1
## 1 0 2
## 1 1 1
## 32 94
test for MCAR
mcar_test(data_1_pre_impute)
p-value =0.107 > 0.05, fail to reject H0 -> no evidence against MCAR
ini_d1 <- mice(data_1_pre_impute, m=30, maxit=0, seed = 111, print=FALSE)
ini_d1$method
## MTB_load bl_age pretx_sex
## "polyreg" "" ""
## ptretx_bmi ses_education_level living_alone_37eb74_v2_v2
## "" "polyreg" "logreg"
## smoker_5c21df_v2_v2 alcohol_83d0af_v2_v2 tretx_dm
## "logreg" "polyreg" ""
## prettx_prevtbtx late_culture_conversion time_culture_pos
## "" "" ""
## pretx_hiv hb qol_usual_activity
## "" "pmm" "polyreg"
## ses_income_before resistance_pattern
## "pmm" "polyreg"
hist(data_1_pre_impute$hb)
shapiro.test(data_1_pre_impute$hb)
##
## Shapiro-Wilk normality test
##
## data: data_1_pre_impute$hb
## W = 0.99617, p-value = 0.9513
# => p-value >0.05 ->Hb normally distributed
qqnorm(data_1_pre_impute$hb)
qqline(data_1_pre_impute$hb, col = "red")
#=> for hb, use norm in stead of pmm for more efficiency
#summary(data_1_pre_impute)
hist(data_1_pre_impute$bl_age)
shapiro.test(data_1_pre_impute$bl_age)
##
## Shapiro-Wilk normality test
##
## data: data_1_pre_impute$bl_age
## W = 0.97727, p-value = 0.00709
qqnorm(data_1_pre_impute$bl_age)
qqline(data_1_pre_impute$bl_age, col = "red")
For sparse categorical data, it may be better to use method pmm instead of logreg, polr or polyreg. Method logreg.boot is a version of logreg that uses the bootstrap to emulate sampling variance. [qol_usual_activity, alcohol_83d0af_v2_v2, resistance_pattern, ses_education_level]
meth <- ini_d1$method
meth["qol_usual_activity"] <- "pmm"
meth["alcohol_83d0af_v2_v2"] <- "pmm"
meth["resistance_pattern"] <- "pmm"
meth["ses_education_level"] <- "pmm"
meth["living_alone_37eb74_v2_v2"] <- "pmm"
meth
## MTB_load bl_age pretx_sex
## "polyreg" "" ""
## ptretx_bmi ses_education_level living_alone_37eb74_v2_v2
## "" "pmm" "pmm"
## smoker_5c21df_v2_v2 alcohol_83d0af_v2_v2 tretx_dm
## "logreg" "pmm" ""
## prettx_prevtbtx late_culture_conversion time_culture_pos
## "" "" ""
## pretx_hiv hb qol_usual_activity
## "" "pmm" "pmm"
## ses_income_before resistance_pattern
## "pmm" "pmm"
pred_d1 <- ini_d1$predictorMatrix
# Set time_culture_pos as an auxiliary variable for MTB_load
pred_d1["MTB_load", "time_culture_pos"] <- 1
# time_culture_pos is not used for imputing other variables
other_vars <- setdiff(rownames(pred_d1), "MTB_load")
pred_d1[other_vars, "time_culture_pos"] <- 0
# as number of missing data ~30% -> use m= 30 (number of imputations)
imp1_d2 <- mice(data_1_pre_impute,
pred=pred_d1,method = meth, m=30, maxit = 20, print =FALSE, seed= 111)
# check types of imputation for each variable
imp1_d2$method
## MTB_load bl_age pretx_sex
## "polyreg" "" ""
## ptretx_bmi ses_education_level living_alone_37eb74_v2_v2
## "" "pmm" "pmm"
## smoker_5c21df_v2_v2 alcohol_83d0af_v2_v2 tretx_dm
## "logreg" "pmm" ""
## prettx_prevtbtx late_culture_conversion time_culture_pos
## "" "" ""
## pretx_hiv hb qol_usual_activity
## "" "pmm" "pmm"
## ses_income_before resistance_pattern
## "pmm" "pmm"
Check convergence
plot(imp1_d2)
In imputation it is often more informative to focus on distributional discrepancy, the difference between the observed and imputed data
The idea is that good imputations have a distribution similar to the observed data. In other words, the imputations could have been real values had they been observed. Except under MCAR, the distributions do not need to be identical, since strong MAR mechanisms may induce systematic differences between the two distributions. However, any dramatic differences between the imputed and observed data should certainly alert us to the possibility that something is wrong.
stripplot(imp1_d2, cex = c(1, 1.5))
Kernel density estimates for the marginal distributions of the observed
data (blue) and the m=30 densities per variable calculated from the
imputed data (thin red lines).
densityplot(imp1_d2, layout=c(3,1))
Interpretation is more difficult if there are discrepancies. Such discrepancies may be caused by a bad imputation model, by a missing data mechanism that is not MCAR or by a combination of both
bwplot(imp1_d2, layout = c(3, 1))
with(complete(imp1_d2, "long", include = TRUE), {
boxplot(hb ~ .imp, main = "Hb across imputations")})
bwplot(imp1_d2, hb~.imp)
A more refined diagnostic tool that aims to compare the distributions of observed and imputed data conditional on the missingness probability. The idea is that under MAR the conditional distributions should be similar if the assumed model for creating multiple imputations has a good fit. These statements first model the probability of each record being incomplete as a function of all variables in each imputed dataset. The probabilities (propensities) are then averaged over the imputed datasets to obtain stability.
Assess proportion btw imputed and observed (categorical)
imp1_d2_long_include <- complete(imp1_d2,"long", include = T) %>% # change to long data
mutate(imputed=.imp>0,
imputed= factor(imputed,
levels=c(F,T),
labels=c("Observed", "Imputed")))
imp1_d2_long_not_include <- complete(imp1_d2,"long", include = F)
# #MTB load
prop.table(table(imp1_d2_long_include$MTB_load,
imp1_d2_long_include$imputed),
margin = 2)
##
## Observed Imputed
## 1 0.1257485 0.1305720
## 2 0.1676647 0.1684418
## 3 0.2335329 0.2319527
## 4 0.4730539 0.4690335
# alcohol
prop.table(table(imp1_d2_long_include$alcohol_83d0af_v2_v2,
imp1_d2_long_include$imputed),
margin = 2)
##
## Observed Imputed
## 1 0.40993789 0.41005917
## 2 0.28571429 0.28639053
## 3 0.21739130 0.21637081
## 4 0.08695652 0.08717949
#smoke
prop.table(table(imp1_d2_long_include$smoker_5c21df_v2_v2,
imp1_d2_long_include$imputed),
margin = 2)
##
## Observed Imputed
## 0 0.4785276 0.4816568
## 1 0.5214724 0.5183432
# living alone
prop.table(table(imp1_d2_long_include$living_alone_37eb74_v2_v2,
imp1_d2_long_include$imputed),
margin = 2)
##
## Observed Imputed
## 0 0.8834356 0.8836292
## 1 0.1165644 0.1163708
# resistance pattern
prop.table(table(imp1_d2_long_include$resistance_pattern,
imp1_d2_long_include$imputed),
margin = 2)
##
## Observed Imputed
## 1 0.5454545 0.5422091
## 2 0.3393939 0.3422091
## 3 0.1151515 0.1155819
#education
prop.table(table(imp1_d2_long_include$ses_education_level,
imp1_d2_long_include$imputed),
margin = 2)
##
## Observed Imputed
## 1 0.1304348 0.1368836
## 2 0.6304348 0.6159763
## 3 0.2391304 0.2471400
# qol
prop.table(table(imp1_d2_long_include$qol_usual_activity,
imp1_d2_long_include$imputed),
margin = 2)
##
## Observed Imputed
## 1 0.59523810 0.59368836
## 2 0.24404762 0.24418146
## 3 0.10119048 0.10197239
## 4 0.04761905 0.04812623
## 5 0.01190476 0.01203156
tertile_2 <- quantile(imp1_d2_long_not_include$ses_income_before, probs = c(1/3, 2/3))
tertile_2
## 33.33333% 66.66667%
## 1800 3780
Categorize variable
imp1_d2_long_include_cat_3 <- imp1_d2_long_include %>%
mutate(
# Age binary
bl_age = factor(ifelse(bl_age >= 45, 1, 0), levels = c(0, 1)),
# BMI: Underweight vs Non-underweight
ptretx_bmi = cut(ptretx_bmi,
breaks = c(-Inf, 18.5, 24.9, 29.9, Inf),
labels = c(1, 0, 0, 0), # 1: underweight, 0: normal/overweight/obese
right = FALSE),
ptretx_bmi = factor(ptretx_bmi, levels = c(0, 1)),
# Income tertiles
ses_income_before = cut(ses_income_before,
breaks = c(-Inf, tertile_2[1], tertile_2[2], Inf),
labels = c(0, 1, 2),
right = TRUE),
ses_income_before = factor(ses_income_before, levels = c(0, 1, 2)),
# Hemoglobin: No anemia / Mild / Moderate-Severe
hb = case_when(
pretx_sex == 2 & hb >= 13.0 ~ 0,
pretx_sex == 2 & hb >= 11.0 & hb < 13.0 ~ 1,
pretx_sex == 2 & hb < 11.0 ~ 2,
pretx_sex == 1 & hb >= 12.0 ~ 0,
pretx_sex == 1 & hb >= 11.0 & hb < 12.0 ~ 1,
pretx_sex == 1 & hb < 11.0 ~ 2,
TRUE ~ NA_real_
),
hb = factor(hb, levels = c(0, 1, 2)),
# MTB load: low/very low, medium, high
MTB_load = factor(case_when(
MTB_load %in% c(1, 2) ~ 0,
MTB_load == 3 ~ 1,
MTB_load == 4 ~ 2
), levels = c(0, 1, 2)),
# QoL: usual activity collapse
qol_usual_activity = factor(case_when(
qol_usual_activity == 1 ~ 0,
qol_usual_activity %in% c(2, 3, 4, 5) ~ 1
), levels = c(0, 1)),
# Resistance pattern: MonoDR vs MDR/(pre)XDR
resistance_pattern = factor(case_when(
resistance_pattern == 1 ~ 0,
resistance_pattern %in% c(2, 3) ~ 1
), levels = c(0, 1))
)
summarise data
tab1_imputed_3 <- CreateTableOne(data = imp1_d2_long_include_cat_3 %>% filter(.imp==1),strata = "late_culture_conversion")
#print(tab1_imputed_3, showAllLevels = T)
change into mids
imp1_d2_long_include_cat_3_red <- imp1_d2_long_include_cat_3 %>% select(-imputed)
mids_3rd <- as.mids(imp1_d2_long_include_cat_3_red)
# fit null model
null.model_3 <- with(mids_3rd, glm(late_culture_conversion ~ 1, family = binomial))
# Hemoglobin (HB)
uni.hb_3 <- with(mids_3rd, glm(late_culture_conversion ~ hb, family = binomial))
summary(pool(uni.hb_3), exponentiate = TRUE, conf.int = TRUE)[c("term", "estimate", "2.5 %", "97.5 %")]
anova(null.model_3, uni.hb_3, method = "D3")
## test statistic df1 df2 dfcom p.value riv
## 2 ~~ 1 3.350545 2 88127.74 166 0.0350697 0.02450735
anova(null.model_3, uni.hb_3, method = "D1")
## test statistic df1 df2 dfcom p.value riv
## 2 ~~ 1 3.266454 2 163.6243 166 0.04064291 0.02428414
# MTB Load
uni.mtb_load_3 <- with(mids_3rd, glm(late_culture_conversion ~ MTB_load, family = binomial))
summary(pool(uni.mtb_load_3), exponentiate = TRUE, conf.int = TRUE)[c("term", "estimate", "2.5 %", "97.5 %")]
anova(null.model_3, uni.mtb_load_3, method = "D3")
## test statistic df1 df2 dfcom p.value riv
## 2 ~~ 1 4.793988 2 837819 166 0.008279601 0.007814183
anova(null.model_3, uni.mtb_load_3, method = "D1")
## test statistic df1 df2 dfcom p.value riv
## 2 ~~ 1 4.634039 2 163.9703 166 0.01102258 0.007774104
# Sex
uni.sex_3 <- with(mids_3rd, glm(late_culture_conversion ~ pretx_sex, family = binomial))
summary(pool(uni.sex_3), exponentiate = TRUE, conf.int = TRUE)[c("term", "estimate", "2.5 %", "97.5 %")]
anova(null.model_3, uni.sex_3, method = "D3")
## test statistic df1 df2 dfcom p.value riv
## 2 ~~ 1 0.1550975 1 Inf 167 0.6937109 0
anova(null.model_3, uni.sex_3, method = "D1")
## test statistic df1 df2 dfcom p.value riv
## 2 ~~ 1 0.1553875 1 165.0353 167 0.693948 0
# HIV Status
uni.hiv_3 <- with(mids_3rd, glm(late_culture_conversion ~ pretx_hiv, family = binomial))
summary(pool(uni.hiv_3), exponentiate = TRUE, conf.int = TRUE)[c("term", "estimate", "2.5 %", "97.5 %")]
anova(null.model_3, uni.hiv_3, method = "D3")
## test statistic df1 df2 dfcom p.value riv
## 2 ~~ 1 0.4558578 1 Inf 167 0.4995662 0
anova(null.model_3, uni.hiv_3, method = "D1")
## test statistic df1 df2 dfcom p.value riv
## 2 ~~ 1 0.456295 1 165.0353 167 0.500306 0
# Usual Activity
uni.usual.activity_3 <- with(mids_3rd, glm(late_culture_conversion ~ qol_usual_activity, family = binomial))
summary(pool(uni.usual.activity_3), exponentiate = TRUE, conf.int = TRUE)[c("term", "estimate", "2.5 %", "97.5 %")]
anova(null.model_3, uni.usual.activity_3, method = "D3")
## test statistic df1 df2 dfcom p.value riv
## 2 ~~ 1 7.598904 1 1514347 167 0.005840449 0.003798323
anova(null.model_3, uni.usual.activity_3, method = "D1")
## test statistic df1 df2 dfcom p.value riv
## 2 ~~ 1 7.317632 1 165.0003 167 0.007544944 0.003801323
# Income
uni.income_3 <- with(mids_3rd, glm(late_culture_conversion ~ ses_income_before, family = binomial))
summary(pool(uni.income_3), exponentiate = TRUE, conf.int = TRUE)[c("term", "estimate", "2.5 %", "97.5 %")]
anova(null.model_3, uni.income_3, method = "D3")
## test statistic df1 df2 dfcom p.value riv
## 2 ~~ 1 0.2609915 2 2261.865 166 0.7703107 0.1766327
anova(null.model_3, uni.income_3, method = "D1")
## test statistic df1 df2 dfcom p.value riv
## 2 ~~ 1 0.2633689 2 151.7944 166 0.7688088 0.1763916
# Education Level
uni.edu_3 <- with(mids_3rd, glm(late_culture_conversion ~ ses_education_level, family = binomial))
summary(pool(uni.edu_3), exponentiate = TRUE, conf.int = TRUE)[c("term", "estimate", "2.5 %", "97.5 %")]
anova(null.model_3, uni.edu_3, method = "D3")
## test statistic df1 df2 dfcom p.value riv
## 2 ~~ 1 0.3402885 2 2745.209 166 0.711595 0.15764
anova(null.model_3, uni.edu_3, method = "D1")
## test statistic df1 df2 dfcom p.value riv
## 2 ~~ 1 0.33511 2 153.8999 166 0.71578 0.1563223
# Living Alone
uni.living.alone_3 <- with(mids_3rd, glm(late_culture_conversion ~ living_alone_37eb74_v2_v2, family = binomial))
summary(pool(uni.living.alone_3), exponentiate = TRUE, conf.int = TRUE)[c("term", "estimate", "2.5 %", "97.5 %")]
anova(null.model_3, uni.living.alone_3, method = "D3")
## test statistic df1 df2 dfcom p.value riv
## 2 ~~ 1 0.01969885 1 15238.84 167 0.8883833 0.03930755
anova(null.model_3, uni.living.alone_3, method = "D1")
## test statistic df1 df2 dfcom p.value riv
## 2 ~~ 1 0.01983083 1 163.1681 167 0.8881843 0.03781709
# Smoking
uni.smoke_3 <- with(mids_3rd, glm(late_culture_conversion ~ smoker_5c21df_v2_v2, family = binomial))
summary(pool(uni.smoke_3), exponentiate = TRUE, conf.int = TRUE)[c("term", "estimate", "2.5 %", "97.5 %")]
anova(null.model_3, uni.smoke_3, method = "D3")
## test statistic df1 df2 dfcom p.value riv
## 2 ~~ 1 0.0329065 1 17116.11 167 0.8560546 0.03700065
anova(null.model_3, uni.smoke_3, method = "D1")
## test statistic df1 df2 dfcom p.value riv
## 2 ~~ 1 0.03292855 1 163.2414 167 0.8562299 0.0369968
# Alcohol
uni.alcohol_3 <- with(mids_3rd, glm(late_culture_conversion ~ alcohol_83d0af_v2_v2, family = binomial))
summary(pool(uni.alcohol_3), exponentiate = TRUE, conf.int = TRUE)[c("term", "estimate", "2.5 %", "97.5 %")]
anova(null.model_3, uni.alcohol_3, method = "D3")
## test statistic df1 df2 dfcom p.value riv
## 2 ~~ 1 0.286865 3 38922.48 165 0.8349248 0.04730366
anova(null.model_3, uni.alcohol_3, method = "D1")
## test statistic df1 df2 dfcom p.value riv
## 2 ~~ 1 0.2857165 3 162.1748 165 0.8356651 0.04608936
# Prior TB Treatment
uni.prior_tb_treatment_3 <- with(mids_3rd, glm(late_culture_conversion ~ prettx_prevtbtx, family = binomial))
summary(pool(uni.prior_tb_treatment_3), exponentiate = TRUE, conf.int = TRUE)[c("term", "estimate", "2.5 %", "97.5 %")]
anova(null.model_3, uni.prior_tb_treatment_3, method = "D3")
## test statistic df1 df2 dfcom p.value riv
## 2 ~~ 1 2.582187 1 Inf 167 0.1080722 0
anova(null.model_3, uni.prior_tb_treatment_3, method = "D1")
## test statistic df1 df2 dfcom p.value riv
## 2 ~~ 1 2.549779 1 165.0353 167 0.1122227 0
# BMI
uni.bmi_3 <- with(mids_3rd, glm(late_culture_conversion ~ ptretx_bmi, family = binomial))
summary(pool(uni.bmi_3), exponentiate = TRUE, conf.int = TRUE)[c("term", "estimate", "2.5 %", "97.5 %")]
anova(null.model_3, uni.bmi_3, method = "D3")
## test statistic df1 df2 dfcom p.value riv
## 2 ~~ 1 5.835283 1 Inf 167 0.01570788 0
anova(null.model_3, uni.bmi_3, method = "D1")
## test statistic df1 df2 dfcom p.value riv
## 2 ~~ 1 5.754318 1 165.0353 167 0.01756311 0
# Age
uni.age_3 <- with(mids_3rd, glm(late_culture_conversion ~ bl_age, family = binomial))
summary(pool(uni.age_3), exponentiate = TRUE, conf.int = TRUE)[c("term", "estimate", "2.5 %", "97.5 %")]
anova(null.model_3, uni.age_3, method = "D3")
## test statistic df1 df2 dfcom p.value riv
## 2 ~~ 1 1.773374 1 Inf 167 0.1829651 0
anova(null.model_3, uni.age_3, method = "D1")
## test statistic df1 df2 dfcom p.value riv
## 2 ~~ 1 1.747691 1 165.0353 167 0.1879967 0
# Resistance Pattern
uni.resistance_3 <- with(mids_3rd, glm(late_culture_conversion ~ resistance_pattern, family = binomial))
summary(pool(uni.resistance_3), exponentiate = TRUE, conf.int = TRUE)[c("term", "estimate", "2.5 %", "97.5 %")]
anova(null.model_3, uni.resistance_3, method = "D3")
## test statistic df1 df2 dfcom p.value riv
## 2 ~~ 1 0.6283847 1 18763.2 167 0.4279585 0.03527602
anova(null.model_3, uni.resistance_3, method = "D1")
## test statistic df1 df2 dfcom p.value riv
## 2 ~~ 1 0.6269418 1 163.3922 167 0.4296273 0.03525935
# Diabetes Mellitus (DM)
uni.dm_3 <- with(mids_3rd, glm(late_culture_conversion ~ tretx_dm, family = binomial))
summary(pool(uni.dm_3), exponentiate = TRUE, conf.int = TRUE)[c("term", "estimate", "2.5 %", "97.5 %")]
anova(null.model_3, uni.dm_3, method = "D3")
## test statistic df1 df2 dfcom p.value riv
## 2 ~~ 1 0.01233493 1 2.347706e+28 167 0.9115666 -3.038183e-14
anova(null.model_3, uni.dm_3, method = "D1")
## test statistic df1 df2 dfcom p.value riv
## 2 ~~ 1 0.0122971 1 165.0353 167 0.9118365 0
There are 73 events, -> maximum 7 predictors for multivariable model after univariable analysis, 7 variables: MTB load, usual activity/ qol, prior TB treatment, malnutrition/ bmi, age, anemia/ hb, (resistance) are chosen => standard multiple regression model can be used (p<0.25)
Stepwise backward eliminations are conducted with 30 imputed dataset, models comparision based on AIC. Variables appeared in at least half of models will be chosen. Stepwise AIC across multiple imputations requires running stepAIC() on each imputed dataset individually and then pooling or summarizing the results. This method allows you to identify which variables are robust across the imputations.
scope <- list(upper = ~ MTB_load + hb + ptretx_bmi + qol_usual_activity + bl_age + prettx_prevtbtx + resistance_pattern,
lower = ~1)
expr <- expression(f1 <- glm(late_culture_conversion ~ MTB_load + hb + ptretx_bmi + qol_usual_activity + bl_age + prettx_prevtbtx + resistance_pattern,
family = binomial),
f2 <- step(f1, scope = scope, direction = "backward",trace =0))
# Apply the model to the imputed data
fit_3 <- with(mids_3rd, expr)
formulas_3 <- lapply(fit_3$analyses, formula)
terms_3 <- lapply(formulas_3, terms)
votes_3 <- unlist(lapply(terms_3, labels))
table(votes_3)
## votes_3
## bl_age hb MTB_load prettx_prevtbtx
## 30 23 30 30
## ptretx_bmi qol_usual_activity
## 30 30
hb, MTB_load, prettx_prevtbtx, ptretx_bmi, qol_usual_activity appear in all models. bl_age in 29 models, resistance_pattern in 1 models
fit_pool_3_after_majority <- with(mids_3rd, glm(late_culture_conversion~ MTB_load+ ptretx_bmi + qol_usual_activity + bl_age + hb+ prettx_prevtbtx, family = binomial))
fit_pool_3_not_age <- with(mids_3rd, glm(late_culture_conversion~ MTB_load+ ptretx_bmi + qol_usual_activity + hb+ prettx_prevtbtx, family = binomial))
D1(fit_pool_3_after_majority, fit_pool_3_not_age)
## test statistic df1 df2 dfcom p.value riv
## 1 ~~ 2 2.64999 1 158.0257 160 0.1055427 0.001742126
D3(fit_pool_3_after_majority, fit_pool_3_not_age)
## test statistic df1 df2 dfcom p.value riv
## 1 ~~ 2 2.713474 1 7063538 160 0.09950414 0.00175486
p-value = 0.119 -> keep age
vif_model_3rd <- vif(fit_pool_3_after_majority$analyses[[1]])
# Display the VIF/GVIF values
vif_model_3rd
## MTB_load1 MTB_load2 ptretx_bmi1 qol_usual_activity1
## 1.407067 1.387633 1.045873 1.085309
## bl_age1 hb1 hb2 prettx_prevtbtx1
## 1.061347 1.543770 1.539190 1.061703
VIF for continuous or binary predictors. GVIF for categorical predictors with more than two levels, along with its power-transformed value GVIF^(1/(2p)). Interpretation of GVIF The interpretation of GVIF^(1/(2p)) is similar to VIF: Values close to 1 suggest low multicollinearity. Values between 1 and 5 suggest moderate multicollinearity. Values above 5 or 10 may indicate high multicollinearity.
The predictive accuracy of the final model was checked using discrimination (AUC) and calibration (calibration graphs and Hosmer–Lemeshow goodness of fit test) parameters. The Hosmer–Lemeshow goodness of fit test with a p-value greater than 0.05 indicates good calibration, which means that the probability of late culture conversion estimated by the model is similar to the observed probability. An AUC of 0.5 indicates no discrimination ability, while an AUC of 1 indicates perfect discrimination.
AUC across 30 imputed dataset
# Get the predicted probabilities for each patient in all imputed dataset
predicted_probs_list_3rd <- lapply(fit_pool_3_after_majority$analyses, function(model) {
predict(model, type = "response")
})
range(predicted_probs_list_3rd)
## [1] 0.1053515 0.9201646
# Plot only the ROC curves (no extra output)
plot.roc(roc_list[[1]], col = "blue", main = "ROC Curves",
print.auc = FALSE, legacy.axes = TRUE)
for (i in 2:30) {
plot.roc(roc_list[[i]], col = rainbow(30)[i], add = TRUE, print.auc = FALSE)
}
# Add diagonal line
abline(a = 1, b = -1, lty = 2, col = "gray")
# Calculate the Youden Index and the corresponding threshold for each ROC curve
youden_results <- lapply(roc_list, function(roc_obj) {
# Get the optimal threshold that maximizes the Youden Index (sensitivity + specificity - 1)
coords(roc_obj, "best", ret = c("threshold", "sensitivity", "specificity", "youden"),
transpose = FALSE)
})
# Extract thresholds and Youden Indexes
thresholds <- sapply(youden_results, function(res) res$threshold)
youden_indexes <- sapply(youden_results, function(res) res$youden)
# Print the thresholds and Youden Indexes
data.frame(Threshold = thresholds, YoudenIndex = youden_indexes)
Pooled calibration + AUC: pool_performance Pooling performance measures (AUC): aggregates performance measures from multiple imputed datasets using a method that accounts for the variability between imputations
stacked_long_3rd <- complete(mids_3rd, "long", include = F)
pool_performance_3 <- pool_performance(stacked_long_3rd,
nimp = 30,
impvar = ".imp",
formula = late_culture_conversion~ bl_age+ hb+ MTB_load+ prettx_prevtbtx +ptretx_bmi+ qol_usual_activity,
cal.plot=TRUE, plot.method="mean",
groups_cal=7, model_type="binomial")
pool_performance_3$ROC_pooled
## 95% Low C-statistic 95% Up
## C-statistic (logit) 0.6483 0.7322 0.8022
pool_performance_3$R2_pooled
## [1] 0.2056982
pool_performance_3$HLtest_pooled
## F_value P(>F) df1 df2
## [1,] 0.2628084 0.9333774 5 1117.869
pool_performance_3$Brier_Scaled_pooled
## [1] 0.1599911
R² of 0.2067 suggests that about 20.67% of the variance in the outcome is explained by the model The Brier score measures overall performance—it is simply calculated as the mean squared difference between predicted probabilities and actual outcomes. The value of the Brier score estimate can range between 0 and 1. A lower Brier score value indicates a better performing model.
Hosmo Lemeshow test: to assess goodness-of-fit for logistic models It evaluates whether the observed event rates match the expected event rates across deciles of predicted probabilities.
p-value >0.05 suggests that there is no significant difference between the observed and expected frequencies in your data across different deciles of predicted probabilities. In other words, your model fits the data well according to the Hosmer-Lemeshow test.
Overlay calibration
cali_plot <- pool_performance(data = stacked_long_3rd, formula = late_culture_conversion~ bl_age+ hb+ MTB_load+ prettx_prevtbtx +ptretx_bmi+ qol_usual_activity,
nimp=30, impvar=".imp",
cal.plot=TRUE, plot.method="overlay",
groups_cal=7, model_type="binomial")
The model was internally validated using a bootstrap technique to estimate the degree of over-optimism of the final model when applied to a similar population. Internal validation was performed on the regression coefficient with a 95% confidence interval (CI) and the AUC of the model using 2,000 random bootstrap samples. The AUC difference between the bootstrap and the original full sample measured the optimism of the predictive model.
Ideally, we should first bootstrap and then impute. However, this strategy might be computationally difficult. Instead, we can first impute, then bootstrap, obtain optimism corrected performance measures from each imputed dataset, and finally pool these
Method cv_MI uses imputation within each cross-validation fold definition. By repeating this in several imputation runs, multiply imputed datasets are generated. Method cv_MI_RR uses multiple imputation within the cross-validation definition. MI_cv_naive, applies cross-validation within each imputed dataset. MI_boot draws for each bootstrap step the same cases in all imputed datasets. With boot_MI first bootstrap samples are drawn from the original dataset with missing values and than multiple imputation is applied. For multiple imputation the mice function from the mice package is used. It is recommended to use a minumum of 100 imputation runs for method cv_MI or 100 bootstrap samples for method boot_MI or MI_boot
Orig (original datasets), Apparent (models applied in bootstrap samples), Test (bootstrap models are applied in original datasets), Optimism (difference between apparent and test) and Corrected (original corrected for optimism).
#Pooled model wald test
pool_D1_model_3rd <- psfmi_lr(data = stacked_long_3rd,
nimp = 30, # number of imputation
impvar = ".imp", # The variable that identifies imputation number
formula = late_culture_conversion ~ MTB_load + hb + ptretx_bmi + qol_usual_activity + bl_age + prettx_prevtbtx,
method = "D1", # Rubin’s rules (D1 method)
p.crit = 1) # Keep all predictors (no variable selection)
internal_validate_3 <- suppressMessages(
suppressWarnings(
psfmi_validate(
pobj = pool_D1_model_3rd,
val_method = "MI_boot",
miceImp = TRUE,
int_val = TRUE,
nboot = 200, # number of bootstrap resamples
plot.method = "mean",
cal.plot = TRUE,
groups_cal = 7
)
)
)
#internal_validate_3$intercept_test
#internal_validate_3$res_boot$R2_app
auc_values <- internal_validate_3$res_boot$ROC_app
quantile(auc_values, 0.025)
## 2.5%
## 0.6847675
quantile(auc_values, 0.975)
## 97.5%
## 0.8261175
A simplified risk score was constructed based on the hierarchy of the regression coefficients in the final model (each coefficient was divided by the smallest coefficient (bl_age =0.60) and rounded to the nearest integer). The score’s predictive performance (AUC) was assessed and compared with that of the original model. Sustained culture conversion risk corresponding to each score was calculated
fit_pool_3_final <- with(mids_3rd, glm(late_culture_conversion~ bl_age+ hb+ MTB_load+ prettx_prevtbtx +ptretx_bmi+ qol_usual_activity, family = binomial))
tbl_regression(fit_pool_3_final,exponentiate = TRUE)
| Characteristic | OR1 | 95% CI1 | p-value |
|---|---|---|---|
| bl_age | |||
| 0 | — | — | |
| 1 | 1.82 | 0.88, 3.77 | 0.11 |
| hb | |||
| 0 | — | — | |
| 1 | 1.97 | 0.79, 4.89 | 0.14 |
| 2 | 2.31 | 0.98, 5.44 | 0.056 |
| MTB_load | |||
| 0 | — | — | |
| 1 | 1.82 | 0.71, 4.63 | 0.2 |
| 2 | 2.55 | 1.15, 5.65 | 0.021 |
| prettx_prevtbtx | |||
| 0 | — | — | |
| 1 | 1.98 | 0.98, 3.99 | 0.057 |
| ptretx_bmi | |||
| 0 | — | — | |
| 1 | 1.89 | 0.94, 3.78 | 0.072 |
| qol_usual_activity | |||
| 0 | — | — | |
| 1 | 1.98 | 0.97, 4.07 | 0.062 |
| 1 OR = Odds Ratio, CI = Confidence Interval | |||
# MTB_low is smallest -> weight of 1
# age1 (weight of 1)
0.5627385/ 0.5511816
## [1] 1.020967
# hb1 (weight of 1)
0.7064516/ 0.5511816
## [1] 1.281704
# MTB load 2 (weight of 2)
0.9132189/ 0.5511816
## [1] 1.656839
# prettx_prevtbtx (weight of 1)
0.6715238/0.5511816
## [1] 1.218335
# bmi (weight of 1)
0.6424174/0.5511816
## [1] 1.165528
# qol (weight of 1)
0.7490449/0.5511816
## [1] 1.35898
Total score = 1* age>= 45 + 1 * anemia_mild + 1* MTB_load_medium + 2 * MTB_load_high + 1* pre_TB_treatment + 1* underweight + 1* qol_some_problems (baseline: age < 45, MTB_load: low/very low, bmi: non-underweight, hb: non-anemia, qol: no problem, pre_TB_treatment: no)
Total score = 1 * bl_age1 + 1 * hb1 + 1* MTB_load1 + 2* MTB_load2 + 1* prettx_prevtbtx1 + 1* ptretx_bmi0 + 1* qol_usual_activity1 min = 0, max = 7
imp1_d2_long_include_cat_3_red_score <- imp1_d2_long_include_cat_3_red %>%
mutate(MTB_load_RC1 = if_else(MTB_load==1, 1, 0),
MTB_load_RC2 = if_else(MTB_load==2, 1, 0),
TotalScore = 1 * as.numeric(as.character(bl_age)) +
1 * as.numeric(as.character(hb)) +
1 * MTB_load_RC1 +
2 * MTB_load_RC2 +
1 * as.numeric(as.character(prettx_prevtbtx)) +
1 * as.numeric(as.character(ptretx_bmi)) +
1 * as.numeric(as.character(qol_usual_activity)))
imp1_d2_long_not_include_cat_3_red_score <- imp1_d2_long_include_cat_3_red_score %>%
filter(.imp!=0)
Calculate risk scores for stacked 30 imputed datasets (equal contribution of imputed dataset)
imp1_d2_long_not_include_cat_3_red_score %>%
mutate(TotalScore = case_when(
TotalScore %in% 0:1 ~ "0-1", # Collapse scores 0 and 1 into "0-1"
TRUE ~ as.character(TotalScore) # Keep scores 2-7 unchanged
)) %>%
group_by(TotalScore) %>%
summarise(
total_patients = n(), # Total patients with this score
n_late_culture_conversion = sum(late_culture_conversion == 1), # Late culture conversion cases
Proportion = round(n_late_culture_conversion / total_patients, 3) # Proportion
) %>%
arrange(factor(TotalScore, levels = c("0-1", "2", "3", "4", "5", "6", "7"))) # Ensure correct order
imp1_d2_long_not_include_cat_3_red_score %>%
filter(.imp==2) %>%
filter(TotalScore %in% 0:7) %>%
group_by(TotalScore) %>% # Group by TotalScore
summarise(
n_total = n(), # Total patients with this score
n_late_conversion = sum(late_culture_conversion == 1), # Patients with late culture conversion
proportion_late_conversion = n_late_conversion / n_total # Proportion
) %>%
arrange(TotalScore)
pool_performance_score <- pool_performance(imp1_d2_long_include_cat_3_red_score,
nimp = 30,
impvar = ".imp",
formula = late_culture_conversion~ TotalScore,
cal.plot=TRUE, plot.method="mean",
groups_cal=5, model_type="binomial")
pool_performance(imp1_d2_long_include_cat_3_red_score,
nimp = 30,
impvar = ".imp",
formula = late_culture_conversion~ TotalScore,
cal.plot=TRUE, plot.method="overlay",
groups_cal=5, model_type="binomial")
## $ROC_pooled
## 95% Low C-statistic 95% Up
## C-statistic (logit) 0.6382 0.7206 0.7905
##
## $coef_pooled
## (Intercept) TotalScore
## -1.7757380 0.5078212
##
## $R2_pooled
## [1] 0.1953933
##
## $Brier_Scaled_pooled
## [1] 0.1495607
##
## $nimp
## [1] 30
##
## $HLtest_pooled
## F_value P(>F) df1 df2
## [1,] 0.4651624 0.7065946 3 37521.59
##
## $model_type
## [1] "binomial"
group_cal: the number of risk groups for calibration
pool_performance_score$ROC_pooled
## 95% Low C-statistic 95% Up
## C-statistic (logit) 0.6382 0.7206 0.7905
pool_performance_score$coef_pooled
## (Intercept) TotalScore
## -1.7757380 0.5078212
pool_performance_score$HLtest_pooled
## F_value P(>F) df1 df2
## [1,] 0.4651624 0.7065946 3 37521.59
AUC draw for risk score
par(pty = "s")
# Plot the first ROC curve
plot.roc(roc_list[[1]],
col = "blue",
print.auc = FALSE,
legacy.axes = TRUE,
main = "ROC curve of simplified risk score",
cex.main=0.9)
# Add other ROC curves
for (i in 2:30) {
plot.roc(roc_list[[i]], col = rainbow(10)[i], add = TRUE, print.auc = FALSE)
}
# Add diagonal line
abline(a = 1, b = -1, lty = 2, col = "gray")
# Add AUC text
text(x = 0.3, y = 0.3,
labels = "AUC (95% CI) = 0.72 (0.64–0.79)",
cex = 0.7, font = 1)
Threshold that maximizes the sum of sensitivity and specificity — i.e., the point on the ROC curve farthest from the diagonal
mids_3rd_score <- as.mids(imp1_d2_long_include_cat_3_red_score)
# Get predicted probabilities per imputation
pred_matrix <- sapply(1:30, function(i) {
dat_i <- complete(mids_3rd_score, i)
model <- glm(late_culture_conversion ~ TotalScore,
data = dat_i, family = binomial)
predict(model, type = "response") # vector of predicted probabilities
})
# Get the average prediction per individual (pooled prediction)
pooled_preds <- rowMeans(pred_matrix)
# Use the observed outcome from any imputed dataset (e.g., first one)
true_outcome <- complete(mids_3rd_score, 1)$late_culture_conversion
# Calculate ROC with pooled predicted probabilities
pooled_roc <- roc(true_outcome, pooled_preds, direction = "<")
## Setting levels: control = 0, case = 1
# Get optimal threshold using Youden Index
youden_result <- coords(pooled_roc,
x = "best",
best.method = "youden",
ret = c("threshold", "sensitivity", "specificity", "youden"),
transpose = FALSE)
print(youden_result)
## threshold sensitivity specificity youden
## 1 0.5365736 0.7916667 0.5479452 1.339612