In this assignment, we answer our research question with help of multiple imputation for the missing values in our dataset.
data_complete <- readRDS("g3_complete_data.rds")
data_incomplete <- readRDS("g3_incomplete_data.rds")
#adding depression score:
data_complete1 <- data_complete %>% mutate(depression_score = dep1 + dep2 + dep3 + dep4 + dep5 + dep6 + dep7 + dep8 + dep9)
data_incomplete1 <- data_incomplete %>% mutate(depression_score = dep1 + dep2 + dep3 + dep4 + dep5 + dep6 + dep7 + dep8 + dep9)
The missing data are multiply imputed:
# default multiple imputation with 10 imps, 20 its:
imp <- mice(data_incomplete1, m = 10, maxit = 20, seed = 1234567, print = FALSE)
# checking the loggedEvents; due to depression_score:
imp$loggedEvents %>% head()
## it im dep meth out
## 1 2 3 height pmm depression_score
## 2 2 3 bmi pmm depression_score
## 3 2 3 bp_sys2 pmm depression_score
## 4 2 3 bp_dia2 pmm depression_score
## 5 2 6 drink_regularly logreg depression_score
## 6 2 8 weight pmm depression_score
# looking at the plots from our initial model:
plot(imp)
densityplot(imp)
## Hint: Did you know, an equivalent figure can be created with `ggmice()`?
## For example, to plot a variable named 'my_vrb' from a mids object called 'my_mids', run:
##
## ggmice(my_mids, ggplot2::aes(x = my_vrb, group = .imp)) +
## ggplot2::geom_density()
##
## See amices.org/ggmice for more info.
stripplot(imp)
## Hint: Did you know, an equivalent figure can be created with `ggmice()`?
## For example, to plot a variable named 'my_vrb' from a mids object called 'my_mids', run:
##
## ggmice(my_mids, ggplot2::aes(x = .imp, y = my_vrb)) +
## ggplot2::geom_jitter()
##
## See amices.org/ggmice for more info.
# auto-correlation should ideally be 0, but deviates from that for some variables:
conv <- convergence(imp)
ggplot(conv, aes(x = .it, y = ac)) +
geom_hline(yintercept = 0, color = "grey", linetype = "dashed") +
geom_line() +
facet_wrap(~vrb, scales = "free") +
theme_classic() +
labs(x = "Iteration", y = "Auto-correlation")
# potential scale reduction factors are high; ideally they should be 1 for each variable:
ggplot(conv, aes(x = .it, y = psrf)) +
geom_hline(yintercept = 1, color = "grey", linetype = "dashed") +
geom_line() +
facet_wrap(~vrb, scales = "free") +
theme_classic() +
labs(x = "Iteration", y = "Potential scale reduction factor")
We have quantified and displayed the non-convergence. Now we try to solve the issue.
Since we added a column for the depression score, which is the sum of all depression scores in a row, we are going to do passive imputation for this variable. To implement passive imputation, we adjust two features of the mice() setup: the method vector: we use the method vector to define the deterministic relations, and the predictor matrix: we adjust the predictor matrix to keep a transformed variable from being used as a predictor of its raw version.
# currently all missing data is treated with pmm:
(method <- imp$method)
## id sex age ethnicity
## "" "" "" ""
## education marital household_size household_income
## "" "" "" ""
## weight height bmi pulse
## "pmm" "pmm" "pmm" ""
## bp_sys1 bp_dia1 bp_sys2 bp_dia2
## "" "" "pmm" "pmm"
## time_sed drink_regularly days_drinking dep1
## "" "logreg" "" ""
## dep2 dep3 dep4 dep5
## "pmm" "pmm" "" "pmm"
## dep6 dep7 dep8 dep9
## "pmm" "" "pmm" "pmm"
## depression_score
## "pmm"
# since our data is not MCAR, pmm is mostly fine:
out <- mcar_test(data_incomplete)
out$statistic # 1448.785
## [1] 1448.785
out$p.value # 1.968048e-10
## [1] 1.968048e-10
# we change the method to implement passive imputation depression_score :
method["depression_score"]<- "~ I(dep1 + dep2 + dep3 + dep4 + dep5 + dep6 + dep7 + dep8 + dep9)"
method
## id
## ""
## sex
## ""
## age
## ""
## ethnicity
## ""
## education
## ""
## marital
## ""
## household_size
## ""
## household_income
## ""
## weight
## "pmm"
## height
## "pmm"
## bmi
## "pmm"
## pulse
## ""
## bp_sys1
## ""
## bp_dia1
## ""
## bp_sys2
## "pmm"
## bp_dia2
## "pmm"
## time_sed
## ""
## drink_regularly
## "logreg"
## days_drinking
## ""
## dep1
## ""
## dep2
## "pmm"
## dep3
## "pmm"
## dep4
## ""
## dep5
## "pmm"
## dep6
## "pmm"
## dep7
## ""
## dep8
## "pmm"
## dep9
## "pmm"
## depression_score
## "~ I(dep1 + dep2 + dep3 + dep4 + dep5 + dep6 + dep7 + dep8 + dep9)"
# adjusting the prediction matrix to avoid circularity:
(pm <- imp$predictorMatrix)
## id sex age ethnicity education marital household_size
## id 0 1 1 1 1 1 1
## sex 1 0 1 1 1 1 1
## age 1 1 0 1 1 1 1
## ethnicity 1 1 1 0 1 1 1
## education 1 1 1 1 0 1 1
## marital 1 1 1 1 1 0 1
## household_size 1 1 1 1 1 1 0
## household_income 1 1 1 1 1 1 1
## weight 1 1 1 1 1 1 1
## height 1 1 1 1 1 1 1
## bmi 1 1 1 1 1 1 1
## pulse 1 1 1 1 1 1 1
## bp_sys1 1 1 1 1 1 1 1
## bp_dia1 1 1 1 1 1 1 1
## bp_sys2 1 1 1 1 1 1 1
## bp_dia2 1 1 1 1 1 1 1
## time_sed 1 1 1 1 1 1 1
## drink_regularly 1 1 1 1 1 1 1
## days_drinking 1 1 1 1 1 1 1
## dep1 1 1 1 1 1 1 1
## dep2 1 1 1 1 1 1 1
## dep3 1 1 1 1 1 1 1
## dep4 1 1 1 1 1 1 1
## dep5 1 1 1 1 1 1 1
## dep6 1 1 1 1 1 1 1
## dep7 1 1 1 1 1 1 1
## dep8 1 1 1 1 1 1 1
## dep9 1 1 1 1 1 1 1
## depression_score 1 1 1 1 1 1 1
## household_income weight height bmi pulse bp_sys1 bp_dia1
## id 1 1 1 1 1 1 1
## sex 1 1 1 1 1 1 1
## age 1 1 1 1 1 1 1
## ethnicity 1 1 1 1 1 1 1
## education 1 1 1 1 1 1 1
## marital 1 1 1 1 1 1 1
## household_size 1 1 1 1 1 1 1
## household_income 0 1 1 1 1 1 1
## weight 1 0 1 1 1 1 1
## height 1 1 0 1 1 1 1
## bmi 1 1 1 0 1 1 1
## pulse 1 1 1 1 0 1 1
## bp_sys1 1 1 1 1 1 0 1
## bp_dia1 1 1 1 1 1 1 0
## bp_sys2 1 1 1 1 1 1 1
## bp_dia2 1 1 1 1 1 1 1
## time_sed 1 1 1 1 1 1 1
## drink_regularly 1 1 1 1 1 1 1
## days_drinking 1 1 1 1 1 1 1
## dep1 1 1 1 1 1 1 1
## dep2 1 1 1 1 1 1 1
## dep3 1 1 1 1 1 1 1
## dep4 1 1 1 1 1 1 1
## dep5 1 1 1 1 1 1 1
## dep6 1 1 1 1 1 1 1
## dep7 1 1 1 1 1 1 1
## dep8 1 1 1 1 1 1 1
## dep9 1 1 1 1 1 1 1
## depression_score 1 1 1 1 1 1 1
## bp_sys2 bp_dia2 time_sed drink_regularly days_drinking dep1
## id 1 1 1 1 1 1
## sex 1 1 1 1 1 1
## age 1 1 1 1 1 1
## ethnicity 1 1 1 1 1 1
## education 1 1 1 1 1 1
## marital 1 1 1 1 1 1
## household_size 1 1 1 1 1 1
## household_income 1 1 1 1 1 1
## weight 1 1 1 1 1 1
## height 1 1 1 1 1 1
## bmi 1 1 1 1 1 1
## pulse 1 1 1 1 1 1
## bp_sys1 1 1 1 1 1 1
## bp_dia1 1 1 1 1 1 1
## bp_sys2 0 1 1 1 1 1
## bp_dia2 1 0 1 1 1 1
## time_sed 1 1 0 1 1 1
## drink_regularly 1 1 1 0 1 1
## days_drinking 1 1 1 1 0 1
## dep1 1 1 1 1 1 0
## dep2 1 1 1 1 1 1
## dep3 1 1 1 1 1 1
## dep4 1 1 1 1 1 1
## dep5 1 1 1 1 1 1
## dep6 1 1 1 1 1 1
## dep7 1 1 1 1 1 1
## dep8 1 1 1 1 1 1
## dep9 1 1 1 1 1 1
## depression_score 1 1 1 1 1 1
## dep2 dep3 dep4 dep5 dep6 dep7 dep8 dep9 depression_score
## id 1 1 1 1 1 1 1 1 1
## sex 1 1 1 1 1 1 1 1 1
## age 1 1 1 1 1 1 1 1 1
## ethnicity 1 1 1 1 1 1 1 1 1
## education 1 1 1 1 1 1 1 1 1
## marital 1 1 1 1 1 1 1 1 1
## household_size 1 1 1 1 1 1 1 1 1
## household_income 1 1 1 1 1 1 1 1 1
## weight 1 1 1 1 1 1 1 1 1
## height 1 1 1 1 1 1 1 1 1
## bmi 1 1 1 1 1 1 1 1 1
## pulse 1 1 1 1 1 1 1 1 1
## bp_sys1 1 1 1 1 1 1 1 1 1
## bp_dia1 1 1 1 1 1 1 1 1 1
## bp_sys2 1 1 1 1 1 1 1 1 1
## bp_dia2 1 1 1 1 1 1 1 1 1
## time_sed 1 1 1 1 1 1 1 1 1
## drink_regularly 1 1 1 1 1 1 1 1 1
## days_drinking 1 1 1 1 1 1 1 1 1
## dep1 1 1 1 1 1 1 1 1 1
## dep2 0 1 1 1 1 1 1 1 1
## dep3 1 0 1 1 1 1 1 1 1
## dep4 1 1 0 1 1 1 1 1 1
## dep5 1 1 1 0 1 1 1 1 1
## dep6 1 1 1 1 0 1 1 1 1
## dep7 1 1 1 1 1 0 1 1 1
## dep8 1 1 1 1 1 1 0 1 1
## dep9 1 1 1 1 1 1 1 0 1
## depression_score 1 1 1 1 1 1 1 1 0
pm <- as.data.frame(pm) #converted to df to change the cells
pm[20:28, 29] <- 0 #changing the values to 0
pm <- as.matrix(pm) #conversion back to matrix
pm
## id sex age ethnicity education marital household_size
## id 0 1 1 1 1 1 1
## sex 1 0 1 1 1 1 1
## age 1 1 0 1 1 1 1
## ethnicity 1 1 1 0 1 1 1
## education 1 1 1 1 0 1 1
## marital 1 1 1 1 1 0 1
## household_size 1 1 1 1 1 1 0
## household_income 1 1 1 1 1 1 1
## weight 1 1 1 1 1 1 1
## height 1 1 1 1 1 1 1
## bmi 1 1 1 1 1 1 1
## pulse 1 1 1 1 1 1 1
## bp_sys1 1 1 1 1 1 1 1
## bp_dia1 1 1 1 1 1 1 1
## bp_sys2 1 1 1 1 1 1 1
## bp_dia2 1 1 1 1 1 1 1
## time_sed 1 1 1 1 1 1 1
## drink_regularly 1 1 1 1 1 1 1
## days_drinking 1 1 1 1 1 1 1
## dep1 1 1 1 1 1 1 1
## dep2 1 1 1 1 1 1 1
## dep3 1 1 1 1 1 1 1
## dep4 1 1 1 1 1 1 1
## dep5 1 1 1 1 1 1 1
## dep6 1 1 1 1 1 1 1
## dep7 1 1 1 1 1 1 1
## dep8 1 1 1 1 1 1 1
## dep9 1 1 1 1 1 1 1
## depression_score 1 1 1 1 1 1 1
## household_income weight height bmi pulse bp_sys1 bp_dia1
## id 1 1 1 1 1 1 1
## sex 1 1 1 1 1 1 1
## age 1 1 1 1 1 1 1
## ethnicity 1 1 1 1 1 1 1
## education 1 1 1 1 1 1 1
## marital 1 1 1 1 1 1 1
## household_size 1 1 1 1 1 1 1
## household_income 0 1 1 1 1 1 1
## weight 1 0 1 1 1 1 1
## height 1 1 0 1 1 1 1
## bmi 1 1 1 0 1 1 1
## pulse 1 1 1 1 0 1 1
## bp_sys1 1 1 1 1 1 0 1
## bp_dia1 1 1 1 1 1 1 0
## bp_sys2 1 1 1 1 1 1 1
## bp_dia2 1 1 1 1 1 1 1
## time_sed 1 1 1 1 1 1 1
## drink_regularly 1 1 1 1 1 1 1
## days_drinking 1 1 1 1 1 1 1
## dep1 1 1 1 1 1 1 1
## dep2 1 1 1 1 1 1 1
## dep3 1 1 1 1 1 1 1
## dep4 1 1 1 1 1 1 1
## dep5 1 1 1 1 1 1 1
## dep6 1 1 1 1 1 1 1
## dep7 1 1 1 1 1 1 1
## dep8 1 1 1 1 1 1 1
## dep9 1 1 1 1 1 1 1
## depression_score 1 1 1 1 1 1 1
## bp_sys2 bp_dia2 time_sed drink_regularly days_drinking dep1
## id 1 1 1 1 1 1
## sex 1 1 1 1 1 1
## age 1 1 1 1 1 1
## ethnicity 1 1 1 1 1 1
## education 1 1 1 1 1 1
## marital 1 1 1 1 1 1
## household_size 1 1 1 1 1 1
## household_income 1 1 1 1 1 1
## weight 1 1 1 1 1 1
## height 1 1 1 1 1 1
## bmi 1 1 1 1 1 1
## pulse 1 1 1 1 1 1
## bp_sys1 1 1 1 1 1 1
## bp_dia1 1 1 1 1 1 1
## bp_sys2 0 1 1 1 1 1
## bp_dia2 1 0 1 1 1 1
## time_sed 1 1 0 1 1 1
## drink_regularly 1 1 1 0 1 1
## days_drinking 1 1 1 1 0 1
## dep1 1 1 1 1 1 0
## dep2 1 1 1 1 1 1
## dep3 1 1 1 1 1 1
## dep4 1 1 1 1 1 1
## dep5 1 1 1 1 1 1
## dep6 1 1 1 1 1 1
## dep7 1 1 1 1 1 1
## dep8 1 1 1 1 1 1
## dep9 1 1 1 1 1 1
## depression_score 1 1 1 1 1 1
## dep2 dep3 dep4 dep5 dep6 dep7 dep8 dep9 depression_score
## id 1 1 1 1 1 1 1 1 1
## sex 1 1 1 1 1 1 1 1 1
## age 1 1 1 1 1 1 1 1 1
## ethnicity 1 1 1 1 1 1 1 1 1
## education 1 1 1 1 1 1 1 1 1
## marital 1 1 1 1 1 1 1 1 1
## household_size 1 1 1 1 1 1 1 1 1
## household_income 1 1 1 1 1 1 1 1 1
## weight 1 1 1 1 1 1 1 1 1
## height 1 1 1 1 1 1 1 1 1
## bmi 1 1 1 1 1 1 1 1 1
## pulse 1 1 1 1 1 1 1 1 1
## bp_sys1 1 1 1 1 1 1 1 1 1
## bp_dia1 1 1 1 1 1 1 1 1 1
## bp_sys2 1 1 1 1 1 1 1 1 1
## bp_dia2 1 1 1 1 1 1 1 1 1
## time_sed 1 1 1 1 1 1 1 1 1
## drink_regularly 1 1 1 1 1 1 1 1 1
## days_drinking 1 1 1 1 1 1 1 1 1
## dep1 1 1 1 1 1 1 1 1 0
## dep2 0 1 1 1 1 1 1 1 0
## dep3 1 0 1 1 1 1 1 1 0
## dep4 1 1 0 1 1 1 1 1 0
## dep5 1 1 1 0 1 1 1 1 0
## dep6 1 1 1 1 0 1 1 1 0
## dep7 1 1 1 1 1 0 1 1 0
## dep8 1 1 1 1 1 1 0 1 0
## dep9 1 1 1 1 1 1 1 0 0
## depression_score 1 1 1 1 1 1 1 1 0
imp1 <- mice(data_incomplete1,
m = 10,
method = method,
predictorMatrix = pm,
maxit = 20,
seed = 1234567,
print = FALSE)
# looking at the plots from our passive imputation model:
plot(imp1)
densityplot(imp1)
## Hint: Did you know, an equivalent figure can be created with `ggmice()`?
## For example, to plot a variable named 'my_vrb' from a mids object called 'my_mids', run:
##
## ggmice(my_mids, ggplot2::aes(x = my_vrb, group = .imp)) +
## ggplot2::geom_density()
##
## See amices.org/ggmice for more info.
stripplot(imp1)
## Hint: Did you know, an equivalent figure can be created with `ggmice()`?
## For example, to plot a variable named 'my_vrb' from a mids object called 'my_mids', run:
##
## ggmice(my_mids, ggplot2::aes(x = .imp, y = my_vrb)) +
## ggplot2::geom_jitter()
##
## See amices.org/ggmice for more info.
#BMI, height, weight could be better. The rest looks good. We add one more pmi for BMI:
method["bmi"] <- "~ I(weight / (height / 100)^2)"
method
## id
## ""
## sex
## ""
## age
## ""
## ethnicity
## ""
## education
## ""
## marital
## ""
## household_size
## ""
## household_income
## ""
## weight
## "pmm"
## height
## "pmm"
## bmi
## "~ I(weight / (height / 100)^2)"
## pulse
## ""
## bp_sys1
## ""
## bp_dia1
## ""
## bp_sys2
## "pmm"
## bp_dia2
## "pmm"
## time_sed
## ""
## drink_regularly
## "logreg"
## days_drinking
## ""
## dep1
## ""
## dep2
## "pmm"
## dep3
## "pmm"
## dep4
## ""
## dep5
## "pmm"
## dep6
## "pmm"
## dep7
## ""
## dep8
## "pmm"
## dep9
## "pmm"
## depression_score
## "~ I(dep1 + dep2 + dep3 + dep4 + dep5 + dep6 + dep7 + dep8 + dep9)"
# adjusting the prediction matrix to avoid circularity:
(pm1 <- imp1$predictorMatrix)
## id sex age ethnicity education marital household_size
## id 0 1 1 1 1 1 1
## sex 1 0 1 1 1 1 1
## age 1 1 0 1 1 1 1
## ethnicity 1 1 1 0 1 1 1
## education 1 1 1 1 0 1 1
## marital 1 1 1 1 1 0 1
## household_size 1 1 1 1 1 1 0
## household_income 1 1 1 1 1 1 1
## weight 1 1 1 1 1 1 1
## height 1 1 1 1 1 1 1
## bmi 1 1 1 1 1 1 1
## pulse 1 1 1 1 1 1 1
## bp_sys1 1 1 1 1 1 1 1
## bp_dia1 1 1 1 1 1 1 1
## bp_sys2 1 1 1 1 1 1 1
## bp_dia2 1 1 1 1 1 1 1
## time_sed 1 1 1 1 1 1 1
## drink_regularly 1 1 1 1 1 1 1
## days_drinking 1 1 1 1 1 1 1
## dep1 1 1 1 1 1 1 1
## dep2 1 1 1 1 1 1 1
## dep3 1 1 1 1 1 1 1
## dep4 1 1 1 1 1 1 1
## dep5 1 1 1 1 1 1 1
## dep6 1 1 1 1 1 1 1
## dep7 1 1 1 1 1 1 1
## dep8 1 1 1 1 1 1 1
## dep9 1 1 1 1 1 1 1
## depression_score 1 1 1 1 1 1 1
## household_income weight height bmi pulse bp_sys1 bp_dia1
## id 1 1 1 1 1 1 1
## sex 1 1 1 1 1 1 1
## age 1 1 1 1 1 1 1
## ethnicity 1 1 1 1 1 1 1
## education 1 1 1 1 1 1 1
## marital 1 1 1 1 1 1 1
## household_size 1 1 1 1 1 1 1
## household_income 0 1 1 1 1 1 1
## weight 1 0 1 1 1 1 1
## height 1 1 0 1 1 1 1
## bmi 1 1 1 0 1 1 1
## pulse 1 1 1 1 0 1 1
## bp_sys1 1 1 1 1 1 0 1
## bp_dia1 1 1 1 1 1 1 0
## bp_sys2 1 1 1 1 1 1 1
## bp_dia2 1 1 1 1 1 1 1
## time_sed 1 1 1 1 1 1 1
## drink_regularly 1 1 1 1 1 1 1
## days_drinking 1 1 1 1 1 1 1
## dep1 1 1 1 1 1 1 1
## dep2 1 1 1 1 1 1 1
## dep3 1 1 1 1 1 1 1
## dep4 1 1 1 1 1 1 1
## dep5 1 1 1 1 1 1 1
## dep6 1 1 1 1 1 1 1
## dep7 1 1 1 1 1 1 1
## dep8 1 1 1 1 1 1 1
## dep9 1 1 1 1 1 1 1
## depression_score 1 1 1 1 1 1 1
## bp_sys2 bp_dia2 time_sed drink_regularly days_drinking dep1
## id 1 1 1 1 1 1
## sex 1 1 1 1 1 1
## age 1 1 1 1 1 1
## ethnicity 1 1 1 1 1 1
## education 1 1 1 1 1 1
## marital 1 1 1 1 1 1
## household_size 1 1 1 1 1 1
## household_income 1 1 1 1 1 1
## weight 1 1 1 1 1 1
## height 1 1 1 1 1 1
## bmi 1 1 1 1 1 1
## pulse 1 1 1 1 1 1
## bp_sys1 1 1 1 1 1 1
## bp_dia1 1 1 1 1 1 1
## bp_sys2 0 1 1 1 1 1
## bp_dia2 1 0 1 1 1 1
## time_sed 1 1 0 1 1 1
## drink_regularly 1 1 1 0 1 1
## days_drinking 1 1 1 1 0 1
## dep1 1 1 1 1 1 0
## dep2 1 1 1 1 1 1
## dep3 1 1 1 1 1 1
## dep4 1 1 1 1 1 1
## dep5 1 1 1 1 1 1
## dep6 1 1 1 1 1 1
## dep7 1 1 1 1 1 1
## dep8 1 1 1 1 1 1
## dep9 1 1 1 1 1 1
## depression_score 1 1 1 1 1 1
## dep2 dep3 dep4 dep5 dep6 dep7 dep8 dep9 depression_score
## id 1 1 1 1 1 1 1 1 1
## sex 1 1 1 1 1 1 1 1 1
## age 1 1 1 1 1 1 1 1 1
## ethnicity 1 1 1 1 1 1 1 1 1
## education 1 1 1 1 1 1 1 1 1
## marital 1 1 1 1 1 1 1 1 1
## household_size 1 1 1 1 1 1 1 1 1
## household_income 1 1 1 1 1 1 1 1 1
## weight 1 1 1 1 1 1 1 1 1
## height 1 1 1 1 1 1 1 1 1
## bmi 1 1 1 1 1 1 1 1 1
## pulse 1 1 1 1 1 1 1 1 1
## bp_sys1 1 1 1 1 1 1 1 1 1
## bp_dia1 1 1 1 1 1 1 1 1 1
## bp_sys2 1 1 1 1 1 1 1 1 1
## bp_dia2 1 1 1 1 1 1 1 1 1
## time_sed 1 1 1 1 1 1 1 1 1
## drink_regularly 1 1 1 1 1 1 1 1 1
## days_drinking 1 1 1 1 1 1 1 1 1
## dep1 1 1 1 1 1 1 1 1 0
## dep2 0 1 1 1 1 1 1 1 0
## dep3 1 0 1 1 1 1 1 1 0
## dep4 1 1 0 1 1 1 1 1 0
## dep5 1 1 1 0 1 1 1 1 0
## dep6 1 1 1 1 0 1 1 1 0
## dep7 1 1 1 1 1 0 1 1 0
## dep8 1 1 1 1 1 1 0 1 0
## dep9 1 1 1 1 1 1 1 0 0
## depression_score 1 1 1 1 1 1 1 1 0
pm1 <- as.data.frame(pm1) #converted to df to change the cells
pm1[9:10, 11] <- 0 #changing the values to 0
pm1 <- as.matrix(pm1) #conversion back to matrix
pm1
## id sex age ethnicity education marital household_size
## id 0 1 1 1 1 1 1
## sex 1 0 1 1 1 1 1
## age 1 1 0 1 1 1 1
## ethnicity 1 1 1 0 1 1 1
## education 1 1 1 1 0 1 1
## marital 1 1 1 1 1 0 1
## household_size 1 1 1 1 1 1 0
## household_income 1 1 1 1 1 1 1
## weight 1 1 1 1 1 1 1
## height 1 1 1 1 1 1 1
## bmi 1 1 1 1 1 1 1
## pulse 1 1 1 1 1 1 1
## bp_sys1 1 1 1 1 1 1 1
## bp_dia1 1 1 1 1 1 1 1
## bp_sys2 1 1 1 1 1 1 1
## bp_dia2 1 1 1 1 1 1 1
## time_sed 1 1 1 1 1 1 1
## drink_regularly 1 1 1 1 1 1 1
## days_drinking 1 1 1 1 1 1 1
## dep1 1 1 1 1 1 1 1
## dep2 1 1 1 1 1 1 1
## dep3 1 1 1 1 1 1 1
## dep4 1 1 1 1 1 1 1
## dep5 1 1 1 1 1 1 1
## dep6 1 1 1 1 1 1 1
## dep7 1 1 1 1 1 1 1
## dep8 1 1 1 1 1 1 1
## dep9 1 1 1 1 1 1 1
## depression_score 1 1 1 1 1 1 1
## household_income weight height bmi pulse bp_sys1 bp_dia1
## id 1 1 1 1 1 1 1
## sex 1 1 1 1 1 1 1
## age 1 1 1 1 1 1 1
## ethnicity 1 1 1 1 1 1 1
## education 1 1 1 1 1 1 1
## marital 1 1 1 1 1 1 1
## household_size 1 1 1 1 1 1 1
## household_income 0 1 1 1 1 1 1
## weight 1 0 1 0 1 1 1
## height 1 1 0 0 1 1 1
## bmi 1 1 1 0 1 1 1
## pulse 1 1 1 1 0 1 1
## bp_sys1 1 1 1 1 1 0 1
## bp_dia1 1 1 1 1 1 1 0
## bp_sys2 1 1 1 1 1 1 1
## bp_dia2 1 1 1 1 1 1 1
## time_sed 1 1 1 1 1 1 1
## drink_regularly 1 1 1 1 1 1 1
## days_drinking 1 1 1 1 1 1 1
## dep1 1 1 1 1 1 1 1
## dep2 1 1 1 1 1 1 1
## dep3 1 1 1 1 1 1 1
## dep4 1 1 1 1 1 1 1
## dep5 1 1 1 1 1 1 1
## dep6 1 1 1 1 1 1 1
## dep7 1 1 1 1 1 1 1
## dep8 1 1 1 1 1 1 1
## dep9 1 1 1 1 1 1 1
## depression_score 1 1 1 1 1 1 1
## bp_sys2 bp_dia2 time_sed drink_regularly days_drinking dep1
## id 1 1 1 1 1 1
## sex 1 1 1 1 1 1
## age 1 1 1 1 1 1
## ethnicity 1 1 1 1 1 1
## education 1 1 1 1 1 1
## marital 1 1 1 1 1 1
## household_size 1 1 1 1 1 1
## household_income 1 1 1 1 1 1
## weight 1 1 1 1 1 1
## height 1 1 1 1 1 1
## bmi 1 1 1 1 1 1
## pulse 1 1 1 1 1 1
## bp_sys1 1 1 1 1 1 1
## bp_dia1 1 1 1 1 1 1
## bp_sys2 0 1 1 1 1 1
## bp_dia2 1 0 1 1 1 1
## time_sed 1 1 0 1 1 1
## drink_regularly 1 1 1 0 1 1
## days_drinking 1 1 1 1 0 1
## dep1 1 1 1 1 1 0
## dep2 1 1 1 1 1 1
## dep3 1 1 1 1 1 1
## dep4 1 1 1 1 1 1
## dep5 1 1 1 1 1 1
## dep6 1 1 1 1 1 1
## dep7 1 1 1 1 1 1
## dep8 1 1 1 1 1 1
## dep9 1 1 1 1 1 1
## depression_score 1 1 1 1 1 1
## dep2 dep3 dep4 dep5 dep6 dep7 dep8 dep9 depression_score
## id 1 1 1 1 1 1 1 1 1
## sex 1 1 1 1 1 1 1 1 1
## age 1 1 1 1 1 1 1 1 1
## ethnicity 1 1 1 1 1 1 1 1 1
## education 1 1 1 1 1 1 1 1 1
## marital 1 1 1 1 1 1 1 1 1
## household_size 1 1 1 1 1 1 1 1 1
## household_income 1 1 1 1 1 1 1 1 1
## weight 1 1 1 1 1 1 1 1 1
## height 1 1 1 1 1 1 1 1 1
## bmi 1 1 1 1 1 1 1 1 1
## pulse 1 1 1 1 1 1 1 1 1
## bp_sys1 1 1 1 1 1 1 1 1 1
## bp_dia1 1 1 1 1 1 1 1 1 1
## bp_sys2 1 1 1 1 1 1 1 1 1
## bp_dia2 1 1 1 1 1 1 1 1 1
## time_sed 1 1 1 1 1 1 1 1 1
## drink_regularly 1 1 1 1 1 1 1 1 1
## days_drinking 1 1 1 1 1 1 1 1 1
## dep1 1 1 1 1 1 1 1 1 0
## dep2 0 1 1 1 1 1 1 1 0
## dep3 1 0 1 1 1 1 1 1 0
## dep4 1 1 0 1 1 1 1 1 0
## dep5 1 1 1 0 1 1 1 1 0
## dep6 1 1 1 1 0 1 1 1 0
## dep7 1 1 1 1 1 0 1 1 0
## dep8 1 1 1 1 1 1 0 1 0
## dep9 1 1 1 1 1 1 1 0 0
## depression_score 1 1 1 1 1 1 1 1 0
imp2 <- mice(data_incomplete1,
m = 10,
method = method,
predictorMatrix = pm1,
maxit = 20,
seed = 1234567,
print = FALSE)
# Checking if the deterministic definition of depression score is maintained in the imputed data:
xyplot(imp1, depression_score ~ I(dep1 + dep2 + dep3 + dep4 + dep5 + dep6 + dep7 + dep8 + dep9), ylab="Imputed depression score", xlab="Calculated depression score")
## Hint: Did you know, an equivalent figure can be created with `ggmice()`?
## For example, to plot 2 variables named 'my_x' and 'my_y' from a mids object called 'my_mids', run:
##
## ggmice(my_mids, ggplot2::aes(x = my_x, y = my_y)) +
## ggplot2::geom_point()
##
## See amices.org/ggmice for more info.
# Checking if the deterministic definition of BMI is maintained in the imputed data:
xyplot(imp2, bmi ~ I(weight / (height / 100)^2), ylab="Imputed BMI", xlab="Calculated BMI")
## Hint: Did you know, an equivalent figure can be created with `ggmice()`?
## For example, to plot 2 variables named 'my_x' and 'my_y' from a mids object called 'my_mids', run:
##
## ggmice(my_mids, ggplot2::aes(x = my_x, y = my_y)) +
## ggplot2::geom_point()
##
## See amices.org/ggmice for more info.
# looking at the plots from our model with 2 passive imputations:
plot(imp2) #convergence is better now: also good for bmi, height, weight
densityplot(imp2) #plausibility
## Hint: Did you know, an equivalent figure can be created with `ggmice()`?
## For example, to plot a variable named 'my_vrb' from a mids object called 'my_mids', run:
##
## ggmice(my_mids, ggplot2::aes(x = my_vrb, group = .imp)) +
## ggplot2::geom_density()
##
## See amices.org/ggmice for more info.
stripplot(imp2) #plausibility
## Hint: Did you know, an equivalent figure can be created with `ggmice()`?
## For example, to plot a variable named 'my_vrb' from a mids object called 'my_mids', run:
##
## ggmice(my_mids, ggplot2::aes(x = .imp, y = my_vrb)) +
## ggplot2::geom_jitter()
##
## See amices.org/ggmice for more info.
# checking the default prediction matrix to find correlating variables: BMI (and height + weight) correlate with depression_score (and two blood pressure variables and household income correlate with BMI). Since it does not include a correlation between age and depression_score, as we have in our research question, we stick to keeping all variables, except the ones removed to avoid circularity from passive imputation. In other words, we stick with the predictor matrix in imp2 due to its fit with our research question.
(pred <- quickpred(data_incomplete1))
## id sex age ethnicity education marital household_size
## id 0 0 0 0 0 0 0
## sex 0 0 0 0 0 0 0
## age 0 0 0 0 0 0 0
## ethnicity 0 0 0 0 0 0 0
## education 0 0 0 0 0 0 0
## marital 0 0 0 0 0 0 0
## household_size 0 0 0 0 0 0 0
## household_income 0 0 0 0 0 0 0
## weight 0 1 1 1 1 0 1
## height 0 1 0 1 1 0 1
## bmi 0 1 1 0 0 0 0
## pulse 0 0 0 0 0 0 0
## bp_sys1 0 0 0 0 0 0 0
## bp_dia1 0 0 0 0 0 0 0
## bp_sys2 0 1 1 0 1 0 1
## bp_dia2 0 1 1 0 0 0 0
## time_sed 0 0 0 0 0 0 0
## drink_regularly 0 1 0 0 0 0 0
## days_drinking 0 0 0 0 0 0 0
## dep1 0 0 0 0 0 0 0
## dep2 0 1 0 0 0 0 0
## dep3 0 1 0 0 1 0 0
## dep4 0 0 0 0 0 0 0
## dep5 0 1 1 0 1 1 0
## dep6 0 1 0 0 0 0 0
## dep7 0 0 0 0 0 0 0
## dep8 0 0 0 0 1 1 0
## dep9 0 0 0 0 1 0 1
## depression_score 0 1 0 0 1 1 0
## household_income weight height bmi pulse bp_sys1 bp_dia1
## id 0 0 0 0 0 0 0
## sex 0 0 0 0 0 0 0
## age 0 0 0 0 0 0 0
## ethnicity 0 0 0 0 0 0 0
## education 0 0 0 0 0 0 0
## marital 0 0 0 0 0 0 0
## household_size 0 0 0 0 0 0 0
## household_income 0 0 0 0 0 0 0
## weight 0 0 1 1 1 1 1
## height 1 1 0 0 0 0 1
## bmi 0 1 1 0 1 1 1
## pulse 0 0 0 0 0 0 0
## bp_sys1 0 0 0 0 0 0 0
## bp_dia1 0 0 0 0 0 0 0
## bp_sys2 0 1 0 1 1 1 1
## bp_dia2 0 1 1 1 1 1 1
## time_sed 0 0 0 0 0 0 0
## drink_regularly 1 0 1 0 0 0 0
## days_drinking 0 0 0 0 0 0 0
## dep1 0 0 0 0 0 0 0
## dep2 1 0 1 1 1 0 0
## dep3 1 1 1 1 1 0 0
## dep4 0 0 0 0 0 0 0
## dep5 1 1 1 1 1 0 0
## dep6 1 0 1 1 0 0 0
## dep7 0 0 0 0 0 0 0
## dep8 1 0 0 0 1 0 0
## dep9 0 0 0 0 0 0 0
## depression_score 1 1 1 1 1 0 0
## bp_sys2 bp_dia2 time_sed drink_regularly days_drinking dep1
## id 0 0 0 0 0 0
## sex 0 0 0 0 0 0
## age 0 0 0 0 0 0
## ethnicity 0 0 0 0 0 0
## education 0 0 0 0 0 0
## marital 0 0 0 0 0 0
## household_size 0 0 0 0 0 0
## household_income 0 0 0 0 0 0
## weight 1 1 1 0 0 0
## height 0 1 1 1 1 0
## bmi 1 1 1 0 0 1
## pulse 0 0 0 0 0 0
## bp_sys1 0 0 0 0 0 0
## bp_dia1 0 0 0 0 0 0
## bp_sys2 0 1 0 0 1 0
## bp_dia2 1 0 0 0 1 0
## time_sed 0 0 0 0 0 0
## drink_regularly 0 0 0 0 1 1
## days_drinking 0 0 0 0 0 0
## dep1 0 0 0 0 0 0
## dep2 0 0 0 0 1 1
## dep3 0 0 0 0 1 1
## dep4 0 0 0 0 0 0
## dep5 0 0 0 0 1 1
## dep6 0 0 0 0 1 1
## dep7 0 0 0 0 0 0
## dep8 0 0 0 0 0 1
## dep9 0 0 1 0 0 1
## depression_score 0 0 0 0 1 1
## dep2 dep3 dep4 dep5 dep6 dep7 dep8 dep9 depression_score
## id 0 0 0 0 0 0 0 0 0
## sex 0 0 0 0 0 0 0 0 0
## age 0 0 0 0 0 0 0 0 0
## ethnicity 0 0 0 0 0 0 0 0 0
## education 0 0 0 0 0 0 0 0 0
## marital 0 0 0 0 0 0 0 0 0
## household_size 0 0 0 0 0 0 0 0 0
## household_income 0 0 0 0 0 0 0 0 0
## weight 1 1 0 1 0 0 0 0 1
## height 0 0 0 1 0 1 0 0 1
## bmi 1 1 1 1 0 1 0 0 1
## pulse 0 0 0 0 0 0 0 0 0
## bp_sys1 0 0 0 0 0 0 0 0 0
## bp_dia1 0 0 0 0 0 0 0 0 0
## bp_sys2 0 0 0 0 1 0 0 0 0
## bp_dia2 0 0 0 0 1 0 0 0 0
## time_sed 0 0 0 0 0 0 0 0 0
## drink_regularly 0 0 0 0 0 0 0 0 0
## days_drinking 0 0 0 0 0 0 0 0 0
## dep1 0 0 0 0 0 0 0 0 0
## dep2 0 1 1 1 1 1 1 1 1
## dep3 1 0 1 1 1 1 1 1 1
## dep4 0 0 0 0 0 0 0 0 0
## dep5 1 1 1 0 1 1 1 1 1
## dep6 1 1 1 1 0 1 1 1 1
## dep7 0 0 0 0 0 0 0 0 0
## dep8 1 1 1 1 1 1 0 1 1
## dep9 1 1 1 1 1 1 1 0 1
## depression_score 1 1 1 1 1 1 1 1 0
With our chosen multiple imputation model, we will now do analysis for our research question and pool the results.
# analysis for each imp to see if age is a predictor for depression score:
fit <- with(imp2, lm(depression_score ~ age))
# results:
fit
## call :
## with.mids(data = imp2, expr = lm(depression_score ~ age))
##
## call1 :
## mice(data = data_incomplete1, m = 10, method = method, predictorMatrix = pm1,
## maxit = 20, printFlag = FALSE, seed = 1234567)
##
## nmis :
## id sex age ethnicity
## 0 0 0 0
## education marital household_size household_income
## 0 0 0 0
## weight height bmi pulse
## 150 96 222 0
## bp_sys1 bp_dia1 bp_sys2 bp_dia2
## 0 0 100 100
## time_sed drink_regularly days_drinking dep1
## 0 50 0 0
## dep2 dep3 dep4 dep5
## 75 75 0 75
## dep6 dep7 dep8 dep9
## 75 0 40 81
## depression_score
## 177
##
## analyses :
## [[1]]
##
## Call:
## lm(formula = depression_score ~ age)
##
## Coefficients:
## (Intercept) age
## 4.3210 -0.0154
##
##
## [[2]]
##
## Call:
## lm(formula = depression_score ~ age)
##
## Coefficients:
## (Intercept) age
## 4.33318 -0.01783
##
##
## [[3]]
##
## Call:
## lm(formula = depression_score ~ age)
##
## Coefficients:
## (Intercept) age
## 4.14071 -0.01319
##
##
## [[4]]
##
## Call:
## lm(formula = depression_score ~ age)
##
## Coefficients:
## (Intercept) age
## 4.18988 -0.01393
##
##
## [[5]]
##
## Call:
## lm(formula = depression_score ~ age)
##
## Coefficients:
## (Intercept) age
## 4.22247 -0.01408
##
##
## [[6]]
##
## Call:
## lm(formula = depression_score ~ age)
##
## Coefficients:
## (Intercept) age
## 4.21945 -0.01402
##
##
## [[7]]
##
## Call:
## lm(formula = depression_score ~ age)
##
## Coefficients:
## (Intercept) age
## 4.23324 -0.01284
##
##
## [[8]]
##
## Call:
## lm(formula = depression_score ~ age)
##
## Coefficients:
## (Intercept) age
## 4.27384 -0.01596
##
##
## [[9]]
##
## Call:
## lm(formula = depression_score ~ age)
##
## Coefficients:
## (Intercept) age
## 4.14297 -0.01315
##
##
## [[10]]
##
## Call:
## lm(formula = depression_score ~ age)
##
## Coefficients:
## (Intercept) age
## 4.22587 -0.01533
fit$analyses[[1]] %>% summary()
##
## Call:
## lm(formula = depression_score ~ age)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.013 -3.351 -1.805 1.176 20.203
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.32100 0.69700 6.199 1.19e-09 ***
## age -0.01540 0.01491 -1.033 0.302
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.788 on 498 degrees of freedom
## Multiple R-squared: 0.002137, Adjusted R-squared: 0.0001333
## F-statistic: 1.067 on 1 and 498 DF, p-value: 0.3022
fit$analyses[[2]] %>% coef()
## (Intercept) age
## 4.33318014 -0.01783068
# pooling the results and checking them:
poolFit <- pool(fit)
summary(poolFit)
## term estimate std.error statistic df p.value
## 1 (Intercept) 4.23026217 0.68210600 6.2017665 488.3516 1.190560e-09
## 2 age -0.01457293 0.01460972 -0.9974816 485.7817 3.190274e-01