Question
For this question, you will be using the dataset “Programming.csv”.
Some participants have missing data for some variables. Multiple
imputation (MI) is a method that is commonly used to handle missing
data. For this dataset, the missing values have been imputed five times
using Stata’s mi suite of commands.
When reporting missing data from a study, guidelines recommend
“indicating the number of participants with missing data for each
variable of interest”. One way to do this, is to present a table which
shows the proportion of participants with complete data and with missing
data for each variable included in your analysis. For this dataset, we
have three waves of data for the time varying variables. Before we
examine the data, provide code to check the integrity of the original
(not imputed) data. Next, provide code to produce a table, which shows
the number (#) and percent (%) of complete and missing data for each
variable from the originally observed data (i.e., _mi_m = 0) . For
variables that are collected over multiple waves, the # (%) should be
given separately for each wave of data collection. Your table should be
appropriately formatted and should be automatically generated and
exported as a publication ready table from R.
Solution
# load packages
library(dplyr)
library(tidyverse)
library(gt)
library(gtsummary)
# load the data
df <- read.csv('dataset/programming.csv', header = T)
head(df)
## pid wave sex age smoke disab_anytype std_PHYS std_MENT X_mi_id
## 1 100028 2003 Male 36 No NA NA 1
## 2 100028 2004 Male 36 Ever smoked No 52.34127 49.50153 2
## 3 100028 2005 Male 36 Ever smoked No 58.80231 46.86787 3
## 4 100053 2003 Female 47 Ever smoked No 53.48396 27.86780 4
## 5 100053 2004 Female 47 Ever smoked No 53.17352 52.32007 5
## 6 100053 2005 Female 47 Ever smoked No 51.48118 46.37448 6
## X_mi_miss X_mi_m
## 1 1 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
# check the integrity of the data
df <- df[df$X_mi_m == 0,]
df %>%
tbl_summary()
| Characteristic |
N = 1,635 |
| pid |
110,192 (104,820, 114,456) |
| wave |
|
| 2003 |
545 (33%) |
| 2004 |
545 (33%) |
| 2005 |
545 (33%) |
| sex |
|
| Female |
864 (53%) |
| Male |
771 (47%) |
| age |
39 (32, 46) |
| smoke |
|
|
149 (9.1%) |
| Ever smoked |
700 (43%) |
| Never smoked |
786 (48%) |
| disab_anytype |
|
|
49 (3.0%) |
| No |
1,531 (94%) |
| Yes |
55 (3.4%) |
| std_PHYS |
55 (52, 58) |
| Unknown |
168 |
| std_MENT |
52 (46, 56) |
| Unknown |
168 |
| X_mi_id |
818 (410, 1,226) |
| X_mi_miss |
176 (11%) |
| X_mi_m |
0 (0%) |
# replace the empty cells with NA
f <- function(x){
ifelse(x == '', NA, x)
}
dfn <- apply(df, 2, f)
dfn <- as.data.frame(dfn)
dfn$pid <- as.factor(dfn$pid)
dfn$wave <- as.factor(dfn$wave)
dfn$sex <- as.factor(dfn$sex)
dfn$age <- as.numeric(dfn$age)
dfn$smoke <- as.factor(dfn$smoke)
dfn$disab_anytype <- as.factor(dfn$disab_anytype)
dfn$std_PHYS <- as.numeric(dfn$std_PHYS)
dfn$std_MENT <- as.numeric(dfn$std_MENT)
dfn$X_mi_id <- as.integer(dfn$X_mi_id)
dfn$X_mi_miss <- as.integer(dfn$X_mi_miss)
dfn$X_mi_m <- as.integer(dfn$X_mi_m)
# count number of complete cases for each wave
count_miss <- function(x) sum(is.na(x) | x=='<NA>')
count_comp <- function(x) sum(!is.na(x) & x!='<NA>')
sum_miss <- apply(dfn, 2, count_miss)
sum_comp <- apply(dfn, 2, count_comp)
perc_miss <- apply(dfn, 2, count_miss)/nrow(dfn) * 100
perc_comp <- apply(dfn, 2, count_comp)/nrow(dfn) * 100
s=rbind.data.frame(sum_miss, sum_comp, perc_miss, perc_comp)
ss = t(s)
rownames(ss) <- colnames(dfn)
colnames(ss) <- c('#missing','#complete','%missing', '%complete')
ss <- as.data.frame(ss)
ss <- cbind.data.frame(Variable=colnames(dfn),ss)
ss %>% gt(caption = 'Number of missing and complete cases in each variable')
Number of missing and complete cases in each variable
| Variable |
#missing |
#complete |
%missing |
%complete |
| pid |
0 |
1635 |
0.000000 |
100.00000 |
| wave |
0 |
1635 |
0.000000 |
100.00000 |
| sex |
0 |
1635 |
0.000000 |
100.00000 |
| age |
0 |
1635 |
0.000000 |
100.00000 |
| smoke |
149 |
1486 |
9.113150 |
90.88685 |
| disab_anytype |
49 |
1586 |
2.996942 |
97.00306 |
| std_PHYS |
168 |
1467 |
10.275229 |
89.72477 |
| std_MENT |
168 |
1467 |
10.275229 |
89.72477 |
| X_mi_id |
0 |
1635 |
0.000000 |
100.00000 |
| X_mi_miss |
0 |
1635 |
0.000000 |
100.00000 |
| X_mi_m |
0 |
1635 |
0.000000 |
100.00000 |
# count proportion of complete and missing for each wave
sn <- dfn %>% dplyr::group_by(wave) %>% dplyr::select(c(age,disab_anytype,sex,smoke,std_MENT,std_PHYS)) %>%
summarise_each(funs(sum(!is.na(.))))
sn2 <- t.data.frame(sn[,-1])
colnames(sn2) <- unlist(sn[,1])
sn2 <- as.data.frame(sn2)
sm <- dfn %>% dplyr::group_by(wave) %>% dplyr::select(c(age,disab_anytype,sex,smoke,std_MENT,std_PHYS)) %>%
summarise_each(funs(sum(is.na(.))))
sm2 <- t.data.frame(sm[,-1])
colnames(sm2) <- unlist(sm[,1])
sm2 <- as.data.frame(sm2)
# f <- function(x) paste(x,paste0("(",round(x/3270 * 100,2),"%)"))
f <- function(x) paste(x,paste0("(",round(x/545 * 100,2),"%)"))
age <- f(sn$age)
dis_n <- f(sn$disab_anytype)
dis_m <- f(sm$disab_anytype)
sex_n <- f(sn$sex)
smoking_n <- f(sn$smoke)
smoking_m <- f(sm$smoke)
mental_n <- f(sn$std_MENT)
mental_m <- f(sm$std_MENT)
physical_n <- f(sn$std_PHYS)
physical_m <- f(sm$std_PHYS)
rhs <- rbind(age,dis_n,dis_m,sex_n,smoking_n,smoking_m,mental_n, mental_m,physical_n,physical_m)
rhs <- as.data.frame(rhs)
s <- data.frame(
v1 = c('Age (years)', 'Disability Status','Disability Status','Sex',
'Smoking Status','Smoking Status','Mental Health Summary Score',
'Mental Health Summary Score','Physical Summary Score','Physical Summary Score'),
v2 = c('Not missing','Not missing','Missing','Not missing','Not missing',
'Missing','Not missing','Missing','Not missing','Missing')
)
final <- cbind.data.frame(s,rhs)
colnames(final)[3:5] <- c('year2003', 'year2004','year2005')
final %>% gt(caption = 'Summary of missings and not missing variables for each wave and each variable')
Summary of missings and not missing variables for each wave and each variable
| v1 |
v2 |
year2003 |
year2004 |
year2005 |
| Age (years) |
Not missing |
545 (100%) |
545 (100%) |
545 (100%) |
| Disability Status |
Not missing |
522 (95.78%) |
519 (95.23%) |
545 (100%) |
| Disability Status |
Missing |
23 (4.22%) |
26 (4.77%) |
0 (0%) |
| Sex |
Not missing |
545 (100%) |
545 (100%) |
545 (100%) |
| Smoking Status |
Not missing |
490 (89.91%) |
493 (90.46%) |
503 (92.29%) |
| Smoking Status |
Missing |
55 (10.09%) |
52 (9.54%) |
42 (7.71%) |
| Mental Health Summary Score |
Not missing |
480 (88.07%) |
488 (89.54%) |
499 (91.56%) |
| Mental Health Summary Score |
Missing |
65 (11.93%) |
57 (10.46%) |
46 (8.44%) |
| Physical Summary Score |
Not missing |
480 (88.07%) |
488 (89.54%) |
499 (91.56%) |
| Physical Summary Score |
Missing |
65 (11.93%) |
57 (10.46%) |
46 (8.44%) |