Question

For this question, you will be using the dataset “Programming.csv”. Some participants have missing data for some variables. Multiple imputation (MI) is a method that is commonly used to handle missing data. For this dataset, the missing values have been imputed five times using Stata’s mi suite of commands.

When reporting missing data from a study, guidelines recommend “indicating the number of participants with missing data for each variable of interest”. One way to do this, is to present a table which shows the proportion of participants with complete data and with missing data for each variable included in your analysis. For this dataset, we have three waves of data for the time varying variables. Before we examine the data, provide code to check the integrity of the original (not imputed) data. Next, provide code to produce a table, which shows the number (#) and percent (%) of complete and missing data for each variable from the originally observed data (i.e., _mi_m = 0) . For variables that are collected over multiple waves, the # (%) should be given separately for each wave of data collection. Your table should be appropriately formatted and should be automatically generated and exported as a publication ready table from R.

Solution

# load packages
library(dplyr)
library(tidyverse)
library(gt)
library(gtsummary)

# load the data
df <- read.csv('dataset/programming.csv', header = T)
head(df)
##      pid wave    sex age       smoke disab_anytype std_PHYS std_MENT X_mi_id
## 1 100028 2003   Male  36                        No       NA       NA       1
## 2 100028 2004   Male  36 Ever smoked            No 52.34127 49.50153       2
## 3 100028 2005   Male  36 Ever smoked            No 58.80231 46.86787       3
## 4 100053 2003 Female  47 Ever smoked            No 53.48396 27.86780       4
## 5 100053 2004 Female  47 Ever smoked            No 53.17352 52.32007       5
## 6 100053 2005 Female  47 Ever smoked            No 51.48118 46.37448       6
##   X_mi_miss X_mi_m
## 1         1      0
## 2         0      0
## 3         0      0
## 4         0      0
## 5         0      0
## 6         0      0
# check the integrity of the data
df <- df[df$X_mi_m == 0,]
df %>%
  tbl_summary()
Characteristic N = 1,6351
pid 110,192 (104,820, 114,456)
wave
2003 545 (33%)
2004 545 (33%)
2005 545 (33%)
sex
Female 864 (53%)
Male 771 (47%)
age 39 (32, 46)
smoke
149 (9.1%)
Ever smoked 700 (43%)
Never smoked 786 (48%)
disab_anytype
49 (3.0%)
No 1,531 (94%)
Yes 55 (3.4%)
std_PHYS 55 (52, 58)
Unknown 168
std_MENT 52 (46, 56)
Unknown 168
X_mi_id 818 (410, 1,226)
X_mi_miss 176 (11%)
X_mi_m 0 (0%)
1 Median (IQR); n (%)
# replace the empty cells with NA
f <- function(x){
  ifelse(x == '', NA, x)
}
dfn <- apply(df, 2, f)
dfn <- as.data.frame(dfn)
dfn$pid <- as.factor(dfn$pid)
dfn$wave <- as.factor(dfn$wave)
dfn$sex <- as.factor(dfn$sex)
dfn$age <- as.numeric(dfn$age)
dfn$smoke <- as.factor(dfn$smoke)
dfn$disab_anytype <- as.factor(dfn$disab_anytype)
dfn$std_PHYS <- as.numeric(dfn$std_PHYS)
dfn$std_MENT <- as.numeric(dfn$std_MENT)
dfn$X_mi_id <- as.integer(dfn$X_mi_id)
dfn$X_mi_miss <- as.integer(dfn$X_mi_miss)
dfn$X_mi_m <- as.integer(dfn$X_mi_m)
# count number of complete cases for each wave
count_miss <- function(x) sum(is.na(x) | x=='<NA>')
count_comp <- function(x) sum(!is.na(x) & x!='<NA>')
sum_miss <- apply(dfn, 2, count_miss)
sum_comp <- apply(dfn, 2, count_comp)
perc_miss <- apply(dfn, 2, count_miss)/nrow(dfn) * 100
perc_comp <- apply(dfn, 2, count_comp)/nrow(dfn) * 100
s=rbind.data.frame(sum_miss, sum_comp, perc_miss, perc_comp)
ss = t(s)
rownames(ss) <- colnames(dfn)
colnames(ss) <- c('#missing','#complete','%missing', '%complete')
ss <- as.data.frame(ss)
ss <- cbind.data.frame(Variable=colnames(dfn),ss)
ss %>% gt(caption  = 'Number of missing and complete cases in each variable')
Number of missing and complete cases in each variable
Variable #missing #complete %missing %complete
pid 0 1635 0.000000 100.00000
wave 0 1635 0.000000 100.00000
sex 0 1635 0.000000 100.00000
age 0 1635 0.000000 100.00000
smoke 149 1486 9.113150 90.88685
disab_anytype 49 1586 2.996942 97.00306
std_PHYS 168 1467 10.275229 89.72477
std_MENT 168 1467 10.275229 89.72477
X_mi_id 0 1635 0.000000 100.00000
X_mi_miss 0 1635 0.000000 100.00000
X_mi_m 0 1635 0.000000 100.00000
# count proportion of complete and missing for each wave
sn <- dfn %>% dplyr::group_by(wave)  %>% dplyr::select(c(age,disab_anytype,sex,smoke,std_MENT,std_PHYS)) %>%
  summarise_each(funs(sum(!is.na(.))))
sn2 <- t.data.frame(sn[,-1])
colnames(sn2) <- unlist(sn[,1])
sn2 <- as.data.frame(sn2)
sm <- dfn %>% dplyr::group_by(wave)  %>% dplyr::select(c(age,disab_anytype,sex,smoke,std_MENT,std_PHYS)) %>%
  summarise_each(funs(sum(is.na(.))))
sm2 <- t.data.frame(sm[,-1])
colnames(sm2) <- unlist(sm[,1])
sm2 <- as.data.frame(sm2)
# f <- function(x) paste(x,paste0("(",round(x/3270 * 100,2),"%)"))
f <- function(x) paste(x,paste0("(",round(x/545 * 100,2),"%)"))
age <- f(sn$age)
dis_n <- f(sn$disab_anytype)
dis_m <- f(sm$disab_anytype)
sex_n <- f(sn$sex)
smoking_n <- f(sn$smoke)
smoking_m <- f(sm$smoke)
mental_n <- f(sn$std_MENT)
mental_m <- f(sm$std_MENT)
physical_n <- f(sn$std_PHYS)
physical_m <- f(sm$std_PHYS)
rhs <- rbind(age,dis_n,dis_m,sex_n,smoking_n,smoking_m,mental_n, mental_m,physical_n,physical_m)
rhs <- as.data.frame(rhs)
s <- data.frame(
  v1 = c('Age (years)', 'Disability Status','Disability Status','Sex',                            
        'Smoking Status','Smoking Status','Mental Health Summary Score',                             
        'Mental Health Summary Score','Physical Summary Score','Physical Summary Score'),
  v2 = c('Not missing','Not missing','Missing','Not missing','Not missing',
         'Missing','Not missing','Missing','Not missing','Missing')
  )
final <- cbind.data.frame(s,rhs)
colnames(final)[3:5] <- c('year2003', 'year2004','year2005')
final %>% gt(caption = 'Summary of missings and not missing variables for each wave and each variable')
Summary of missings and not missing variables for each wave and each variable
v1 v2 year2003 year2004 year2005
Age (years) Not missing 545 (100%) 545 (100%) 545 (100%)
Disability Status Not missing 522 (95.78%) 519 (95.23%) 545 (100%)
Disability Status Missing 23 (4.22%) 26 (4.77%) 0 (0%)
Sex Not missing 545 (100%) 545 (100%) 545 (100%)
Smoking Status Not missing 490 (89.91%) 493 (90.46%) 503 (92.29%)
Smoking Status Missing 55 (10.09%) 52 (9.54%) 42 (7.71%)
Mental Health Summary Score Not missing 480 (88.07%) 488 (89.54%) 499 (91.56%)
Mental Health Summary Score Missing 65 (11.93%) 57 (10.46%) 46 (8.44%)
Physical Summary Score Not missing 480 (88.07%) 488 (89.54%) 499 (91.56%)
Physical Summary Score Missing 65 (11.93%) 57 (10.46%) 46 (8.44%)

follow me on RPubs for more!

www.rstudio.ir