Question

For this question, you will be using the dataset “Programming.csv”. Some participants have missing data for some variables. Multiple imputation (MI) is a method that is commonly used to handle missing data. For this dataset, the missing values have been imputed five times using Stata’s mi suite of commands.

When reporting missing data from a study, guidelines recommend “indicating the number of participants with missing data for each variable of interest”. One way to do this, is to present a table which shows the proportion of participants with complete data and with missing data for each variable included in your analysis. For this dataset, we have three waves of data for the time varying variables. Before we examine the data, provide code to check the integrity of the original (not imputed) data. Next, provide code to produce a table, which shows the number (#) and percent (%) of complete and missing data for each variable from the originally observed data (i.e., _mi_m = 0) . For variables that are collected over multiple waves, the # (%) should be given separately for each wave of data collection. Your table should be appropriately formatted and should be automatically generated and exported as a publication ready table from R.

Solution

# load packages
library(dplyr)
library(tidyverse)
library(gt)
library(gtsummary)

# load the data
df <- read.csv('dataset/programming.csv', header = T)
head(df)

##      pid wave    sex age       smoke disab_anytype std_PHYS std_MENT X_mi_id
## 1 100028 2003   Male  36                        No       NA       NA       1
## 2 100028 2004   Male  36 Ever smoked            No 52.34127 49.50153       2
## 3 100028 2005   Male  36 Ever smoked            No 58.80231 46.86787       3
## 4 100053 2003 Female  47 Ever smoked            No 53.48396 27.86780       4
## 5 100053 2004 Female  47 Ever smoked            No 53.17352 52.32007       5
## 6 100053 2005 Female  47 Ever smoked            No 51.48118 46.37448       6
##   X_mi_miss X_mi_m
## 1         1      0
## 2         0      0
## 3         0      0
## 4         0      0
## 5         0      0
## 6         0      0

# check the integrity of the data
df <- df[df$X_mi_m == 0,]
df %>%
  tbl_summary()

Characteristic	N = 1,635¹
pid	110,192 (104,820, 114,456)
wave
2003	545 (33%)
2004	545 (33%)
2005	545 (33%)
sex
Female	864 (53%)
Male	771 (47%)
age	39 (32, 46)
smoke
	149 (9.1%)
Ever smoked	700 (43%)
Never smoked	786 (48%)
disab_anytype
	49 (3.0%)
No	1,531 (94%)
Yes	55 (3.4%)
std_PHYS	55 (52, 58)
Unknown	168
std_MENT	52 (46, 56)
Unknown	168
X_mi_id	818 (410, 1,226)
X_mi_miss	176 (11%)
X_mi_m	0 (0%)
¹ Median (IQR); n (%)

# replace the empty cells with NA
f <- function(x){
  ifelse(x == '', NA, x)
}
dfn <- apply(df, 2, f)
dfn <- as.data.frame(dfn)
dfn$pid <- as.factor(dfn$pid)
dfn$wave <- as.factor(dfn$wave)
dfn$sex <- as.factor(dfn$sex)
dfn$age <- as.numeric(dfn$age)
dfn$smoke <- as.factor(dfn$smoke)
dfn$disab_anytype <- as.factor(dfn$disab_anytype)
dfn$std_PHYS <- as.numeric(dfn$std_PHYS)
dfn$std_MENT <- as.numeric(dfn$std_MENT)
dfn$X_mi_id <- as.integer(dfn$X_mi_id)
dfn$X_mi_miss <- as.integer(dfn$X_mi_miss)
dfn$X_mi_m <- as.integer(dfn$X_mi_m)
# count number of complete cases for each wave
count_miss <- function(x) sum(is.na(x) | x=='<NA>')
count_comp <- function(x) sum(!is.na(x) & x!='<NA>')
sum_miss <- apply(dfn, 2, count_miss)
sum_comp <- apply(dfn, 2, count_comp)
perc_miss <- apply(dfn, 2, count_miss)/nrow(dfn) * 100
perc_comp <- apply(dfn, 2, count_comp)/nrow(dfn) * 100
s=rbind.data.frame(sum_miss, sum_comp, perc_miss, perc_comp)
ss = t(s)
rownames(ss) <- colnames(dfn)
colnames(ss) <- c('#missing','#complete','%missing', '%complete')
ss <- as.data.frame(ss)
ss <- cbind.data.frame(Variable=colnames(dfn),ss)
ss %>% gt(caption  = 'Number of missing and complete cases in each variable')

Number of missing and complete cases in each variable
Variable	#missing	#complete	%missing	%complete
pid	0	1635	0.000000	100.00000
wave	0	1635	0.000000	100.00000
sex	0	1635	0.000000	100.00000
age	0	1635	0.000000	100.00000
smoke	149	1486	9.113150	90.88685
disab_anytype	49	1586	2.996942	97.00306
std_PHYS	168	1467	10.275229	89.72477
std_MENT	168	1467	10.275229	89.72477
X_mi_id	0	1635	0.000000	100.00000
X_mi_miss	0	1635	0.000000	100.00000
X_mi_m	0	1635	0.000000	100.00000

# count proportion of complete and missing for each wave
sn <- dfn %>% dplyr::group_by(wave)  %>% dplyr::select(c(age,disab_anytype,sex,smoke,std_MENT,std_PHYS)) %>%
  summarise_each(funs(sum(!is.na(.))))
sn2 <- t.data.frame(sn[,-1])
colnames(sn2) <- unlist(sn[,1])
sn2 <- as.data.frame(sn2)
sm <- dfn %>% dplyr::group_by(wave)  %>% dplyr::select(c(age,disab_anytype,sex,smoke,std_MENT,std_PHYS)) %>%
  summarise_each(funs(sum(is.na(.))))
sm2 <- t.data.frame(sm[,-1])
colnames(sm2) <- unlist(sm[,1])
sm2 <- as.data.frame(sm2)
# f <- function(x) paste(x,paste0("(",round(x/3270 * 100,2),"%)"))
f <- function(x) paste(x,paste0("(",round(x/545 * 100,2),"%)"))
age <- f(sn$age)
dis_n <- f(sn$disab_anytype)
dis_m <- f(sm$disab_anytype)
sex_n <- f(sn$sex)
smoking_n <- f(sn$smoke)
smoking_m <- f(sm$smoke)
mental_n <- f(sn$std_MENT)
mental_m <- f(sm$std_MENT)
physical_n <- f(sn$std_PHYS)
physical_m <- f(sm$std_PHYS)
rhs <- rbind(age,dis_n,dis_m,sex_n,smoking_n,smoking_m,mental_n, mental_m,physical_n,physical_m)
rhs <- as.data.frame(rhs)
s <- data.frame(
  v1 = c('Age (years)', 'Disability Status','Disability Status','Sex',                            
        'Smoking Status','Smoking Status','Mental Health Summary Score',                             
        'Mental Health Summary Score','Physical Summary Score','Physical Summary Score'),
  v2 = c('Not missing','Not missing','Missing','Not missing','Not missing',
         'Missing','Not missing','Missing','Not missing','Missing')
  )
final <- cbind.data.frame(s,rhs)
colnames(final)[3:5] <- c('year2003', 'year2004','year2005')
final %>% gt(caption = 'Summary of missings and not missing variables for each wave and each variable')

Summary of missings and not missing variables for each wave and each variable
v1	v2	year2003	year2004	year2005
Age (years)	Not missing	545 (100%)	545 (100%)	545 (100%)
Disability Status	Not missing	522 (95.78%)	519 (95.23%)	545 (100%)
Disability Status	Missing	23 (4.22%)	26 (4.77%)	0 (0%)
Sex	Not missing	545 (100%)	545 (100%)	545 (100%)
Smoking Status	Not missing	490 (89.91%)	493 (90.46%)	503 (92.29%)
Smoking Status	Missing	55 (10.09%)	52 (9.54%)	42 (7.71%)
Mental Health Summary Score	Not missing	480 (88.07%)	488 (89.54%)	499 (91.56%)
Mental Health Summary Score	Missing	65 (11.93%)	57 (10.46%)	46 (8.44%)
Physical Summary Score	Not missing	480 (88.07%)	488 (89.54%)	499 (91.56%)
Physical Summary Score	Missing	65 (11.93%)	57 (10.46%)	46 (8.44%)

follow me on RPubs for more!

Assignment

Afshin Motavali

June, 2022

Question

Solution

follow me on RPubs for more!