#Load data for analysis#sub-setting and Re-codding variables for analysis purposes

brfss <- readRDS("brfss_177.rds")
# Cleaning the  variable names for space, underscore & Uppercase Characters
renam<-names(brfss)
newnames<-tolower(gsub(pattern = "_",replacement =  "",x =  renam))
names(brfss)<-newnames


homewk3 <- brfss %>% 
  dplyr::select(state,llcpwt,dispcode,seqno,psu,bmi5cat,racegr3,educa,ststr,smoke100,exerany2,sex,ageg,menthlth,smokday2,sleptim1) %>%  
  mutate( smokers=as.factor(ifelse(smoke100==1 & smokday2%in%c(1,2),"Current",
                 ifelse(smoke100==1 & smokday2==3, "Former", 
                ifelse(smoke100==2, "Never", NA))))) %>% 
  
mutate(bmi = car::Recode(bmi5cat,recodes="1='Underweight';2='Normal';3='Overweight'; 4='Obese';else=NA",as.factor=T ),
       educa = car::Recode(educa,recodes="1:2='<HS';3:4='HS';5:6='>HS';else=NA", as.factor=T ),
       racegr3 = car::Recode(racegr3,recodes="1='NH-White';2='NH-Black';3:4='Other';5='Hispanic';else=NA",as.factor=T ), 
       ageg = car::Recode(ageg,recodes="1='18-24';2='25-30';3='35-44';4='45-54';5='55-64';else='65+'" ,as.factor=T ), 
       Physicala = car::Recode(exerany2,recodes="='Yes';2='No';else=NA",as.factor=T ), sex = car::Recode(sex,recodes="1='Male';2='Female';else=NA",as.factor=T ),
       sleepdurat=car::Recode(sleptim1, recodes="77=NA ;99= NA"),
        mentald=ifelse(menthlth<31, "Yes",
                       ifelse(menthlth==88, "No",NA))) %>% 
  filter( is.na(sleepdurat)==F)
 
#Data containing only variables used in the analysis
 Final_data <- homewk3 %>% 
 dplyr:: select(smokers,bmi,educa,ageg, sex,racegr3,sleepdurat)
rm(brfss,newnames,renam, homewk3);gc() 
##           used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells 2286060 122.1    4158941  222.2   3821911  204.2
## Vcells 3987159  30.5  164146586 1252.4 201729723 1539.1
set.seed(1234)
samp<-sample(1:dim(Final_data)[1], size = 25000) #smaller sample for brevity
Final_data<-Final_data[samp,]

Pattern of Missigness

summary(Final_data[, c("ageg", "smokers",  "bmi",  "racegr3",  "educa", "sleepdurat")])
##     ageg         smokers               bmi           racegr3       educa      
##  18-24:1356   Current: 3596   Normal     :7353   Hispanic: 1734   <HS :  472  
##  25-30:2416   Former : 7141   Obese      :7080   NH-Black: 1737   >HS :17253  
##  35-44:2965   Never  :14103   Overweight :8555   NH-White:19617   HS  : 7195  
##  45-54:3952   NA's   :  160   Underweight: 379   Other   : 1461   NA's:   80  
##  55-64:5444                   NA's       :1633   NA's    :  451               
##  65+  :8867                                                                   
##    sleepdurat    
##  Min.   : 1.000  
##  1st Qu.: 6.000  
##  Median : 7.000  
##  Mean   : 7.091  
##  3rd Qu.: 8.000  
##  Max.   :24.000

The BMI variable (bmi) has the highest no of missing value, it accounts for 6.5% of the sample. The lowest number of missings is in the eduaction variable (educa), which only has 0.32% missing.

Regression Analysis 1

Regression analysis using the Original and the modal Imputed values

# Original Values 
fit11<-lm(sleepdurat~bmi+educa+racegr3+smokers+ageg, data=Final_data) 
 fit1 <- fit11 %>% 
  tbl_regression(exp=FALSE,label=list( bmi~"BMI Categories",
                                      educa ~ "Education Level",
                                      racegr3~ "Race/Ethnicity" ,
                                     ageg~"Age",
                                     smokers~"Smoker"
                                   )) %>%
   add_global_p() %>% 
  add_glance_source_note(label = list(df.residual ~ "Degrees of Freedom"),
fmt_fun = df.residual ~ style_number,
include = c( AIC,r.squared))
## add_global_p: Global p-values for variable(s) `add_global_p(include = c("bmi",
## "educa", "racegr3", "smokers", "ageg"))` were calculated with
##   `car::Anova(x$model_obj, type = "III")`
   fit11 %>% 
   tbl_regression(exp=FALSE,label=list( bmi~"BMI Categories",
                                      educa ~ "Education Level",
                                      racegr3~ "Race/Ethnicity" ,
                                     ageg~"Age",
                                     smokers~"Smoker"
                                   )) %>%
  
   add_global_p() %>% 
  add_glance_source_note(label = list(df.residual ~ "Degrees of Freedom"),
fmt_fun = df.residual ~ style_number,
include = c( AIC,r.squared)
) %>% 
  bold_labels()%>% 
  italicize_levels()%>% 
  as_gt() %>%
  gt::tab_options(
    table.font.size = "small",
    data_row.padding = gt::px(1)) %>%
 set_table_properties(width = 1, layout = "autofit")
## add_global_p: Global p-values for variable(s) `add_global_p(include = c("bmi",
## "educa", "racegr3", "smokers", "ageg"))` were calculated with
##   `car::Anova(x$model_obj, type = "III")`
## Warning: The `.dots` argument of `group_by()` is deprecated as of dplyr 1.0.0.
Characteristic Beta 95% CI1 p-value
BMI Categories <0.001
Normal
Obese -0.10 -0.14, -0.05
Overweight -0.05 -0.10, -0.01
Underweight 0.03 -0.12, 0.17
Education Level 0.002
<HS
>HS -0.12 -0.26, 0.02
HS -0.05 -0.20, 0.09
Race/Ethnicity <0.001
Hispanic
NH-Black -0.16 -0.26, -0.06
NH-White 0.02 -0.06, 0.09
Other -0.01 -0.11, 0.09
Smoker <0.001
Current
Former 0.20 0.14, 0.25
Never 0.20 0.14, 0.25
Age <0.001
18-24
25-30 -0.20 -0.30, -0.10
35-44 -0.17 -0.26, -0.08
45-54 -0.15 -0.24, -0.06
55-64 0.00 -0.08, 0.09
65+ 0.28 0.20, 0.37
AIC = 78,969; R² = 0.027

1 CI = Confidence Interval

#Imputed Values
Imp.datam <- Final_data %>% 
select(bmi.imp, educa.imp,racegr3.imp,smokers.imp,ageg,sleepdurat ) %>% 
  mutate(bmi=bmi.imp, educa=educa.imp, smokers=smokers.imp, racegr3=racegr3.imp)

fit22<-lm(sleepdurat~bmi+educa+racegr3+smokers+ageg, data=Imp.datam) 
fit2 <- fit22%>% 
tbl_regression(exp=FALSE,label=list( bmi~"BMI Categories",
                                      educa ~ "Education Level",
                                      racegr3~ "Race/Ethnicity" ,
                                     ageg~"Age",
                                     smokers~"Smoker"
                                   )) %>%
  
  add_global_p() %>% 
  add_glance_source_note(label = list(df.residual ~ "Degrees of Freedom"),
fmt_fun = df.residual ~ style_number,
include = c( AIC,r.squared))
## add_global_p: Global p-values for variable(s) `add_global_p(include = c("bmi",
## "educa", "racegr3", "smokers", "ageg"))` were calculated with
##   `car::Anova(x$model_obj, type = "III")`
 fit22%>% 
tbl_regression(exp=FALSE,label=list( bmi~"BMI Categories",
                                      educa ~ "Education Level",
                                      racegr3~ "Race/Ethnicity" ,
                                     ageg~"Age",
                                     smokers~"Smoker"
                                   )) %>%
  
  add_global_p() %>% 
  add_glance_source_note(label = list(df.residual ~ "Degrees of Freedom"),
fmt_fun = df.residual ~ style_number,
include = c( AIC,r.squared)
) %>% 
  bold_labels()%>% 
  italicize_levels()%>% 
  as_gt() %>%
  gt::tab_options(
    table.font.size = "small",
    data_row.padding = gt::px(1)) %>%
 set_table_properties(width = 1, layout = "autofit")
## add_global_p: Global p-values for variable(s) `add_global_p(include = c("bmi",
## "educa", "racegr3", "smokers", "ageg"))` were calculated with
##   `car::Anova(x$model_obj, type = "III")`
Characteristic Beta 95% CI1 p-value
BMI Categories <0.001
Normal
Obese -0.10 -0.15, -0.06
Overweight -0.04 -0.09, 0.00
Underweight 0.03 -0.12, 0.17
Education Level <0.001
<HS
>HS -0.24 -0.37, -0.11
HS -0.17 -0.30, -0.04
Race/Ethnicity <0.001
Hispanic
NH-Black -0.18 -0.28, -0.09
NH-White 0.00 -0.07, 0.07
Other -0.03 -0.12, 0.07
Smoker <0.001
Current
Former 0.20 0.14, 0.25
Never 0.20 0.15, 0.25
Age <0.001
18-24
25-30 -0.18 -0.27, -0.08
35-44 -0.15 -0.24, -0.06
45-54 -0.13 -0.22, -0.05
55-64 0.01 -0.08, 0.09
65+ 0.28 0.20, 0.37
AIC = 86,877; R² = 0.026

1 CI = Confidence Interval

Multiple Imputation

# sampling the data set for ease of the imputation process 
dat2<-Final_data
samp2<-sample(1:dim(dat2)[1], replace = F, size = 500)

#Imputing the missing values using Ten iteration 
imp<-mice(data = dat2[,c("ageg", "smokers",  "bmi",  "racegr3",  "educa", "sleepdurat")], m = 10)
## 
##  iter imp variable
##   1   1  smokers  bmi  racegr3  educa
##   1   2  smokers  bmi  racegr3  educa
##   1   3  smokers  bmi  racegr3  educa
##   1   4  smokers  bmi  racegr3  educa
##   1   5  smokers  bmi  racegr3  educa
##   1   6  smokers  bmi  racegr3  educa
##   1   7  smokers  bmi  racegr3  educa
##   1   8  smokers  bmi  racegr3  educa
##   1   9  smokers  bmi  racegr3  educa
##   1   10  smokers  bmi  racegr3  educa
##   2   1  smokers  bmi  racegr3  educa
##   2   2  smokers  bmi  racegr3  educa
##   2   3  smokers  bmi  racegr3  educa
##   2   4  smokers  bmi  racegr3  educa
##   2   5  smokers  bmi  racegr3  educa
##   2   6  smokers  bmi  racegr3  educa
##   2   7  smokers  bmi  racegr3  educa
##   2   8  smokers  bmi  racegr3  educa
##   2   9  smokers  bmi  racegr3  educa
##   2   10  smokers  bmi  racegr3  educa
##   3   1  smokers  bmi  racegr3  educa
##   3   2  smokers  bmi  racegr3  educa
##   3   3  smokers  bmi  racegr3  educa
##   3   4  smokers  bmi  racegr3  educa
##   3   5  smokers  bmi  racegr3  educa
##   3   6  smokers  bmi  racegr3  educa
##   3   7  smokers  bmi  racegr3  educa
##   3   8  smokers  bmi  racegr3  educa
##   3   9  smokers  bmi  racegr3  educa
##   3   10  smokers  bmi  racegr3  educa
##   4   1  smokers  bmi  racegr3  educa
##   4   2  smokers  bmi  racegr3  educa
##   4   3  smokers  bmi  racegr3  educa
##   4   4  smokers  bmi  racegr3  educa
##   4   5  smokers  bmi  racegr3  educa
##   4   6  smokers  bmi  racegr3  educa
##   4   7  smokers  bmi  racegr3  educa
##   4   8  smokers  bmi  racegr3  educa
##   4   9  smokers  bmi  racegr3  educa
##   4   10  smokers  bmi  racegr3  educa
##   5   1  smokers  bmi  racegr3  educa
##   5   2  smokers  bmi  racegr3  educa
##   5   3  smokers  bmi  racegr3  educa
##   5   4  smokers  bmi  racegr3  educa
##   5   5  smokers  bmi  racegr3  educa
##   5   6  smokers  bmi  racegr3  educa
##   5   7  smokers  bmi  racegr3  educa
##   5   8  smokers  bmi  racegr3  educa
##   5   9  smokers  bmi  racegr3  educa
##   5   10  smokers  bmi  racegr3  educa
# choosing the first imputation of the 10 iteration
dat.imp<-complete(imp, action = 7)

Regression Analysis 2

Regression analysis using the Original and the Multiple Imputed values

fit.impp<-lm(sleepdurat~bmi+educa+racegr3+smokers+ageg, data=dat.imp) 

fit.imp <- fit.impp%>% 
 tbl_regression(exp=FALSE,label=list( bmi~"BMI Categories",
                                      educa ~ "Education Level",
                                      racegr3~ "Race/Ethnicity" ,
                                     ageg~"Age",
                                     smokers~"Smoker"
                                   )) %>%
  
   add_global_p() %>% 
  add_glance_source_note(label = list(df.residual ~ "Degrees of Freedom"),
fmt_fun = df.residual ~ style_number,
include = c( AIC,r.squared))
## add_global_p: Global p-values for variable(s) `add_global_p(include = c("bmi",
## "educa", "racegr3", "smokers", "ageg"))` were calculated with
##   `car::Anova(x$model_obj, type = "III")`
fit.impp%>% 
tbl_regression(exp=FALSE,label=list( bmi~"BMI Categories",
                                      educa ~ "Education Level",
                                      racegr3~ "Race/Ethnicity" ,
                                     ageg~"Age",
                                     smokers~"Smoker"
                                   )) %>%
  
  add_global_p() %>% 
  add_glance_source_note(label = list(df.residual ~ "Degrees of Freedom"),
fmt_fun = df.residual ~ style_number,
include = c( AIC,r.squared)
) %>% 
  bold_labels()%>% 
  italicize_levels()%>% 
  as_gt() %>%
  gt::tab_options(
    table.font.size = "small",
    data_row.padding = gt::px(1)) %>%
 set_table_properties(width = 1, layout = "autofit")
## add_global_p: Global p-values for variable(s) `add_global_p(include = c("bmi",
## "educa", "racegr3", "smokers", "ageg"))` were calculated with
##   `car::Anova(x$model_obj, type = "III")`
Characteristic Beta 95% CI1 p-value
BMI Categories <0.001
Normal
Obese -0.10 -0.15, -0.06
Overweight -0.05 -0.10, -0.01
Underweight 0.00 -0.14, 0.14
Education Level <0.001
<HS
>HS -0.24 -0.37, -0.12
HS -0.17 -0.30, -0.04
Race/Ethnicity <0.001
Hispanic
NH-Black -0.18 -0.28, -0.09
NH-White 0.00 -0.07, 0.07
Other -0.03 -0.12, 0.07
Smoker <0.001
Current
Former 0.20 0.14, 0.26
Never 0.20 0.15, 0.25
Age <0.001
18-24
25-30 -0.18 -0.27, -0.08
35-44 -0.15 -0.24, -0.06
45-54 -0.13 -0.22, -0.05
55-64 0.01 -0.08, 0.09
65+ 0.28 0.20, 0.37
AIC = 86,876; R² = 0.026

1 CI = Confidence Interval

Fit_merge <-
  tbl_merge( #Table merge function
    tbls = list(fit1, fit2, fit.imp), # Selecting the tables to be included in the combined table output 
    # More objects can be added here
    tab_spanner = c("**Original**","**Modal Imputed**","**Multiple Imputed**") # Naming the models
  
  )%>%
  bold_labels()%>% 
  italicize_levels()%>% 
  as_gt() %>%
   gt::tab_header("Table Compares Model Fit Between Data Source With Missing Values, Modal Imputed Data, and Multiple Imputed data") %>% 
  gt::tab_options(
    table.font.size = "small",
    data_row.padding = gt::px(1)) %>% 
  gt::tab_source_note(
    str_glue("Original {fit1$list_output$source_note}; ",
             "Modal Imputed {fit2$list_output$source_note}; ",
            "Multiple Imputed {fit.imp$list_output$source_note}")
  ) %>%
  set_table_properties(width = 1, layout = "autofit")  
  
Fit_merge
Table Compares Model Fit Between Data Source With Missing Values, Modal Imputed Data, and Multiple Imputed data
Characteristic Original Modal Imputed Multiple Imputed
Beta 95% CI1 p-value Beta 95% CI1 p-value Beta 95% CI1 p-value
BMI Categories <0.001 <0.001 <0.001
Normal
Obese -0.10 -0.14, -0.05 -0.10 -0.15, -0.06 -0.10 -0.15, -0.06
Overweight -0.05 -0.10, -0.01 -0.04 -0.09, 0.00 -0.05 -0.10, -0.01
Underweight 0.03 -0.12, 0.17 0.03 -0.12, 0.17 0.00 -0.14, 0.14
Education Level 0.002 <0.001 <0.001
<HS
>HS -0.12 -0.26, 0.02 -0.24 -0.37, -0.11 -0.24 -0.37, -0.12
HS -0.05 -0.20, 0.09 -0.17 -0.30, -0.04 -0.17 -0.30, -0.04
Race/Ethnicity <0.001 <0.001 <0.001
Hispanic
NH-Black -0.16 -0.26, -0.06 -0.18 -0.28, -0.09 -0.18 -0.28, -0.09
NH-White 0.02 -0.06, 0.09 0.00 -0.07, 0.07 0.00 -0.07, 0.07
Other -0.01 -0.11, 0.09 -0.03 -0.12, 0.07 -0.03 -0.12, 0.07
Smoker <0.001 <0.001 <0.001
Current
Former 0.20 0.14, 0.25 0.20 0.14, 0.25 0.20 0.14, 0.26
Never 0.20 0.14, 0.25 0.20 0.15, 0.25 0.20 0.15, 0.25
Age <0.001 <0.001 <0.001
18-24
25-30 -0.20 -0.30, -0.10 -0.18 -0.27, -0.08 -0.18 -0.27, -0.08
35-44 -0.17 -0.26, -0.08 -0.15 -0.24, -0.06 -0.15 -0.24, -0.06
45-54 -0.15 -0.24, -0.06 -0.13 -0.22, -0.05 -0.13 -0.22, -0.05
55-64 0.00 -0.08, 0.09 0.01 -0.08, 0.09 0.01 -0.08, 0.09
65+ 0.28 0.20, 0.37 0.28 0.20, 0.37 0.28 0.20, 0.37
Original AIC = 78,969; R² = 0.027; Modal Imputed AIC = 86,877; R² = 0.026; Multiple Imputed AIC = 86,876; R² = 0.026

1 CI = Confidence Interval

The model fit from the table above looks similar with some variations in the Beta wthin the groups. Considering the ACI and R² values between the modal and multiple imputation data, the model fit from the multiple imputation data (Multiple imputed) has the best fit.