#Load data for analysis#sub-setting and Re-codding variables for analysis purposes
brfss <- readRDS("brfss_177.rds")
# Cleaning the variable names for space, underscore & Uppercase Characters
renam<-names(brfss)
newnames<-tolower(gsub(pattern = "_",replacement = "",x = renam))
names(brfss)<-newnames
homewk3 <- brfss %>%
dplyr::select(state,llcpwt,dispcode,seqno,psu,bmi5cat,racegr3,educa,ststr,smoke100,exerany2,sex,ageg,menthlth,smokday2,sleptim1) %>%
mutate( smokers=as.factor(ifelse(smoke100==1 & smokday2%in%c(1,2),"Current",
ifelse(smoke100==1 & smokday2==3, "Former",
ifelse(smoke100==2, "Never", NA))))) %>%
mutate(bmi = car::Recode(bmi5cat,recodes="1='Underweight';2='Normal';3='Overweight'; 4='Obese';else=NA",as.factor=T ),
educa = car::Recode(educa,recodes="1:2='<HS';3:4='HS';5:6='>HS';else=NA", as.factor=T ),
racegr3 = car::Recode(racegr3,recodes="1='NH-White';2='NH-Black';3:4='Other';5='Hispanic';else=NA",as.factor=T ),
ageg = car::Recode(ageg,recodes="1='18-24';2='25-30';3='35-44';4='45-54';5='55-64';else='65+'" ,as.factor=T ),
Physicala = car::Recode(exerany2,recodes="='Yes';2='No';else=NA",as.factor=T ), sex = car::Recode(sex,recodes="1='Male';2='Female';else=NA",as.factor=T ),
sleepdurat=car::Recode(sleptim1, recodes="77=NA ;99= NA"),
mentald=ifelse(menthlth<31, "Yes",
ifelse(menthlth==88, "No",NA))) %>%
filter( is.na(sleepdurat)==F)
#Data containing only variables used in the analysis
Final_data <- homewk3 %>%
dplyr:: select(smokers,bmi,educa,ageg, sex,racegr3,sleepdurat)
rm(brfss,newnames,renam, homewk3);gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 2286060 122.1 4158941 222.2 3821911 204.2
## Vcells 3987159 30.5 164146586 1252.4 201729723 1539.1
set.seed(1234)
samp<-sample(1:dim(Final_data)[1], size = 25000) #smaller sample for brevity
Final_data<-Final_data[samp,]
summary(Final_data[, c("ageg", "smokers", "bmi", "racegr3", "educa", "sleepdurat")])
## ageg smokers bmi racegr3 educa
## 18-24:1356 Current: 3596 Normal :7353 Hispanic: 1734 <HS : 472
## 25-30:2416 Former : 7141 Obese :7080 NH-Black: 1737 >HS :17253
## 35-44:2965 Never :14103 Overweight :8555 NH-White:19617 HS : 7195
## 45-54:3952 NA's : 160 Underweight: 379 Other : 1461 NA's: 80
## 55-64:5444 NA's :1633 NA's : 451
## 65+ :8867
## sleepdurat
## Min. : 1.000
## 1st Qu.: 6.000
## Median : 7.000
## Mean : 7.091
## 3rd Qu.: 8.000
## Max. :24.000
The BMI variable (bmi) has the highest no of missing value, it accounts for 6.5% of the sample. The lowest number of missings is in the eduaction variable (educa), which only has 0.32% missing.
Performing Modal imputation for the categorical variables (smokers, bmi, racegr3, educa)
#find the most common value
mcv.bmi<-factor(names(which.max(table(Final_data$bmi))), levels=levels(Final_data$bmi))
mcv.bmi
## [1] Overweight
## Levels: Normal Obese Overweight Underweight
mcv.smokers<-factor(names(which.max(table(Final_data$smokers))), levels=levels(Final_data$smokers))
mcv.smokers
## [1] Never
## Levels: Current Former Never
mcv.racegr3<-factor(names(which.max(table(Final_data$racegr3))), levels=levels(Final_data$racegr3))
mcv.racegr3
## [1] NH-White
## Levels: Hispanic NH-Black NH-White Other
mcv.educa<-factor(names(which.max(table(Final_data$educa))), levels=levels(Final_data$educa))
mcv.educa
## [1] >HS
## Levels: <HS >HS HS
#impute the cases
Final_data$bmi.imp<-as.factor(ifelse(is.na(Final_data$bmi)==T, mcv.bmi, Final_data$bmi))
levels(Final_data$bmi.imp)<-levels(Final_data$bmi)
round(prop.table(table(Final_data$bmi)),3)
##
## Normal Obese Overweight Underweight
## 0.315 0.303 0.366 0.016
round(prop.table(table(Final_data$bmi.imp)),3)
##
## Normal Obese Overweight Underweight
## 0.294 0.283 0.408 0.015
Final_data$smokers.imp<-as.factor(ifelse(is.na(Final_data$smokers)==T, mcv.smokers, Final_data$smokers))
levels(Final_data$smokers.imp)<-levels(Final_data$smokers)
round(prop.table(table(Final_data$smokers)),3)
##
## Current Former Never
## 0.145 0.287 0.568
round(prop.table(table(Final_data$smokers.imp)),3)
##
## Current Former Never
## 0.144 0.286 0.571
Final_data$racegr3.imp<-as.factor(ifelse(is.na(Final_data$racegr3)==T, mcv.racegr3, Final_data$racegr3))
levels(Final_data$racegr3.imp)<-levels(Final_data$racegr3)
round(prop.table(table(Final_data$racegr3)),3)
##
## Hispanic NH-Black NH-White Other
## 0.071 0.071 0.799 0.060
round(prop.table(table(Final_data$racegr3.imp)),3)
##
## Hispanic NH-Black NH-White Other
## 0.069 0.069 0.803 0.058
Final_data$educa.imp<-as.factor(ifelse(is.na(Final_data$educa)==T,mcv.educa, Final_data$educa))
levels(Final_data$educa.imp)<-levels(Final_data$educa)
round(prop.table(table(Final_data$educa)),3)
##
## <HS >HS HS
## 0.019 0.692 0.289
round(prop.table(table(Final_data$educa.imp)),3)
##
## <HS >HS HS
## 0.019 0.693 0.288
Variables with more missing values has higher variation between the original values and the imputed values compared to variables with little amount of missing values. Education variable has the smallest variation when the original values is compared to the imputed values.
Regression analysis using the Original and the modal Imputed values
# Original Values
fit11<-lm(sleepdurat~bmi+educa+racegr3+smokers+ageg, data=Final_data)
fit1 <- fit11 %>%
tbl_regression(exp=FALSE,label=list( bmi~"BMI Categories",
educa ~ "Education Level",
racegr3~ "Race/Ethnicity" ,
ageg~"Age",
smokers~"Smoker"
)) %>%
add_global_p() %>%
add_glance_source_note(label = list(df.residual ~ "Degrees of Freedom"),
fmt_fun = df.residual ~ style_number,
include = c( AIC,r.squared))
## add_global_p: Global p-values for variable(s) `add_global_p(include = c("bmi",
## "educa", "racegr3", "smokers", "ageg"))` were calculated with
## `car::Anova(x$model_obj, type = "III")`
fit11 %>%
tbl_regression(exp=FALSE,label=list( bmi~"BMI Categories",
educa ~ "Education Level",
racegr3~ "Race/Ethnicity" ,
ageg~"Age",
smokers~"Smoker"
)) %>%
add_global_p() %>%
add_glance_source_note(label = list(df.residual ~ "Degrees of Freedom"),
fmt_fun = df.residual ~ style_number,
include = c( AIC,r.squared)
) %>%
bold_labels()%>%
italicize_levels()%>%
as_gt() %>%
gt::tab_options(
table.font.size = "small",
data_row.padding = gt::px(1)) %>%
set_table_properties(width = 1, layout = "autofit")
## add_global_p: Global p-values for variable(s) `add_global_p(include = c("bmi",
## "educa", "racegr3", "smokers", "ageg"))` were calculated with
## `car::Anova(x$model_obj, type = "III")`
## Warning: The `.dots` argument of `group_by()` is deprecated as of dplyr 1.0.0.
Characteristic | Beta | 95% CI1 | p-value |
---|---|---|---|
BMI Categories | <0.001 | ||
Normal | — | — | |
Obese | -0.10 | -0.14, -0.05 | |
Overweight | -0.05 | -0.10, -0.01 | |
Underweight | 0.03 | -0.12, 0.17 | |
Education Level | 0.002 | ||
<HS | — | — | |
>HS | -0.12 | -0.26, 0.02 | |
HS | -0.05 | -0.20, 0.09 | |
Race/Ethnicity | <0.001 | ||
Hispanic | — | — | |
NH-Black | -0.16 | -0.26, -0.06 | |
NH-White | 0.02 | -0.06, 0.09 | |
Other | -0.01 | -0.11, 0.09 | |
Smoker | <0.001 | ||
Current | — | — | |
Former | 0.20 | 0.14, 0.25 | |
Never | 0.20 | 0.14, 0.25 | |
Age | <0.001 | ||
18-24 | — | — | |
25-30 | -0.20 | -0.30, -0.10 | |
35-44 | -0.17 | -0.26, -0.08 | |
45-54 | -0.15 | -0.24, -0.06 | |
55-64 | 0.00 | -0.08, 0.09 | |
65+ | 0.28 | 0.20, 0.37 | |
AIC = 78,969; R² = 0.027 | |||
1
CI = Confidence Interval
|
#Imputed Values
Imp.datam <- Final_data %>%
select(bmi.imp, educa.imp,racegr3.imp,smokers.imp,ageg,sleepdurat ) %>%
mutate(bmi=bmi.imp, educa=educa.imp, smokers=smokers.imp, racegr3=racegr3.imp)
fit22<-lm(sleepdurat~bmi+educa+racegr3+smokers+ageg, data=Imp.datam)
fit2 <- fit22%>%
tbl_regression(exp=FALSE,label=list( bmi~"BMI Categories",
educa ~ "Education Level",
racegr3~ "Race/Ethnicity" ,
ageg~"Age",
smokers~"Smoker"
)) %>%
add_global_p() %>%
add_glance_source_note(label = list(df.residual ~ "Degrees of Freedom"),
fmt_fun = df.residual ~ style_number,
include = c( AIC,r.squared))
## add_global_p: Global p-values for variable(s) `add_global_p(include = c("bmi",
## "educa", "racegr3", "smokers", "ageg"))` were calculated with
## `car::Anova(x$model_obj, type = "III")`
fit22%>%
tbl_regression(exp=FALSE,label=list( bmi~"BMI Categories",
educa ~ "Education Level",
racegr3~ "Race/Ethnicity" ,
ageg~"Age",
smokers~"Smoker"
)) %>%
add_global_p() %>%
add_glance_source_note(label = list(df.residual ~ "Degrees of Freedom"),
fmt_fun = df.residual ~ style_number,
include = c( AIC,r.squared)
) %>%
bold_labels()%>%
italicize_levels()%>%
as_gt() %>%
gt::tab_options(
table.font.size = "small",
data_row.padding = gt::px(1)) %>%
set_table_properties(width = 1, layout = "autofit")
## add_global_p: Global p-values for variable(s) `add_global_p(include = c("bmi",
## "educa", "racegr3", "smokers", "ageg"))` were calculated with
## `car::Anova(x$model_obj, type = "III")`
Characteristic | Beta | 95% CI1 | p-value |
---|---|---|---|
BMI Categories | <0.001 | ||
Normal | — | — | |
Obese | -0.10 | -0.15, -0.06 | |
Overweight | -0.04 | -0.09, 0.00 | |
Underweight | 0.03 | -0.12, 0.17 | |
Education Level | <0.001 | ||
<HS | — | — | |
>HS | -0.24 | -0.37, -0.11 | |
HS | -0.17 | -0.30, -0.04 | |
Race/Ethnicity | <0.001 | ||
Hispanic | — | — | |
NH-Black | -0.18 | -0.28, -0.09 | |
NH-White | 0.00 | -0.07, 0.07 | |
Other | -0.03 | -0.12, 0.07 | |
Smoker | <0.001 | ||
Current | — | — | |
Former | 0.20 | 0.14, 0.25 | |
Never | 0.20 | 0.15, 0.25 | |
Age | <0.001 | ||
18-24 | — | — | |
25-30 | -0.18 | -0.27, -0.08 | |
35-44 | -0.15 | -0.24, -0.06 | |
45-54 | -0.13 | -0.22, -0.05 | |
55-64 | 0.01 | -0.08, 0.09 | |
65+ | 0.28 | 0.20, 0.37 | |
AIC = 86,877; R² = 0.026 | |||
1
CI = Confidence Interval
|
# sampling the data set for ease of the imputation process
dat2<-Final_data
samp2<-sample(1:dim(dat2)[1], replace = F, size = 500)
#Imputing the missing values using Ten iteration
imp<-mice(data = dat2[,c("ageg", "smokers", "bmi", "racegr3", "educa", "sleepdurat")], m = 10)
##
## iter imp variable
## 1 1 smokers bmi racegr3 educa
## 1 2 smokers bmi racegr3 educa
## 1 3 smokers bmi racegr3 educa
## 1 4 smokers bmi racegr3 educa
## 1 5 smokers bmi racegr3 educa
## 1 6 smokers bmi racegr3 educa
## 1 7 smokers bmi racegr3 educa
## 1 8 smokers bmi racegr3 educa
## 1 9 smokers bmi racegr3 educa
## 1 10 smokers bmi racegr3 educa
## 2 1 smokers bmi racegr3 educa
## 2 2 smokers bmi racegr3 educa
## 2 3 smokers bmi racegr3 educa
## 2 4 smokers bmi racegr3 educa
## 2 5 smokers bmi racegr3 educa
## 2 6 smokers bmi racegr3 educa
## 2 7 smokers bmi racegr3 educa
## 2 8 smokers bmi racegr3 educa
## 2 9 smokers bmi racegr3 educa
## 2 10 smokers bmi racegr3 educa
## 3 1 smokers bmi racegr3 educa
## 3 2 smokers bmi racegr3 educa
## 3 3 smokers bmi racegr3 educa
## 3 4 smokers bmi racegr3 educa
## 3 5 smokers bmi racegr3 educa
## 3 6 smokers bmi racegr3 educa
## 3 7 smokers bmi racegr3 educa
## 3 8 smokers bmi racegr3 educa
## 3 9 smokers bmi racegr3 educa
## 3 10 smokers bmi racegr3 educa
## 4 1 smokers bmi racegr3 educa
## 4 2 smokers bmi racegr3 educa
## 4 3 smokers bmi racegr3 educa
## 4 4 smokers bmi racegr3 educa
## 4 5 smokers bmi racegr3 educa
## 4 6 smokers bmi racegr3 educa
## 4 7 smokers bmi racegr3 educa
## 4 8 smokers bmi racegr3 educa
## 4 9 smokers bmi racegr3 educa
## 4 10 smokers bmi racegr3 educa
## 5 1 smokers bmi racegr3 educa
## 5 2 smokers bmi racegr3 educa
## 5 3 smokers bmi racegr3 educa
## 5 4 smokers bmi racegr3 educa
## 5 5 smokers bmi racegr3 educa
## 5 6 smokers bmi racegr3 educa
## 5 7 smokers bmi racegr3 educa
## 5 8 smokers bmi racegr3 educa
## 5 9 smokers bmi racegr3 educa
## 5 10 smokers bmi racegr3 educa
# choosing the first imputation of the 10 iteration
dat.imp<-complete(imp, action = 7)
Regression analysis using the Original and the Multiple Imputed values
fit.impp<-lm(sleepdurat~bmi+educa+racegr3+smokers+ageg, data=dat.imp)
fit.imp <- fit.impp%>%
tbl_regression(exp=FALSE,label=list( bmi~"BMI Categories",
educa ~ "Education Level",
racegr3~ "Race/Ethnicity" ,
ageg~"Age",
smokers~"Smoker"
)) %>%
add_global_p() %>%
add_glance_source_note(label = list(df.residual ~ "Degrees of Freedom"),
fmt_fun = df.residual ~ style_number,
include = c( AIC,r.squared))
## add_global_p: Global p-values for variable(s) `add_global_p(include = c("bmi",
## "educa", "racegr3", "smokers", "ageg"))` were calculated with
## `car::Anova(x$model_obj, type = "III")`
fit.impp%>%
tbl_regression(exp=FALSE,label=list( bmi~"BMI Categories",
educa ~ "Education Level",
racegr3~ "Race/Ethnicity" ,
ageg~"Age",
smokers~"Smoker"
)) %>%
add_global_p() %>%
add_glance_source_note(label = list(df.residual ~ "Degrees of Freedom"),
fmt_fun = df.residual ~ style_number,
include = c( AIC,r.squared)
) %>%
bold_labels()%>%
italicize_levels()%>%
as_gt() %>%
gt::tab_options(
table.font.size = "small",
data_row.padding = gt::px(1)) %>%
set_table_properties(width = 1, layout = "autofit")
## add_global_p: Global p-values for variable(s) `add_global_p(include = c("bmi",
## "educa", "racegr3", "smokers", "ageg"))` were calculated with
## `car::Anova(x$model_obj, type = "III")`
Characteristic | Beta | 95% CI1 | p-value |
---|---|---|---|
BMI Categories | <0.001 | ||
Normal | — | — | |
Obese | -0.10 | -0.15, -0.06 | |
Overweight | -0.05 | -0.10, -0.01 | |
Underweight | 0.00 | -0.14, 0.14 | |
Education Level | <0.001 | ||
<HS | — | — | |
>HS | -0.24 | -0.37, -0.12 | |
HS | -0.17 | -0.30, -0.04 | |
Race/Ethnicity | <0.001 | ||
Hispanic | — | — | |
NH-Black | -0.18 | -0.28, -0.09 | |
NH-White | 0.00 | -0.07, 0.07 | |
Other | -0.03 | -0.12, 0.07 | |
Smoker | <0.001 | ||
Current | — | — | |
Former | 0.20 | 0.14, 0.26 | |
Never | 0.20 | 0.15, 0.25 | |
Age | <0.001 | ||
18-24 | — | — | |
25-30 | -0.18 | -0.27, -0.08 | |
35-44 | -0.15 | -0.24, -0.06 | |
45-54 | -0.13 | -0.22, -0.05 | |
55-64 | 0.01 | -0.08, 0.09 | |
65+ | 0.28 | 0.20, 0.37 | |
AIC = 86,876; R² = 0.026 | |||
1
CI = Confidence Interval
|
Fit_merge <-
tbl_merge( #Table merge function
tbls = list(fit1, fit2, fit.imp), # Selecting the tables to be included in the combined table output
# More objects can be added here
tab_spanner = c("**Original**","**Modal Imputed**","**Multiple Imputed**") # Naming the models
)%>%
bold_labels()%>%
italicize_levels()%>%
as_gt() %>%
gt::tab_header("Table Compares Model Fit Between Data Source With Missing Values, Modal Imputed Data, and Multiple Imputed data") %>%
gt::tab_options(
table.font.size = "small",
data_row.padding = gt::px(1)) %>%
gt::tab_source_note(
str_glue("Original {fit1$list_output$source_note}; ",
"Modal Imputed {fit2$list_output$source_note}; ",
"Multiple Imputed {fit.imp$list_output$source_note}")
) %>%
set_table_properties(width = 1, layout = "autofit")
Fit_merge
Table Compares Model Fit Between Data Source With Missing Values, Modal Imputed Data, and Multiple Imputed data | |||||||||
---|---|---|---|---|---|---|---|---|---|
Characteristic | Original | Modal Imputed | Multiple Imputed | ||||||
Beta | 95% CI1 | p-value | Beta | 95% CI1 | p-value | Beta | 95% CI1 | p-value | |
BMI Categories | <0.001 | <0.001 | <0.001 | ||||||
Normal | — | — | — | — | — | — | |||
Obese | -0.10 | -0.14, -0.05 | -0.10 | -0.15, -0.06 | -0.10 | -0.15, -0.06 | |||
Overweight | -0.05 | -0.10, -0.01 | -0.04 | -0.09, 0.00 | -0.05 | -0.10, -0.01 | |||
Underweight | 0.03 | -0.12, 0.17 | 0.03 | -0.12, 0.17 | 0.00 | -0.14, 0.14 | |||
Education Level | 0.002 | <0.001 | <0.001 | ||||||
<HS | — | — | — | — | — | — | |||
>HS | -0.12 | -0.26, 0.02 | -0.24 | -0.37, -0.11 | -0.24 | -0.37, -0.12 | |||
HS | -0.05 | -0.20, 0.09 | -0.17 | -0.30, -0.04 | -0.17 | -0.30, -0.04 | |||
Race/Ethnicity | <0.001 | <0.001 | <0.001 | ||||||
Hispanic | — | — | — | — | — | — | |||
NH-Black | -0.16 | -0.26, -0.06 | -0.18 | -0.28, -0.09 | -0.18 | -0.28, -0.09 | |||
NH-White | 0.02 | -0.06, 0.09 | 0.00 | -0.07, 0.07 | 0.00 | -0.07, 0.07 | |||
Other | -0.01 | -0.11, 0.09 | -0.03 | -0.12, 0.07 | -0.03 | -0.12, 0.07 | |||
Smoker | <0.001 | <0.001 | <0.001 | ||||||
Current | — | — | — | — | — | — | |||
Former | 0.20 | 0.14, 0.25 | 0.20 | 0.14, 0.25 | 0.20 | 0.14, 0.26 | |||
Never | 0.20 | 0.14, 0.25 | 0.20 | 0.15, 0.25 | 0.20 | 0.15, 0.25 | |||
Age | <0.001 | <0.001 | <0.001 | ||||||
18-24 | — | — | — | — | — | — | |||
25-30 | -0.20 | -0.30, -0.10 | -0.18 | -0.27, -0.08 | -0.18 | -0.27, -0.08 | |||
35-44 | -0.17 | -0.26, -0.08 | -0.15 | -0.24, -0.06 | -0.15 | -0.24, -0.06 | |||
45-54 | -0.15 | -0.24, -0.06 | -0.13 | -0.22, -0.05 | -0.13 | -0.22, -0.05 | |||
55-64 | 0.00 | -0.08, 0.09 | 0.01 | -0.08, 0.09 | 0.01 | -0.08, 0.09 | |||
65+ | 0.28 | 0.20, 0.37 | 0.28 | 0.20, 0.37 | 0.28 | 0.20, 0.37 | |||
Original AIC = 78,969; R² = 0.027; Modal Imputed AIC = 86,877; R² = 0.026; Multiple Imputed AIC = 86,876; R² = 0.026 | |||||||||
1
CI = Confidence Interval
|
The model fit from the table above looks similar with some variations in the Beta wthin the groups. Considering the ACI and R² values between the modal and multiple imputation data, the model fit from the multiple imputation data (Multiple imputed) has the best fit.