library(car)
library(mice)
## Warning: package 'mice' was built under R version 4.0.4
library(ggplot2)
library(dplyr)
library(haven)
#load data
load("C:/Users/okabe/OneDrive/Pictures/Stats 2/brfss_2017 (1).Rdata")
View(brfss_17)
set.seed(1234)
samp<-sample(1:dim(brfss_17)[1], size = 25000) #smaller sample for brevity
brfss_17<-brfss_17[samp,]
#brfss_17$badhealth<-ifelse(brfss_17$genhlth %in% c(4,5),1,0)
brfss_17$badhealth<-Recode(brfss_17$genhlth, recodes="4:5=1; 1:3=0; else=NA")
#race/ethnicity
brfss_17$black<-Recode(brfss_17$racegr3, recodes="2=1; 9=NA; else=0")
brfss_17$white<-Recode(brfss_17$racegr3, recodes="1=1; 9=NA; else=0")
brfss_17$other<-Recode(brfss_17$racegr3, recodes="3:4=1; 9=NA; else=0")
brfss_17$hispanic<-Recode(brfss_17$racegr3, recodes="5=1; 9=NA; else=0")
brfss_17$race_eth<-Recode(brfss_17$racegr3,
recodes="1='nhwhite'; 2='nh black'; 3='nh other';4='nh multirace'; 5='hispanic'; else=NA",
as.factor = T)
brfss_17$race_eth<-relevel(brfss_17$race_eth, ref = "nhwhite")
#insurance
brfss_17$ins<-Recode(brfss_17$hlthpln1, recodes ="7:9=NA; 1=1;2=0")
#income grouping
brfss_17$inc<-ifelse(brfss_17$incomg==9, NA, brfss_17$incomg)
#education level
brfss_17$educ<-Recode(brfss_17$educa,
recodes="1:2='0Prim'; 3='1somehs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA",
as.factor=T)
brfss_17$educ<-relevel(brfss_17$educ, ref='2hsgrad')
#employment
brfss_17$employ<-Recode(brfss_17$employ1,
recodes="1:2='Employed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA",
as.factor=T)
brfss_17$employ<-relevel(brfss_17$employ, ref='Employed')
#marital status
brfss_17$marst<-Recode(brfss_17$marital,
recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA",
as.factor=T)
brfss_17$marst<-relevel(brfss_17$marst, ref='married')
#Age cut into intervals
brfss_17$agec<-cut(brfss_17$age80, breaks=c(0,24,39,59,79,99))
#BMI, in the brfss_17a the bmi variable has 2 implied decimal places,
#so we must divide by 100 to get real bmi's
brfss_17$bmi<-brfss_17$bmi5/100
#smoking currently
brfss_17$smoke<-Recode(brfss_17$smoker3,
recodes="1:2='Current'; 3='Former';4='NeverSmoked'; else=NA",
as.factor=T)
brfss_17$smoke<-relevel(brfss_17$smoke, ref = "NeverSmoked")
##Exercise
brfss_17$exercise<-Recode(brfss_17$exerany2, recodes ="7:9=NA; 1=1;2=0")
#rEcigeret
brfss_17$ecigaret<-Recode(brfss_17$ecigaret, recodes ="7:9=NA; 1=1;2=0")
Measure an outcome variable and at least 5 predictors Report the pattern of missingness among all of these variables
summary(brfss_17[, c( "white", "exercise", "ecigaret", "black", "badhealth", "inc")])
## white exercise ecigaret black
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.000
## Median :1.0000 Median :1.0000 Median :0.0000 Median :0.000
## Mean :0.7397 Mean :0.7427 Mean :0.1576 Mean :0.106
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:0.000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.000
## NA's :523 NA's :1945 NA's :1166 NA's :523
## badhealth inc
## Min. :0.0000 Min. :1.00
## 1st Qu.:0.0000 1st Qu.:3.00
## Median :0.0000 Median :5.00
## Mean :0.1756 Mean :3.87
## 3rd Qu.:0.0000 3rd Qu.:5.00
## Max. :1.0000 Max. :5.00
## NA's :67 NA's :4202
Perform a mean (a mean for numeric data) or a modal imputation (for categorical data) of all values. Perform the analysis using this imputed data. What are your results?
summary(brfss_17$inc)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.00 3.00 5.00 3.87 5.00 5.00 4202
#what happens when we replace the missings with the mean?
brfss_17$inc.imp.mean<-ifelse (is.na(brfss_17$inc)==T, mean(brfss_17$inc,na.rm=T), brfss_17$inc)
brfss_17$exercise.imp.mean<-ifelse (is.na(brfss_17$exercise)==T, mean(brfss_17$exercise,na.rm=T), brfss_17$exercise)
brfss_17$black.imp.mean<-ifelse(is.na(brfss_17$black)==T, mean(brfss_17$black, na.rm=T),brfss_17$black)
brfss_17$ecigaret.imp.mean<-ifelse(is.na(brfss_17$ecigaret)==T, mean(brfss_17$ecigaret, na.rm=T), brfss_17$ecigaret)
brfss_17$badhealth.imp.mean<-ifelse(is.na(brfss_17$badhealth )==T, mean(brfss_17$badhealth, na.rm=T), brfss_17$badhealth)
brfss_17$white.imp.mean<-ifelse (is.na(brfss_17$white)==T, mean(brfss_17$white,na.rm=T), brfss_17$white)
##insurance
mean(brfss_17$inc, na.rm=T)
## [1] 3.869651
mean(brfss_17$inc.imp.mean) #no difference!
## [1] 3.869651
##exercise
mean(brfss_17$exercise, na.rm=T)
## [1] 0.7426589
mean(brfss_17$exercise.imp.mean)
## [1] 0.7426589
##white
mean(brfss_17$white, na.rm=T)
## [1] 0.7397148
mean(brfss_17$white.imp.mean)
## [1] 0.7397148
##black
mean(brfss_17$black, na.rm=T)
## [1] 0.105977
mean(brfss_17$black.imp.mean)
## [1] 0.105977
##ecigeret
mean(brfss_17$ecigaret, na.rm=T)
## [1] 0.15759
mean(brfss_17$ecigaret.imp.mean)
## [1] 0.15759
##badhealth
mean(brfss_17$badhealth, na.rm=T)
## [1] 0.1755906
mean(brfss_17$badhealth.imp.mean)
## [1] 0.1755906
fit<-lm(inc.imp.mean~exercise.imp.mean+white.imp.mean+black.imp.mean+ecigaret.imp.mean+badhealth.imp.mean,brfss_17)
summary(fit)
##
## Call:
## lm(formula = inc.imp.mean ~ exercise.imp.mean + white.imp.mean +
## black.imp.mean + ecigaret.imp.mean + badhealth.imp.mean,
## data = brfss_17)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.2785 -0.5444 0.4942 0.7215 2.6401
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.34270 0.02472 135.215 <2e-16 ***
## exercise.imp.mean 0.34759 0.01850 18.787 <2e-16 ***
## white.imp.mean 0.58818 0.02155 27.296 <2e-16 ***
## black.imp.mean 0.02805 0.03062 0.916 0.36
## ecigaret.imp.mean -0.18451 0.02138 -8.631 <2e-16 ***
## badhealth.imp.mean -0.79829 0.02050 -38.946 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.201 on 24994 degrees of freedom
## Multiple R-squared: 0.1319, Adjusted R-squared: 0.1317
## F-statistic: 759.4 on 5 and 24994 DF, p-value: < 2.2e-16
##while looking at mean values of the the original data and imputed data, we see little or no difference among the mean values
Perform a multiple imputation of all values. Perform the analysis using this imputed data set. What are your results?
md.pattern(brfss_17[,c("white", "exercise", "ecigaret", "black", "badhealth", "inc")])
## badhealth white black ecigaret exercise inc
## 19173 1 1 1 1 1 1 0
## 3359 1 1 1 1 1 0 1
## 612 1 1 1 1 0 1 1
## 169 1 1 1 1 0 0 2
## 26 1 1 1 0 1 1 1
## 16 1 1 1 0 1 0 2
## 619 1 1 1 0 0 1 2
## 440 1 1 1 0 0 0 3
## 286 1 0 0 1 1 1 2
## 133 1 0 0 1 1 0 3
## 20 1 0 0 1 0 1 3
## 20 1 0 0 1 0 0 4
## 2 1 0 0 0 1 0 4
## 19 1 0 0 0 0 1 4
## 39 1 0 0 0 0 0 5
## 38 0 1 1 1 1 1 1
## 18 0 1 1 1 1 0 2
## 1 0 1 1 1 0 1 2
## 2 0 1 1 1 0 0 3
## 1 0 1 1 0 1 0 3
## 2 0 1 1 0 0 1 3
## 1 0 1 1 0 0 0 4
## 1 0 0 0 1 1 1 3
## 2 0 0 0 1 1 0 4
## 1 0 0 0 0 0 1 5
## 67 523 523 1166 1945 4202 8426
md.pairs(brfss_17[,c("white", "exercise", "ecigaret", "black", "badhealth", "inc")])
## $rr
## white exercise ecigaret black badhealth inc
## white 24477 22631 23372 24477 24414 20471
## exercise 22631 23055 23010 22631 22995 19524
## ecigaret 23372 23010 23834 23372 23772 20131
## black 24477 22631 23372 24477 24414 20471
## badhealth 24414 22995 23772 24414 24933 20755
## inc 20471 19524 20131 20471 20755 20798
##
## $rm
## white exercise ecigaret black badhealth inc
## white 0 1846 1105 0 63 4006
## exercise 424 0 45 424 60 3531
## ecigaret 462 824 0 462 62 3703
## black 0 1846 1105 0 63 4006
## badhealth 519 1938 1161 519 0 4178
## inc 327 1274 667 327 43 0
##
## $mr
## white exercise ecigaret black badhealth inc
## white 0 424 462 0 519 327
## exercise 1846 0 824 1846 1938 1274
## ecigaret 1105 45 0 1105 1161 667
## black 0 424 462 0 519 327
## badhealth 63 60 62 63 0 43
## inc 4006 3531 3703 4006 4178 0
##
## $mm
## white exercise ecigaret black badhealth inc
## white 523 99 61 523 4 196
## exercise 99 1945 1121 99 7 671
## ecigaret 61 1121 1166 61 5 499
## black 523 99 61 523 4 196
## badhealth 4 7 5 4 67 24
## inc 196 671 499 196 24 4202
##In here we see that there is a lot of missing data among my variables
dat2<-brfss_17
samp2<-sample(1:dim(dat2)[1], replace = F, size = 500)
dat2$incknock<-dat2$inc
dat2$incknock[samp2]<-NA
head(dat2[, c("incknock","inc")])
## # A tibble: 6 x 2
## incknock inc
## <dbl> <dbl>
## 1 5 5
## 2 NA NA
## 3 3 3
## 4 NA NA
## 5 5 5
## 6 4 4
imp<-mice(data = dat2[,c("white", "exercise", "ecigaret", "black", "badhealth", "inc")],m = 5)
##
## iter imp variable
## 1 1 white exercise ecigaret black badhealth inc
## 1 2 white exercise ecigaret black badhealth inc
## 1 3 white exercise ecigaret black badhealth inc
## 1 4 white exercise ecigaret black badhealth inc
## 1 5 white exercise ecigaret black badhealth inc
## 2 1 white exercise ecigaret black badhealth inc
## 2 2 white exercise ecigaret black badhealth inc
## 2 3 white exercise ecigaret black badhealth inc
## 2 4 white exercise ecigaret black badhealth inc
## 2 5 white exercise ecigaret black badhealth inc
## 3 1 white exercise ecigaret black badhealth inc
## 3 2 white exercise ecigaret black badhealth inc
## 3 3 white exercise ecigaret black badhealth inc
## 3 4 white exercise ecigaret black badhealth inc
## 3 5 white exercise ecigaret black badhealth inc
## 4 1 white exercise ecigaret black badhealth inc
## 4 2 white exercise ecigaret black badhealth inc
## 4 3 white exercise ecigaret black badhealth inc
## 4 4 white exercise ecigaret black badhealth inc
## 4 5 white exercise ecigaret black badhealth inc
## 5 1 white exercise ecigaret black badhealth inc
## 5 2 white exercise ecigaret black badhealth inc
## 5 3 white exercise ecigaret black badhealth inc
## 5 4 white exercise ecigaret black badhealth inc
## 5 5 white exercise ecigaret black badhealth inc
print(imp)
## Class: mids
## Number of multiple imputations: 5
## Imputation methods:
## white exercise ecigaret black badhealth inc
## "pmm" "pmm" "pmm" "pmm" "pmm" "pmm"
## PredictorMatrix:
## white exercise ecigaret black badhealth inc
## white 0 1 1 1 1 1
## exercise 1 0 1 1 1 1
## ecigaret 1 1 0 1 1 1
## black 1 1 1 0 1 1
## badhealth 1 1 1 1 0 1
## inc 1 1 1 1 1 0
##Here we see what variables were used for the imputations and gow many imputation were done
Were the results similar between the mean/modal and multiply imputed data sets? How do the results compare to the results from the model fit with the data source with missing values?
barplot(prop.table(table(brfss_17$inc)), main="Original Data", ylim=c(0, .6))
barplot(prop.table(table(brfss_17$inc.imp.mean)), main="Imputed Data", ylim=c(0, .6))
##Although there is little change in the distribution, there is still a lot of missing data among the insurance. This has the mostg missing values
library(lattice)
stripplot(imp,inc~black|.imp, pch=20)
##here we see that the imputed values are not consisted with those seen in the original sample due to the number of red dots we see in the table below
fit.inc<-with(data=imp ,expr=lm(inc~black+white+ecigaret+exercise))
fit.inc
## call :
## with.mids(data = imp, expr = lm(inc ~ black + white + ecigaret +
## exercise))
##
## call1 :
## mice(data = dat2[, c("white", "exercise", "ecigaret", "black",
## "badhealth", "inc")], m = 5)
##
## nmis :
## white exercise ecigaret black badhealth inc
## 523 1945 1166 523 67 4202
##
## analyses :
## [[1]]
##
## Call:
## lm(formula = inc ~ black + white + ecigaret + exercise)
##
## Coefficients:
## (Intercept) black white ecigaret exercise
## 2.95964 0.02726 0.78978 -0.32079 0.49686
##
##
## [[2]]
##
## Call:
## lm(formula = inc ~ black + white + ecigaret + exercise)
##
## Coefficients:
## (Intercept) black white ecigaret exercise
## 2.8015 0.0969 0.8285 -0.2875 0.6747
##
##
## [[3]]
##
## Call:
## lm(formula = inc ~ black + white + ecigaret + exercise)
##
## Coefficients:
## (Intercept) black white ecigaret exercise
## 2.89392 -0.06399 0.67979 -0.23231 0.61754
##
##
## [[4]]
##
## Call:
## lm(formula = inc ~ black + white + ecigaret + exercise)
##
## Coefficients:
## (Intercept) black white ecigaret exercise
## 2.98393 -0.03769 0.63665 -0.15907 0.55846
##
##
## [[5]]
##
## Call:
## lm(formula = inc ~ black + white + ecigaret + exercise)
##
## Coefficients:
## (Intercept) black white ecigaret exercise
## 3.0099 -0.1103 0.7155 -0.2652 0.5294
##here see if income changes in regards to race, smoking and exercise. And how these relationships change with the imputed data sets.
dat.imp<-complete(imp, action = 1)
head(dat.imp, n=10)
## white exercise ecigaret black badhealth inc
## 1 1 1 0 0 0 5
## 2 1 0 0 0 0 4
## 3 1 1 0 0 1 3
## 4 1 1 0 0 0 4
## 5 1 1 1 0 0 5
## 6 0 1 0 0 0 4
## 7 0 1 0 1 0 3
## 8 0 0 0 1 1 5
## 9 1 1 0 0 0 5
## 10 1 0 0 0 0 4
#Compare to the original data
head(brfss_17[,c("white", "inc", "black","ecigaret","exercise")], n=10)
## # A tibble: 10 x 5
## white inc black ecigaret exercise
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 5 0 0 1
## 2 1 NA 0 0 NA
## 3 1 3 0 0 1
## 4 1 NA 0 0 1
## 5 1 5 0 1 1
## 6 0 4 0 0 1
## 7 0 3 1 0 1
## 8 0 NA 1 0 0
## 9 1 5 0 0 1
## 10 1 4 0 0 0
##here we see that the imputed data set does not have any missing data because all the missing gaps has been filled with realistic values compared to the original data
fit1<-lm(inc~black+white+ecigaret+exercise, data=brfss_17)
summary(fit1)
##
## Call:
## lm(formula = inc ~ black + white + ecigaret + exercise, data = brfss_17)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.2621 -1.0151 0.7379 0.7379 2.3425
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.90447 0.03004 96.698 <2e-16 ***
## black 0.04094 0.03935 1.040 0.298
## white 0.76984 0.02740 28.093 <2e-16 ***
## ecigaret -0.24697 0.02617 -9.437 <2e-16 ***
## exercise 0.58780 0.02229 26.372 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.336 on 19206 degrees of freedom
## (5789 observations deleted due to missingness)
## Multiple R-squared: 0.09691, Adjusted R-squared: 0.09672
## F-statistic: 515.2 on 4 and 19206 DF, p-value: < 2.2e-16
fit.imp<-lm(inc~black+white+ecigaret+exercise, data=dat.imp)
summary(fit.imp)
##
## Call:
## lm(formula = inc ~ black + white + ecigaret + exercise, data = dat.imp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.2463 -1.2463 0.7537 0.7537 2.3612
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.95964 0.02562 115.522 <2e-16 ***
## black 0.02726 0.03361 0.811 0.417
## white 0.78978 0.02352 33.580 <2e-16 ***
## ecigaret -0.32079 0.02310 -13.889 <2e-16 ***
## exercise 0.49686 0.01917 25.917 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.333 on 24995 degrees of freedom
## Multiple R-squared: 0.09593, Adjusted R-squared: 0.09579
## F-statistic: 663.1 on 4 and 24995 DF, p-value: < 2.2e-16
##when comparing both the original and imputed data, we see that blacks have a lower significance effect on insurance in the imputed data compared to the original data. This shows that the black race does not have a significant effect on insurance compared to other variables such as white race, e-smoking and exercise