library(car)
library(mice)
## Warning: package 'mice' was built under R version 4.0.4
library(ggplot2)
library(dplyr)
library(haven)
#load data
load("C:/Users/okabe/OneDrive/Pictures/Stats 2/brfss_2017 (1).Rdata")


View(brfss_17)

set.seed(1234)
samp<-sample(1:dim(brfss_17)[1], size = 25000) #smaller sample for brevity
brfss_17<-brfss_17[samp,]
#brfss_17$badhealth<-ifelse(brfss_17$genhlth %in% c(4,5),1,0)
brfss_17$badhealth<-Recode(brfss_17$genhlth, recodes="4:5=1; 1:3=0; else=NA")
#race/ethnicity
brfss_17$black<-Recode(brfss_17$racegr3, recodes="2=1; 9=NA; else=0")
brfss_17$white<-Recode(brfss_17$racegr3, recodes="1=1; 9=NA; else=0")
brfss_17$other<-Recode(brfss_17$racegr3, recodes="3:4=1; 9=NA; else=0")
brfss_17$hispanic<-Recode(brfss_17$racegr3, recodes="5=1; 9=NA; else=0")

brfss_17$race_eth<-Recode(brfss_17$racegr3, 
recodes="1='nhwhite'; 2='nh black'; 3='nh other';4='nh multirace'; 5='hispanic'; else=NA",
as.factor = T)
brfss_17$race_eth<-relevel(brfss_17$race_eth, ref = "nhwhite")

#insurance
brfss_17$ins<-Recode(brfss_17$hlthpln1, recodes ="7:9=NA; 1=1;2=0")

#income grouping
brfss_17$inc<-ifelse(brfss_17$incomg==9, NA, brfss_17$incomg)

#education level
brfss_17$educ<-Recode(brfss_17$educa,
recodes="1:2='0Prim'; 3='1somehs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA",
as.factor=T)
brfss_17$educ<-relevel(brfss_17$educ, ref='2hsgrad')

#employment
brfss_17$employ<-Recode(brfss_17$employ1,
recodes="1:2='Employed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA",
as.factor=T)
brfss_17$employ<-relevel(brfss_17$employ, ref='Employed')

#marital status
brfss_17$marst<-Recode(brfss_17$marital,
recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA",
as.factor=T)
brfss_17$marst<-relevel(brfss_17$marst, ref='married')

#Age cut into intervals
brfss_17$agec<-cut(brfss_17$age80, breaks=c(0,24,39,59,79,99))

#BMI, in the brfss_17a the bmi variable has 2 implied decimal places,
#so we must divide by 100 to get real bmi's

brfss_17$bmi<-brfss_17$bmi5/100

#smoking currently
brfss_17$smoke<-Recode(brfss_17$smoker3, 
recodes="1:2='Current'; 3='Former';4='NeverSmoked'; else=NA", 
as.factor=T)
brfss_17$smoke<-relevel(brfss_17$smoke, ref = "NeverSmoked")

##Exercise
brfss_17$exercise<-Recode(brfss_17$exerany2, recodes ="7:9=NA; 1=1;2=0")

#rEcigeret
brfss_17$ecigaret<-Recode(brfss_17$ecigaret, recodes ="7:9=NA; 1=1;2=0")

Measure an outcome variable and at least 5 predictors Report the pattern of missingness among all of these variables

summary(brfss_17[, c( "white",  "exercise", "ecigaret", "black",   "badhealth", "inc")])
##      white           exercise         ecigaret          black      
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.000  
##  Median :1.0000   Median :1.0000   Median :0.0000   Median :0.000  
##  Mean   :0.7397   Mean   :0.7427   Mean   :0.1576   Mean   :0.106  
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:0.0000   3rd Qu.:0.000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.000  
##  NA's   :523      NA's   :1945     NA's   :1166     NA's   :523    
##    badhealth           inc      
##  Min.   :0.0000   Min.   :1.00  
##  1st Qu.:0.0000   1st Qu.:3.00  
##  Median :0.0000   Median :5.00  
##  Mean   :0.1756   Mean   :3.87  
##  3rd Qu.:0.0000   3rd Qu.:5.00  
##  Max.   :1.0000   Max.   :5.00  
##  NA's   :67       NA's   :4202

Perform a mean (a mean for numeric data) or a modal imputation (for categorical data) of all values. Perform the analysis using this imputed data. What are your results?

summary(brfss_17$inc) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    1.00    3.00    5.00    3.87    5.00    5.00    4202
#what happens when we replace the missings with the mean?
brfss_17$inc.imp.mean<-ifelse (is.na(brfss_17$inc)==T, mean(brfss_17$inc,na.rm=T), brfss_17$inc)
brfss_17$exercise.imp.mean<-ifelse (is.na(brfss_17$exercise)==T, mean(brfss_17$exercise,na.rm=T), brfss_17$exercise)
brfss_17$black.imp.mean<-ifelse(is.na(brfss_17$black)==T, mean(brfss_17$black, na.rm=T),brfss_17$black)
brfss_17$ecigaret.imp.mean<-ifelse(is.na(brfss_17$ecigaret)==T, mean(brfss_17$ecigaret, na.rm=T), brfss_17$ecigaret)
brfss_17$badhealth.imp.mean<-ifelse(is.na(brfss_17$badhealth )==T, mean(brfss_17$badhealth, na.rm=T),  brfss_17$badhealth)
brfss_17$white.imp.mean<-ifelse (is.na(brfss_17$white)==T, mean(brfss_17$white,na.rm=T), brfss_17$white)

##insurance
mean(brfss_17$inc, na.rm=T)
## [1] 3.869651
mean(brfss_17$inc.imp.mean) #no difference!
## [1] 3.869651
##exercise
mean(brfss_17$exercise, na.rm=T)
## [1] 0.7426589
mean(brfss_17$exercise.imp.mean)
## [1] 0.7426589
##white
mean(brfss_17$white, na.rm=T)
## [1] 0.7397148
mean(brfss_17$white.imp.mean)
## [1] 0.7397148
##black
mean(brfss_17$black, na.rm=T)
## [1] 0.105977
mean(brfss_17$black.imp.mean)
## [1] 0.105977
##ecigeret
mean(brfss_17$ecigaret, na.rm=T)
## [1] 0.15759
mean(brfss_17$ecigaret.imp.mean)
## [1] 0.15759
##badhealth
mean(brfss_17$badhealth, na.rm=T)
## [1] 0.1755906
mean(brfss_17$badhealth.imp.mean)
## [1] 0.1755906
fit<-lm(inc.imp.mean~exercise.imp.mean+white.imp.mean+black.imp.mean+ecigaret.imp.mean+badhealth.imp.mean,brfss_17)
summary(fit)
## 
## Call:
## lm(formula = inc.imp.mean ~ exercise.imp.mean + white.imp.mean + 
##     black.imp.mean + ecigaret.imp.mean + badhealth.imp.mean, 
##     data = brfss_17)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.2785 -0.5444  0.4942  0.7215  2.6401 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         3.34270    0.02472 135.215   <2e-16 ***
## exercise.imp.mean   0.34759    0.01850  18.787   <2e-16 ***
## white.imp.mean      0.58818    0.02155  27.296   <2e-16 ***
## black.imp.mean      0.02805    0.03062   0.916     0.36    
## ecigaret.imp.mean  -0.18451    0.02138  -8.631   <2e-16 ***
## badhealth.imp.mean -0.79829    0.02050 -38.946   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.201 on 24994 degrees of freedom
## Multiple R-squared:  0.1319, Adjusted R-squared:  0.1317 
## F-statistic: 759.4 on 5 and 24994 DF,  p-value: < 2.2e-16
##while looking at mean values of the the original data and imputed data, we see little or no difference among the mean values

Perform a multiple imputation of all values. Perform the analysis using this imputed data set. What are your results?

md.pattern(brfss_17[,c("white",  "exercise", "ecigaret", "black", "badhealth", "inc")])

##       badhealth white black ecigaret exercise  inc     
## 19173         1     1     1        1        1    1    0
## 3359          1     1     1        1        1    0    1
## 612           1     1     1        1        0    1    1
## 169           1     1     1        1        0    0    2
## 26            1     1     1        0        1    1    1
## 16            1     1     1        0        1    0    2
## 619           1     1     1        0        0    1    2
## 440           1     1     1        0        0    0    3
## 286           1     0     0        1        1    1    2
## 133           1     0     0        1        1    0    3
## 20            1     0     0        1        0    1    3
## 20            1     0     0        1        0    0    4
## 2             1     0     0        0        1    0    4
## 19            1     0     0        0        0    1    4
## 39            1     0     0        0        0    0    5
## 38            0     1     1        1        1    1    1
## 18            0     1     1        1        1    0    2
## 1             0     1     1        1        0    1    2
## 2             0     1     1        1        0    0    3
## 1             0     1     1        0        1    0    3
## 2             0     1     1        0        0    1    3
## 1             0     1     1        0        0    0    4
## 1             0     0     0        1        1    1    3
## 2             0     0     0        1        1    0    4
## 1             0     0     0        0        0    1    5
##              67   523   523     1166     1945 4202 8426
md.pairs(brfss_17[,c("white",  "exercise", "ecigaret", "black",   "badhealth", "inc")])
## $rr
##           white exercise ecigaret black badhealth   inc
## white     24477    22631    23372 24477     24414 20471
## exercise  22631    23055    23010 22631     22995 19524
## ecigaret  23372    23010    23834 23372     23772 20131
## black     24477    22631    23372 24477     24414 20471
## badhealth 24414    22995    23772 24414     24933 20755
## inc       20471    19524    20131 20471     20755 20798
## 
## $rm
##           white exercise ecigaret black badhealth  inc
## white         0     1846     1105     0        63 4006
## exercise    424        0       45   424        60 3531
## ecigaret    462      824        0   462        62 3703
## black         0     1846     1105     0        63 4006
## badhealth   519     1938     1161   519         0 4178
## inc         327     1274      667   327        43    0
## 
## $mr
##           white exercise ecigaret black badhealth  inc
## white         0      424      462     0       519  327
## exercise   1846        0      824  1846      1938 1274
## ecigaret   1105       45        0  1105      1161  667
## black         0      424      462     0       519  327
## badhealth    63       60       62    63         0   43
## inc        4006     3531     3703  4006      4178    0
## 
## $mm
##           white exercise ecigaret black badhealth  inc
## white       523       99       61   523         4  196
## exercise     99     1945     1121    99         7  671
## ecigaret     61     1121     1166    61         5  499
## black       523       99       61   523         4  196
## badhealth     4        7        5     4        67   24
## inc         196      671      499   196        24 4202
##In here we see that there is a lot of missing data among my variables 
dat2<-brfss_17
samp2<-sample(1:dim(dat2)[1], replace = F, size = 500)
dat2$incknock<-dat2$inc
dat2$incknock[samp2]<-NA





head(dat2[, c("incknock","inc")])
## # A tibble: 6 x 2
##   incknock   inc
##      <dbl> <dbl>
## 1        5     5
## 2       NA    NA
## 3        3     3
## 4       NA    NA
## 5        5     5
## 6        4     4
imp<-mice(data = dat2[,c("white",  "exercise", "ecigaret", "black", "badhealth", "inc")],m = 5)
## 
##  iter imp variable
##   1   1  white  exercise  ecigaret  black  badhealth  inc
##   1   2  white  exercise  ecigaret  black  badhealth  inc
##   1   3  white  exercise  ecigaret  black  badhealth  inc
##   1   4  white  exercise  ecigaret  black  badhealth  inc
##   1   5  white  exercise  ecigaret  black  badhealth  inc
##   2   1  white  exercise  ecigaret  black  badhealth  inc
##   2   2  white  exercise  ecigaret  black  badhealth  inc
##   2   3  white  exercise  ecigaret  black  badhealth  inc
##   2   4  white  exercise  ecigaret  black  badhealth  inc
##   2   5  white  exercise  ecigaret  black  badhealth  inc
##   3   1  white  exercise  ecigaret  black  badhealth  inc
##   3   2  white  exercise  ecigaret  black  badhealth  inc
##   3   3  white  exercise  ecigaret  black  badhealth  inc
##   3   4  white  exercise  ecigaret  black  badhealth  inc
##   3   5  white  exercise  ecigaret  black  badhealth  inc
##   4   1  white  exercise  ecigaret  black  badhealth  inc
##   4   2  white  exercise  ecigaret  black  badhealth  inc
##   4   3  white  exercise  ecigaret  black  badhealth  inc
##   4   4  white  exercise  ecigaret  black  badhealth  inc
##   4   5  white  exercise  ecigaret  black  badhealth  inc
##   5   1  white  exercise  ecigaret  black  badhealth  inc
##   5   2  white  exercise  ecigaret  black  badhealth  inc
##   5   3  white  exercise  ecigaret  black  badhealth  inc
##   5   4  white  exercise  ecigaret  black  badhealth  inc
##   5   5  white  exercise  ecigaret  black  badhealth  inc
print(imp)
## Class: mids
## Number of multiple imputations:  5 
## Imputation methods:
##     white  exercise  ecigaret     black badhealth       inc 
##     "pmm"     "pmm"     "pmm"     "pmm"     "pmm"     "pmm" 
## PredictorMatrix:
##           white exercise ecigaret black badhealth inc
## white         0        1        1     1         1   1
## exercise      1        0        1     1         1   1
## ecigaret      1        1        0     1         1   1
## black         1        1        1     0         1   1
## badhealth     1        1        1     1         0   1
## inc           1        1        1     1         1   0
##Here we see what variables were used for the imputations and gow many imputation were done

Were the results similar between the mean/modal and multiply imputed data sets? How do the results compare to the results from the model fit with the data source with missing values?

barplot(prop.table(table(brfss_17$inc)), main="Original Data", ylim=c(0, .6))

barplot(prop.table(table(brfss_17$inc.imp.mean)), main="Imputed Data", ylim=c(0, .6))

##Although there is little change in the distribution, there is still a lot of missing data among the insurance. This has the mostg missing values
library(lattice)
stripplot(imp,inc~black|.imp, pch=20)

##here we see that the imputed values are not consisted with those seen in the original sample due to the number of red dots we see in the table below
fit.inc<-with(data=imp ,expr=lm(inc~black+white+ecigaret+exercise))
fit.inc
## call :
## with.mids(data = imp, expr = lm(inc ~ black + white + ecigaret + 
##     exercise))
## 
## call1 :
## mice(data = dat2[, c("white", "exercise", "ecigaret", "black", 
##     "badhealth", "inc")], m = 5)
## 
## nmis :
##     white  exercise  ecigaret     black badhealth       inc 
##       523      1945      1166       523        67      4202 
## 
## analyses :
## [[1]]
## 
## Call:
## lm(formula = inc ~ black + white + ecigaret + exercise)
## 
## Coefficients:
## (Intercept)        black        white     ecigaret     exercise  
##     2.95964      0.02726      0.78978     -0.32079      0.49686  
## 
## 
## [[2]]
## 
## Call:
## lm(formula = inc ~ black + white + ecigaret + exercise)
## 
## Coefficients:
## (Intercept)        black        white     ecigaret     exercise  
##      2.8015       0.0969       0.8285      -0.2875       0.6747  
## 
## 
## [[3]]
## 
## Call:
## lm(formula = inc ~ black + white + ecigaret + exercise)
## 
## Coefficients:
## (Intercept)        black        white     ecigaret     exercise  
##     2.89392     -0.06399      0.67979     -0.23231      0.61754  
## 
## 
## [[4]]
## 
## Call:
## lm(formula = inc ~ black + white + ecigaret + exercise)
## 
## Coefficients:
## (Intercept)        black        white     ecigaret     exercise  
##     2.98393     -0.03769      0.63665     -0.15907      0.55846  
## 
## 
## [[5]]
## 
## Call:
## lm(formula = inc ~ black + white + ecigaret + exercise)
## 
## Coefficients:
## (Intercept)        black        white     ecigaret     exercise  
##      3.0099      -0.1103       0.7155      -0.2652       0.5294
##here see if income changes in regards to race, smoking and exercise. And how these relationships change with the imputed data sets.
dat.imp<-complete(imp, action = 1)
head(dat.imp, n=10)
##    white exercise ecigaret black badhealth inc
## 1      1        1        0     0         0   5
## 2      1        0        0     0         0   4
## 3      1        1        0     0         1   3
## 4      1        1        0     0         0   4
## 5      1        1        1     0         0   5
## 6      0        1        0     0         0   4
## 7      0        1        0     1         0   3
## 8      0        0        0     1         1   5
## 9      1        1        0     0         0   5
## 10     1        0        0     0         0   4
#Compare to the original data
head(brfss_17[,c("white", "inc", "black","ecigaret","exercise")], n=10)
## # A tibble: 10 x 5
##    white   inc black ecigaret exercise
##    <dbl> <dbl> <dbl>    <dbl>    <dbl>
##  1     1     5     0        0        1
##  2     1    NA     0        0       NA
##  3     1     3     0        0        1
##  4     1    NA     0        0        1
##  5     1     5     0        1        1
##  6     0     4     0        0        1
##  7     0     3     1        0        1
##  8     0    NA     1        0        0
##  9     1     5     0        0        1
## 10     1     4     0        0        0
##here we see that the imputed data set does not have any missing data because all the missing gaps has been filled with realistic values compared to the original data

fit1<-lm(inc~black+white+ecigaret+exercise, data=brfss_17)
summary(fit1)
## 
## Call:
## lm(formula = inc ~ black + white + ecigaret + exercise, data = brfss_17)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.2621 -1.0151  0.7379  0.7379  2.3425 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.90447    0.03004  96.698   <2e-16 ***
## black        0.04094    0.03935   1.040    0.298    
## white        0.76984    0.02740  28.093   <2e-16 ***
## ecigaret    -0.24697    0.02617  -9.437   <2e-16 ***
## exercise     0.58780    0.02229  26.372   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.336 on 19206 degrees of freedom
##   (5789 observations deleted due to missingness)
## Multiple R-squared:  0.09691,    Adjusted R-squared:  0.09672 
## F-statistic: 515.2 on 4 and 19206 DF,  p-value: < 2.2e-16
fit.imp<-lm(inc~black+white+ecigaret+exercise, data=dat.imp)
summary(fit.imp)
## 
## Call:
## lm(formula = inc ~ black + white + ecigaret + exercise, data = dat.imp)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.2463 -1.2463  0.7537  0.7537  2.3612 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.95964    0.02562 115.522   <2e-16 ***
## black        0.02726    0.03361   0.811    0.417    
## white        0.78978    0.02352  33.580   <2e-16 ***
## ecigaret    -0.32079    0.02310 -13.889   <2e-16 ***
## exercise     0.49686    0.01917  25.917   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.333 on 24995 degrees of freedom
## Multiple R-squared:  0.09593,    Adjusted R-squared:  0.09579 
## F-statistic: 663.1 on 4 and 24995 DF,  p-value: < 2.2e-16
##when comparing both the original and imputed data, we see that blacks have a lower significance effect on insurance in the imputed data compared to the original data. This shows that the black race does not have a significant effect on insurance compared to other variables such as white race, e-smoking and exercise