#All of the variables in this dataset are from brfss 2017.
library(haven)
library(readr)
library (car)
## Loading required package: carData
library (mice)
## Loading required package: lattice
## Registered S3 methods overwritten by 'lme4':
## method from
## cooks.distance.influence.merMod car
## influence.merMod car
## dfbeta.influence.merMod car
## dfbetas.influence.merMod car
##
## Attaching package: 'mice'
## The following objects are masked from 'package:base':
##
## cbind, rbind
library (dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library (ggplot2)
setwd('C:/Users/tim1203/Desktop/utsa/C. Sparks/Dem 7283/Data sets')
load("brfss_2017.rdata")
library(questionr)
## Warning: package 'questionr' was built under R version 3.6.2
names(brfss_17)
## [1] "dispcode" "statere1" "safetime" "hhadult" "genhlth" "physhlth"
## [7] "menthlth" "poorhlth" "hlthpln1" "persdoc2" "medcost" "checkup1"
## [13] "bphigh4" "bpmeds" "cholchk1" "toldhi2" "cholmed1" "cvdinfr4"
## [19] "cvdcrhd4" "cvdstrk3" "asthma3" "asthnow" "chcscncr" "chcocncr"
## [25] "chccopd1" "havarth3" "addepev2" "chckidny" "diabete3" "diabage2"
## [31] "lmtjoin3" "arthdis2" "arthsocl" "joinpai1" "sex" "marital"
## [37] "educa" "renthom1" "numhhol2" "numphon2" "cpdemo1a" "veteran3"
## [43] "employ1" "children" "income2" "internet" "weight2" "height3"
## [49] "pregnant" "deaf" "blind" "decide" "diffwalk" "diffdres"
## [55] "diffalon" "smoke100" "smokday2" "stopsmk2" "lastsmk2" "usenow3"
## [61] "ecigaret" "ecignow" "alcday5" "avedrnk2" "drnk3ge5" "maxdrnks"
## [67] "fruit2" "fruitju2" "fvgreen1" "frenchf1" "potatoe1" "vegetab2"
## [73] "exerany2" "exract11" "exeroft1" "exerhmm1" "exract21" "exeroft2"
## [79] "exerhmm2" "strength" "seatbelt" "flushot6" "flshtmy2" "pneuvac3"
## [85] "shingle2" "hivtst6" "hivtstd3" "hivrisk5" "casthdx2" "casthno2"
## [91] "callbckz" "wdusenow" "wdinftrk" "wdhowoft" "wdshare" "namtribe"
## [97] "namothr" "urbnrrl" "ststr" "impsex" "rfhlth" "phys14d"
## [103] "ment14d" "hcvu651" "rfhype5" "cholch1" "rfchol1" "michd"
## [109] "ltasth1" "casthm1" "asthms1" "drdxar1" "lmtact1" "lmtwrk1"
## [115] "lmtscl1" "prace1" "mrace1" "hispanc" "race" "raceg21"
## [121] "racegr3" "ageg5yr" "age65yr" "age80" "ageg" "wtkg3"
## [127] "bmi5" "bmi5cat" "rfbmi5" "educag" "incomg" "smoker3"
## [133] "rfsmok3" "ecigsts" "curecig" "drnkany5" "rfbing5" "drnkwek"
## [139] "rfdrhv5" "ftjuda2" "frutda2" "grenda1" "frnchda" "potada1"
## [145] "vegeda2" "misfrt1" "misveg1" "frtres1" "vegres1" "frutsu1"
## [151] "vegesu1" "frtlt1a" "veglt1a" "frt16a" "veg23a" "fruite1"
## [157] "vegete1" "totinda" "minac11" "minac21" "pacat1" "paindx1"
## [163] "pa150r2" "pa300r2" "pa30021" "pastrng" "parec1" "pastae1"
## [169] "rfseat2" "rfseat3" "flshot6" "pneumo2" "aidtst3" "mmsa"
## [175] "mmsawt" "seqno" "mmsaname"
##Recoding of variables
table(brfss_17$menthlth)
##
## 1 2 3 4 5 6 7 8 9 10
## 7526 11574 6897 3484 8490 973 3285 672 112 6041
## 11 12 13 14 15 16 17 18 19 20
## 42 467 72 1244 5567 110 84 130 8 3405
## 21 22 23 24 25 26 27 28 29 30
## 217 65 39 58 1169 49 98 329 198 12112
## 77 88 99
## 2433 152826 1099
brfss_17$mentaldays<-Recode(brfss_17$menthlth, recodes="88=0; 77=NA; 99=NA")
table(brfss_17$mentaldays)
##
## 0 1 2 3 4 5 6 7 8 9
## 152826 7526 11574 6897 3484 8490 973 3285 672 112
## 10 11 12 13 14 15 16 17 18 19
## 6041 42 467 72 1244 5567 110 84 130 8
## 20 21 22 23 24 25 26 27 28 29
## 3405 217 65 39 58 1169 49 98 329 198
## 30
## 12112
hist(brfss_17$mentaldays)
summary(brfss_17$mentaldays)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 0.00 0.00 3.54 2.00 30.00 3532
#The average number of mental days is about 4 days.
table(brfss_17$internet)
##
## 1 2 7 9
## 191714 36199 228 409
brfss_17$usage<-Recode(brfss_17$internet, recodes="1=1; 2=0; 7=NA; 9=NA; else=NA")
table(brfss_17$usage)
##
## 0 1
## 36199 191714
brfss_17$usage2<-Recode(brfss_17$usage, recodes="1='usage'; 0='no usage'; else=NA", as.factor=T)
brfss_17$usage2<-relevel(brfss_17$usage2, ref = 'usage')
table(brfss_17$usage2)
##
## usage no usage
## 191714 36199
table(brfss_17$sex)
##
## 1 2 9
## 102083 128633 159
brfss_17$sex2<-Recode(brfss_17$sex, recodes="1=1; 2=0; else=NA")
table(brfss_17$sex2)
##
## 0 1
## 128633 102083
brfss_17$sex2b<-Recode(brfss_17$sex2, recodes="1='male'; 0='female'; else=NA", as.factor=T)
brfss_17$sex2b<-relevel(brfss_17$sex2b, ref = 'male')
table(brfss_17$sex2b)
##
## male female
## 102083 128633
table(brfss_17$income2)
##
## 1 2 3 4 5 6 7 8 77 99
## 8193 8366 12695 15902 18533 25814 30049 71998 16588 21037
brfss_17$newincome<-Recode(brfss_17$income2, recodes="1:2=1; 3:4=2; 5=3; 6=4; 7=5; 8=6; 77=NA; 99=NA")
table(brfss_17$newincome)
##
## 1 2 3 4 5 6
## 16559 28597 18533 25814 30049 71998
brfss_17$newincome2<-Recode(brfss_17$newincome, recodes="1='less than 15k'; 2='less than 25k'; 3='less than 35k'; 4='less than 50k'; 5='less than 75k'; 6='more than 75k'; else=NA", as.factor=T)
brfss_17$newincome2<-relevel(brfss_17$newincome2, ref = 'more than 75k')
table(brfss_17$newincome2)
##
## more than 75k less than 15k less than 25k less than 35k less than 50k
## 71998 16559 28597 18533 25814
## less than 75k
## 30049
brfss_17$educa2<-Recode(brfss_17$educa, recodes="1:2='0 Prim'; 3='1somehs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad'; 9=NA", as.factor=T)
brfss_17$educa2<-relevel(brfss_17$educa2, ref = '2hsgrad')
table(brfss_17$educa2)
##
## 2hsgrad 0 Prim 1somehs 3somecol 4colgrad
## 55951 5133 9493 62309 97008
table(brfss_17$hispanc)
##
## 1 2 9
## 22022 206413 2440
brfss_17$hispanic<-Recode(brfss_17$hispanc, recodes="1=1; 2=0; else=NA")
table(brfss_17$hispanic)
##
## 0 1
## 206413 22022
brfss_17$hispanic2<-Recode(brfss_17$hispanic, recodes="1='Hispanic'; 0='NonHispanic'; else=NA", as.factor=T)
brfss_17$hispanic2<-relevel(brfss_17$hispanic2, ref = 'NonHispanic')
#Pattern of Missingness
summary(brfss_17[, c("hispanic2","usage2", "sex2b", "newincome2", "educa2", "mentaldays")])
## hispanic2 usage2 sex2b
## NonHispanic:206413 usage :191714 male :102083
## Hispanic : 22022 no usage: 36199 female:128633
## NA's : 2440 NA's : 2962 NA's : 159
##
##
##
##
## newincome2 educa2 mentaldays
## more than 75k:71998 2hsgrad :55951 Min. : 0.00
## less than 15k:16559 0 Prim : 5133 1st Qu.: 0.00
## less than 25k:28597 1somehs : 9493 Median : 0.00
## less than 35k:18533 3somecol:62309 Mean : 3.54
## less than 50k:25814 4colgrad:97008 3rd Qu.: 2.00
## less than 75k:30049 NA's : 981 Max. :30.00
## NA's :39325 NA's :3532
##The lowest numbers of missing belongs to biological sex: sex2b with only 159 missing. This is followed by education: educa2 which only has 981. Newincome2 has the highest with over 39 thousand missing.
summary(brfss_17$mentaldays)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 0.00 0.00 3.54 2.00 30.00 3532
#Mean for numeric data
brfss_17$mentaldays.imp.mean<-ifelse(is.na(brfss_17$mentaldays)==T, mean(brfss_17$mentaldays, na.rm=T), brfss_17$mentaldays)
mean(brfss_17$mentaldays, na.rm=T)
## [1] 3.540096
median(brfss_17$mentaldays, na.rm=T)
## [1] 0
median(brfss_17$mentaldays.imp.mean)
## [1] 0
var(brfss_17$mentaldays, na.rm=T)
## [1] 60.56478
var(brfss_17$mentaldays.imp.mean)
## [1] 59.63824
library(reshape2)
brfss_17%>%
select(mentaldays.imp.mean, mentaldays)%>%
melt()%>%
ggplot()+geom_freqpoly(aes(x=value, y=..density.., colour =variable))
## No id variables; using all as measure variables
## Warning: attributes are not identical across measure variables; they will
## be dropped
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3532 rows containing non-finite values (stat_bin).
#Model Imputaqtion for categorical data
table(brfss_17$newincome2)
##
## more than 75k less than 15k less than 25k less than 35k less than 50k
## 71998 16559 28597 18533 25814
## less than 75k
## 30049
mcv.newincome2<-factor(names(which.max(table(brfss_17$newincome2))), levels=levels(brfss_17$newincome2))
mcv.newincome2
## [1] more than 75k
## 6 Levels: more than 75k less than 15k less than 25k ... less than 75k
#impute the cases
brfss_17$newincome2.imp<-as.factor(ifelse(is.na(brfss_17$newincome2)==T, mcv.newincome2, brfss_17$newincome2))
levels(brfss_17$newincome2.imp)<-levels(brfss_17$newincome2)
prop.table(table(brfss_17$newincome2))
##
## more than 75k less than 15k less than 25k less than 35k less than 50k
## 0.37587053 0.08644740 0.14929261 0.09675281 0.13476377
## less than 75k
## 0.15687288
prop.table(table(brfss_17$newincome2.imp))
##
## more than 75k less than 15k less than 25k less than 35k less than 50k
## 0.48217867 0.07172279 0.12386356 0.08027287 0.11180942
## less than 75k
## 0.13015268
barplot(prop.table(table(brfss_17$newincome2)), main="Original Data", ylim = c(0, .6))
barplot(prop.table(table(brfss_17$newincome2)), main = "Imputed Data", ylim = c(0,.6))
#There appears to be slight differences. There are more than 39 missing cases.
##Multiple Imputation
md.pattern(brfss_17[,c("mentaldays", "usage2", "sex2b", "newincome2", "educa2", "hispanic2")])
## sex2b educa2 hispanic2 usage2 mentaldays newincome2
## 187118 1 1 1 1 1 1 0
## 34399 1 1 1 1 1 0 1
## 2197 1 1 1 1 0 1 1
## 1068 1 1 1 1 0 0 2
## 478 1 1 1 0 1 1 1
## 2179 1 1 1 0 1 0 2
## 15 1 1 1 0 0 1 2
## 72 1 1 1 0 0 0 3
## 1268 1 1 0 1 1 1 1
## 769 1 1 0 1 1 0 2
## 52 1 1 0 1 0 1 2
## 51 1 1 0 1 0 0 3
## 13 1 1 0 0 1 1 2
## 70 1 1 0 0 1 0 3
## 7 1 1 0 0 0 0 4
## 268 1 0 1 1 1 1 1
## 393 1 0 1 1 1 0 2
## 20 1 0 1 1 0 1 2
## 30 1 0 1 1 0 0 3
## 3 1 0 1 0 1 1 2
## 65 1 0 1 0 1 0 3
## 5 1 0 1 0 0 0 4
## 20 1 0 0 1 1 1 2
## 104 1 0 0 1 1 0 3
## 7 1 0 0 1 0 0 4
## 1 1 0 0 0 1 1 3
## 41 1 0 0 0 1 0 4
## 3 1 0 0 0 0 0 5
## 82 0 1 1 1 1 1 1
## 31 0 1 1 1 1 0 2
## 4 0 1 1 1 0 1 2
## 2 0 1 1 0 1 0 3
## 7 0 1 0 1 1 1 2
## 11 0 1 0 1 1 0 3
## 1 0 1 0 0 1 0 4
## 2 0 0 1 1 1 1 2
## 3 0 0 1 1 1 0 3
## 1 0 0 1 0 1 0 4
## 2 0 0 0 1 1 1 3
## 7 0 0 0 1 1 0 4
## 5 0 0 0 0 1 0 5
## 1 0 0 0 0 0 0 6
## 159 981 2440 2962 3532 39325 49399
md.pairs(brfss_17[, c("mentaldays", "usage2", "sex2b", "newincome2", "educa2", "hispanic2")])
## $rr
## mentaldays usage2 sex2b newincome2 educa2 hispanic2
## mentaldays 227343 224484 227189 189262 226428 225024
## usage2 224484 227913 227764 191040 227057 225615
## sex2b 227189 227764 230716 191453 229756 228310
## newincome2 189262 191040 191453 191550 191234 190187
## educa2 226428 227057 229756 191234 229894 227645
## hispanic2 225024 225615 228310 190187 227645 228435
##
## $rm
## mentaldays usage2 sex2b newincome2 educa2 hispanic2
## mentaldays 0 2859 154 38081 915 2319
## usage2 3429 0 149 36873 856 2298
## sex2b 3527 2952 0 39263 960 2406
## newincome2 2288 510 97 0 316 1363
## educa2 3466 2837 138 38660 0 2249
## hispanic2 3411 2820 125 38248 790 0
##
## $mr
## mentaldays usage2 sex2b newincome2 educa2 hispanic2
## mentaldays 0 3429 3527 2288 3466 3411
## usage2 2859 0 2952 510 2837 2820
## sex2b 154 149 0 97 138 125
## newincome2 38081 36873 39263 0 38660 38248
## educa2 915 856 960 316 0 790
## hispanic2 2319 2298 2406 1363 2249 0
##
## $mm
## mentaldays usage2 sex2b newincome2 educa2 hispanic2
## mentaldays 3532 103 5 1244 66 121
## usage2 103 2962 10 2452 125 142
## sex2b 5 10 159 62 21 34
## newincome2 1244 2452 62 39325 665 1077
## educa2 66 125 21 665 981 191
## hispanic2 121 142 34 1077 191 2440
#Results are similar as the missing number for newincome 2 was 39,325; however, itn the multiple imputation, the missing value was 34399