#All of the variables in this dataset are from brfss 2017.

library(haven)
library(readr)
library (car)
## Loading required package: carData
library (mice)
## Loading required package: lattice
## Registered S3 methods overwritten by 'lme4':
##   method                          from
##   cooks.distance.influence.merMod car 
##   influence.merMod                car 
##   dfbeta.influence.merMod         car 
##   dfbetas.influence.merMod        car
## 
## Attaching package: 'mice'
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
library (dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library (ggplot2)
setwd('C:/Users/tim1203/Desktop/utsa/C. Sparks/Dem 7283/Data sets')
load("brfss_2017.rdata")
library(questionr)
## Warning: package 'questionr' was built under R version 3.6.2
names(brfss_17)
##   [1] "dispcode" "statere1" "safetime" "hhadult"  "genhlth"  "physhlth"
##   [7] "menthlth" "poorhlth" "hlthpln1" "persdoc2" "medcost"  "checkup1"
##  [13] "bphigh4"  "bpmeds"   "cholchk1" "toldhi2"  "cholmed1" "cvdinfr4"
##  [19] "cvdcrhd4" "cvdstrk3" "asthma3"  "asthnow"  "chcscncr" "chcocncr"
##  [25] "chccopd1" "havarth3" "addepev2" "chckidny" "diabete3" "diabage2"
##  [31] "lmtjoin3" "arthdis2" "arthsocl" "joinpai1" "sex"      "marital" 
##  [37] "educa"    "renthom1" "numhhol2" "numphon2" "cpdemo1a" "veteran3"
##  [43] "employ1"  "children" "income2"  "internet" "weight2"  "height3" 
##  [49] "pregnant" "deaf"     "blind"    "decide"   "diffwalk" "diffdres"
##  [55] "diffalon" "smoke100" "smokday2" "stopsmk2" "lastsmk2" "usenow3" 
##  [61] "ecigaret" "ecignow"  "alcday5"  "avedrnk2" "drnk3ge5" "maxdrnks"
##  [67] "fruit2"   "fruitju2" "fvgreen1" "frenchf1" "potatoe1" "vegetab2"
##  [73] "exerany2" "exract11" "exeroft1" "exerhmm1" "exract21" "exeroft2"
##  [79] "exerhmm2" "strength" "seatbelt" "flushot6" "flshtmy2" "pneuvac3"
##  [85] "shingle2" "hivtst6"  "hivtstd3" "hivrisk5" "casthdx2" "casthno2"
##  [91] "callbckz" "wdusenow" "wdinftrk" "wdhowoft" "wdshare"  "namtribe"
##  [97] "namothr"  "urbnrrl"  "ststr"    "impsex"   "rfhlth"   "phys14d" 
## [103] "ment14d"  "hcvu651"  "rfhype5"  "cholch1"  "rfchol1"  "michd"   
## [109] "ltasth1"  "casthm1"  "asthms1"  "drdxar1"  "lmtact1"  "lmtwrk1" 
## [115] "lmtscl1"  "prace1"   "mrace1"   "hispanc"  "race"     "raceg21" 
## [121] "racegr3"  "ageg5yr"  "age65yr"  "age80"    "ageg"     "wtkg3"   
## [127] "bmi5"     "bmi5cat"  "rfbmi5"   "educag"   "incomg"   "smoker3" 
## [133] "rfsmok3"  "ecigsts"  "curecig"  "drnkany5" "rfbing5"  "drnkwek" 
## [139] "rfdrhv5"  "ftjuda2"  "frutda2"  "grenda1"  "frnchda"  "potada1" 
## [145] "vegeda2"  "misfrt1"  "misveg1"  "frtres1"  "vegres1"  "frutsu1" 
## [151] "vegesu1"  "frtlt1a"  "veglt1a"  "frt16a"   "veg23a"   "fruite1" 
## [157] "vegete1"  "totinda"  "minac11"  "minac21"  "pacat1"   "paindx1" 
## [163] "pa150r2"  "pa300r2"  "pa30021"  "pastrng"  "parec1"   "pastae1" 
## [169] "rfseat2"  "rfseat3"  "flshot6"  "pneumo2"  "aidtst3"  "mmsa"    
## [175] "mmsawt"   "seqno"    "mmsaname"

##Recoding of variables

table(brfss_17$menthlth)
## 
##      1      2      3      4      5      6      7      8      9     10 
##   7526  11574   6897   3484   8490    973   3285    672    112   6041 
##     11     12     13     14     15     16     17     18     19     20 
##     42    467     72   1244   5567    110     84    130      8   3405 
##     21     22     23     24     25     26     27     28     29     30 
##    217     65     39     58   1169     49     98    329    198  12112 
##     77     88     99 
##   2433 152826   1099
brfss_17$mentaldays<-Recode(brfss_17$menthlth, recodes="88=0; 77=NA; 99=NA")
table(brfss_17$mentaldays)
## 
##      0      1      2      3      4      5      6      7      8      9 
## 152826   7526  11574   6897   3484   8490    973   3285    672    112 
##     10     11     12     13     14     15     16     17     18     19 
##   6041     42    467     72   1244   5567    110     84    130      8 
##     20     21     22     23     24     25     26     27     28     29 
##   3405    217     65     39     58   1169     49     98    329    198 
##     30 
##  12112
hist(brfss_17$mentaldays)

summary(brfss_17$mentaldays)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00    0.00    0.00    3.54    2.00   30.00    3532

#The average number of mental days is about 4 days.

table(brfss_17$internet)
## 
##      1      2      7      9 
## 191714  36199    228    409
brfss_17$usage<-Recode(brfss_17$internet, recodes="1=1; 2=0; 7=NA; 9=NA; else=NA")
table(brfss_17$usage)
## 
##      0      1 
##  36199 191714
brfss_17$usage2<-Recode(brfss_17$usage, recodes="1='usage'; 0='no usage'; else=NA", as.factor=T)
brfss_17$usage2<-relevel(brfss_17$usage2, ref = 'usage')
table(brfss_17$usage2)
## 
##    usage no usage 
##   191714    36199
table(brfss_17$sex)
## 
##      1      2      9 
## 102083 128633    159
brfss_17$sex2<-Recode(brfss_17$sex, recodes="1=1; 2=0; else=NA")
table(brfss_17$sex2)
## 
##      0      1 
## 128633 102083
brfss_17$sex2b<-Recode(brfss_17$sex2, recodes="1='male'; 0='female'; else=NA", as.factor=T)
brfss_17$sex2b<-relevel(brfss_17$sex2b, ref = 'male')
table(brfss_17$sex2b)
## 
##   male female 
## 102083 128633
table(brfss_17$income2)
## 
##     1     2     3     4     5     6     7     8    77    99 
##  8193  8366 12695 15902 18533 25814 30049 71998 16588 21037
brfss_17$newincome<-Recode(brfss_17$income2, recodes="1:2=1; 3:4=2; 5=3; 6=4; 7=5; 8=6; 77=NA; 99=NA")
table(brfss_17$newincome)
## 
##     1     2     3     4     5     6 
## 16559 28597 18533 25814 30049 71998
brfss_17$newincome2<-Recode(brfss_17$newincome, recodes="1='less than 15k'; 2='less than 25k'; 3='less than 35k'; 4='less than 50k'; 5='less than 75k'; 6='more than 75k'; else=NA", as.factor=T)
brfss_17$newincome2<-relevel(brfss_17$newincome2, ref = 'more than 75k')
table(brfss_17$newincome2)
## 
## more than 75k less than 15k less than 25k less than 35k less than 50k 
##         71998         16559         28597         18533         25814 
## less than 75k 
##         30049
brfss_17$educa2<-Recode(brfss_17$educa, recodes="1:2='0 Prim'; 3='1somehs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad'; 9=NA", as.factor=T)
brfss_17$educa2<-relevel(brfss_17$educa2, ref = '2hsgrad')
table(brfss_17$educa2)
## 
##  2hsgrad   0 Prim  1somehs 3somecol 4colgrad 
##    55951     5133     9493    62309    97008
table(brfss_17$hispanc)
## 
##      1      2      9 
##  22022 206413   2440
brfss_17$hispanic<-Recode(brfss_17$hispanc, recodes="1=1; 2=0; else=NA")
table(brfss_17$hispanic)
## 
##      0      1 
## 206413  22022
brfss_17$hispanic2<-Recode(brfss_17$hispanic, recodes="1='Hispanic'; 0='NonHispanic'; else=NA", as.factor=T)
brfss_17$hispanic2<-relevel(brfss_17$hispanic2, ref = 'NonHispanic')

#Pattern of Missingness

summary(brfss_17[, c("hispanic2","usage2", "sex2b", "newincome2", "educa2", "mentaldays")])
##        hispanic2           usage2          sex2b       
##  NonHispanic:206413   usage   :191714   male  :102083  
##  Hispanic   : 22022   no usage: 36199   female:128633  
##  NA's       :  2440   NA's    :  2962   NA's  :   159  
##                                                        
##                                                        
##                                                        
##                                                        
##          newincome2         educa2        mentaldays   
##  more than 75k:71998   2hsgrad :55951   Min.   : 0.00  
##  less than 15k:16559   0 Prim  : 5133   1st Qu.: 0.00  
##  less than 25k:28597   1somehs : 9493   Median : 0.00  
##  less than 35k:18533   3somecol:62309   Mean   : 3.54  
##  less than 50k:25814   4colgrad:97008   3rd Qu.: 2.00  
##  less than 75k:30049   NA's    :  981   Max.   :30.00  
##  NA's         :39325                    NA's   :3532

##The lowest numbers of missing belongs to biological sex: sex2b with only 159 missing. This is followed by education: educa2 which only has 981. Newincome2 has the highest with over 39 thousand missing.

summary(brfss_17$mentaldays)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00    0.00    0.00    3.54    2.00   30.00    3532

#Mean for numeric data

brfss_17$mentaldays.imp.mean<-ifelse(is.na(brfss_17$mentaldays)==T, mean(brfss_17$mentaldays, na.rm=T), brfss_17$mentaldays)
mean(brfss_17$mentaldays, na.rm=T)
## [1] 3.540096
median(brfss_17$mentaldays, na.rm=T)
## [1] 0
median(brfss_17$mentaldays.imp.mean)
## [1] 0
var(brfss_17$mentaldays, na.rm=T)
## [1] 60.56478
var(brfss_17$mentaldays.imp.mean)
## [1] 59.63824
library(reshape2)
brfss_17%>%
  select(mentaldays.imp.mean, mentaldays)%>%
  melt()%>%
  ggplot()+geom_freqpoly(aes(x=value, y=..density.., colour =variable))
## No id variables; using all as measure variables
## Warning: attributes are not identical across measure variables; they will
## be dropped
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3532 rows containing non-finite values (stat_bin).

#Model Imputaqtion for categorical data

table(brfss_17$newincome2)
## 
## more than 75k less than 15k less than 25k less than 35k less than 50k 
##         71998         16559         28597         18533         25814 
## less than 75k 
##         30049
mcv.newincome2<-factor(names(which.max(table(brfss_17$newincome2))), levels=levels(brfss_17$newincome2))
mcv.newincome2
## [1] more than 75k
## 6 Levels: more than 75k less than 15k less than 25k ... less than 75k

#impute the cases

brfss_17$newincome2.imp<-as.factor(ifelse(is.na(brfss_17$newincome2)==T, mcv.newincome2, brfss_17$newincome2))
levels(brfss_17$newincome2.imp)<-levels(brfss_17$newincome2)
prop.table(table(brfss_17$newincome2))
## 
## more than 75k less than 15k less than 25k less than 35k less than 50k 
##    0.37587053    0.08644740    0.14929261    0.09675281    0.13476377 
## less than 75k 
##    0.15687288
prop.table(table(brfss_17$newincome2.imp))
## 
## more than 75k less than 15k less than 25k less than 35k less than 50k 
##    0.48217867    0.07172279    0.12386356    0.08027287    0.11180942 
## less than 75k 
##    0.13015268
barplot(prop.table(table(brfss_17$newincome2)), main="Original Data", ylim = c(0, .6))

barplot(prop.table(table(brfss_17$newincome2)), main = "Imputed Data", ylim = c(0,.6))

#There appears to be slight differences. There are more than 39 missing cases.

##Multiple Imputation

md.pattern(brfss_17[,c("mentaldays", "usage2", "sex2b", "newincome2", "educa2", "hispanic2")])

##        sex2b educa2 hispanic2 usage2 mentaldays newincome2      
## 187118     1      1         1      1          1          1     0
## 34399      1      1         1      1          1          0     1
## 2197       1      1         1      1          0          1     1
## 1068       1      1         1      1          0          0     2
## 478        1      1         1      0          1          1     1
## 2179       1      1         1      0          1          0     2
## 15         1      1         1      0          0          1     2
## 72         1      1         1      0          0          0     3
## 1268       1      1         0      1          1          1     1
## 769        1      1         0      1          1          0     2
## 52         1      1         0      1          0          1     2
## 51         1      1         0      1          0          0     3
## 13         1      1         0      0          1          1     2
## 70         1      1         0      0          1          0     3
## 7          1      1         0      0          0          0     4
## 268        1      0         1      1          1          1     1
## 393        1      0         1      1          1          0     2
## 20         1      0         1      1          0          1     2
## 30         1      0         1      1          0          0     3
## 3          1      0         1      0          1          1     2
## 65         1      0         1      0          1          0     3
## 5          1      0         1      0          0          0     4
## 20         1      0         0      1          1          1     2
## 104        1      0         0      1          1          0     3
## 7          1      0         0      1          0          0     4
## 1          1      0         0      0          1          1     3
## 41         1      0         0      0          1          0     4
## 3          1      0         0      0          0          0     5
## 82         0      1         1      1          1          1     1
## 31         0      1         1      1          1          0     2
## 4          0      1         1      1          0          1     2
## 2          0      1         1      0          1          0     3
## 7          0      1         0      1          1          1     2
## 11         0      1         0      1          1          0     3
## 1          0      1         0      0          1          0     4
## 2          0      0         1      1          1          1     2
## 3          0      0         1      1          1          0     3
## 1          0      0         1      0          1          0     4
## 2          0      0         0      1          1          1     3
## 7          0      0         0      1          1          0     4
## 5          0      0         0      0          1          0     5
## 1          0      0         0      0          0          0     6
##          159    981      2440   2962       3532      39325 49399
md.pairs(brfss_17[, c("mentaldays", "usage2", "sex2b", "newincome2", "educa2", "hispanic2")])
## $rr
##            mentaldays usage2  sex2b newincome2 educa2 hispanic2
## mentaldays     227343 224484 227189     189262 226428    225024
## usage2         224484 227913 227764     191040 227057    225615
## sex2b          227189 227764 230716     191453 229756    228310
## newincome2     189262 191040 191453     191550 191234    190187
## educa2         226428 227057 229756     191234 229894    227645
## hispanic2      225024 225615 228310     190187 227645    228435
## 
## $rm
##            mentaldays usage2 sex2b newincome2 educa2 hispanic2
## mentaldays          0   2859   154      38081    915      2319
## usage2           3429      0   149      36873    856      2298
## sex2b            3527   2952     0      39263    960      2406
## newincome2       2288    510    97          0    316      1363
## educa2           3466   2837   138      38660      0      2249
## hispanic2        3411   2820   125      38248    790         0
## 
## $mr
##            mentaldays usage2 sex2b newincome2 educa2 hispanic2
## mentaldays          0   3429  3527       2288   3466      3411
## usage2           2859      0  2952        510   2837      2820
## sex2b             154    149     0         97    138       125
## newincome2      38081  36873 39263          0  38660     38248
## educa2            915    856   960        316      0       790
## hispanic2        2319   2298  2406       1363   2249         0
## 
## $mm
##            mentaldays usage2 sex2b newincome2 educa2 hispanic2
## mentaldays       3532    103     5       1244     66       121
## usage2            103   2962    10       2452    125       142
## sex2b               5     10   159         62     21        34
## newincome2       1244   2452    62      39325    665      1077
## educa2             66    125    21        665    981       191
## hispanic2         121    142    34       1077    191      2440

#Results are similar as the missing number for newincome 2 was 39,325; however, itn the multiple imputation, the missing value was 34399