library(car)
library(mice)
library(ggplot2)
library(dplyr)
library(haven)
tb19 <- read_dta("state_19_working_pudf.dta")
nams<-names(tb19)
head(nams, n=10)
##  [1] "year"     "ststr"    "state"    "fmonth"   "dispcode" "orgseqno"
##  [7] "c01q01"   "fairpoor" "c02q01"   "phyng"
newnames<-tolower(gsub(pattern = "_",replacement =  "",x =  nams))
names(tb19)<-newnames

#recode

#child_ins
#tb19$tx05q02 = as.factor(tb19$tx05q02)
tb19$c_ins<-Recode(tb19$tx05q02, recodes="1='parents_employer';2='medicaid/medicare'; 3='CHIP';4='other'; 5='none'; else=NA", as.factor=T)

#harveyhomedamage
tb19$tx04q04 = as.factor(tb19$tx04q04)
tb19$hdamage<-Recode(tb19$tx04q04, recodes="1='midly';2='moderate'; 3='severe';4='none'; else=NA", as.factor=T)

#harveyvermon
tb19$tx05q07 = as.factor(tb19$tx05q07)
tb19$vermon<-Recode(tb19$tx05q07, recodes="1=1;2=0; else=NA", as.factor=F)

#harveymold
tb19$tx05q06 = as.factor(tb19$tx05q06)
tb19$mold<-Recode(tb19$tx05q06, recodes="1=1;2=0; else=NA", as.factor=F)

#boygirl
#tb19$m30q02 = as.factor(tb19$m30q02)
tb19$cg<-Recode(tb19$m30q02, recodes="1='Boy';2='Girl'; else=NA", as.factor=T)

#ChildAsthma(outcome,binary1,0)
tb19$m31q01 = as.factor(tb19$m31q01)
tb19$casthma<-Recode(tb19$m31q01, recodes="1=1;2=0; else=NA", as.factor=F)

Measure an outcome variable and at least 5 predictors. Report the pattern of missingness among all of these variables

#outcome variable is casthma, five predictors are child insurance, mold, home damage, child gender, and vermon. 

summary(tb19[, c("casthma", "c_ins",  "mold", "hdamage","cg","vermon" )])
##     casthma                    c_ins            mold           hdamage     
##  Min.   :0.000   CHIP             :   20   Min.   :0.000   midly   :  620  
##  1st Qu.:0.000   medicaid/medicare:   99   1st Qu.:0.000   moderate:  519  
##  Median :0.000   none             :   16   Median :0.000   none    :  815  
##  Mean   :0.114   other            :   10   Mean   :0.153   severe  :  337  
##  3rd Qu.:0.000   parents_employer :   98   3rd Qu.:0.000   NA's    :10006  
##  Max.   :1.000   NA's             :12054   Max.   :1.000                   
##  NA's   :9892                              NA's   :12049                   
##     cg           vermon     
##  Boy :1301   Min.   :0.000  
##  Girl:1123   1st Qu.:0.000  
##  NA's:9873   Median :0.000  
##              Mean   :0.169  
##              3rd Qu.:0.000  
##              Max.   :1.000  
##              NA's   :12055

Perform a mean (a mean for numeric data) or a modal imputation (for categorical data) of all values. Perform the analysis using this imputed data. What are your results?

summary(tb19$casthma)#numeric variable 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   0.000   0.000   0.114   0.000   1.000    9892
#mean for Child Asthma (casthma) is 0.144

tb19$casthma.imp.mean<-ifelse(is.na(tb19$casthma)==T, mean(tb19$casthma, na.rm=T), tb19$casthma)
mean(tb19$casthma, na.rm=T)
## [1] 0.1135135
mean(tb19$casthma.imp.mean) 
## [1] 0.1135135
#mean when replacing the missing with mean is 0.1135135

fit<-lm(casthma~c_ins+mold+hdamage+cg, tb19)
print(fit)
## 
## Call:
## lm(formula = casthma ~ c_ins + mold + hdamage + cg, data = tb19)
## 
## Coefficients:
##            (Intercept)  c_insmedicaid/medicare               c_insnone  
##              1.000e+00              -2.493e-16              -5.667e-16  
##             c_insother   c_insparents_employer                    mold  
##             -4.573e-16              -8.441e-16               2.224e-16  
##        hdamagemoderate             hdamagenone           hdamagesevere  
##             -3.755e-16              -8.143e-16              -4.035e-16  
##                 cgGirl  
##             -9.850e-16
median(tb19$casthma, na.rm=T)
## [1] 0
median(tb19$casthma.imp.mean) 
## [1] 0.1135135
var(tb19$casthma, na.rm=T)
## [1] 0.1006701
var(tb19$casthma.imp.mean) 
## [1] 0.01968208
#original data for child asthma has the mean at zero, but with imputted mean the mean goes changes to 0.1135135. When it comes to variance this is some small variation between original and imputted data.  

for mold

summary(tb19$mold)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   0.000   0.000   0.153   0.000   1.000   12049
tb19$mold.imp.mean<-ifelse(is.na(tb19$mold)==T, mean(tb19$mold, na.rm=T), tb19$mold)
mean(tb19$mold, na.rm=T)
## [1] 0.1532258
mean(tb19$mold.imp.mean) 
## [1] 0.1532258
#The mean for original and imputted data for mold is not different. They are the same. 

Vermon

summary(tb19$vermon)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   0.000   0.000   0.169   0.000   1.000   12055
tb19$vermon.imp.mean<-ifelse(is.na(tb19$vermon)==T, mean(tb19$vermon, na.rm=T), tb19$vermon)
mean(tb19$vermon, na.rm=T)
## [1] 0.1694215
mean(tb19$vermon.imp.mean) 
## [1] 0.1694215
#The means for both original and imputted data for vermon are the same....not sure if this is suppose to happen. 
#plot the histogram
library(reshape2)

tb19%>%
  select(casthma.imp.mean, casthma)%>%
  melt()%>%
  ggplot()+geom_freqpoly(aes(x = value,
     y = ..density.., colour = variable))
## No id variables; using all as measure variables
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 9892 rows containing non-finite values (stat_bin).

#Based on the histogram, the peak of the distribution is lower when using imputted values. 

Perform a multiple imputation of all values. Perform the analysis using this imputed data set. What are your results?

Modal Imputation for categorical data

#categorical variable will be child's gender: boy or girl. 
table(tb19$cg)
## 
##  Boy Girl 
## 1301 1123
#find the most common value
mcv.cg<-factor(names(which.max(table(tb19$cg))), levels=levels(tb19$cg))
mcv.cg
## [1] Boy
## Levels: Boy Girl
#most common value appears to be boy

#Impute Cases

tb19$cg.imp<-as.factor(ifelse(is.na(tb19$cg)==T, mcv.cg, tb19$cg))
levels(tb19$cg.imp)<-levels(tb19$cg)

prop.table(table(tb19$cg))
## 
##       Boy      Girl 
## 0.5367162 0.4632838
prop.table(table(tb19$cg.imp))
## 
##        Boy       Girl 
## 0.90867691 0.09132309
barplot(prop.table(table(tb19$cg)), main="Original Data For Child Gender", ylim=c(0, .6))

barplot(prop.table(table(tb19$cg.imp)), main="Imputed Data for Child Gender",ylim=c(0, 1))

#There appears to be some difference between original child gender data vs imputatted child gender data. The data for girl is lower when using imputted data. 

Now we will try with home damage variable another categorical variable

#variable: hdamage - home damage due to Hurricane Harvey 
table(tb19$hdamage)
## 
##    midly moderate     none   severe 
##      620      519      815      337
# Most common value: home damage
mcv.hdamage<-factor(names(which.max(table(tb19$hdamage))), levels = levels(tb19$hdamage))
mcv.hdamage
## [1] none
## Levels: midly moderate none severe
 #impute cases home damage
tb19$hdamage.imp<-as.factor(ifelse(is.na(tb19$hdamage)==T, mcv.hdamage, tb19$hdamage))
levels(tb19$hdamage.imp)<-levels(as.factor(tb19$hdamage))

prop.table(table(tb19$hdamage))
## 
##     midly  moderate      none    severe 
## 0.2706242 0.2265386 0.3557399 0.1470973
# Bar plot Home Damage
barplot(prop.table(table(tb19$hdamage)), main="Original Data Home Damage", ylim=c(0, .6))

# Bar Plot Imputed Data Home Damage
barplot(prop.table(table(tb19$hdamage.imp)), main="Imputed Data Home Damage", ylim=c(0, 1))

#data for home damage due to Hurricane Harvey is has more variance with the original data, with the imputted data, however, there seems to be higher numbers for no home damage. 

Child Insurance

table(tb19$c_ins)
## 
##              CHIP medicaid/medicare              none             other 
##                20                99                16                10 
##  parents_employer 
##                98
mcv.c_ins<-factor(names(which.max(table(tb19$c_ins))), levels = levels(tb19$c_ins))
mcv.c_ins
## [1] medicaid/medicare
## Levels: CHIP medicaid/medicare none other parents_employer
tb19$c_ins.imp<-as.factor(ifelse(is.na(tb19$c_ins)==T, mcv.c_ins, tb19$c_ins))
levels(tb19$c_ins.imp)<-levels(as.factor(tb19$c_ins))

prop.table(table(tb19$c_ins))
## 
##              CHIP medicaid/medicare              none             other 
##        0.08230453        0.40740741        0.06584362        0.04115226 
##  parents_employer 
##        0.40329218
barplot(prop.table(table(tb19$c_ins)), main="Original Data Child Insurance", ylim=c(0, .6))

barplot(prop.table(table(tb19$c_ins.imp)), main="Imputed Data for Child Insurance", ylim=c(0, 1))

#the original data for child insurance vs imputted data for child insurance are very different. There is more variation with original child insurance data. With the imputted child insurance data, medicaid/medicare seems to be the main type of child insurance. 

Testing for MAR

fit1<-lm(casthma~vermon+cg+c_ins+is.na(hdamage), data=tb19)
summary(fit1)
## 
## Call:
## lm(formula = casthma ~ vermon + cg + c_ins + is.na(hdamage), 
##     data = tb19)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -6.130e-16 -3.230e-16 -1.830e-16  7.800e-17  3.321e-14 
## 
## Coefficients:
##                          Estimate Std. Error    t value Pr(>|t|)    
## (Intercept)             1.000e+00  5.508e-16  1.816e+15   <2e-16 ***
## vermon                 -1.484e-16  4.196e-16 -3.540e-01    0.724    
## cgGirl                 -2.898e-16  3.076e-16 -9.420e-01    0.347    
## c_insmedicaid/medicare  3.351e-16  5.551e-16  6.040e-01    0.547    
## c_insnone              -3.481e-17  7.758e-16 -4.500e-02    0.964    
## c_insother             -9.252e-17  9.074e-16 -1.020e-01    0.919    
## c_insparents_employer  -9.435e-17  5.580e-16 -1.690e-01    0.866    
## is.na(hdamage)TRUE      2.526e-16  3.356e-16  7.530e-01    0.452    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.239e-15 on 224 degrees of freedom
##   (12065 observations deleted due to missingness)
## Multiple R-squared:  0.5002, Adjusted R-squared:  0.4846 
## F-statistic: 32.03 on 7 and 224 DF,  p-value: < 2.2e-16
#home damage is the variable used to see whether we have missing at random within the data. Data indicates that those with missing home damage have a ....higher rate of child asthma. I dont think this is correct. 

Patterns of missingness

md.pattern(tb19[,c("casthma", "hdamage", "cg","c_ins","vermon")])

##        cg casthma hdamage c_ins vermon      
## 79      1       1       1     1      1     0
## 4       1       1       1     1      0     1
## 2       1       1       1     0      1     1
## 523     1       1       1     0      0     2
## 153     1       1       0     1      1     1
## 5       1       1       0     1      0     2
## 5       1       1       0     0      1     2
## 1599    1       1       0     0      0     3
## 11      1       0       1     0      0     3
## 43      1       0       0     0      0     4
## 9       0       1       1     0      0     3
## 2       0       1       0     1      1     2
## 1       0       1       0     0      1     3
## 23      0       1       0     0      0     4
## 1663    0       0       1     0      0     4
## 8175    0       0       0     0      0     5
##      9873    9892   10006 12054  12055 53880
md.pairs(tb19[,c("casthma", "hdamage", "cg","c_ins","vermon")])
## $rr
##         casthma hdamage   cg c_ins vermon
## casthma    2405     617 2370   243    242
## hdamage     617    2291  619    83     81
## cg         2370     619 2424   241    239
## c_ins       243      83  241   243    234
## vermon      242      81  239   234    242
## 
## $rm
##         casthma hdamage   cg c_ins vermon
## casthma       0    1788   35  2162   2163
## hdamage    1674       0 1672  2208   2210
## cg           54    1805    0  2183   2185
## c_ins         0     160    2     0      9
## vermon        0     161    3     8      0
## 
## $mr
##         casthma hdamage   cg c_ins vermon
## casthma       0    1674   54     0      0
## hdamage    1788       0 1805   160    161
## cg           35    1672    0     2      3
## c_ins      2162    2208 2183     0      8
## vermon     2163    2210 2185     9      0
## 
## $mm
##         casthma hdamage   cg c_ins vermon
## casthma    9892    8218 9838  9892   9892
## hdamage    8218   10006 8201  9846   9845
## cg         9838    8201 9873  9871   9870
## c_ins      9892    9846 9871 12054  12046
## vermon     9892    9845 9870 12046  12055
#There is a lot of missing data in my data...

Perform a multiple imputation of all values. Perform the analysis using this imputed data set. What are your results?

dat2<-tb19
samp2<-sample(1:dim(dat2)[1], replace = F, size = 500)
dat2$casthmami<-dat2$casthma
dat2$casthmami[samp2]<-NA

head(dat2[, c("casthmami","casthma")])
#data show NAs on both casthmmami and casthma. 

imp<-mice(data = dat2[,c("casthma", "hdamage", "cg","c_ins","vermon")], m = 5)
## 
##  iter imp variable
##   1   1  casthma  hdamage  cg  c_ins  vermon
##   1   2  casthma  hdamage  cg  c_ins  vermon
##   1   3  casthma  hdamage  cg  c_ins  vermon
##   1   4  casthma  hdamage  cg  c_ins  vermon
##   1   5  casthma  hdamage  cg  c_ins  vermon
##   2   1  casthma  hdamage  cg  c_ins  vermon
##   2   2  casthma  hdamage  cg  c_ins  vermon
##   2   3  casthma  hdamage  cg  c_ins  vermon
##   2   4  casthma  hdamage  cg  c_ins  vermon
##   2   5  casthma  hdamage  cg  c_ins  vermon
##   3   1  casthma  hdamage  cg  c_ins  vermon
##   3   2  casthma  hdamage  cg  c_ins  vermon
##   3   3  casthma  hdamage  cg  c_ins  vermon
##   3   4  casthma  hdamage  cg  c_ins  vermon
##   3   5  casthma  hdamage  cg  c_ins  vermon
##   4   1  casthma  hdamage  cg  c_ins  vermon
##   4   2  casthma  hdamage  cg  c_ins  vermon
##   4   3  casthma  hdamage  cg  c_ins  vermon
##   4   4  casthma  hdamage  cg  c_ins  vermon
##   4   5  casthma  hdamage  cg  c_ins  vermon
##   5   1  casthma  hdamage  cg  c_ins  vermon
##   5   2  casthma  hdamage  cg  c_ins  vermon
##   5   3  casthma  hdamage  cg  c_ins  vermon
##   5   4  casthma  hdamage  cg  c_ins  vermon
##   5   5  casthma  hdamage  cg  c_ins  vermon
## Warning: Number of logged events: 50
print(imp)
## Class: mids
## Number of multiple imputations:  5 
## Imputation methods:
##   casthma   hdamage        cg     c_ins    vermon 
##     "pmm" "polyreg"  "logreg" "polyreg"     "pmm" 
## PredictorMatrix:
##         casthma hdamage cg c_ins vermon
## casthma       0       1  1     1      1
## hdamage       1       0  1     1      1
## cg            1       1  0     1      1
## c_ins         1       1  1     0      1
## vermon        1       1  1     1      0
## Number of logged events:  50 
##   it im    dep    meth     out
## 1  1  1  c_ins polyreg casthma
## 2  1  1 vermon     pmm casthma
## 3  1  2  c_ins polyreg casthma
## 4  1  2 vermon     pmm casthma
## 5  1  3  c_ins polyreg casthma
## 6  1  3 vermon     pmm casthma
#data shows the number of imputations done  and the total missingness and which imputation method for each variable since they are all different class.My data shows all NAs for casthmami and casthma. What am i doing wrong?! 
head(imp$imp$casthma)

Were the results similar between the mean/modal and multiply imputed data sets? How do the results compare to the results from the model fit with the data source with missing values?

dat.imp<-complete(imp, action = 1)
head(dat.imp, n=10)
#Compare to the original data
head(tb19[,c("casthma", "c_ins",  "mold", "hdamage","cg","vermon" )], n=10)
#There is something wrong with my data. I dont know if it is the recoding or something else but i keep getting NAs when i run the model fits.