library(car)
library(mice)
library(ggplot2)
library(dplyr)
library(haven)
tb19 <- read_dta("state_19_working_pudf.dta")
nams<-names(tb19)
head(nams, n=10)
## [1] "year" "ststr" "state" "fmonth" "dispcode" "orgseqno"
## [7] "c01q01" "fairpoor" "c02q01" "phyng"
newnames<-tolower(gsub(pattern = "_",replacement = "",x = nams))
names(tb19)<-newnames
#recode
#child_ins
#tb19$tx05q02 = as.factor(tb19$tx05q02)
tb19$c_ins<-Recode(tb19$tx05q02, recodes="1='parents_employer';2='medicaid/medicare'; 3='CHIP';4='other'; 5='none'; else=NA", as.factor=T)
#harveyhomedamage
tb19$tx04q04 = as.factor(tb19$tx04q04)
tb19$hdamage<-Recode(tb19$tx04q04, recodes="1='midly';2='moderate'; 3='severe';4='none'; else=NA", as.factor=T)
#harveyvermon
tb19$tx05q07 = as.factor(tb19$tx05q07)
tb19$vermon<-Recode(tb19$tx05q07, recodes="1=1;2=0; else=NA", as.factor=F)
#harveymold
tb19$tx05q06 = as.factor(tb19$tx05q06)
tb19$mold<-Recode(tb19$tx05q06, recodes="1=1;2=0; else=NA", as.factor=F)
#boygirl
#tb19$m30q02 = as.factor(tb19$m30q02)
tb19$cg<-Recode(tb19$m30q02, recodes="1='Boy';2='Girl'; else=NA", as.factor=T)
#ChildAsthma(outcome,binary1,0)
tb19$m31q01 = as.factor(tb19$m31q01)
tb19$casthma<-Recode(tb19$m31q01, recodes="1=1;2=0; else=NA", as.factor=F)
Measure an outcome variable and at least 5 predictors. Report the pattern of missingness among all of these variables
#outcome variable is casthma, five predictors are child insurance, mold, home damage, child gender, and vermon.
summary(tb19[, c("casthma", "c_ins", "mold", "hdamage","cg","vermon" )])
## casthma c_ins mold hdamage
## Min. :0.000 CHIP : 20 Min. :0.000 midly : 620
## 1st Qu.:0.000 medicaid/medicare: 99 1st Qu.:0.000 moderate: 519
## Median :0.000 none : 16 Median :0.000 none : 815
## Mean :0.114 other : 10 Mean :0.153 severe : 337
## 3rd Qu.:0.000 parents_employer : 98 3rd Qu.:0.000 NA's :10006
## Max. :1.000 NA's :12054 Max. :1.000
## NA's :9892 NA's :12049
## cg vermon
## Boy :1301 Min. :0.000
## Girl:1123 1st Qu.:0.000
## NA's:9873 Median :0.000
## Mean :0.169
## 3rd Qu.:0.000
## Max. :1.000
## NA's :12055
Perform a mean (a mean for numeric data) or a modal imputation (for categorical data) of all values. Perform the analysis using this imputed data. What are your results?
summary(tb19$casthma)#numeric variable
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 0.000 0.000 0.114 0.000 1.000 9892
#mean for Child Asthma (casthma) is 0.144
tb19$casthma.imp.mean<-ifelse(is.na(tb19$casthma)==T, mean(tb19$casthma, na.rm=T), tb19$casthma)
mean(tb19$casthma, na.rm=T)
## [1] 0.1135135
mean(tb19$casthma.imp.mean)
## [1] 0.1135135
#mean when replacing the missing with mean is 0.1135135
fit<-lm(casthma~c_ins+mold+hdamage+cg, tb19)
print(fit)
##
## Call:
## lm(formula = casthma ~ c_ins + mold + hdamage + cg, data = tb19)
##
## Coefficients:
## (Intercept) c_insmedicaid/medicare c_insnone
## 1.000e+00 -2.493e-16 -5.667e-16
## c_insother c_insparents_employer mold
## -4.573e-16 -8.441e-16 2.224e-16
## hdamagemoderate hdamagenone hdamagesevere
## -3.755e-16 -8.143e-16 -4.035e-16
## cgGirl
## -9.850e-16
median(tb19$casthma, na.rm=T)
## [1] 0
median(tb19$casthma.imp.mean)
## [1] 0.1135135
var(tb19$casthma, na.rm=T)
## [1] 0.1006701
var(tb19$casthma.imp.mean)
## [1] 0.01968208
#original data for child asthma has the mean at zero, but with imputted mean the mean goes changes to 0.1135135. When it comes to variance this is some small variation between original and imputted data.
for mold
summary(tb19$mold)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 0.000 0.000 0.153 0.000 1.000 12049
tb19$mold.imp.mean<-ifelse(is.na(tb19$mold)==T, mean(tb19$mold, na.rm=T), tb19$mold)
mean(tb19$mold, na.rm=T)
## [1] 0.1532258
mean(tb19$mold.imp.mean)
## [1] 0.1532258
#The mean for original and imputted data for mold is not different. They are the same.
Vermon
summary(tb19$vermon)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 0.000 0.000 0.169 0.000 1.000 12055
tb19$vermon.imp.mean<-ifelse(is.na(tb19$vermon)==T, mean(tb19$vermon, na.rm=T), tb19$vermon)
mean(tb19$vermon, na.rm=T)
## [1] 0.1694215
mean(tb19$vermon.imp.mean)
## [1] 0.1694215
#The means for both original and imputted data for vermon are the same....not sure if this is suppose to happen.
#plot the histogram
library(reshape2)
tb19%>%
select(casthma.imp.mean, casthma)%>%
melt()%>%
ggplot()+geom_freqpoly(aes(x = value,
y = ..density.., colour = variable))
## No id variables; using all as measure variables
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 9892 rows containing non-finite values (stat_bin).
#Based on the histogram, the peak of the distribution is lower when using imputted values.
Perform a multiple imputation of all values. Perform the analysis using this imputed data set. What are your results?
Modal Imputation for categorical data
#categorical variable will be child's gender: boy or girl.
table(tb19$cg)
##
## Boy Girl
## 1301 1123
#find the most common value
mcv.cg<-factor(names(which.max(table(tb19$cg))), levels=levels(tb19$cg))
mcv.cg
## [1] Boy
## Levels: Boy Girl
#most common value appears to be boy
#Impute Cases
tb19$cg.imp<-as.factor(ifelse(is.na(tb19$cg)==T, mcv.cg, tb19$cg))
levels(tb19$cg.imp)<-levels(tb19$cg)
prop.table(table(tb19$cg))
##
## Boy Girl
## 0.5367162 0.4632838
prop.table(table(tb19$cg.imp))
##
## Boy Girl
## 0.90867691 0.09132309
barplot(prop.table(table(tb19$cg)), main="Original Data For Child Gender", ylim=c(0, .6))
barplot(prop.table(table(tb19$cg.imp)), main="Imputed Data for Child Gender",ylim=c(0, 1))
#There appears to be some difference between original child gender data vs imputatted child gender data. The data for girl is lower when using imputted data.
Now we will try with home damage variable another categorical variable
#variable: hdamage - home damage due to Hurricane Harvey
table(tb19$hdamage)
##
## midly moderate none severe
## 620 519 815 337
# Most common value: home damage
mcv.hdamage<-factor(names(which.max(table(tb19$hdamage))), levels = levels(tb19$hdamage))
mcv.hdamage
## [1] none
## Levels: midly moderate none severe
#impute cases home damage
tb19$hdamage.imp<-as.factor(ifelse(is.na(tb19$hdamage)==T, mcv.hdamage, tb19$hdamage))
levels(tb19$hdamage.imp)<-levels(as.factor(tb19$hdamage))
prop.table(table(tb19$hdamage))
##
## midly moderate none severe
## 0.2706242 0.2265386 0.3557399 0.1470973
# Bar plot Home Damage
barplot(prop.table(table(tb19$hdamage)), main="Original Data Home Damage", ylim=c(0, .6))
# Bar Plot Imputed Data Home Damage
barplot(prop.table(table(tb19$hdamage.imp)), main="Imputed Data Home Damage", ylim=c(0, 1))
#data for home damage due to Hurricane Harvey is has more variance with the original data, with the imputted data, however, there seems to be higher numbers for no home damage.
Child Insurance
table(tb19$c_ins)
##
## CHIP medicaid/medicare none other
## 20 99 16 10
## parents_employer
## 98
mcv.c_ins<-factor(names(which.max(table(tb19$c_ins))), levels = levels(tb19$c_ins))
mcv.c_ins
## [1] medicaid/medicare
## Levels: CHIP medicaid/medicare none other parents_employer
tb19$c_ins.imp<-as.factor(ifelse(is.na(tb19$c_ins)==T, mcv.c_ins, tb19$c_ins))
levels(tb19$c_ins.imp)<-levels(as.factor(tb19$c_ins))
prop.table(table(tb19$c_ins))
##
## CHIP medicaid/medicare none other
## 0.08230453 0.40740741 0.06584362 0.04115226
## parents_employer
## 0.40329218
barplot(prop.table(table(tb19$c_ins)), main="Original Data Child Insurance", ylim=c(0, .6))
barplot(prop.table(table(tb19$c_ins.imp)), main="Imputed Data for Child Insurance", ylim=c(0, 1))
#the original data for child insurance vs imputted data for child insurance are very different. There is more variation with original child insurance data. With the imputted child insurance data, medicaid/medicare seems to be the main type of child insurance.
Testing for MAR
fit1<-lm(casthma~vermon+cg+c_ins+is.na(hdamage), data=tb19)
summary(fit1)
##
## Call:
## lm(formula = casthma ~ vermon + cg + c_ins + is.na(hdamage),
## data = tb19)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.130e-16 -3.230e-16 -1.830e-16 7.800e-17 3.321e-14
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.000e+00 5.508e-16 1.816e+15 <2e-16 ***
## vermon -1.484e-16 4.196e-16 -3.540e-01 0.724
## cgGirl -2.898e-16 3.076e-16 -9.420e-01 0.347
## c_insmedicaid/medicare 3.351e-16 5.551e-16 6.040e-01 0.547
## c_insnone -3.481e-17 7.758e-16 -4.500e-02 0.964
## c_insother -9.252e-17 9.074e-16 -1.020e-01 0.919
## c_insparents_employer -9.435e-17 5.580e-16 -1.690e-01 0.866
## is.na(hdamage)TRUE 2.526e-16 3.356e-16 7.530e-01 0.452
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.239e-15 on 224 degrees of freedom
## (12065 observations deleted due to missingness)
## Multiple R-squared: 0.5002, Adjusted R-squared: 0.4846
## F-statistic: 32.03 on 7 and 224 DF, p-value: < 2.2e-16
#home damage is the variable used to see whether we have missing at random within the data. Data indicates that those with missing home damage have a ....higher rate of child asthma. I dont think this is correct.
Patterns of missingness
md.pattern(tb19[,c("casthma", "hdamage", "cg","c_ins","vermon")])
## cg casthma hdamage c_ins vermon
## 79 1 1 1 1 1 0
## 4 1 1 1 1 0 1
## 2 1 1 1 0 1 1
## 523 1 1 1 0 0 2
## 153 1 1 0 1 1 1
## 5 1 1 0 1 0 2
## 5 1 1 0 0 1 2
## 1599 1 1 0 0 0 3
## 11 1 0 1 0 0 3
## 43 1 0 0 0 0 4
## 9 0 1 1 0 0 3
## 2 0 1 0 1 1 2
## 1 0 1 0 0 1 3
## 23 0 1 0 0 0 4
## 1663 0 0 1 0 0 4
## 8175 0 0 0 0 0 5
## 9873 9892 10006 12054 12055 53880
md.pairs(tb19[,c("casthma", "hdamage", "cg","c_ins","vermon")])
## $rr
## casthma hdamage cg c_ins vermon
## casthma 2405 617 2370 243 242
## hdamage 617 2291 619 83 81
## cg 2370 619 2424 241 239
## c_ins 243 83 241 243 234
## vermon 242 81 239 234 242
##
## $rm
## casthma hdamage cg c_ins vermon
## casthma 0 1788 35 2162 2163
## hdamage 1674 0 1672 2208 2210
## cg 54 1805 0 2183 2185
## c_ins 0 160 2 0 9
## vermon 0 161 3 8 0
##
## $mr
## casthma hdamage cg c_ins vermon
## casthma 0 1674 54 0 0
## hdamage 1788 0 1805 160 161
## cg 35 1672 0 2 3
## c_ins 2162 2208 2183 0 8
## vermon 2163 2210 2185 9 0
##
## $mm
## casthma hdamage cg c_ins vermon
## casthma 9892 8218 9838 9892 9892
## hdamage 8218 10006 8201 9846 9845
## cg 9838 8201 9873 9871 9870
## c_ins 9892 9846 9871 12054 12046
## vermon 9892 9845 9870 12046 12055
#There is a lot of missing data in my data...
Perform a multiple imputation of all values. Perform the analysis using this imputed data set. What are your results?
dat2<-tb19
samp2<-sample(1:dim(dat2)[1], replace = F, size = 500)
dat2$casthmami<-dat2$casthma
dat2$casthmami[samp2]<-NA
head(dat2[, c("casthmami","casthma")])
#data show NAs on both casthmmami and casthma.
imp<-mice(data = dat2[,c("casthma", "hdamage", "cg","c_ins","vermon")], m = 5)
##
## iter imp variable
## 1 1 casthma hdamage cg c_ins vermon
## 1 2 casthma hdamage cg c_ins vermon
## 1 3 casthma hdamage cg c_ins vermon
## 1 4 casthma hdamage cg c_ins vermon
## 1 5 casthma hdamage cg c_ins vermon
## 2 1 casthma hdamage cg c_ins vermon
## 2 2 casthma hdamage cg c_ins vermon
## 2 3 casthma hdamage cg c_ins vermon
## 2 4 casthma hdamage cg c_ins vermon
## 2 5 casthma hdamage cg c_ins vermon
## 3 1 casthma hdamage cg c_ins vermon
## 3 2 casthma hdamage cg c_ins vermon
## 3 3 casthma hdamage cg c_ins vermon
## 3 4 casthma hdamage cg c_ins vermon
## 3 5 casthma hdamage cg c_ins vermon
## 4 1 casthma hdamage cg c_ins vermon
## 4 2 casthma hdamage cg c_ins vermon
## 4 3 casthma hdamage cg c_ins vermon
## 4 4 casthma hdamage cg c_ins vermon
## 4 5 casthma hdamage cg c_ins vermon
## 5 1 casthma hdamage cg c_ins vermon
## 5 2 casthma hdamage cg c_ins vermon
## 5 3 casthma hdamage cg c_ins vermon
## 5 4 casthma hdamage cg c_ins vermon
## 5 5 casthma hdamage cg c_ins vermon
## Warning: Number of logged events: 50
print(imp)
## Class: mids
## Number of multiple imputations: 5
## Imputation methods:
## casthma hdamage cg c_ins vermon
## "pmm" "polyreg" "logreg" "polyreg" "pmm"
## PredictorMatrix:
## casthma hdamage cg c_ins vermon
## casthma 0 1 1 1 1
## hdamage 1 0 1 1 1
## cg 1 1 0 1 1
## c_ins 1 1 1 0 1
## vermon 1 1 1 1 0
## Number of logged events: 50
## it im dep meth out
## 1 1 1 c_ins polyreg casthma
## 2 1 1 vermon pmm casthma
## 3 1 2 c_ins polyreg casthma
## 4 1 2 vermon pmm casthma
## 5 1 3 c_ins polyreg casthma
## 6 1 3 vermon pmm casthma
#data shows the number of imputations done and the total missingness and which imputation method for each variable since they are all different class.My data shows all NAs for casthmami and casthma. What am i doing wrong?!
head(imp$imp$casthma)
Were the results similar between the mean/modal and multiply imputed data sets? How do the results compare to the results from the model fit with the data source with missing values?
dat.imp<-complete(imp, action = 1)
head(dat.imp, n=10)
#Compare to the original data
head(tb19[,c("casthma", "c_ins", "mold", "hdamage","cg","vermon" )], n=10)
#There is something wrong with my data. I dont know if it is the recoding or something else but i keep getting NAs when i run the model fits.