library(ipumsr)
usa_00013a <- read_ipums_ddi("usa_00013.xml")
tx_00013a <- read_ipums_micro(usa_00013a, data_file = ("usa_00013.dat.gz"), verbose = FALSE)
library(stringr)
names(tx_00013a)<-tolower(names(tx_00013a))
names(tx_00013a)
## [1] "year" "multyear" "sample" "serial" "cbserial" "hhwt"
## [7] "cluster" "statefip" "puma" "strata" "gq" "ownershp"
## [13] "ownershpd" "mortgage" "multgen" "multgend" "pernum" "perwt"
## [19] "sex" "age" "fertyr" "race" "raced" "hispan"
## [25] "hispand" "hcovany" "educ" "educd" "empstat" "empstatd"
## [31] "labforce" "occ" "ind" "uhrswork" "inctot" "poverty"
## [37] "presgl" "migrate1" "migrate1d"
#CS Code
tx_00013a<-zap_labels(tx_00013a)
tx_00013a$newpuma<- paste (str_pad(tx_00013a$statefip, 2,"left", "0"), str_pad(tx_00013a$puma,5,"left", "0") , sep="")
table(tx_00013a$newpuma)
##
## 4800100 4800200 4800300 4800400 4800501 4800502 4800600 4800700 4800800 4800900
## 8976 4362 4674 6757 3800 5870 7094 4974 8381 6331
## 4801000 4801100 4801200 4801300 4801400 4801501 4801502 4801600 4801700 4801800
## 6359 4275 3594 4239 4289 2823 3451 3754 4852 5583
## 4801901 4801902 4801903 4801904 4801905 4801906 4801907 4802001 4802002 4802003
## 4402 4245 3728 3195 4205 6061 4681 4436 4103 6084
## 4802004 4802005 4802006 4802101 4802102 4802200 4802301 4802302 4802303 4802304
## 4873 4049 5078 5882 5309 4226 2988 3015 3388 2921
## 4802305 4802306 4802307 4802308 4802309 4802310 4802311 4802312 4802313 4802314
## 3255 3718 3948 3274 3816 3944 4200 4436 5082 3627
## 4802315 4802316 4802317 4802318 4802319 4802320 4802321 4802322 4802400 4802501
## 3899 2974 2488 3253 3424 3566 3905 4536 4496 3737
## 4802502 4802503 4802504 4802505 4802506 4802507 4802508 4802509 4802510 4802511
## 4185 3696 2676 3742 3053 3873 3622 4406 4376 2785
## 4802512 4802513 4802514 4802515 4802516 4802600 4802700 4802800 4802900 4803000
## 3138 4605 4330 3849 3494 9343 5077 4847 3644 3894
## 4803100 4803200 4803301 4803302 4803303 4803304 4803305 4803306 4803400 4803501
## 4029 4379 4575 4536 3704 2996 3661 4159 6975 4733
## 4803502 4803601 4803602 4803700 4803801 4803802 4803900 4804000 4804100 4804200
## 5277 5462 7410 8434 3518 5522 6597 5112 3565 4323
## 4804301 4804302 4804400 4804501 4804502 4804503 4804504 4804601 4804602 4804603
## 3282 4948 3389 3376 3264 2591 2489 3635 3768 4783
## 4804604 4804605 4804606 4804607 4804608 4804609 4804610 4804611 4804612 4804613
## 4719 3325 3548 3001 3208 3701 2969 2641 3627 3319
## 4804614 4804615 4804616 4804617 4804618 4804619 4804620 4804621 4804622 4804623
## 3146 2923 3399 2564 2998 2644 2778 3648 2757 2811
## 4804624 4804625 4804626 4804627 4804628 4804629 4804630 4804631 4804632 4804633
## 3105 2733 3005 2988 3085 2941 2632 3105 3428 2002
## 4804634 4804635 4804636 4804637 4804638 4804701 4804702 4804801 4804802 4804803
## 2120 2990 2901 3075 2706 5956 3790 2101 3777 3715
## 4804901 4804902 4804903 4804904 4804905 4805000 4805100 4805201 4805202 4805203
## 3025 3696 2939 2716 4373 5532 5005 4511 3605 3445
## 4805204 4805301 4805302 4805303 4805304 4805305 4805306 4805307 4805308 4805309
## 4498 3153 3689 4928 4124 3561 6162 4390 4123 4819
## 4805400 4805500 4805600 4805700 4805800 4805901 4805902 4805903 4805904 4805905
## 5755 6315 3466 5226 3730 3654 3649 3516 3355 3693
## 4805906 4805907 4805908 4805909 4805910 4805911 4805912 4805913 4805914 4805915
## 3386 4084 3118 3324 3944 3566 3237 2997 4406 3570
## 4805916 4806000 4806100 4806200 4806301 4806302 4806400 4806500 4806601 4806602
## 3317 3527 3634 4059 4778 3826 3097 4814 3861 3349
## 4806603 4806701 4806702 4806703 4806801 4806802 4806803 4806804 4806805 4806806
## 3786 3088 4194 4537 2613 2169 2594 2336 3324 2414
## 4806807 4806900
## 2775 2869
bordp<-readr::read_csv("C:/Users/codar/OneDrive/Documents/Stats II/Data/border_100mi_pumas_table.csv")
## Parsed with column specification:
## cols(
## fid = col_double(),
## STATEFP10 = col_double(),
## PUMACE10 = col_character(),
## AFFGEOID10 = col_character(),
## GEOID10 = col_double(),
## NAME10 = col_character(),
## LSAD10 = col_character(),
## ALAND10 = col_double(),
## AWATER10 = col_double()
## )
mdat<-merge(tx_00013a, bordp, by.x="newpuma", by.y="GEOID10")
table(mdat$newpuma)
##
## 4802800 4803200 4803301 4803302 4803303 4803304 4803305 4803306 4806000 4806100
## 4847 4379 4575 4536 3704 2996 3661 4159 3527 3634
## 4806200 4806301 4806302 4806400 4806701 4806702 4806703 4806801 4806802 4806803
## 4059 4778 3826 3097 3088 4194 4537 2613 2169 2594
## 4806804 4806805 4806806 4806807 4806900
## 2336 3324 2414 2775 2869
library(dplyr)
tx_00013a<-tx_00013a%>%
filter(newpuma %in% c( "4802800", "4803200","4806000", "4806100", "4806200", "4806301", "4806302", "4806701", "4806702", "4806703", "4806900" ))
# View(tx_00012a)
# names(tx_00012a)
# class(tx_00012a)
hw8 <-tx_00013a %>%
mutate(sex=case_when(sex == 1~0,
sex == 2~ 1,
TRUE ~ NA_real_),
lfpart=case_when(labforce== 1 ~ 0,
labforce== 2 ~ 1,
TRUE ~ NA_real_),
edu=case_when(educ== 0 ~ 'none',
educ %in% 1:5 ~ 'hs incomplete',
educ %in% 6 ~ 'hs complete',
educ %in% 7:11 ~ 'college',
TRUE ~ NA_character_),
race=case_when(race== 1 ~ 'white',
race== 2 ~ 'black',
# race== 3 ~'aian',
race %in% 4:5 ~ 'asian',
race== 6 ~ 'oapi',
race== 7 ~ 'other',
race== 8 ~ 'twomajor',
race== 9 ~ 'threemoremaj',
TRUE ~ NA_character_),
hisp= case_when(hispan !=0 ~ "Latino",
hispan==0 ~'NL',
hispan==9 ~ 'NL',
TRUE ~ NA_character_),
migrate1=case_when(migrate1==1 ~ 'same house',
migrate1==2 ~ 'movinstate',
migrate1==3 ~ 'abroad1yr',
TRUE ~ NA_character_),
fertyr=case_when(fertyr== 1 ~ 0,
fertyr==2 ~ 1,
TRUE~ NA_real_ ),
poverty1=case_when(poverty==001 ~ "1% or less",
poverty ==501 ~ "501% or more"),
hcov=case_when(hcovany == 1 ~ 0,
hcovany == 2 ~ 1,
TRUE~NA_real_),
ownhome=case_when(ownershp==1 ~ 1,
ownershp==2 ~ 0,
TRUE ~ NA_real_),
multgen1=case_when(multgen==1 ~ 1,
multgen==2 ~ 2,
multgen==3 ~ 3,
TRUE~NA_real_),
edu3=case_when(educ %in% 1:5 ~ 1,
educ %in% 6 ~ 2,
educ %in% 7:11 ~ 3,
TRUE~NA_real_),
ownhome2=case_when(ownershp==1 ~"Own",
ownershp==2 ~ "Rent"),
multgen2=case_when(multgen==1 ~ "One Gen",
multgen ==2 ~ "Two Gen",
multgen == 3 ~ "Three More Gens"))
sapply(hw8, class)
## year multyear sample serial cbserial hhwt
## "integer" "numeric" "integer" "numeric" "numeric" "numeric"
## cluster statefip puma strata gq ownershp
## "numeric" "integer" "numeric" "numeric" "integer" "integer"
## ownershpd mortgage multgen multgend pernum perwt
## "integer" "integer" "integer" "integer" "numeric" "numeric"
## sex age fertyr race raced hispan
## "numeric" "integer" "numeric" "character" "integer" "integer"
## hispand hcovany educ educd empstat empstatd
## "integer" "integer" "integer" "integer" "integer" "integer"
## labforce occ ind uhrswork inctot poverty
## "integer" "numeric" "numeric" "integer" "numeric" "numeric"
## presgl migrate1 migrate1d newpuma lfpart edu
## "numeric" "character" "integer" "character" "numeric" "character"
## hisp poverty1 hcov ownhome multgen1 edu3
## "character" "character" "numeric" "numeric" "numeric" "numeric"
## ownhome2 multgen2
## "character" "character"
# Missing data homework
#
# Use a data source you are familiar with
#
#Measure an outcome variable and at least 5 predictors
##Outcome variable: Total personal income
##Predictors: race, sex, hcov, edu, multgen1, ownhome
#Report the pattern of missingness among all of these variables
summary(hw8[, c("inctot", "race", "sex", "hcov", "edu3", "multgen1", "ownhome", "ownhome2", "multgen2")])
## inctot race sex hcov
## Min. : -6600 Length:43738 Min. :0.0000 Min. :0.0000
## 1st Qu.: 317 Class :character 1st Qu.:0.0000 1st Qu.:0.0000
## Median : 15000 Mode :character Median :0.0000 Median :1.0000
## Mean : 27859 Mean :0.4953 Mean :0.6712
## 3rd Qu.: 37899 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :816000 Max. :1.0000 Max. :1.0000
##
## edu3 multgen1 ownhome ownhome2
## Min. :1.000 Min. :1.000 Min. :0.0000 Length:43738
## 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:0.0000 Class :character
## Median :2.000 Median :2.000 Median :1.0000 Mode :character
## Mean :2.113 Mean :1.874 Mean :0.7168
## 3rd Qu.:3.000 3rd Qu.:2.000 3rd Qu.:1.0000
## Max. :3.000 Max. :3.000 Max. :1.0000
## NA's :940 NA's :2982 NA's :2982
## multgen2
## Length:43738
## Class :character
## Mode :character
##
##
##
##
##My edu3, ownhome, and multgen1 variables have missing data. edu3 has 940 NAs, multgen1 has 2982 NAs and ownhome has 2982 NAs.
sapply(hw8, class)
## year multyear sample serial cbserial hhwt
## "integer" "numeric" "integer" "numeric" "numeric" "numeric"
## cluster statefip puma strata gq ownershp
## "numeric" "integer" "numeric" "numeric" "integer" "integer"
## ownershpd mortgage multgen multgend pernum perwt
## "integer" "integer" "integer" "integer" "numeric" "numeric"
## sex age fertyr race raced hispan
## "numeric" "integer" "numeric" "character" "integer" "integer"
## hispand hcovany educ educd empstat empstatd
## "integer" "integer" "integer" "integer" "integer" "integer"
## labforce occ ind uhrswork inctot poverty
## "integer" "numeric" "numeric" "integer" "numeric" "numeric"
## presgl migrate1 migrate1d newpuma lfpart edu
## "numeric" "character" "integer" "character" "numeric" "character"
## hisp poverty1 hcov ownhome multgen1 edu3
## "character" "character" "numeric" "numeric" "numeric" "numeric"
## ownhome2 multgen2
## "character" "character"
#Perform a mean (a mean for numeric data) or a modal imputation (for categorical data) of all values. Perform the analysis using this imputed data. What are your results?
##Missing data was found in my edu3, multgen1, and ownhome variables. I will use the mean imputation method. Using the mean method of imputation will reduce the variance because all of the NA's will receive the mean. This might lower the effect size/beta of my model, which according to CS, is not good.
library(car)
library(mice)
##
## Attaching package: 'mice'
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(ggplot2)
library(dplyr)
hw8$edu3.imp.mean<-ifelse(is.na(hw8$edu3)==T, mean(hw8$edu3, na.rm=T), hw8$edu3)
mean(hw8$edu3, na.rm=T) #both have the same mean
## [1] 2.11344
mean(hw8$edu3.imp.mean)
## [1] 2.11344
hw8$ownhome.imp.mean<-ifelse(is.na(hw8$ownhome)==T, mean(hw8$ownhome, na.rm=T), hw8$ownhome)
mean(hw8$ownhome, na.rm=T)
## [1] 0.716827
mean(hw8$ownhome.imp.mean)
## [1] 0.716827
hw8$multgen1.imp.mean<-ifelse(is.na(hw8$multgen1)==T, mean(hw8$multgen1, na.rm=T), hw8$multgen1)
mean(hw8$multgen1, na.rm=T) #same means
## [1] 1.873835
mean(hw8$multgen1.imp.mean)
## [1] 1.873835
fit<-lm(inctot~race+sex+hcov+ownhome.imp.mean+multgen1.imp.mean+edu3.imp.mean, hw8)
fit
##
## Call:
## lm(formula = inctot ~ race + sex + hcov + ownhome.imp.mean +
## multgen1.imp.mean + edu3.imp.mean, data = hw8)
##
## Coefficients:
## (Intercept) raceblack raceoapi raceother
## 15235 -22100 -11627 -14357
## racethreemoremaj racetwomajor racewhite sex
## -17402 -17860 -11149 -17047
## hcov ownhome.imp.mean multgen1.imp.mean edu3.imp.mean
## 14393 8002 -7086 14470
#looking at variance differences between original variable and imputed variable for ownhome. There is a slight difference. The variance went down by a couple of variance points. Reduction in variation in the sample is not good. I could do the same for the other variables.
var(hw8$ownhome, na.rm=T)
## [1] 0.202991
var(hw8$ownhome.imp.mean, na.rm=T)
## [1] 0.1891511
#Doesn't really show me too much bc of binary variables.
hist(hw8$ownhome)

hist(hw8$ownhome.imp.mean)

#Perform a multiple imputation of all values. Perform the analysis using this imputed data set. What are your results?
##So the flag variable for home ownership returns TRUE, meaning that there are missing values and they could have a meaningful impact on my model. I know this because I ran a linear model with just the dependent variable and home ownership significantly impacts the dependent variable negatively.The other two variables are also missing at random.
fit1<-lm(inctot~is.na(ownhome), data=hw8)
fit1<-lm(inctot~is.na(edu3), data=hw8)
fit1<-lm(inctot~is.na(multgen1), data=hw8)
summary(fit1)
##
## Call:
## lm(formula = inctot ~ is.na(multgen1), data = hw8)
##
## Residuals:
## Min 1Q Median 3Q Max
## -36091 -26846 -10023 9264 786509
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29491.1 221.1 133.38 <2e-16 ***
## is.na(multgen1)TRUE -23936.9 846.8 -28.27 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 44640 on 43736 degrees of freedom
## Multiple R-squared: 0.01794, Adjusted R-squared: 0.01792
## F-statistic: 799.1 on 1 and 43736 DF, p-value: < 2.2e-16
#Looking at patterns for missing values.
library(mice)
md.pattern(hw8[,c("sex", "inctot", "empstat","edu3","race", "ownhome", "hcov", "multgen1")])

## sex inctot empstat hcov race edu3 ownhome multgen1
## 39655 1 1 1 1 1 1 1 1 0
## 2889 1 1 1 1 1 1 0 0 2
## 864 1 1 1 1 1 0 1 1 1
## 60 1 1 1 1 1 0 0 0 3
## 221 1 1 1 1 0 1 1 1 1
## 33 1 1 1 1 0 1 0 0 3
## 16 1 1 1 1 0 0 1 1 2
## 0 0 0 0 270 940 2982 2982 7174
md.pairs(hw8[,c("sex", "inctot", "empstat","edu3","race", "ownhome", "hcov", "multgen1")])
## $rr
## sex inctot empstat edu3 race ownhome hcov multgen1
## sex 43738 43738 43738 42798 43468 40756 43738 40756
## inctot 43738 43738 43738 42798 43468 40756 43738 40756
## empstat 43738 43738 43738 42798 43468 40756 43738 40756
## edu3 42798 42798 42798 42798 42544 39876 42798 39876
## race 43468 43468 43468 42544 43468 40519 43468 40519
## ownhome 40756 40756 40756 39876 40519 40756 40756 40756
## hcov 43738 43738 43738 42798 43468 40756 43738 40756
## multgen1 40756 40756 40756 39876 40519 40756 40756 40756
##
## $rm
## sex inctot empstat edu3 race ownhome hcov multgen1
## sex 0 0 0 940 270 2982 0 2982
## inctot 0 0 0 940 270 2982 0 2982
## empstat 0 0 0 940 270 2982 0 2982
## edu3 0 0 0 0 254 2922 0 2922
## race 0 0 0 924 0 2949 0 2949
## ownhome 0 0 0 880 237 0 0 0
## hcov 0 0 0 940 270 2982 0 2982
## multgen1 0 0 0 880 237 0 0 0
##
## $mr
## sex inctot empstat edu3 race ownhome hcov multgen1
## sex 0 0 0 0 0 0 0 0
## inctot 0 0 0 0 0 0 0 0
## empstat 0 0 0 0 0 0 0 0
## edu3 940 940 940 0 924 880 940 880
## race 270 270 270 254 0 237 270 237
## ownhome 2982 2982 2982 2922 2949 0 2982 0
## hcov 0 0 0 0 0 0 0 0
## multgen1 2982 2982 2982 2922 2949 0 2982 0
##
## $mm
## sex inctot empstat edu3 race ownhome hcov multgen1
## sex 0 0 0 0 0 0 0 0
## inctot 0 0 0 0 0 0 0 0
## empstat 0 0 0 0 0 0 0 0
## edu3 0 0 0 940 16 60 0 60
## race 0 0 0 16 270 33 0 33
## ownhome 0 0 0 60 33 2982 0 2982
## hcov 0 0 0 0 0 0 0 0
## multgen1 0 0 0 60 33 2982 0 2982
#Basic Imputation
dat2<-hw8
samp2<-sample(1:dim(dat2)[1], replace = F, size = 500)
dat2$inctotaa<-dat2$inctot
dat2$inctotaa[samp2]<-NA
head(dat2[, c("inctot","inctot")])
## # A tibble: 6 x 2
## inctot inctot
## <dbl> <dbl>
## 1 37029 37029
## 2 31739 31739
## 3 7406 7406
## 4 0 0
## 5 53957 53957
## 6 46974 46974
imp<-mice(data = dat2[,c("sex", "inctot", "empstat","edu3","race", "ownhome", "hcov", "multgen1")], m = 10)
##
## iter imp variable
## 1 1 edu3 ownhome multgen1
## 1 2 edu3 ownhome multgen1
## 1 3 edu3 ownhome multgen1
## 1 4 edu3 ownhome multgen1
## 1 5 edu3 ownhome multgen1
## 1 6 edu3 ownhome multgen1
## 1 7 edu3 ownhome multgen1
## 1 8 edu3 ownhome multgen1
## 1 9 edu3 ownhome multgen1
## 1 10 edu3 ownhome multgen1
## 2 1 edu3 ownhome multgen1
## 2 2 edu3 ownhome multgen1
## 2 3 edu3 ownhome multgen1
## 2 4 edu3 ownhome multgen1
## 2 5 edu3 ownhome multgen1
## 2 6 edu3 ownhome multgen1
## 2 7 edu3 ownhome multgen1
## 2 8 edu3 ownhome multgen1
## 2 9 edu3 ownhome multgen1
## 2 10 edu3 ownhome multgen1
## 3 1 edu3 ownhome multgen1
## 3 2 edu3 ownhome multgen1
## 3 3 edu3 ownhome multgen1
## 3 4 edu3 ownhome multgen1
## 3 5 edu3 ownhome multgen1
## 3 6 edu3 ownhome multgen1
## 3 7 edu3 ownhome multgen1
## 3 8 edu3 ownhome multgen1
## 3 9 edu3 ownhome multgen1
## 3 10 edu3 ownhome multgen1
## 4 1 edu3 ownhome multgen1
## 4 2 edu3 ownhome multgen1
## 4 3 edu3 ownhome multgen1
## 4 4 edu3 ownhome multgen1
## 4 5 edu3 ownhome multgen1
## 4 6 edu3 ownhome multgen1
## 4 7 edu3 ownhome multgen1
## 4 8 edu3 ownhome multgen1
## 4 9 edu3 ownhome multgen1
## 4 10 edu3 ownhome multgen1
## 5 1 edu3 ownhome multgen1
## 5 2 edu3 ownhome multgen1
## 5 3 edu3 ownhome multgen1
## 5 4 edu3 ownhome multgen1
## 5 5 edu3 ownhome multgen1
## 5 6 edu3 ownhome multgen1
## 5 7 edu3 ownhome multgen1
## 5 8 edu3 ownhome multgen1
## 5 9 edu3 ownhome multgen1
## 5 10 edu3 ownhome multgen1
## Warning: Number of logged events: 1
#To see what methods were used for the imputations:
print(imp)
## Class: mids
## Number of multiple imputations: 10
## Imputation methods:
## sex inctot empstat edu3 race ownhome hcov multgen1
## "" "" "" "pmm" "" "pmm" "" "pmm"
## PredictorMatrix:
## sex inctot empstat edu3 race ownhome hcov multgen1
## sex 0 1 1 1 0 1 1 1
## inctot 1 0 1 1 0 1 1 1
## empstat 1 1 0 1 0 1 1 1
## edu3 1 1 1 0 0 1 1 1
## race 0 0 0 0 0 0 0 0
## ownhome 1 1 1 1 0 0 1 1
## Number of logged events: 1
## it im dep meth out
## 1 0 0 constant race
##For all three of my variables, pmm (predictive means method) was used.
# head(imp$imp$ownhome)
#
# summary(imp$imp$edu3)
#
# summary(hw8$ownhome)
#
# summary(imp$imp$race)
# summary(hw8$race)
#To get one of the imputed datasets. Randomly choosing the first set of imputed data.
dat.imp<-complete(imp, action = 1)
head(dat.imp, n=10)
## sex inctot empstat edu3 race ownhome hcov multgen1
## 1 1 37029 1 3 white 1 1 2
## 2 0 31739 1 1 white 1 1 2
## 3 1 7406 1 3 white 0 0 2
## 4 0 0 2 2 white 0 0 2
## 5 1 53957 1 3 white 1 1 2
## 6 0 46974 1 3 white 1 1 2
## 7 0 0 3 2 white 1 1 2
## 8 1 31739 1 3 white 1 1 2
## 9 0 74058 1 2 white 1 1 2
## 10 0 0 3 1 white 1 1 2
#Compare to the original data
head(hw8[,c("sex", "inctot", "empstat","edu3","race", "ownhome", "hcov", "multgen1")], n=10)
## # A tibble: 10 x 8
## sex inctot empstat edu3 race ownhome hcov multgen1
## <dbl> <dbl> <int> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 1 37029 1 3 white 1 1 2
## 2 0 31739 1 1 white 1 1 2
## 3 1 7406 1 3 white 0 0 2
## 4 0 0 2 2 white 0 0 2
## 5 1 53957 1 3 white 1 1 2
## 6 0 46974 1 3 white 1 1 2
## 7 0 0 3 2 white 1 1 2
## 8 1 31739 1 3 white 1 1 2
## 9 0 74058 1 2 white 1 1 2
## 10 0 0 3 1 white 1 1 2
#Now, I will see the variability in the 10 different imputations for each outcome
fit.inctot<-with(data=imp ,expr=lm(inctot~race+sex+hcov+ownhome+edu3+empstat))
fit.inctot
## call :
## with.mids(data = imp, expr = lm(inctot ~ race + sex + hcov +
## ownhome + edu3 + empstat))
##
## call1 :
## mice(data = dat2[, c("sex", "inctot", "empstat", "edu3", "race",
## "ownhome", "hcov", "multgen1")], m = 10)
##
## nmis :
## sex inctot empstat edu3 race ownhome hcov multgen1
## 0 0 0 940 270 2982 0 2982
##
## analyses :
## [[1]]
##
## Call:
## lm(formula = inctot ~ race + sex + hcov + ownhome + edu3 + empstat)
##
## Coefficients:
## (Intercept) raceblack raceoapi raceother
## 30257 -16733 -12380 -17879
## racethreemoremaj racetwomajor racewhite sex
## -22531 -20416 -14167 -15324
## hcov ownhome edu3 empstat
## 11868 6907 12533 -11466
##
##
## [[2]]
##
## Call:
## lm(formula = inctot ~ race + sex + hcov + ownhome + edu3 + empstat)
##
## Coefficients:
## (Intercept) raceblack raceoapi raceother
## 30009 -16831 -12187 -17798
## racethreemoremaj racetwomajor racewhite sex
## -22388 -20340 -14042 -15283
## hcov ownhome edu3 empstat
## 11854 7115 12520 -11469
##
##
## [[3]]
##
## Call:
## lm(formula = inctot ~ race + sex + hcov + ownhome + edu3 + empstat)
##
## Coefficients:
## (Intercept) raceblack raceoapi raceother
## 29768 -16590 -12405 -17757
## racethreemoremaj racetwomajor racewhite sex
## -22361 -20191 -14028 -15342
## hcov ownhome edu3 empstat
## 11823 7330 12564 -11452
##
##
## [[4]]
##
## Call:
## lm(formula = inctot ~ race + sex + hcov + ownhome + edu3 + empstat)
##
## Coefficients:
## (Intercept) raceblack raceoapi raceother
## 30009 -16915 -12612 -17928
## racethreemoremaj racetwomajor racewhite sex
## -22616 -20498 -14270 -15307
## hcov ownhome edu3 empstat
## 11830 7254 12573 -11435
##
##
## [[5]]
##
## Call:
## lm(formula = inctot ~ race + sex + hcov + ownhome + edu3 + empstat)
##
## Coefficients:
## (Intercept) raceblack raceoapi raceother
## 29915 -16677 -12214 -17753
## racethreemoremaj racetwomajor racewhite sex
## -22375 -20271 -14017 -15311
## hcov ownhome edu3 empstat
## 11857 7081 12553 -11441
##
##
## [[6]]
##
## Call:
## lm(formula = inctot ~ race + sex + hcov + ownhome + edu3 + empstat)
##
## Coefficients:
## (Intercept) raceblack raceoapi raceother
## 29973 -16916 -12600 -17856
## racethreemoremaj racetwomajor racewhite sex
## -22275 -20579 -14160 -15268
## hcov ownhome edu3 empstat
## 11857 7236 12545 -11466
##
##
## [[7]]
##
## Call:
## lm(formula = inctot ~ race + sex + hcov + ownhome + edu3 + empstat)
##
## Coefficients:
## (Intercept) raceblack raceoapi raceother
## 30039 -17005 -12331 -17853
## racethreemoremaj racetwomajor racewhite sex
## -22284 -20335 -14146 -15327
## hcov ownhome edu3 empstat
## 11858 7094 12554 -11443
##
##
## [[8]]
##
## Call:
## lm(formula = inctot ~ race + sex + hcov + ownhome + edu3 + empstat)
##
## Coefficients:
## (Intercept) raceblack raceoapi raceother
## 30303 -17308 -12889 -18304
## racethreemoremaj racetwomajor racewhite sex
## -22883 -20709 -14539 -15304
## hcov ownhome edu3 empstat
## 11810 7233 12586 -11454
##
##
## [[9]]
##
## Call:
## lm(formula = inctot ~ race + sex + hcov + ownhome + edu3 + empstat)
##
## Coefficients:
## (Intercept) raceblack raceoapi raceother
## 30063 -16976 -12407 -17778
## racethreemoremaj racetwomajor racewhite sex
## -22397 -20346 -14034 -15341
## hcov ownhome edu3 empstat
## 11856 6980 12545 -11462
##
##
## [[10]]
##
## Call:
## lm(formula = inctot ~ race + sex + hcov + ownhome + edu3 + empstat)
##
## Coefficients:
## (Intercept) raceblack raceoapi raceother
## 30081 -17342 -12551 -17843
## racethreemoremaj racetwomajor racewhite sex
## -22523 -20547 -14126 -15273
## hcov ownhome edu3 empstat
## 11973 6842 12580 -11477
#variation in inctot - they are all the same
with (data=imp, exp=(sd(inctot)))
## call :
## with.mids(data = imp, expr = (sd(inctot)))
##
## call1 :
## mice(data = dat2[, c("sex", "inctot", "empstat", "edu3", "race",
## "ownhome", "hcov", "multgen1")], m = 10)
##
## nmis :
## sex inctot empstat edu3 race ownhome hcov multgen1
## 0 0 0 940 270 2982 0 2982
##
## analyses :
## [[1]]
## [1] 45042.26
##
## [[2]]
## [1] 45042.26
##
## [[3]]
## [1] 45042.26
##
## [[4]]
## [1] 45042.26
##
## [[5]]
## [1] 45042.26
##
## [[6]]
## [1] 45042.26
##
## [[7]]
## [1] 45042.26
##
## [[8]]
## [1] 45042.26
##
## [[9]]
## [1] 45042.26
##
## [[10]]
## [1] 45042.26
#Freq table
# with (data=imp, exp=(prop.table(table(inctot))))
#Pool the separate models- combine the estimates - we get a regression model for the totality of the imputed data. The focus is on lambda and fmi. When they are closer to 1, the implication is that there is a lot of variation in that parameter across the imputations. In my result, home ownership is the highest value(.13) which means that the imputed data is not good. However, it's not super high...so the variations may be okay?
est.p<-pool(fit.inctot)
print(est.p)
## Class: mipo m = 10
## term m estimate ubar b t dfcom
## 1 (Intercept) 10 30041.788 29928193.25 23884.3342 29954466.02 43456
## 2 raceblack 10 -16929.358 31393940.12 61034.7954 31461078.40 43456
## 3 raceoapi 10 -12457.614 34780319.99 44329.2304 34829082.15 43456
## 4 raceother 10 -17875.074 29800958.81 25892.8074 29829440.90 43456
## 5 racethreemoremaj 10 -22463.329 79268933.16 33717.0322 79306021.90 43456
## 6 racetwomajor 10 -20423.136 31905614.35 24930.0890 31933037.45 43456
## 7 racewhite 10 -14152.894 29210153.11 24936.4960 29237583.25 43456
## 8 sex 10 -15308.106 150676.85 706.8194 151454.35 43456
## 9 hcov 10 11858.650 182748.16 1967.0458 184911.91 43456
## 10 ownhome 10 7107.196 185967.82 25738.1237 214279.75 43456
## 11 edu3 10 12555.345 65792.13 424.1183 66258.66 43456
## 12 empstat 10 -11456.529 41287.71 189.6443 41496.32 43456
## df riv lambda fmi
## 1 43255.3650 0.0008778601 0.0008770902 0.0009232834
## 2 42430.3135 0.0021385743 0.0021340106 0.0021810428
## 3 42986.9096 0.0014020042 0.0014000413 0.0014464987
## 4 43222.4295 0.0009557440 0.0009548314 0.0010010563
## 5 43387.8825 0.0004678849 0.0004676661 0.0005137371
## 6 43262.7682 0.0008595070 0.0008587689 0.0009049551
## 7 43229.6906 0.0009390620 0.0009381810 0.0009843989
## 8 38373.3415 0.0051600580 0.0051335685 0.0051854164
## 9 25974.5207 0.0118400667 0.0117015199 0.0117776087
## 10 508.5913 0.1522410516 0.1321260438 0.1355188849
## 11 34862.0324 0.0070909709 0.0070410431 0.0070980033
## 12 38554.7302 0.0050525621 0.0050271620 0.0050787715
summary (est.p) #yeah, so I got low p-values for a few of the parameters.
## term estimate std.error statistic df p.value
## 1 (Intercept) 30041.788 5473.0673 5.489022 43255.3650 4.064295e-08
## 2 raceblack -16929.358 5609.0176 -3.018239 42430.3135 2.543988e-03
## 3 raceoapi -12457.614 5901.6169 -2.110882 42986.9096 3.478825e-02
## 4 raceother -17875.074 5461.6335 -3.272844 43222.4295 1.065548e-03
## 5 racethreemoremaj -22463.329 8905.3929 -2.522441 43387.8825 1.165789e-02
## 6 racetwomajor -20423.136 5650.9324 -3.614118 43262.7682 3.017137e-04
## 7 racewhite -14152.894 5407.1789 -2.617427 43229.6906 8.862638e-03
## 8 sex -15308.106 389.1714 -39.335129 38373.3415 0.000000e+00
## 9 hcov 11858.650 430.0139 27.577369 25974.5207 0.000000e+00
## 10 ownhome 7107.196 462.9036 15.353512 508.5913 0.000000e+00
## 11 edu3 12555.345 257.4076 48.776129 34862.0324 0.000000e+00
## 12 empstat -11456.529 203.7064 -56.240384 38554.7302 0.000000e+00
#Plotting the pooled values
lam<-data.frame(lam=est.p$pooled$lambda, param=row.names(est.p$pooled))
ggplot(data=lam,aes(x=param, y=lam))+geom_col()+theme(axis.text.x = element_text(angle = 45, hjust = 1))

#10 (home ownership) has the greatest degree of impact on the regression coefficient.
#Compare imputed model to original data
fit2<-lm(inctot~race+sex+hcov+ownhome+edu3+empstat+multgen1, data=hw8)
summary(fit2)
##
## Call:
## lm(formula = inctot ~ race + sex + hcov + ownhome + edu3 + empstat +
## multgen1, data = hw8)
##
## Residuals:
## Min 1Q Median 3Q Max
## -68002 -18968 -5055 9346 755123
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 44328.6 5909.8 7.501 6.47e-14 ***
## raceblack -12238.3 6268.7 -1.952 0.050912 .
## raceoapi -12140.0 6374.5 -1.904 0.056859 .
## raceother -18930.9 5863.2 -3.229 0.001244 **
## racethreemoremaj -23593.5 9370.8 -2.518 0.011814 *
## racetwomajor -20594.8 6076.9 -3.389 0.000702 ***
## racewhite -15008.3 5802.9 -2.586 0.009704 **
## sex -15824.6 418.3 -37.832 < 2e-16 ***
## hcov 10773.1 468.0 23.020 < 2e-16 ***
## ownhome 8633.1 466.8 18.493 < 2e-16 ***
## edu3 12650.1 276.4 45.768 < 2e-16 ***
## empstat -11171.0 222.3 -50.243 < 2e-16 ***
## multgen1 -7314.6 333.0 -21.967 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 40990 on 39642 degrees of freedom
## (4083 observations deleted due to missingness)
## Multiple R-squared: 0.2131, Adjusted R-squared: 0.2128
## F-statistic: 894.5 on 12 and 39642 DF, p-value: < 2.2e-16
fit.imp<-lm(inctot~race+sex+hcov+ownhome+edu3+empstat+multgen1, data=dat.imp)
summary(fit.imp)
##
## Call:
## lm(formula = inctot ~ race + sex + hcov + ownhome + edu3 + empstat +
## multgen1, data = dat.imp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -65547 -18232 -5081 8864 756676
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 42974.2 5477.2 7.846 4.39e-15 ***
## raceblack -16500.8 5575.5 -2.960 0.003083 **
## raceoapi -11831.0 5868.6 -2.016 0.043807 *
## raceother -17216.7 5432.3 -3.169 0.001529 **
## racethreemoremaj -22538.0 8859.5 -2.544 0.010965 *
## racetwomajor -19182.4 5621.0 -3.413 0.000644 ***
## racewhite -13356.0 5378.2 -2.483 0.013018 *
## sex -14899.8 386.8 -38.523 < 2e-16 ***
## hcov 11193.5 426.8 26.228 < 2e-16 ***
## ownhome 7557.4 430.4 17.560 < 2e-16 ***
## edu3 11771.5 257.9 45.643 < 2e-16 ***
## empstat -11535.5 202.2 -57.064 < 2e-16 ***
## multgen1 -6412.0 306.8 -20.900 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 39840 on 43455 degrees of freedom
## (270 observations deleted due to missingness)
## Multiple R-squared: 0.2168, Adjusted R-squared: 0.2166
## F-statistic: 1002 on 12 and 43455 DF, p-value: < 2.2e-16
#There were some changes between the original and imputed data. Where Black and API races were not significant in the original data, after imputation, they became significant (at different levels). The Black race parameter increased by more points than any other parameter. Additionally, the White race became less significant after imputation. Health coverage and home ownership values went up and remained significant. Education and multigenerational households remained significant, but the values decreased slightly.
# #get the coefficients from each of the 5 imputations of bmi
# coefs<-as.data.frame(matrix(unlist(lapply(fit.inctot$analyses, coef)), nrow=5, ncol=17, byrow=T))
# names(coefs)<-names(fit.inctot$analyses[[1]]$coef)
# #plot the coefficients from each of the different rounds of imputation to see the variability in the
# #results
# coefs%>%
# select(`ownhome`:`raceblack other`)%>%
# melt()%>%
# mutate(imp=rep(1:5, 16))%>%
# ggplot()+geom_point(aes(y=value,x=variable, group=variable, color=as.factor(imp) ))+ggtitle("Estimated Betas from Each Imputed Regression Model for BMI Outcome")+theme(axis.text.x = element_text(angle = 45, hjust = 1))
#Were the results similar between the mean/modal and multiply imputed data sets? How do the results compare to the results from the model fit with the data source with missing values?
##Results from the mean imputation
summary(fit)
##
## Call:
## lm(formula = inctot ~ race + sex + hcov + ownhome.imp.mean +
## multgen1.imp.mean + edu3.imp.mean, data = hw8)
##
## Residuals:
## Min 1Q Median 3Q Max
## -73954 -19968 -6283 10525 760281
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 15234.6 5651.8 2.696 0.007031 **
## raceblack -22099.8 5771.6 -3.829 0.000129 ***
## raceoapi -11626.7 6075.9 -1.914 0.055680 .
## raceother -14357.2 5624.1 -2.553 0.010689 *
## racethreemoremaj -17402.5 9172.2 -1.897 0.057793 .
## racetwomajor -17859.7 5819.6 -3.069 0.002150 **
## racewhite -11148.9 5568.1 -2.002 0.045262 *
## sex -17047.5 398.4 -42.792 < 2e-16 ***
## hcov 14393.3 437.1 32.926 < 2e-16 ***
## ownhome.imp.mean 8002.0 462.8 17.289 < 2e-16 ***
## multgen1.imp.mean -7086.2 330.0 -21.472 < 2e-16 ***
## edu3.imp.mean 14470.3 265.1 54.580 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 41250 on 43456 degrees of freedom
## (270 observations deleted due to missingness)
## Multiple R-squared: 0.1604, Adjusted R-squared: 0.1602
## F-statistic: 754.8 on 11 and 43456 DF, p-value: < 2.2e-16
##Results from the multiple imputation
summary(fit.imp)
##
## Call:
## lm(formula = inctot ~ race + sex + hcov + ownhome + edu3 + empstat +
## multgen1, data = dat.imp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -65547 -18232 -5081 8864 756676
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 42974.2 5477.2 7.846 4.39e-15 ***
## raceblack -16500.8 5575.5 -2.960 0.003083 **
## raceoapi -11831.0 5868.6 -2.016 0.043807 *
## raceother -17216.7 5432.3 -3.169 0.001529 **
## racethreemoremaj -22538.0 8859.5 -2.544 0.010965 *
## racetwomajor -19182.4 5621.0 -3.413 0.000644 ***
## racewhite -13356.0 5378.2 -2.483 0.013018 *
## sex -14899.8 386.8 -38.523 < 2e-16 ***
## hcov 11193.5 426.8 26.228 < 2e-16 ***
## ownhome 7557.4 430.4 17.560 < 2e-16 ***
## edu3 11771.5 257.9 45.643 < 2e-16 ***
## empstat -11535.5 202.2 -57.064 < 2e-16 ***
## multgen1 -6412.0 306.8 -20.900 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 39840 on 43455 degrees of freedom
## (270 observations deleted due to missingness)
## Multiple R-squared: 0.2168, Adjusted R-squared: 0.2166
## F-statistic: 1002 on 12 and 43455 DF, p-value: < 2.2e-16
#Raceblack was more statistically significant using the mean method, but was significant nonetheless using the multiple imputation model. Race api and three or more races was significant using the multiple imputation method but not using the mean imputation method. White was significant in both models. Sex was significant in both models, but had a higher value using the mean imputation method. Health coverage had a lower value using the multiple imputation method, but remained significant. The biggest difference in outcome really is with the significance of the race parameter, particularly for Black, API, and three or more races. If additional testing was possible to see which method to prefer, I would pursue that. Otherwise, I would go with the multiple imputation model.