HW8_mydata

library(ipumsr)
usa_00013a <- read_ipums_ddi("usa_00013.xml")
tx_00013a <- read_ipums_micro(usa_00013a, data_file = ("usa_00013.dat.gz"), verbose = FALSE)

library(stringr)
names(tx_00013a)<-tolower(names(tx_00013a))

names(tx_00013a)

##  [1] "year"      "multyear"  "sample"    "serial"    "cbserial"  "hhwt"     
##  [7] "cluster"   "statefip"  "puma"      "strata"    "gq"        "ownershp" 
## [13] "ownershpd" "mortgage"  "multgen"   "multgend"  "pernum"    "perwt"    
## [19] "sex"       "age"       "fertyr"    "race"      "raced"     "hispan"   
## [25] "hispand"   "hcovany"   "educ"      "educd"     "empstat"   "empstatd" 
## [31] "labforce"  "occ"       "ind"       "uhrswork"  "inctot"    "poverty"  
## [37] "presgl"    "migrate1"  "migrate1d"

#CS Code
tx_00013a<-zap_labels(tx_00013a)
tx_00013a$newpuma<- paste (str_pad(tx_00013a$statefip, 2,"left", "0"), str_pad(tx_00013a$puma,5,"left", "0") , sep="")
table(tx_00013a$newpuma)

## 
## 4800100 4800200 4800300 4800400 4800501 4800502 4800600 4800700 4800800 4800900 
##    8976    4362    4674    6757    3800    5870    7094    4974    8381    6331 
## 4801000 4801100 4801200 4801300 4801400 4801501 4801502 4801600 4801700 4801800 
##    6359    4275    3594    4239    4289    2823    3451    3754    4852    5583 
## 4801901 4801902 4801903 4801904 4801905 4801906 4801907 4802001 4802002 4802003 
##    4402    4245    3728    3195    4205    6061    4681    4436    4103    6084 
## 4802004 4802005 4802006 4802101 4802102 4802200 4802301 4802302 4802303 4802304 
##    4873    4049    5078    5882    5309    4226    2988    3015    3388    2921 
## 4802305 4802306 4802307 4802308 4802309 4802310 4802311 4802312 4802313 4802314 
##    3255    3718    3948    3274    3816    3944    4200    4436    5082    3627 
## 4802315 4802316 4802317 4802318 4802319 4802320 4802321 4802322 4802400 4802501 
##    3899    2974    2488    3253    3424    3566    3905    4536    4496    3737 
## 4802502 4802503 4802504 4802505 4802506 4802507 4802508 4802509 4802510 4802511 
##    4185    3696    2676    3742    3053    3873    3622    4406    4376    2785 
## 4802512 4802513 4802514 4802515 4802516 4802600 4802700 4802800 4802900 4803000 
##    3138    4605    4330    3849    3494    9343    5077    4847    3644    3894 
## 4803100 4803200 4803301 4803302 4803303 4803304 4803305 4803306 4803400 4803501 
##    4029    4379    4575    4536    3704    2996    3661    4159    6975    4733 
## 4803502 4803601 4803602 4803700 4803801 4803802 4803900 4804000 4804100 4804200 
##    5277    5462    7410    8434    3518    5522    6597    5112    3565    4323 
## 4804301 4804302 4804400 4804501 4804502 4804503 4804504 4804601 4804602 4804603 
##    3282    4948    3389    3376    3264    2591    2489    3635    3768    4783 
## 4804604 4804605 4804606 4804607 4804608 4804609 4804610 4804611 4804612 4804613 
##    4719    3325    3548    3001    3208    3701    2969    2641    3627    3319 
## 4804614 4804615 4804616 4804617 4804618 4804619 4804620 4804621 4804622 4804623 
##    3146    2923    3399    2564    2998    2644    2778    3648    2757    2811 
## 4804624 4804625 4804626 4804627 4804628 4804629 4804630 4804631 4804632 4804633 
##    3105    2733    3005    2988    3085    2941    2632    3105    3428    2002 
## 4804634 4804635 4804636 4804637 4804638 4804701 4804702 4804801 4804802 4804803 
##    2120    2990    2901    3075    2706    5956    3790    2101    3777    3715 
## 4804901 4804902 4804903 4804904 4804905 4805000 4805100 4805201 4805202 4805203 
##    3025    3696    2939    2716    4373    5532    5005    4511    3605    3445 
## 4805204 4805301 4805302 4805303 4805304 4805305 4805306 4805307 4805308 4805309 
##    4498    3153    3689    4928    4124    3561    6162    4390    4123    4819 
## 4805400 4805500 4805600 4805700 4805800 4805901 4805902 4805903 4805904 4805905 
##    5755    6315    3466    5226    3730    3654    3649    3516    3355    3693 
## 4805906 4805907 4805908 4805909 4805910 4805911 4805912 4805913 4805914 4805915 
##    3386    4084    3118    3324    3944    3566    3237    2997    4406    3570 
## 4805916 4806000 4806100 4806200 4806301 4806302 4806400 4806500 4806601 4806602 
##    3317    3527    3634    4059    4778    3826    3097    4814    3861    3349 
## 4806603 4806701 4806702 4806703 4806801 4806802 4806803 4806804 4806805 4806806 
##    3786    3088    4194    4537    2613    2169    2594    2336    3324    2414 
## 4806807 4806900 
##    2775    2869

bordp<-readr::read_csv("C:/Users/codar/OneDrive/Documents/Stats II/Data/border_100mi_pumas_table.csv")

## Parsed with column specification:
## cols(
##   fid = col_double(),
##   STATEFP10 = col_double(),
##   PUMACE10 = col_character(),
##   AFFGEOID10 = col_character(),
##   GEOID10 = col_double(),
##   NAME10 = col_character(),
##   LSAD10 = col_character(),
##   ALAND10 = col_double(),
##   AWATER10 = col_double()
## )

mdat<-merge(tx_00013a, bordp, by.x="newpuma", by.y="GEOID10")
table(mdat$newpuma)

## 
## 4802800 4803200 4803301 4803302 4803303 4803304 4803305 4803306 4806000 4806100 
##    4847    4379    4575    4536    3704    2996    3661    4159    3527    3634 
## 4806200 4806301 4806302 4806400 4806701 4806702 4806703 4806801 4806802 4806803 
##    4059    4778    3826    3097    3088    4194    4537    2613    2169    2594 
## 4806804 4806805 4806806 4806807 4806900 
##    2336    3324    2414    2775    2869

library(dplyr)
tx_00013a<-tx_00013a%>% 
filter(newpuma %in% c( "4802800", "4803200","4806000", "4806100", "4806200", "4806301", "4806302", "4806701", "4806702", "4806703", "4806900" ))
 # View(tx_00012a)
 #  names(tx_00012a)
 #  class(tx_00012a)

hw8 <-tx_00013a %>%
  mutate(sex=case_when(sex == 1~0,
                       sex == 2~ 1,
                       TRUE ~ NA_real_),
        lfpart=case_when(labforce== 1 ~ 0,
                          labforce== 2 ~ 1,
                         TRUE ~ NA_real_),
         edu=case_when(educ== 0 ~ 'none',
                        educ %in% 1:5 ~ 'hs incomplete',
                        educ %in% 6 ~ 'hs complete',
                        educ %in% 7:11 ~ 'college',
                       TRUE ~ NA_character_),
         race=case_when(race== 1 ~ 'white',
                        race== 2 ~ 'black',
                        # race== 3 ~'aian',
                        race %in% 4:5 ~ 'asian',
                        race== 6 ~ 'oapi',
                        race== 7 ~ 'other',
                        race== 8 ~ 'twomajor',
                        race== 9 ~ 'threemoremaj',
                        TRUE ~ NA_character_),
         hisp= case_when(hispan !=0 ~ "Latino",
                         hispan==0 ~'NL',
                         hispan==9 ~ 'NL',
                         TRUE ~ NA_character_),
         migrate1=case_when(migrate1==1 ~ 'same house',
                            migrate1==2 ~ 'movinstate',
                            migrate1==3 ~ 'abroad1yr',
                            TRUE ~ NA_character_),
         fertyr=case_when(fertyr== 1 ~ 0, 
                          fertyr==2 ~ 1,
                          TRUE~ NA_real_ ),
         poverty1=case_when(poverty==001 ~ "1% or less",
                           poverty ==501 ~ "501% or more"),
          hcov=case_when(hcovany == 1 ~ 0,
                        hcovany == 2 ~ 1,
                        TRUE~NA_real_),
         ownhome=case_when(ownershp==1 ~ 1,
                            ownershp==2 ~ 0,
                            TRUE ~ NA_real_),
        multgen1=case_when(multgen==1 ~ 1,
                           multgen==2 ~ 2,
                           multgen==3 ~ 3,
                           TRUE~NA_real_),
        edu3=case_when(educ %in% 1:5 ~ 1,
                       educ %in% 6 ~ 2,
                       educ %in% 7:11 ~ 3,
                       TRUE~NA_real_),
        ownhome2=case_when(ownershp==1 ~"Own",
                           ownershp==2 ~ "Rent"),
        multgen2=case_when(multgen==1 ~ "One Gen",
                           multgen ==2 ~ "Two Gen",
                           multgen == 3 ~ "Three More Gens"))


 sapply(hw8, class)

##        year    multyear      sample      serial    cbserial        hhwt 
##   "integer"   "numeric"   "integer"   "numeric"   "numeric"   "numeric" 
##     cluster    statefip        puma      strata          gq    ownershp 
##   "numeric"   "integer"   "numeric"   "numeric"   "integer"   "integer" 
##   ownershpd    mortgage     multgen    multgend      pernum       perwt 
##   "integer"   "integer"   "integer"   "integer"   "numeric"   "numeric" 
##         sex         age      fertyr        race       raced      hispan 
##   "numeric"   "integer"   "numeric" "character"   "integer"   "integer" 
##     hispand     hcovany        educ       educd     empstat    empstatd 
##   "integer"   "integer"   "integer"   "integer"   "integer"   "integer" 
##    labforce         occ         ind    uhrswork      inctot     poverty 
##   "integer"   "numeric"   "numeric"   "integer"   "numeric"   "numeric" 
##      presgl    migrate1   migrate1d     newpuma      lfpart         edu 
##   "numeric" "character"   "integer" "character"   "numeric" "character" 
##        hisp    poverty1        hcov     ownhome    multgen1        edu3 
## "character" "character"   "numeric"   "numeric"   "numeric"   "numeric" 
##    ownhome2    multgen2 
## "character" "character"

# Missing data homework
#  
# Use a data source you are familiar with
#  
#Measure an outcome variable and at least 5 predictors
##Outcome variable: Total personal income
##Predictors: race, sex, hcov, edu, multgen1, ownhome

 
#Report the pattern of missingness among all of these variables
summary(hw8[, c("inctot", "race",  "sex", "hcov", "edu3",  "multgen1", "ownhome", "ownhome2", "multgen2")])

##      inctot           race                sex              hcov       
##  Min.   : -6600   Length:43738       Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:   317   Class :character   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median : 15000   Mode  :character   Median :0.0000   Median :1.0000  
##  Mean   : 27859                      Mean   :0.4953   Mean   :0.6712  
##  3rd Qu.: 37899                      3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :816000                      Max.   :1.0000   Max.   :1.0000  
##                                                                       
##       edu3          multgen1        ownhome         ownhome2        
##  Min.   :1.000   Min.   :1.000   Min.   :0.0000   Length:43738      
##  1st Qu.:1.000   1st Qu.:1.000   1st Qu.:0.0000   Class :character  
##  Median :2.000   Median :2.000   Median :1.0000   Mode  :character  
##  Mean   :2.113   Mean   :1.874   Mean   :0.7168                     
##  3rd Qu.:3.000   3rd Qu.:2.000   3rd Qu.:1.0000                     
##  Max.   :3.000   Max.   :3.000   Max.   :1.0000                     
##  NA's   :940     NA's   :2982    NA's   :2982                       
##    multgen2        
##  Length:43738      
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
##

##My edu3, ownhome, and multgen1 variables have missing data. edu3 has 940 NAs, multgen1 has 2982 NAs and ownhome has 2982 NAs.   


sapply(hw8, class)

##        year    multyear      sample      serial    cbserial        hhwt 
##   "integer"   "numeric"   "integer"   "numeric"   "numeric"   "numeric" 
##     cluster    statefip        puma      strata          gq    ownershp 
##   "numeric"   "integer"   "numeric"   "numeric"   "integer"   "integer" 
##   ownershpd    mortgage     multgen    multgend      pernum       perwt 
##   "integer"   "integer"   "integer"   "integer"   "numeric"   "numeric" 
##         sex         age      fertyr        race       raced      hispan 
##   "numeric"   "integer"   "numeric" "character"   "integer"   "integer" 
##     hispand     hcovany        educ       educd     empstat    empstatd 
##   "integer"   "integer"   "integer"   "integer"   "integer"   "integer" 
##    labforce         occ         ind    uhrswork      inctot     poverty 
##   "integer"   "numeric"   "numeric"   "integer"   "numeric"   "numeric" 
##      presgl    migrate1   migrate1d     newpuma      lfpart         edu 
##   "numeric" "character"   "integer" "character"   "numeric" "character" 
##        hisp    poverty1        hcov     ownhome    multgen1        edu3 
## "character" "character"   "numeric"   "numeric"   "numeric"   "numeric" 
##    ownhome2    multgen2 
## "character" "character"

#Perform a mean (a mean for numeric data) or a modal imputation (for categorical data) of all values. Perform the analysis using this imputed data. What are your results?

##Missing data was found in my edu3, multgen1, and ownhome variables. I will use the mean imputation method. Using the mean method of imputation will reduce the variance because all of the NA's will receive the mean. This might lower the effect size/beta of my model, which according to CS, is not good. 
library(car)
library(mice)

## 
## Attaching package: 'mice'

## The following objects are masked from 'package:base':
## 
##     cbind, rbind

library(ggplot2)
library(dplyr)
hw8$edu3.imp.mean<-ifelse(is.na(hw8$edu3)==T, mean(hw8$edu3, na.rm=T), hw8$edu3)
mean(hw8$edu3, na.rm=T) #both have the same mean

## [1] 2.11344

mean(hw8$edu3.imp.mean)

## [1] 2.11344

hw8$ownhome.imp.mean<-ifelse(is.na(hw8$ownhome)==T, mean(hw8$ownhome, na.rm=T), hw8$ownhome)
mean(hw8$ownhome, na.rm=T)

## [1] 0.716827

mean(hw8$ownhome.imp.mean)

## [1] 0.716827

hw8$multgen1.imp.mean<-ifelse(is.na(hw8$multgen1)==T, mean(hw8$multgen1, na.rm=T), hw8$multgen1)
mean(hw8$multgen1, na.rm=T) #same means

## [1] 1.873835

mean(hw8$multgen1.imp.mean)

## [1] 1.873835

fit<-lm(inctot~race+sex+hcov+ownhome.imp.mean+multgen1.imp.mean+edu3.imp.mean, hw8)
fit

## 
## Call:
## lm(formula = inctot ~ race + sex + hcov + ownhome.imp.mean + 
##     multgen1.imp.mean + edu3.imp.mean, data = hw8)
## 
## Coefficients:
##       (Intercept)          raceblack           raceoapi          raceother  
##             15235             -22100             -11627             -14357  
##  racethreemoremaj       racetwomajor          racewhite                sex  
##            -17402             -17860             -11149             -17047  
##              hcov   ownhome.imp.mean  multgen1.imp.mean      edu3.imp.mean  
##             14393               8002              -7086              14470

#looking at variance differences between original variable and imputed variable for ownhome. There is a slight difference. The variance went down by a couple of variance points. Reduction in variation in the sample is not good. I could do the same for the other variables.  
var(hw8$ownhome, na.rm=T)

## [1] 0.202991

var(hw8$ownhome.imp.mean, na.rm=T)

## [1] 0.1891511

#Doesn't really show me too much bc of binary variables. 
hist(hw8$ownhome)

hist(hw8$ownhome.imp.mean)

#Perform a multiple imputation of all values. Perform the analysis using this imputed data set. What are your results?

##So the flag variable for home ownership returns TRUE, meaning that there are missing values and they could have a meaningful impact on my model. I know this because I ran a linear model with just the dependent variable and home ownership significantly impacts the dependent variable negatively.The other two variables are also missing at random.  
fit1<-lm(inctot~is.na(ownhome), data=hw8)
fit1<-lm(inctot~is.na(edu3), data=hw8)
fit1<-lm(inctot~is.na(multgen1), data=hw8)
summary(fit1)

## 
## Call:
## lm(formula = inctot ~ is.na(multgen1), data = hw8)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -36091 -26846 -10023   9264 786509 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          29491.1      221.1  133.38   <2e-16 ***
## is.na(multgen1)TRUE -23936.9      846.8  -28.27   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 44640 on 43736 degrees of freedom
## Multiple R-squared:  0.01794,    Adjusted R-squared:  0.01792 
## F-statistic: 799.1 on 1 and 43736 DF,  p-value: < 2.2e-16

#Looking at patterns for missing values. 
library(mice)
md.pattern(hw8[,c("sex", "inctot", "empstat","edu3","race", "ownhome", "hcov", "multgen1")])

##       sex inctot empstat hcov race edu3 ownhome multgen1     
## 39655   1      1       1    1    1    1       1        1    0
## 2889    1      1       1    1    1    1       0        0    2
## 864     1      1       1    1    1    0       1        1    1
## 60      1      1       1    1    1    0       0        0    3
## 221     1      1       1    1    0    1       1        1    1
## 33      1      1       1    1    0    1       0        0    3
## 16      1      1       1    1    0    0       1        1    2
##         0      0       0    0  270  940    2982     2982 7174

md.pairs(hw8[,c("sex", "inctot", "empstat","edu3","race", "ownhome", "hcov", "multgen1")])

## $rr
##            sex inctot empstat  edu3  race ownhome  hcov multgen1
## sex      43738  43738   43738 42798 43468   40756 43738    40756
## inctot   43738  43738   43738 42798 43468   40756 43738    40756
## empstat  43738  43738   43738 42798 43468   40756 43738    40756
## edu3     42798  42798   42798 42798 42544   39876 42798    39876
## race     43468  43468   43468 42544 43468   40519 43468    40519
## ownhome  40756  40756   40756 39876 40519   40756 40756    40756
## hcov     43738  43738   43738 42798 43468   40756 43738    40756
## multgen1 40756  40756   40756 39876 40519   40756 40756    40756
## 
## $rm
##          sex inctot empstat edu3 race ownhome hcov multgen1
## sex        0      0       0  940  270    2982    0     2982
## inctot     0      0       0  940  270    2982    0     2982
## empstat    0      0       0  940  270    2982    0     2982
## edu3       0      0       0    0  254    2922    0     2922
## race       0      0       0  924    0    2949    0     2949
## ownhome    0      0       0  880  237       0    0        0
## hcov       0      0       0  940  270    2982    0     2982
## multgen1   0      0       0  880  237       0    0        0
## 
## $mr
##           sex inctot empstat edu3 race ownhome hcov multgen1
## sex         0      0       0    0    0       0    0        0
## inctot      0      0       0    0    0       0    0        0
## empstat     0      0       0    0    0       0    0        0
## edu3      940    940     940    0  924     880  940      880
## race      270    270     270  254    0     237  270      237
## ownhome  2982   2982    2982 2922 2949       0 2982        0
## hcov        0      0       0    0    0       0    0        0
## multgen1 2982   2982    2982 2922 2949       0 2982        0
## 
## $mm
##          sex inctot empstat edu3 race ownhome hcov multgen1
## sex        0      0       0    0    0       0    0        0
## inctot     0      0       0    0    0       0    0        0
## empstat    0      0       0    0    0       0    0        0
## edu3       0      0       0  940   16      60    0       60
## race       0      0       0   16  270      33    0       33
## ownhome    0      0       0   60   33    2982    0     2982
## hcov       0      0       0    0    0       0    0        0
## multgen1   0      0       0   60   33    2982    0     2982

#Basic Imputation 
dat2<-hw8
samp2<-sample(1:dim(dat2)[1], replace = F, size = 500)
dat2$inctotaa<-dat2$inctot
dat2$inctotaa[samp2]<-NA

head(dat2[, c("inctot","inctot")])

## # A tibble: 6 x 2
##   inctot inctot
##    <dbl>  <dbl>
## 1  37029  37029
## 2  31739  31739
## 3   7406   7406
## 4      0      0
## 5  53957  53957
## 6  46974  46974

imp<-mice(data = dat2[,c("sex", "inctot", "empstat","edu3","race", "ownhome", "hcov", "multgen1")], m = 10)

## 
##  iter imp variable
##   1   1  edu3  ownhome  multgen1
##   1   2  edu3  ownhome  multgen1
##   1   3  edu3  ownhome  multgen1
##   1   4  edu3  ownhome  multgen1
##   1   5  edu3  ownhome  multgen1
##   1   6  edu3  ownhome  multgen1
##   1   7  edu3  ownhome  multgen1
##   1   8  edu3  ownhome  multgen1
##   1   9  edu3  ownhome  multgen1
##   1   10  edu3  ownhome  multgen1
##   2   1  edu3  ownhome  multgen1
##   2   2  edu3  ownhome  multgen1
##   2   3  edu3  ownhome  multgen1
##   2   4  edu3  ownhome  multgen1
##   2   5  edu3  ownhome  multgen1
##   2   6  edu3  ownhome  multgen1
##   2   7  edu3  ownhome  multgen1
##   2   8  edu3  ownhome  multgen1
##   2   9  edu3  ownhome  multgen1
##   2   10  edu3  ownhome  multgen1
##   3   1  edu3  ownhome  multgen1
##   3   2  edu3  ownhome  multgen1
##   3   3  edu3  ownhome  multgen1
##   3   4  edu3  ownhome  multgen1
##   3   5  edu3  ownhome  multgen1
##   3   6  edu3  ownhome  multgen1
##   3   7  edu3  ownhome  multgen1
##   3   8  edu3  ownhome  multgen1
##   3   9  edu3  ownhome  multgen1
##   3   10  edu3  ownhome  multgen1
##   4   1  edu3  ownhome  multgen1
##   4   2  edu3  ownhome  multgen1
##   4   3  edu3  ownhome  multgen1
##   4   4  edu3  ownhome  multgen1
##   4   5  edu3  ownhome  multgen1
##   4   6  edu3  ownhome  multgen1
##   4   7  edu3  ownhome  multgen1
##   4   8  edu3  ownhome  multgen1
##   4   9  edu3  ownhome  multgen1
##   4   10  edu3  ownhome  multgen1
##   5   1  edu3  ownhome  multgen1
##   5   2  edu3  ownhome  multgen1
##   5   3  edu3  ownhome  multgen1
##   5   4  edu3  ownhome  multgen1
##   5   5  edu3  ownhome  multgen1
##   5   6  edu3  ownhome  multgen1
##   5   7  edu3  ownhome  multgen1
##   5   8  edu3  ownhome  multgen1
##   5   9  edu3  ownhome  multgen1
##   5   10  edu3  ownhome  multgen1

## Warning: Number of logged events: 1

#To see what methods were used for the imputations:
print(imp)

## Class: mids
## Number of multiple imputations:  10 
## Imputation methods:
##      sex   inctot  empstat     edu3     race  ownhome     hcov multgen1 
##       ""       ""       ""    "pmm"       ""    "pmm"       ""    "pmm" 
## PredictorMatrix:
##         sex inctot empstat edu3 race ownhome hcov multgen1
## sex       0      1       1    1    0       1    1        1
## inctot    1      0       1    1    0       1    1        1
## empstat   1      1       0    1    0       1    1        1
## edu3      1      1       1    0    0       1    1        1
## race      0      0       0    0    0       0    0        0
## ownhome   1      1       1    1    0       0    1        1
## Number of logged events:  1 
##   it im dep     meth  out
## 1  0  0     constant race

##For all three of my variables, pmm (predictive means method) was used. 

# head(imp$imp$ownhome)
# 
# summary(imp$imp$edu3)
# 
# summary(hw8$ownhome)
# 
# summary(imp$imp$race)
# summary(hw8$race)

#To get one of the imputed datasets. Randomly choosing the first set of imputed data. 
dat.imp<-complete(imp, action = 1)
head(dat.imp, n=10)

##    sex inctot empstat edu3  race ownhome hcov multgen1
## 1    1  37029       1    3 white       1    1        2
## 2    0  31739       1    1 white       1    1        2
## 3    1   7406       1    3 white       0    0        2
## 4    0      0       2    2 white       0    0        2
## 5    1  53957       1    3 white       1    1        2
## 6    0  46974       1    3 white       1    1        2
## 7    0      0       3    2 white       1    1        2
## 8    1  31739       1    3 white       1    1        2
## 9    0  74058       1    2 white       1    1        2
## 10   0      0       3    1 white       1    1        2

#Compare to the original data 
head(hw8[,c("sex", "inctot", "empstat","edu3","race", "ownhome", "hcov", "multgen1")], n=10)

## # A tibble: 10 x 8
##      sex inctot empstat  edu3 race  ownhome  hcov multgen1
##    <dbl>  <dbl>   <int> <dbl> <chr>   <dbl> <dbl>    <dbl>
##  1     1  37029       1     3 white       1     1        2
##  2     0  31739       1     1 white       1     1        2
##  3     1   7406       1     3 white       0     0        2
##  4     0      0       2     2 white       0     0        2
##  5     1  53957       1     3 white       1     1        2
##  6     0  46974       1     3 white       1     1        2
##  7     0      0       3     2 white       1     1        2
##  8     1  31739       1     3 white       1     1        2
##  9     0  74058       1     2 white       1     1        2
## 10     0      0       3     1 white       1     1        2

#Now, I will see the variability in the 10 different imputations for each outcome
fit.inctot<-with(data=imp ,expr=lm(inctot~race+sex+hcov+ownhome+edu3+empstat))
fit.inctot

## call :
## with.mids(data = imp, expr = lm(inctot ~ race + sex + hcov + 
##     ownhome + edu3 + empstat))
## 
## call1 :
## mice(data = dat2[, c("sex", "inctot", "empstat", "edu3", "race", 
##     "ownhome", "hcov", "multgen1")], m = 10)
## 
## nmis :
##      sex   inctot  empstat     edu3     race  ownhome     hcov multgen1 
##        0        0        0      940      270     2982        0     2982 
## 
## analyses :
## [[1]]
## 
## Call:
## lm(formula = inctot ~ race + sex + hcov + ownhome + edu3 + empstat)
## 
## Coefficients:
##      (Intercept)         raceblack          raceoapi         raceother  
##            30257            -16733            -12380            -17879  
## racethreemoremaj      racetwomajor         racewhite               sex  
##           -22531            -20416            -14167            -15324  
##             hcov           ownhome              edu3           empstat  
##            11868              6907             12533            -11466  
## 
## 
## [[2]]
## 
## Call:
## lm(formula = inctot ~ race + sex + hcov + ownhome + edu3 + empstat)
## 
## Coefficients:
##      (Intercept)         raceblack          raceoapi         raceother  
##            30009            -16831            -12187            -17798  
## racethreemoremaj      racetwomajor         racewhite               sex  
##           -22388            -20340            -14042            -15283  
##             hcov           ownhome              edu3           empstat  
##            11854              7115             12520            -11469  
## 
## 
## [[3]]
## 
## Call:
## lm(formula = inctot ~ race + sex + hcov + ownhome + edu3 + empstat)
## 
## Coefficients:
##      (Intercept)         raceblack          raceoapi         raceother  
##            29768            -16590            -12405            -17757  
## racethreemoremaj      racetwomajor         racewhite               sex  
##           -22361            -20191            -14028            -15342  
##             hcov           ownhome              edu3           empstat  
##            11823              7330             12564            -11452  
## 
## 
## [[4]]
## 
## Call:
## lm(formula = inctot ~ race + sex + hcov + ownhome + edu3 + empstat)
## 
## Coefficients:
##      (Intercept)         raceblack          raceoapi         raceother  
##            30009            -16915            -12612            -17928  
## racethreemoremaj      racetwomajor         racewhite               sex  
##           -22616            -20498            -14270            -15307  
##             hcov           ownhome              edu3           empstat  
##            11830              7254             12573            -11435  
## 
## 
## [[5]]
## 
## Call:
## lm(formula = inctot ~ race + sex + hcov + ownhome + edu3 + empstat)
## 
## Coefficients:
##      (Intercept)         raceblack          raceoapi         raceother  
##            29915            -16677            -12214            -17753  
## racethreemoremaj      racetwomajor         racewhite               sex  
##           -22375            -20271            -14017            -15311  
##             hcov           ownhome              edu3           empstat  
##            11857              7081             12553            -11441  
## 
## 
## [[6]]
## 
## Call:
## lm(formula = inctot ~ race + sex + hcov + ownhome + edu3 + empstat)
## 
## Coefficients:
##      (Intercept)         raceblack          raceoapi         raceother  
##            29973            -16916            -12600            -17856  
## racethreemoremaj      racetwomajor         racewhite               sex  
##           -22275            -20579            -14160            -15268  
##             hcov           ownhome              edu3           empstat  
##            11857              7236             12545            -11466  
## 
## 
## [[7]]
## 
## Call:
## lm(formula = inctot ~ race + sex + hcov + ownhome + edu3 + empstat)
## 
## Coefficients:
##      (Intercept)         raceblack          raceoapi         raceother  
##            30039            -17005            -12331            -17853  
## racethreemoremaj      racetwomajor         racewhite               sex  
##           -22284            -20335            -14146            -15327  
##             hcov           ownhome              edu3           empstat  
##            11858              7094             12554            -11443  
## 
## 
## [[8]]
## 
## Call:
## lm(formula = inctot ~ race + sex + hcov + ownhome + edu3 + empstat)
## 
## Coefficients:
##      (Intercept)         raceblack          raceoapi         raceother  
##            30303            -17308            -12889            -18304  
## racethreemoremaj      racetwomajor         racewhite               sex  
##           -22883            -20709            -14539            -15304  
##             hcov           ownhome              edu3           empstat  
##            11810              7233             12586            -11454  
## 
## 
## [[9]]
## 
## Call:
## lm(formula = inctot ~ race + sex + hcov + ownhome + edu3 + empstat)
## 
## Coefficients:
##      (Intercept)         raceblack          raceoapi         raceother  
##            30063            -16976            -12407            -17778  
## racethreemoremaj      racetwomajor         racewhite               sex  
##           -22397            -20346            -14034            -15341  
##             hcov           ownhome              edu3           empstat  
##            11856              6980             12545            -11462  
## 
## 
## [[10]]
## 
## Call:
## lm(formula = inctot ~ race + sex + hcov + ownhome + edu3 + empstat)
## 
## Coefficients:
##      (Intercept)         raceblack          raceoapi         raceother  
##            30081            -17342            -12551            -17843  
## racethreemoremaj      racetwomajor         racewhite               sex  
##           -22523            -20547            -14126            -15273  
##             hcov           ownhome              edu3           empstat  
##            11973              6842             12580            -11477

#variation in inctot - they are all the same
with (data=imp, exp=(sd(inctot)))

## call :
## with.mids(data = imp, expr = (sd(inctot)))
## 
## call1 :
## mice(data = dat2[, c("sex", "inctot", "empstat", "edu3", "race", 
##     "ownhome", "hcov", "multgen1")], m = 10)
## 
## nmis :
##      sex   inctot  empstat     edu3     race  ownhome     hcov multgen1 
##        0        0        0      940      270     2982        0     2982 
## 
## analyses :
## [[1]]
## [1] 45042.26
## 
## [[2]]
## [1] 45042.26
## 
## [[3]]
## [1] 45042.26
## 
## [[4]]
## [1] 45042.26
## 
## [[5]]
## [1] 45042.26
## 
## [[6]]
## [1] 45042.26
## 
## [[7]]
## [1] 45042.26
## 
## [[8]]
## [1] 45042.26
## 
## [[9]]
## [1] 45042.26
## 
## [[10]]
## [1] 45042.26

#Freq table
# with (data=imp, exp=(prop.table(table(inctot))))

#Pool the separate models- combine the estimates - we get a regression model for the totality of the imputed data. The focus is on lambda and fmi. When they are closer to 1, the implication is that there is a lot of variation in that parameter across the imputations. In my result, home ownership is the highest value(.13) which means that the imputed data is not good. However, it's not super high...so the variations may be okay?
est.p<-pool(fit.inctot)
print(est.p)

## Class: mipo    m = 10 
##                term  m   estimate        ubar          b           t dfcom
## 1       (Intercept) 10  30041.788 29928193.25 23884.3342 29954466.02 43456
## 2         raceblack 10 -16929.358 31393940.12 61034.7954 31461078.40 43456
## 3          raceoapi 10 -12457.614 34780319.99 44329.2304 34829082.15 43456
## 4         raceother 10 -17875.074 29800958.81 25892.8074 29829440.90 43456
## 5  racethreemoremaj 10 -22463.329 79268933.16 33717.0322 79306021.90 43456
## 6      racetwomajor 10 -20423.136 31905614.35 24930.0890 31933037.45 43456
## 7         racewhite 10 -14152.894 29210153.11 24936.4960 29237583.25 43456
## 8               sex 10 -15308.106   150676.85   706.8194   151454.35 43456
## 9              hcov 10  11858.650   182748.16  1967.0458   184911.91 43456
## 10          ownhome 10   7107.196   185967.82 25738.1237   214279.75 43456
## 11             edu3 10  12555.345    65792.13   424.1183    66258.66 43456
## 12          empstat 10 -11456.529    41287.71   189.6443    41496.32 43456
##            df          riv       lambda          fmi
## 1  43255.3650 0.0008778601 0.0008770902 0.0009232834
## 2  42430.3135 0.0021385743 0.0021340106 0.0021810428
## 3  42986.9096 0.0014020042 0.0014000413 0.0014464987
## 4  43222.4295 0.0009557440 0.0009548314 0.0010010563
## 5  43387.8825 0.0004678849 0.0004676661 0.0005137371
## 6  43262.7682 0.0008595070 0.0008587689 0.0009049551
## 7  43229.6906 0.0009390620 0.0009381810 0.0009843989
## 8  38373.3415 0.0051600580 0.0051335685 0.0051854164
## 9  25974.5207 0.0118400667 0.0117015199 0.0117776087
## 10   508.5913 0.1522410516 0.1321260438 0.1355188849
## 11 34862.0324 0.0070909709 0.0070410431 0.0070980033
## 12 38554.7302 0.0050525621 0.0050271620 0.0050787715

summary (est.p) #yeah, so I got low p-values for a few of the parameters.

##                term   estimate std.error  statistic         df      p.value
## 1       (Intercept)  30041.788 5473.0673   5.489022 43255.3650 4.064295e-08
## 2         raceblack -16929.358 5609.0176  -3.018239 42430.3135 2.543988e-03
## 3          raceoapi -12457.614 5901.6169  -2.110882 42986.9096 3.478825e-02
## 4         raceother -17875.074 5461.6335  -3.272844 43222.4295 1.065548e-03
## 5  racethreemoremaj -22463.329 8905.3929  -2.522441 43387.8825 1.165789e-02
## 6      racetwomajor -20423.136 5650.9324  -3.614118 43262.7682 3.017137e-04
## 7         racewhite -14152.894 5407.1789  -2.617427 43229.6906 8.862638e-03
## 8               sex -15308.106  389.1714 -39.335129 38373.3415 0.000000e+00
## 9              hcov  11858.650  430.0139  27.577369 25974.5207 0.000000e+00
## 10          ownhome   7107.196  462.9036  15.353512   508.5913 0.000000e+00
## 11             edu3  12555.345  257.4076  48.776129 34862.0324 0.000000e+00
## 12          empstat -11456.529  203.7064 -56.240384 38554.7302 0.000000e+00

#Plotting the pooled values
lam<-data.frame(lam=est.p$pooled$lambda, param=row.names(est.p$pooled))
ggplot(data=lam,aes(x=param, y=lam))+geom_col()+theme(axis.text.x = element_text(angle = 45, hjust = 1))

#10 (home ownership) has the greatest degree of impact on the regression coefficient.

#Compare imputed model to original data 
fit2<-lm(inctot~race+sex+hcov+ownhome+edu3+empstat+multgen1, data=hw8)
summary(fit2)

## 
## Call:
## lm(formula = inctot ~ race + sex + hcov + ownhome + edu3 + empstat + 
##     multgen1, data = hw8)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -68002 -18968  -5055   9346 755123 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       44328.6     5909.8   7.501 6.47e-14 ***
## raceblack        -12238.3     6268.7  -1.952 0.050912 .  
## raceoapi         -12140.0     6374.5  -1.904 0.056859 .  
## raceother        -18930.9     5863.2  -3.229 0.001244 ** 
## racethreemoremaj -23593.5     9370.8  -2.518 0.011814 *  
## racetwomajor     -20594.8     6076.9  -3.389 0.000702 ***
## racewhite        -15008.3     5802.9  -2.586 0.009704 ** 
## sex              -15824.6      418.3 -37.832  < 2e-16 ***
## hcov              10773.1      468.0  23.020  < 2e-16 ***
## ownhome            8633.1      466.8  18.493  < 2e-16 ***
## edu3              12650.1      276.4  45.768  < 2e-16 ***
## empstat          -11171.0      222.3 -50.243  < 2e-16 ***
## multgen1          -7314.6      333.0 -21.967  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 40990 on 39642 degrees of freedom
##   (4083 observations deleted due to missingness)
## Multiple R-squared:  0.2131, Adjusted R-squared:  0.2128 
## F-statistic: 894.5 on 12 and 39642 DF,  p-value: < 2.2e-16

fit.imp<-lm(inctot~race+sex+hcov+ownhome+edu3+empstat+multgen1, data=dat.imp)
summary(fit.imp)

## 
## Call:
## lm(formula = inctot ~ race + sex + hcov + ownhome + edu3 + empstat + 
##     multgen1, data = dat.imp)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -65547 -18232  -5081   8864 756676 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       42974.2     5477.2   7.846 4.39e-15 ***
## raceblack        -16500.8     5575.5  -2.960 0.003083 ** 
## raceoapi         -11831.0     5868.6  -2.016 0.043807 *  
## raceother        -17216.7     5432.3  -3.169 0.001529 ** 
## racethreemoremaj -22538.0     8859.5  -2.544 0.010965 *  
## racetwomajor     -19182.4     5621.0  -3.413 0.000644 ***
## racewhite        -13356.0     5378.2  -2.483 0.013018 *  
## sex              -14899.8      386.8 -38.523  < 2e-16 ***
## hcov              11193.5      426.8  26.228  < 2e-16 ***
## ownhome            7557.4      430.4  17.560  < 2e-16 ***
## edu3              11771.5      257.9  45.643  < 2e-16 ***
## empstat          -11535.5      202.2 -57.064  < 2e-16 ***
## multgen1          -6412.0      306.8 -20.900  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 39840 on 43455 degrees of freedom
##   (270 observations deleted due to missingness)
## Multiple R-squared:  0.2168, Adjusted R-squared:  0.2166 
## F-statistic:  1002 on 12 and 43455 DF,  p-value: < 2.2e-16

#There were some changes between the original and imputed data. Where Black and API races were not significant in the original data, after imputation, they became significant (at different levels). The Black race parameter increased by more points than any other parameter.  Additionally, the White race became less significant after imputation. Health coverage and home ownership values went up and remained significant. Education and multigenerational households remained significant, but the values decreased slightly.

# #get the coefficients from each of the 5 imputations of bmi
# coefs<-as.data.frame(matrix(unlist(lapply(fit.inctot$analyses, coef)), nrow=5, ncol=17, byrow=T))
# names(coefs)<-names(fit.inctot$analyses[[1]]$coef)
# #plot the coefficients from each of the different rounds of imputation to see the variability in the
# #results
# coefs%>%
#   select(`ownhome`:`raceblack other`)%>%
#   melt()%>%
#   mutate(imp=rep(1:5, 16))%>%
#   ggplot()+geom_point(aes(y=value,x=variable, group=variable, color=as.factor(imp) ))+ggtitle("Estimated Betas from Each Imputed Regression Model for BMI Outcome")+theme(axis.text.x = element_text(angle = 45, hjust = 1))

#Were the results similar between the mean/modal and multiply imputed data sets?  How do the results compare to the results from the model fit with the data source with missing values?

##Results from the mean imputation
summary(fit)

## 
## Call:
## lm(formula = inctot ~ race + sex + hcov + ownhome.imp.mean + 
##     multgen1.imp.mean + edu3.imp.mean, data = hw8)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -73954 -19968  -6283  10525 760281 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        15234.6     5651.8   2.696 0.007031 ** 
## raceblack         -22099.8     5771.6  -3.829 0.000129 ***
## raceoapi          -11626.7     6075.9  -1.914 0.055680 .  
## raceother         -14357.2     5624.1  -2.553 0.010689 *  
## racethreemoremaj  -17402.5     9172.2  -1.897 0.057793 .  
## racetwomajor      -17859.7     5819.6  -3.069 0.002150 ** 
## racewhite         -11148.9     5568.1  -2.002 0.045262 *  
## sex               -17047.5      398.4 -42.792  < 2e-16 ***
## hcov               14393.3      437.1  32.926  < 2e-16 ***
## ownhome.imp.mean    8002.0      462.8  17.289  < 2e-16 ***
## multgen1.imp.mean  -7086.2      330.0 -21.472  < 2e-16 ***
## edu3.imp.mean      14470.3      265.1  54.580  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 41250 on 43456 degrees of freedom
##   (270 observations deleted due to missingness)
## Multiple R-squared:  0.1604, Adjusted R-squared:  0.1602 
## F-statistic: 754.8 on 11 and 43456 DF,  p-value: < 2.2e-16

##Results from the multiple imputation
summary(fit.imp)

## 
## Call:
## lm(formula = inctot ~ race + sex + hcov + ownhome + edu3 + empstat + 
##     multgen1, data = dat.imp)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -65547 -18232  -5081   8864 756676 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       42974.2     5477.2   7.846 4.39e-15 ***
## raceblack        -16500.8     5575.5  -2.960 0.003083 ** 
## raceoapi         -11831.0     5868.6  -2.016 0.043807 *  
## raceother        -17216.7     5432.3  -3.169 0.001529 ** 
## racethreemoremaj -22538.0     8859.5  -2.544 0.010965 *  
## racetwomajor     -19182.4     5621.0  -3.413 0.000644 ***
## racewhite        -13356.0     5378.2  -2.483 0.013018 *  
## sex              -14899.8      386.8 -38.523  < 2e-16 ***
## hcov              11193.5      426.8  26.228  < 2e-16 ***
## ownhome            7557.4      430.4  17.560  < 2e-16 ***
## edu3              11771.5      257.9  45.643  < 2e-16 ***
## empstat          -11535.5      202.2 -57.064  < 2e-16 ***
## multgen1          -6412.0      306.8 -20.900  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 39840 on 43455 degrees of freedom
##   (270 observations deleted due to missingness)
## Multiple R-squared:  0.2168, Adjusted R-squared:  0.2166 
## F-statistic:  1002 on 12 and 43455 DF,  p-value: < 2.2e-16

#Raceblack was more statistically significant using the mean method, but was significant nonetheless using the multiple imputation model. Race api and three or more races was significant using the multiple imputation method but not using the mean imputation method. White was significant in both models. Sex was significant in both models, but had a  higher value using the mean imputation method. Health coverage had a lower value using the multiple imputation method, but remained significant. The biggest difference in outcome really is with the significance of the race parameter, particularly for Black, API, and three or more races. If additional testing was possible to see which method to prefer, I would pursue that. Otherwise, I would go with the multiple imputation model.

HW8_mydata

Coda Rayo-Garza

3/29/2021