options(repos="https://cran.rstudio.com" )
library(htmlTable)
library(car)
## Loading required package: carData
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
library(questionr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#
install.packages("ipumsr")
##
## The downloaded binary packages are in
## /var/folders/lr/thl33hwj2jdd01hth_whxlz00000gn/T//RtmpHRHpg1/downloaded_packages
library(ipumsr)
## Registered S3 methods overwritten by 'ipumsr':
## method from
## format.pillar_shaft_haven_labelled_chr haven
## format.pillar_shaft_haven_labelled_num haven
## pillar_shaft.haven_labelled haven
# Note to self: Must set wd to where all files are, then open ddi and feed into read_ipums_micro
setwd("~/Desktop/UTSA/DEM7283_StatsII/HW2")
ddi <- read_ipums_ddi("usa_00011.xml")
data_total <- read_ipums_micro(ddi)
## Use of data from IPUMS-USA is subject to conditions including that users should
## cite the data appropriately. Use command `ipums_conditions()` for more details.
names(data_total)
## [1] "YEAR" "MULTYEAR" "SAMPLE" "SERIAL" "CBSERIAL" "HHWT"
## [7] "CLUSTER" "STATEFIP" "COUNTYFIP" "MET2013" "PUMA" "STRATA"
## [13] "GQ" "OWNERSHP" "OWNERSHPD" "VALUEH" "PERNUM" "PERWT"
## [19] "SEX" "AGE" "RACE" "RACED" "HISPAN" "HISPAND"
## [25] "EMPSTAT" "EMPSTATD" "LABFORCE" "OCC" "IND" "CLASSWKR"
## [31] "CLASSWKRD" "WKSWORK2" "UHRSWORK" "INCWAGE" "POVERTY" "MIGRATE1"
## [37] "MIGRATE1D"
#At first I tried to do the multiple imputation with this whole dataset but it took forever so let's filter out just for the observations in the San Antonio-New Braunfels MSA (code = 33260 ). This leaves me with 6,021 observations.
data <- filter(data_total, MET2013==33260)
Outcome Variable: Homeowner (yes/no)
Predictor Variables: * Age * Employment Status * Moved within last year * Poverty status * Race * Hispanic/Latino Ethncity
#Homeownership
data$tenure_c <-Recode(data$OWNERSHP, recodes="0=NA; 1='Owned/Mortgaged'; 2='Rented'", as.factor=T)
#Age
data$age_c <-Recode(data$AGE, recodes="0:17='0 Child'; 18:64='1 Working Age'; else='2 Senior'", as.factor=T)
#Employment Status
data$empl_c <-Recode(data$EMPSTAT, recodes="0=NA; 1='1 Employed'; 2='2 Unemployed'; 3='3 Not in LF'", as.factor=T)
#Moved within last year
data$move <- Recode(data$MIGRATE1, recodes="0=NA; 1='2. Did not Move'; 2:4='1. Moved within last year'; 9=NA", as.factor=T)
#Poverty Status
data$pov_b <-Recode(data$POVERTY, recodes="000=NA; 1:100='Below Poverty'; else='Above Poverty'", as.factor=T)
#Race
data$race_c <- Recode(data$RACE, recodes = "1='1. White'; 2='2. Black'; 3='3. Native American'; 4:6='4. Asian/PI'; else = '5. Other, or more than one race'", as.factor=T)
#Hispanic/Latino Ethnicity
data$hisp <- Recode(data$HISPAN, recodes = "0='1. Not Hispanic'; 1:4='2. Hispanic/Latino'; else = NA", as.factor=T)
There are no NAs for Race or Ethnicity, which is interesting and I couldn’t find why those would, in particular, not have any NAs (perhaps any NAs go to “other?”). There are also no NAs for age, which I wonder if the ACS already imputes that?
The highest number of NAs (1,339) are for employment status, which might make sense as a question that many might be reluctant to answer, or perhaps might not understand. Housing tenure has 219 NAs, followed by poverty status with 158 and “move within last year” with 90. When examining the missingness pattern, we see that 143 observations are missing poverty and tenure (I think I’m interpreting the pattern correctly?). If this is the case, perhaps there are shared characteristics of those who are not responding to both of those questions.
summary(data[, c("tenure_c", "age_c","empl_c","move","pov_b", "race_c", "hisp")])
## tenure_c age_c empl_c
## Owned/Mortgaged:4423 0 Child :1482 1 Employed :3006
## Rented :1379 1 Working Age:3671 2 Unemployed: 103
## NA's : 219 2 Senior : 868 3 Not in LF :1573
## NA's :1339
##
## move pov_b
## 1. Moved within last year: 939 Above Poverty:5345
## 2. Did not Move :4992 Below Poverty: 518
## NA's : 90 NA's : 158
##
##
## race_c hisp
## 1. White :5158 1. Not Hispanic :3812
## 2. Black : 361 2. Hispanic/Latino:2209
## 3. Native American : 32
## 4. Asian/PI : 112
## 5. Other, or more than one race: 358
library(mice)
## Loading required package: lattice
## Registered S3 methods overwritten by 'lme4':
## method from
## cooks.distance.influence.merMod car
## influence.merMod car
## dfbeta.influence.merMod car
## dfbetas.influence.merMod car
##
## Attaching package: 'mice'
## The following objects are masked from 'package:base':
##
## cbind, rbind
md.pattern(data[, c("tenure_c", "age_c","empl_c","move","pov_b", "race_c", "hisp")])
## age_c race_c hisp move pov_b tenure_c empl_c
## 4465 1 1 1 1 1 1 1 0
## 1237 1 1 1 1 1 1 0 1
## 74 1 1 1 1 1 0 1 1
## 10 1 1 1 1 0 1 0 2
## 143 1 1 1 1 0 0 1 2
## 2 1 1 1 1 0 0 0 3
## 87 1 1 1 0 1 1 0 2
## 3 1 1 1 0 0 1 0 3
## 0 0 0 90 158 219 1339 1806
Results: * Employment Status: (largest shift) the percentages of those employed rose significantly, since this is the modal category and there were so many NAs in the data. This type of imputation would give a skewed perception of the employment rate (a very rosy picture) * Housing tenure: when modally imputed, a slightly lower share of the total are renters, and a slightly higher share of owners. Since there were a relatively low number of NAs, this shift was not dramatic.
* Moved within last year: Very little change in these values, likley because of small number of NAs. * Poverty Status: Also very slight change, perhaps because poverty status is already imputed?
Not imputed: * Age: no missing values * Race: no missing values * Ethnicity: no missing values
#HOUSING TENURE
table(data$tenure_c)
##
## Owned/Mortgaged Rented
## 4423 1379
#find the most common value
mcv.tenure<-factor(names(which.max(table(data$tenure_c))), levels=levels(data$tenure_c))
mcv.tenure
## [1] Owned/Mortgaged
## Levels: Owned/Mortgaged Rented
#impute the cases
data$tenure_c.imp<-as.factor(ifelse(is.na(data$tenure_c)==T, mcv.tenure, data$tenure_c))
levels(data$tenure_c.imp)<-levels(data$tenure_c)
prop.table(table(data$tenure_c))
##
## Owned/Mortgaged Rented
## 0.7623233 0.2376767
prop.table(table(data$tenure_c.imp))
##
## Owned/Mortgaged Rented
## 0.7709683 0.2290317
barplot(prop.table(table(data$tenure_c)), main="Original Data: Housing Tenure", ylim=c(0, 1))
barplot(prop.table(table(data$tenure_c.imp)), main="Imputed Data: Housing Tenure",ylim=c(0, 1))
#EMPLOYMENT STATUS
table(data$empl_c)
##
## 1 Employed 2 Unemployed 3 Not in LF
## 3006 103 1573
#find the most common value
mcv.emp<-factor(names(which.max(table(data$empl_c))), levels=levels(data$empl_c))
mcv.emp
## [1] 1 Employed
## Levels: 1 Employed 2 Unemployed 3 Not in LF
#impute the cases
data$empl_c.imp<-as.factor(ifelse(is.na(data$empl_c)==T, mcv.emp, data$empl_c))
levels(data$empl_c.imp)<-levels(data$empl_c)
prop.table(table(data$empl_c))
##
## 1 Employed 2 Unemployed 3 Not in LF
## 0.64203332 0.02199915 0.33596754
prop.table(table(data$empl_c.imp))
##
## 1 Employed 2 Unemployed 3 Not in LF
## 0.72164092 0.01710679 0.26125228
barplot(prop.table(table(data$empl_c)), main="Original Data: Employment Status", ylim=c(0, 1))
barplot(prop.table(table(data$empl_c.imp)), main="Imputed Data: Employment Status",ylim=c(0, 1))
#MOVED WITHIN LAST YEAR
table(data$move)
##
## 1. Moved within last year 2. Did not Move
## 939 4992
#find the most common value
mcv.move<-factor(names(which.max(table(data$move))), levels=levels(data$move))
mcv.move
## [1] 2. Did not Move
## Levels: 1. Moved within last year 2. Did not Move
#impute the cases
data$move.imp<-as.factor(ifelse(is.na(data$move)==T, mcv.move, data$move))
levels(data$move.imp)<-levels(data$move)
prop.table(table(data$move))
##
## 1. Moved within last year 2. Did not Move
## 0.1583207 0.8416793
prop.table(table(data$move.imp))
##
## 1. Moved within last year 2. Did not Move
## 0.1559542 0.8440458
barplot(prop.table(table(data$move)), main="Original Data: Moved within last year", ylim=c(0, 1))
barplot(prop.table(table(data$move.imp)), main="Imputed Data: Moved within last year",ylim=c(0, 1))
#POVERTY STATUS
table(data$pov_b)
##
## Above Poverty Below Poverty
## 5345 518
#find the most common value
mcv.pov<-factor(names(which.max(table(data$pov_b))), levels=levels(data$pov_b))
mcv.pov
## [1] Above Poverty
## Levels: Above Poverty Below Poverty
#impute the cases
data$pov_b.imp<-as.factor(ifelse(is.na(data$pov_b)==T, mcv.pov, data$pov_b))
levels(data$pov_b.imp)<-levels(data$pov_b)
prop.table(table(data$pov_b))
##
## Above Poverty Below Poverty
## 0.91164933 0.08835067
prop.table(table(data$pov_b.imp))
##
## Above Poverty Below Poverty
## 0.91396778 0.08603222
barplot(prop.table(table(data$pov_b)), main="Original Data: Poverty Status", ylim=c(0, 1))
barplot(prop.table(table(data$pov_b.imp)), main="Imputed Data: Poverty Status",ylim=c(0, 1))
summary(data[, c("tenure_c.imp","empl_c.imp","move.imp","pov_b.imp")])
## tenure_c.imp empl_c.imp move.imp
## Owned/Mortgaged:4642 1 Employed :4345 1. Moved within last year: 939
## Rented :1379 2 Unemployed: 103 2. Did not Move :5082
## 3 Not in LF :1573
## pov_b.imp
## Above Poverty:5503
## Below Poverty: 518
##
#Multiple Imputation ###Perform a multiple imputation of all values. Perform the analysis using this imputed data set. What are your results?
dat2<-data
samp2<-sample(1:dim(dat2)[1], replace = F, size = 500)
dat2$tenure_knock<-dat2$tenure_c
dat2$tenure_knock[samp2]<-NA
head(dat2[, c("tenure_knock","tenure_c")])
## # A tibble: 6 x 2
## tenure_knock tenure_c
## <fct> <fct>
## 1 <NA> <NA>
## 2 Owned/Mortgaged Owned/Mortgaged
## 3 Owned/Mortgaged Owned/Mortgaged
## 4 Owned/Mortgaged Owned/Mortgaged
## 5 Owned/Mortgaged Owned/Mortgaged
## 6 Owned/Mortgaged Owned/Mortgaged
imp<-mice(data = dat2[,c("tenure_c", "age_c","empl_c","move","pov_b", "race_c", "hisp")], seed = 22, m = 5)
##
## iter imp variable
## 1 1 tenure_c empl_c move pov_b
## 1 2 tenure_c empl_c move pov_b
## 1 3 tenure_c empl_c move pov_b
## 1 4 tenure_c empl_c move pov_b
## 1 5 tenure_c empl_c move pov_b
## 2 1 tenure_c empl_c move pov_b
## 2 2 tenure_c empl_c move pov_b
## 2 3 tenure_c empl_c move pov_b
## 2 4 tenure_c empl_c move pov_b
## 2 5 tenure_c empl_c move pov_b
## 3 1 tenure_c empl_c move pov_b
## 3 2 tenure_c empl_c move pov_b
## 3 3 tenure_c empl_c move pov_b
## 3 4 tenure_c empl_c move pov_b
## 3 5 tenure_c empl_c move pov_b
## 4 1 tenure_c empl_c move pov_b
## 4 2 tenure_c empl_c move pov_b
## 4 3 tenure_c empl_c move pov_b
## 4 4 tenure_c empl_c move pov_b
## 4 5 tenure_c empl_c move pov_b
## 5 1 tenure_c empl_c move pov_b
## 5 2 tenure_c empl_c move pov_b
## 5 3 tenure_c empl_c move pov_b
## 5 4 tenure_c empl_c move pov_b
## 5 5 tenure_c empl_c move pov_b
print(imp)
## Class: mids
## Number of multiple imputations: 5
## Imputation methods:
## tenure_c age_c empl_c move pov_b race_c hisp
## "logreg" "" "polyreg" "logreg" "logreg" "" ""
## PredictorMatrix:
## tenure_c age_c empl_c move pov_b race_c hisp
## tenure_c 0 1 1 1 1 1 1
## age_c 1 0 1 1 1 1 1
## empl_c 1 1 0 1 1 1 1
## move 1 1 1 0 1 1 1
## pov_b 1 1 1 1 0 1 1
## race_c 1 1 1 1 1 0 1
Here I’m comparing regressions bewteen data sets (I think this is “the analysis” referenced in homework?). I’m honestly unsure how to compare them, and unsure if I’m doing this right. Is this on the right track?
The coefficients for the modally imputed data set are different than both the original data and the multiply imputed dataset.
#Unchaged data (leaving NAs as is)
library(survey)
des <- svydesign(id=~CLUSTER, strata=~STRATA, weights=~PERWT, data=data)
fit.logit1<-svyglm(tenure_c~age_c+empl_c+move+pov_b+race_c+hisp,
design= des,
family=binomial)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit.logit1)
##
## Call:
## svyglm(formula = tenure_c ~ age_c + empl_c + move + pov_b + race_c +
## hisp, design = des, family = binomial)
##
## Survey design:
## svydesign(id = ~CLUSTER, strata = ~STRATA, weights = ~PERWT,
## data = data)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.3099425 0.3197897 0.969 0.332547
## age_c1 Working Age 0.1260949 0.2806702 0.449 0.653285
## age_c2 Senior -0.4103804 0.3111046 -1.319 0.187269
## empl_c2 Unemployed -0.0003687 0.3058694 -0.001 0.999038
## empl_c3 Not in LF -0.4527183 0.1122262 -4.034 5.67e-05
## move2. Did not Move -1.8678294 0.1509127 -12.377 < 2e-16
## pov_bBelow Poverty 1.2314764 0.2178243 5.654 1.77e-08
## race_c2. Black 0.8663488 0.2294602 3.776 0.000164
## race_c3. Native American 1.3557348 0.6978477 1.943 0.052174
## race_c4. Asian/PI -0.3028557 0.5347939 -0.566 0.571245
## race_c5. Other, or more than one race 0.0857941 0.2411641 0.356 0.722062
## hisp2. Hispanic/Latino 0.2128376 0.1431861 1.486 0.137304
##
## (Intercept)
## age_c1 Working Age
## age_c2 Senior
## empl_c2 Unemployed
## empl_c3 Not in LF ***
## move2. Did not Move ***
## pov_bBelow Poverty ***
## race_c2. Black ***
## race_c3. Native American .
## race_c4. Asian/PI
## race_c5. Other, or more than one race
## hisp2. Hispanic/Latino
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 0.9912079)
##
## Number of Fisher Scoring iterations: 4
#Modally imputed data
des <- svydesign(id=~CLUSTER, strata=~STRATA, weights=~PERWT, data=data)
fit.logit2<-svyglm(tenure_c.imp~age_c+empl_c.imp+move.imp+pov_b.imp+race_c+hisp,
design= des,
family=binomial)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit.logit2)
##
## Call:
## svyglm(formula = tenure_c.imp ~ age_c + empl_c.imp + move.imp +
## pov_b.imp + race_c + hisp, design = des, family = binomial)
##
## Survey design:
## svydesign(id = ~CLUSTER, strata = ~STRATA, weights = ~PERWT,
## data = data)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.249851 0.171123 1.460 0.14440
## age_c1 Working Age 0.006252 0.095811 0.065 0.94798
## age_c2 Senior -0.478671 0.185377 -2.582 0.00988 **
## empl_c.imp2 Unemployed -0.038790 0.304452 -0.127 0.89863
## empl_c.imp3 Not in LF -0.593506 0.104662 -5.671 1.59e-08 ***
## move.imp2. Did not Move -1.736605 0.151531 -11.460 < 2e-16 ***
## pov_b.impBelow Poverty 1.209164 0.243671 4.962 7.45e-07 ***
## race_c2. Black 1.087414 0.255727 4.252 2.20e-05 ***
## race_c3. Native American 1.736709 0.648277 2.679 0.00743 **
## race_c4. Asian/PI -0.109817 0.551454 -0.199 0.84217
## race_c5. Other, or more than one race -0.060030 0.231282 -0.260 0.79523
## hisp2. Hispanic/Latino 0.352091 0.145231 2.424 0.01541 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1.00653)
##
## Number of Fisher Scoring iterations: 4
#Multiply imputed data
des2 <- svydesign(id=~CLUSTER, strata=~STRATA, weights=~PERWT, data=dat2)
fit.logit3<-svyglm(tenure_knock~age_c+empl_c+move+pov_b+race_c+hisp,
design= des2,
family=binomial)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit.logit3)
##
## Call:
## svyglm(formula = tenure_knock ~ age_c + empl_c + move + pov_b +
## race_c + hisp, design = des2, family = binomial)
##
## Survey design:
## svydesign(id = ~CLUSTER, strata = ~STRATA, weights = ~PERWT,
## data = dat2)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.32132 0.33537 0.958 0.338120
## age_c1 Working Age 0.14611 0.29565 0.494 0.621228
## age_c2 Senior -0.42601 0.32861 -1.296 0.194986
## empl_c2 Unemployed -0.04601 0.31441 -0.146 0.883666
## empl_c3 Not in LF -0.40919 0.11863 -3.449 0.000573 ***
## move2. Did not Move -1.88783 0.15187 -12.431 < 2e-16 ***
## pov_bBelow Poverty 1.20884 0.22296 5.422 6.55e-08 ***
## race_c2. Black 0.76512 0.23382 3.272 0.001084 **
## race_c3. Native American 1.37760 0.70731 1.948 0.051585 .
## race_c4. Asian/PI -0.02362 0.52455 -0.045 0.964090
## race_c5. Other, or more than one race 0.03638 0.25156 0.145 0.885033
## hisp2. Hispanic/Latino 0.19705 0.14671 1.343 0.179382
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 0.9919528)
##
## Number of Fisher Scoring iterations: 4