We examine the relationship of individuals who where NOT born in the US and the number of years they have lived in the country. We explore the impact of such lenghtin their academic achievement. Finally, we consider the impact of poverty rates by counties to further understand these relationships.
First, we fit two random intercept models, the second includes a group level variable of percentage of poverty in the county as a predictor, then we compare the fits of each model using a ratio test. Lastly, we compute a cross-level interaction analysis that includes the poverty rates by county.
The sample includes data from 4,871 individuals born between 1978 and 1988 in the State of Texas. Data was gathered from ACS-IPUMS 2014-2016.
Multi level propositions
Figures from Kawachi and Berkman (2003)
In the image selection below from Kawachi et.,al(2003): Z = Percentage of poverty in the county, X = Having a bachelor’s degree or higherS, Y = Number of years living in the country.
[Selection of Multi level propositions](C:/Users/PCMcC/Documents/Hierarchical Models/hierarchichal methods class/fig2_6.png
library(haven)
library(dplyr)
library (car)
library(lmtest)
library(arm)
library(lme4)
library(ggplot2)
library(sjstats)
library(lmerTest)
library(MuMIn)
library(tidycensus)
library(stringr)
library(utils)
#library(tigris)
#options(tigris_class = "sf")
#pumas<-pumas(state = "TX", year = 2014, cb = T)
testdata<-read_dta('C:/Users/PCMcC/Documents/Hierarchical Models/hierarchichal methods class/usa_00001.dta')
load("C:/Users/PCMcC/Documents/Hierarchical Models/hierarchichal methods class/ipums_subset2.Rdata")
names(testdata)<-tolower(names(testdata))
sub2 <- filter(testdata, birthyr %in% 1978:1988, year %in% 2014:2016, gq != 3, gq != 4, gq!=6, citizen==3:5, statefip==48)
## Warning in citizen == 3:5: longer object length is not a multiple of
## shorter object length
save(sub2, file="C:/Users/PCMcC/Documents/Hierarchical Models/hierarchichal methods class/ipums_subset2.Rdata")
#rm(testdata)
#gc()
#set.seed(1115)
#samp <- sample(1:dim(sub)[1], replace = F, 100000)
names(sub2) #print the column names
## [1] "year" "datanum" "serial" "hhwt" "hhtype"
## [6] "stateicp" "statefip" "county" "countyfips" "metro"
## [11] "city" "puma" "gq" "hhincome" "foodstmp"
## [16] "pernum" "perwt" "sex" "age" "birthyr"
## [21] "fertyr" "race" "raced" "hispan" "hispand"
## [26] "bpl" "bpld" "citizen" "yrsusa1" "language"
## [31] "languaged" "school" "educ" "educd" "gradeatt"
## [36] "gradeattd" "degfield" "degfieldd" "empstat" "empstatd"
## [41] "labforce" "incwage" "poverty" "trantime"
myvars<-c("statefip","county","countyfips","metro","city","puma","hhincome","incwage","educd","empstatd","gradeatt","race", "hispan", "hispand","raced","sex","empstat","age","trantime","poverty","birthyr","year","gq","hhtype","yrsusa1","bpl")
sub3<-sub2[,myvars]
#rm(sub2); gc()
#RACE/ETHNICITY
sub3$hispanic <- Recode(sub3$hispan, recodes = "9=NA; 1:4='Hispanic'; 0='NonHispanic'")
sub3$race_rec <- Recode(sub3$race, recodes = "1='White'; 2='Black'; 3='Other'; 4:6='Asian'; 7:9='Other'")
sub3$race_eth <- interaction(sub3$hispanic, sub3$race_rec, sep = "_")
sub3$race_eth <- as.factor(ifelse(substr(as.character(sub3$race_eth),1,8) == "Hispanic", "Hispanic", as.character(sub3$race_eth)))
sub3$race_eth <- relevel(sub3$race_eth, ref = "NonHispanic_White")
#SEX
sub3$sex2<- Recode (sub3$sex, recodes= "1=1; 2=2; else=NA")
sub3$male<- Recode (sub3$sex, recodes= "1=1; 2=0; else=NA")
sub3$female<- Recode (sub3$sex, recodes= "2=1; 1=0; else=NA")
#CITIZEN STATUS (1=born in the US or to US citizens, 0=not US citizen)
#sub3$citizenstatus <- Recode(sub3$citizen, recodes = "1:2=1; 3:5=0; else=NA")
#sub3$usborn <-Recode(sub3$citizenstatus, recodes="1=1; 0=0")
#sub3$notusborn <-Recode(sub3$citizenstatus, recodes="0=1; 1=0")
#table(sub3$usborn)
#EDUCATION LEVEL
sub3$educ_level<- Recode(sub3$educd, recodes = "2:61='0Less_than_HS';62:64='1_HSD/GED';65:80='2_somecollege';90:100='2_somecollege'; 81:83='3_AssocDegree';101='4_bachelordegree'; 110:116='4_BAplus_GradDegree'; else=NA")
sub3$educ_level<-as.factor(sub3$educ_level)
#College or more(1=college degree or higher and 0=Associates degree or lower)
sub3$collegeormore<- Recode(sub3$educd, recodes = "84:116=1; 2:83=0")
#USBORN
sub3$usborn <- Recode(sub3$bpl, recodes = "1:120=1; 121:900=0; else=NA")
#EMPLOYMENT
sub3$employedetailed <- Recode(sub3$empstatd, recodes = "10:12=1; 20:22=0; else=NA")
sub3$employed<-Recode(sub3$empstat, recodes = "1=1; else=NA")
sub3$unemployed<-Recode(sub3$empstat, recodes = "2=1; else=NA")
sub3$not_in_laborforce<-Recode(sub3$empstat, recodes = "3=1; else=NA")
#INCOME ADJUSTED FOR INFLATION
#sub$edu_scal_inc <- ave(sub$incwage, sub$male, FUN = scale)
sub3$inc_adj <- ifelse(sub3$year == 2005, sub3$incwage*1.23,
ifelse(sub3$year == 2006, sub3$incwage*1.18,
ifelse(sub3$year == 2007, sub3$incwage*1.16,
ifelse(sub3$year %in% c(2008,2009), sub3$incwage*1.1,
ifelse(sub3$year == 2010, sub3$incwage*1.1,
ifelse(sub3$year == 2011, sub3$incwage*1.07,
ifelse(sub3$year == 2012,sub3$incwage*1.05,
ifelse(sub3$year == 2013, sub3$incwage*1.03,
ifelse(sub3$year == 2014, sub3$incwage*1.01,
ifelse(sub3$year == 2015, sub3$incwage*1.01, sub3$incwage))))))))))
#TRAVEL TIME TO WORK (does not include zero time commute)
sub3$traveltime<-ifelse(sub3$trantime==000, NA, sub3$trantime)
#IN HS/COLLEGE OR NOT IN HS/COLLEGE
sub3$inhs<- Recode(sub3$gradeatt, recodes = "5=1; else=0")
sub3$incollege <- Recode(sub3$gradeatt, recodes = "6:7=1; else=0")
#YEARS IN THE US (0 indicates less than 1 year or NA - I eliminated all NAs)
sub3$yrsusa2<-Recode(sub3$yrsusa1, recodes="0=NA")
#HOUSEHOLD TYPE (1=married family, 2=one male or female living alone, 3=male or female living with someone)
sub3$housetype<-Recode(sub3$hhtype, recodes = "1=1; c(2,3,4,6)=2; c(5,7)=3; 9=NA" )
#METRO AREA (1=not in metro area, 2=metro central city, 3=suburban, 4=central/principal city unknown)
sub3$metroarea<-Recode(sub3$metro,recodes="1=1; 2=2; 3=3; else='NA'")
sub3$urban<-Recode(sub3$metroarea, recodes="2:3=1; 1=0")
sub3$rural<-Recode(sub3$metroarea, recodes="1=1; 2:3=0")
library(tidycensus)
usacs<-get_acs(geography = "county", year = 2015,
variables=c( "DP05_0001E", "DP03_0009P", "DP03_0062E", "DP03_0119PE",
"DP05_0001E","DP02_0009PE","DP02_0008PE", "DP02_0040E","DP02_0038E",
"DP02_0066PE","DP02_0067PE","DP02_0080PE","DP02_0092PE",
"DP03_0005PE","DP03_0028PE","DP03_0062E","DP03_0099PE","DP03_0101PE",
"DP03_0119PE","DP04_0046PE","DP05_0072PE","DP05_0073PE",
"DP05_0066PE","DP05_0033","DP05_0032", "DP05_0072PE", "DP02_0113PE", "DP04_0003PE") ,
summary_var = "B01001_001",
geometry = F, output = "wide")
## Getting data from the 2011-2015 5-year ACS
## Using the ACS Data Profile
## Using the ACS Data Profile
usacs2<-mutate(usacs,totpop= DP05_0001E, fertrate = DP02_0040E,pwhite=DP05_0072PE/100, nwhite=DP05_0032E,nblack=DP05_0033E, pblack=DP05_0073PE/100 , phisp=DP05_0066PE/100, pfemhh=DP02_0008PE/100, phsormore=DP02_0066PE/100,punemp=DP03_0009PE/100, medhhinc=DP03_0062E,ppov=DP03_0119PE/100, pforn=DP02_0092PE/100,plep=DP02_0113PE/100, pvacant=DP04_0003PE/100, GEOID=GEOID)
myv <- c("GEOID","NAME", "totpop", "pwhite", "pblack", "phisp", "pfemhh", "phsormore", "punemp", "medhhinc", "ppov", "pforn","plep")
acsvars <- usacs2[myv]
acsvars <-filter(acsvars,GEOID %in% 48001:48507)
acsvars$GEOID<- substring(acsvars$GEOID,3)
acsvars$GEOID<-as.numeric(acsvars$GEOID)
head(acsvars)
## # A tibble: 6 x 13
## GEOID NAME totpop pwhite pblack phisp pfemhh phsormore punemp medhhinc
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 Ande~ 57915 0.602 0.211 0.168 0.123 0.798 0.043 41327
## 2 3 Andr~ 16775 0.431 0.013 0.532 0.1 0.744 0.039 70423
## 3 5 Ange~ 87748 0.619 0.149 0.209 0.165 0.792 0.091 44223
## 4 7 Aran~ 24292 0.683 0.015 0.261 0.072 0.842 0.068 41690
## 5 9 Arch~ 8779 0.888 0.006 0.081 0.048 0.888 0.03 60275
## 6 11 Arms~ 1943 0.845 0.006 0.118 0.059 0.907 0.011 59737
## # ... with 3 more variables: ppov <dbl>, pforn <dbl>, plep <dbl>
#I fix the error:: `x` and `labels` must be same type with the following:
sub3[] <- lapply(sub3, unclass)
joindata<-merge(acsvars, sub3, by.x="GEOID", by.y="countyfips", all.x=T)
#names(joindata)
jdvars<-c("GEOID","NAME","metro","city","puma","hhincome","incwage","educd","empstatd","race", "hispan", "usborn","raced","sex","age","birthyr","year","gq","hhtype","yrsusa2","totpop", "pwhite", "pblack", "phisp", "pfemhh", "phsormore", "punemp", "medhhinc", "ppov","male", "pforn","plep","educ_level","race_eth","collegeormore")
joindata2<-joindata[,jdvars]
joindata2<-joindata2%>%
filter(complete.cases(.))
#rm(joindata); gc()
joindata2$yearsintheUSz<-scale(joindata2$yrsusa2, center = T, scale=T)
joindata2$agez<-scale(joindata2$age, center = T, scale=T)
However, we want to know whether *individuals with less years in the US have also lower levels of academic achievement, net of other demographic and socioeconomic factors?
fit1<-lmer(yearsintheUSz~collegeormore+male+agez+hispan+(1|GEOID), data=joindata2)
fit.mix.cont<-lmer(collegeormore~yearsintheUSz+male+agez+hispan+(1|GEOID),
family=gaussian, data=joindata2,
control=lmerControl(optimizer="bobyqa",optCtrl=list(maxfun=2e5)))
## Warning in lme4::lmer(formula = collegeormore ~ yearsintheUSz + male + agez
## + : calling lmer with 'family' is deprecated; please use glmer() instead
## Warning in lme4::glmer(formula = collegeormore ~ yearsintheUSz + male + :
## please use glmerControl() instead of lmerControl()
## Warning in lme4::glmer(formula = collegeormore ~ yearsintheUSz + male
## + : calling glmer() with family=gaussian (identity link) as a shortcut to
## lmer() is deprecated; please call lmer() directly
## Warning in lme4::lmer(formula = collegeormore ~ yearsintheUSz + male + agez
## + : calling lmer with 'family' is deprecated; please use glmer() instead
## Warning in lme4::glmer(formula = collegeormore ~ yearsintheUSz + male + :
## please use glmerControl() instead of lmerControl()
## Warning in lme4::glmer(formula = collegeormore ~ yearsintheUSz + male
## + : calling glmer() with family=gaussian (identity link) as a shortcut to
## lmer() is deprecated; please call lmer() directly
library(MuMIn)
r.squaredGLMM(fit.mix.cont)
## Warning: 'r.squaredGLMM' now calculates a revised statistic. See the help
## page.
## R2m R2c
## [1,] 0.1830048 0.2500407
cor(fitted(fit.mix.cont),fit.mix.cont@frame$yearsintheUSz^2)
## [,1]
## [1,] -0.2510046
#Tests of coefficitns
display(fit.mix.cont, detail=T, digits=4)
## lmer(formula = collegeormore ~ yearsintheUSz + male + agez +
## hispan + (1 | GEOID), data = joindata2, control = lmerControl(optimizer = "bobyqa",
## optCtrl = list(maxfun = 2e+05)), family = gaussian)
## coef.est coef.se t value
## (Intercept) 0.3781 0.0232 16.2708
## yearsintheUSz -0.1345 0.0058 -23.2917
## male -0.0033 0.0112 -0.2924
## agez 0.0302 0.0057 5.3206
## hispan -0.0973 0.0046 -21.1958
##
## Error terms:
## Groups Name Std.Dev.
## GEOID (Intercept) 0.1159
## Residual 0.3876
## ---
## number of obs: 4871, groups: GEOID, 38
## AIC = 4702.4, DIC = 4610.6
## deviance = 4649.5
#confidence intervals for the Odds ratio
exp(confint(fit.mix.cont, method="Wald"))
## 2.5 % 97.5 %
## .sig01 NA NA
## .sigma NA NA
## (Intercept) 1.3945023 1.5274872
## yearsintheUSz 0.8642967 0.8840869
## male 0.9751771 1.0187841
## agez 1.0192859 1.0422521
## hispan 0.8991316 0.9154619
We can see that an increase in years in the US is associated with lower likelihood of having a college degree or higher. In terms of race/ethnicity, we see that Hispanics are also less likely to have a college degree or more in comparison to other non-US born individuals. Age has a positive correlation with earning a bachelor’s degree however being male shows very little impact.
The second model integrates a group level variable as a predictor which in this case is the percentage of povery by county.
fit.mix.bin.ml<-lmer(yearsintheUSz~collegeormore+male+agez+hispan+ppov+(1|GEOID),nAGQ=10,
family=gaussian, data=joindata2,
control=lmerControl(optimizer="bobyqa",optCtrl=list(maxfun=4e5)))
## Warning in lme4::lmer(formula = yearsintheUSz ~ collegeormore + male + agez
## + : calling lmer with 'family' is deprecated; please use glmer() instead
## Warning in lme4::glmer(formula = yearsintheUSz ~ collegeormore + male + :
## please use glmerControl() instead of lmerControl()
## Warning in lme4::glmer(formula = yearsintheUSz ~ collegeormore + male
## + : calling glmer() with family=gaussian (identity link) as a shortcut to
## lmer() is deprecated; please call lmer() directly
## Warning: extra argument(s) 'nAGQ' disregarded
## Warning in lme4::lmer(formula = yearsintheUSz ~ collegeormore + male + agez
## + : calling lmer with 'family' is deprecated; please use glmer() instead
## Warning in lme4::glmer(formula = yearsintheUSz ~ collegeormore + male + :
## please use glmerControl() instead of lmerControl()
## Warning in lme4::glmer(formula = yearsintheUSz ~ collegeormore + male
## + : calling glmer() with family=gaussian (identity link) as a shortcut to
## lmer() is deprecated; please call lmer() directly
## Warning: extra argument(s) 'nAGQ' disregarded
r.squaredGLMM(fit.mix.bin.ml)
## R2m R2c
## [1,] 0.1585978 0.1663676
display(fit.mix.bin.ml, detail=T, digits=4)
## lmer(formula = yearsintheUSz ~ collegeormore + male + agez +
## hispan + ppov + (1 | GEOID), data = joindata2, control = lmerControl(optimizer = "bobyqa",
## optCtrl = list(maxfun = 4e+05)), nAGQ = 10, family = gaussian)
## coef.est coef.se t value
## (Intercept) 0.1343 0.0611 2.1977
## collegeormore -0.7489 0.0319 -23.4894
## male 0.0439 0.0263 1.6683
## agez 0.1942 0.0131 14.7727
## hispan 0.0163 0.0113 1.4446
## ppov 0.4580 0.3627 1.2627
##
## Error terms:
## Groups Name Std.Dev.
## GEOID (Intercept) 0.0883
## Residual 0.9151
## ---
## number of obs: 4871, groups: GEOID, 38
## AIC = 13018.8, DIC = 12941.9
## deviance = 12972.3
#confidence intervals for the Odds ratio
exp(confint(fit.mix.bin.ml, method="Wald"))
## 2.5 % 97.5 %
## .sig01 NA NA
## .sigma NA NA
## (Intercept) 1.0146342 1.2892958
## collegeormore 0.4442275 0.5033665
## male 0.9923561 1.1001485
## agez 1.1834039 1.2459694
## hispan 0.9942090 1.0391117
## ppov 0.7765075 3.2187735
We can conclude by doing a comparison of the two models that the first is more accurate at showing the impact of the number of years living in the country for individuals not born in the US.
anova(fit.mix.cont, fit.mix.bin.ml)
## refitting model(s) with ML (instead of REML)
## Data: joindata2
## Models:
## fit.mix.cont: collegeormore ~ yearsintheUSz + male + agez + hispan + (1 | GEOID)
## fit.mix.bin.ml: yearsintheUSz ~ collegeormore + male + agez + hispan + ppov +
## fit.mix.bin.ml: (1 | GEOID)
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## fit.mix.cont 7 4663.5 4708.9 -2324.7 4649.5
## fit.mix.bin.ml 8 12988.3 13040.3 -6486.2 12972.3 0 1 1
We expect to see the following patterns:
H1: Invididuals who have lived longer in the US and live in areas with higher rates of poverty are more likely to be Hispanics. H2: In terms of gender,those who have lived longer in the US and live in areas of poverty are less likely to be males. H3: Individuals who have lived longer in the US and live in a poorer area are less likely to have a bachelor’s degree or higher.
fit.mix.bin.cl<-lmer(yearsintheUSz~agez+collegeormore+(collegeormore+yearsintheUSz+male+hispan)*ppov+ (1|GEOID),nAGQ=10,
family=gaussian, data=joindata2,
control=lmerControl(optimizer="bobyqa",optCtrl=list(maxfun=4e5)))
## Warning in lme4::lmer(formula = yearsintheUSz ~ agez + collegeormore +
## (collegeormore + : calling lmer with 'family' is deprecated; please use
## glmer() instead
## Warning in lme4::glmer(formula = yearsintheUSz ~ agez + collegeormore + :
## please use glmerControl() instead of lmerControl()
## Warning in lme4::glmer(formula = yearsintheUSz ~ agez + collegeormore
## + : calling glmer() with family=gaussian (identity link) as a shortcut to
## lmer() is deprecated; please call lmer() directly
## Warning: extra argument(s) 'nAGQ' disregarded
## Warning in model.matrix.default(fixedform, fr, contrasts): the response
## appeared on the right-hand side and was dropped
## Warning in model.matrix.default(fixedform, fr, contrasts): problem with
## term 3 in model.matrix: no columns are assigned
## Warning in lme4::lmer(formula = yearsintheUSz ~ agez + collegeormore +
## (collegeormore + : calling lmer with 'family' is deprecated; please use
## glmer() instead
## Warning in lme4::glmer(formula = yearsintheUSz ~ agez + collegeormore + :
## please use glmerControl() instead of lmerControl()
## Warning in lme4::glmer(formula = yearsintheUSz ~ agez + collegeormore
## + : calling glmer() with family=gaussian (identity link) as a shortcut to
## lmer() is deprecated; please call lmer() directly
## Warning: extra argument(s) 'nAGQ' disregarded
## Warning in model.matrix.default(fixedform, fr, contrasts): the response
## appeared on the right-hand side and was dropped
## Warning in model.matrix.default(fixedform, fr, contrasts): problem with
## term 3 in model.matrix: no columns are assigned
display(fit.mix.bin.cl, detail=T)
## lmer(formula = yearsintheUSz ~ agez + collegeormore + (collegeormore +
## yearsintheUSz + male + hispan) * ppov + (1 | GEOID), data = joindata2,
## control = lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 4e+05)),
## nAGQ = 10, family = gaussian)
## coef.est coef.se t value
## (Intercept) 0.16 0.04 3.97
## agez 0.04 0.00 8.52
## collegeormore -0.63 0.03 -18.63
## male 0.09 0.03 3.16
## hispan 0.05 0.02 2.84
## ppov -0.98 0.27 -3.61
## collegeormore:ppov 3.36 0.23 14.52
## yearsintheUSz:ppov 5.76 0.03 174.38
## male:ppov -0.56 0.18 -3.20
## hispan:ppov -0.30 0.12 -2.49
##
## Error terms:
## Groups Name Std.Dev.
## GEOID (Intercept) 0.07
## Residual 0.34
## ---
## number of obs: 4871, groups: GEOID, 38
## AIC = 3393.4, DIC = 3268.3
## deviance = 3318.9
exp(confint(fit.mix.bin.cl, method="Wald"))
## 2.5 % 97.5 %
## .sig01 NA NA
## .sigma NA NA
## (Intercept) 1.0853556 1.2737121
## agez 1.0330605 1.0533210
## collegeormore 0.4981931 0.5688600
## male 1.0331251 1.1493790
## hispan 1.0154942 1.0871378
## ppov 0.2212908 0.6394027
## collegeormore:ppov 18.2827293 45.2735063
## yearsintheUSz:ppov 297.0500243 338.1017654
## male:ppov 0.4029371 0.8034690
## hispan:ppov 0.5834211 0.9374964
anova(fit.mix.bin.ml, fit.mix.bin.cl)
## refitting model(s) with ML (instead of REML)
## Data: joindata2
## Models:
## fit.mix.bin.ml: yearsintheUSz ~ collegeormore + male + agez + hispan + ppov +
## fit.mix.bin.ml: (1 | GEOID)
## fit.mix.bin.cl: yearsintheUSz ~ agez + collegeormore + (collegeormore + yearsintheUSz +
## fit.mix.bin.cl: male + hispan) * ppov + (1 | GEOID)
## Df AIC BIC logLik deviance Chisq Chi Df
## fit.mix.bin.ml 8 12988.3 13040.3 -6486.2 12972.3
## fit.mix.bin.cl 12 3342.9 3420.8 -1659.4 3318.9 9653.5 4
## Pr(>Chisq)
## fit.mix.bin.ml
## fit.mix.bin.cl < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Non-US born individuals are less likely to be males if they have lived longer in the US in areas of poverty. Equally, we see that this same individuals are less likely to be Hispanics if they have lived longer and continue to live in areas of poverty. This last finding requires additional exploration possibly, a comparison with US-Born Hispanics and other racial groups.
Finally we also see that an increase in years can result in a higher likelihood of having a bachelor’s degree of higher if the individual lives in counties with higher percentages of poverty.
Here I use the highest and the lowest intercept counties to illustrate the differences which are Bexar County and Bell County accordingly .
which.max(ranef(fit.mix.bin.cl)$GEOID$`(Intercept)`)
## [1] 36
which.min(ranef(fit.mix.bin.cl)$GEOID$`(Intercept)`)
## [1] 5
rownames(ranef(fit.mix.bin.cl)$GEOID)[c(4, 25)]
## [1] "41" "329"
joindata2$NAME[36]
## [1] "Bexar County, Texas"
joindata2$NAME[5]
## [1] "Bell County, Texas"