We examine the relationship between having a bachelor’s degree or higher in the state of Texas for individuals who where NOT born in the US.
We fit two random intercept models, the second includes a group level variable of percentage of poverty in the county as a predictor. We then compare the fits of each model using a ratio test.
Individuals with less number of years in the country that live in rural areas have less academic achievement.
Multi level propositions
Figures from Kawachi and Berkman (2003)
In the image selection below from Kawachi et.,al(2003): Z = Percentage of poverty in the county, X = Number of years living in the US. Y = Having a bachelor’s degree or higher,
Selection of Multi level propositions
The sample includes data from 4,871 individuals born between 1978 and 1988 in the State of Texas. Data was gathered from ACS-IPUMS 2014-2016.
library(haven)
library(dplyr)
library (car)
library(lmtest)
library(arm)
library(lme4)
library(ggplot2)
library(sjstats)
library(lmerTest)
library(MuMIn)
library(tidycensus)
library(stringr)
library(utils)
#library(tigris)
#options(tigris_class = "sf")
#pumas<-pumas(state = "TX", year = 2014, cb = T)
testdata<-read_dta('C:/Users/PCMcC/Documents/Hierarchical Models/hierarchichal methods class/usa_00001.dta')
load("C:/Users/PCMcC/Documents/Hierarchical Models/hierarchichal methods class/ipums_subset2.Rdata")
names(testdata)<-tolower(names(testdata))
sub2 <- filter(testdata, birthyr %in% 1978:1988, year %in% 2014:2016, gq != 3, gq != 4, gq!=6, citizen==3:5, statefip==48)
## Warning in citizen == 3:5: longer object length is not a multiple of
## shorter object length
save(sub2, file="C:/Users/PCMcC/Documents/Hierarchical Models/hierarchichal methods class/ipums_subset2.Rdata")
#rm(testdata)
#gc()
#set.seed(1115)
#samp <- sample(1:dim(sub)[1], replace = F, 100000)
names(sub2) #print the column names
## [1] "year" "datanum" "serial" "hhwt" "hhtype"
## [6] "stateicp" "statefip" "county" "countyfips" "metro"
## [11] "city" "puma" "gq" "hhincome" "foodstmp"
## [16] "pernum" "perwt" "sex" "age" "birthyr"
## [21] "fertyr" "race" "raced" "hispan" "hispand"
## [26] "bpl" "bpld" "citizen" "yrsusa1" "language"
## [31] "languaged" "school" "educ" "educd" "gradeatt"
## [36] "gradeattd" "degfield" "degfieldd" "empstat" "empstatd"
## [41] "labforce" "incwage" "poverty" "trantime"
myvars<-c("statefip","county","countyfips","metro","city","puma","hhincome","incwage","educd","empstatd","gradeatt","race", "hispan", "hispand","raced","sex","empstat","age","trantime","poverty","birthyr","year","gq","hhtype","yrsusa1","bpl")
sub3<-sub2[,myvars]
#rm(sub2); gc()
#RACE/ETHNICITY
sub3$hispanic <- Recode(sub3$hispan, recodes = "9=NA; 1:4='Hispanic'; 0='NonHispanic'")
sub3$race_rec <- Recode(sub3$race, recodes = "1='White'; 2='Black'; 3='Other'; 4:6='Asian'; 7:9='Other'")
sub3$race_eth <- interaction(sub3$hispanic, sub3$race_rec, sep = "_")
sub3$race_eth <- as.factor(ifelse(substr(as.character(sub3$race_eth),1,8) == "Hispanic", "Hispanic", as.character(sub3$race_eth)))
sub3$race_eth <- relevel(sub3$race_eth, ref = "NonHispanic_White")
#SEX
sub3$sex2<- Recode (sub3$sex, recodes= "1=1; 2=2; else=NA")
sub3$male<- Recode (sub3$sex, recodes= "1=1; 2=0; else=NA")
sub3$female<- Recode (sub3$sex, recodes= "2=1; 1=0; else=NA")
#CITIZEN STATUS (1=born in the US or to US citizens, 0=not US citizen)
#sub3$citizenstatus <- Recode(sub3$citizen, recodes = "1:2=1; 3:5=0; else=NA")
#sub3$usborn <-Recode(sub3$citizenstatus, recodes="1=1; 0=0")
#sub3$notusborn <-Recode(sub3$citizenstatus, recodes="0=1; 1=0")
#table(sub3$usborn)
#EDUCATION LEVEL
sub3$educ_level<- Recode(sub3$educd, recodes = "2:61='0Less_than_HS';62:64='1_HSD/GED';65:80='2_somecollege';90:100='2_somecollege'; 81:83='3_AssocDegree';101='4_bachelordegree'; 110:116='4_BAplus_GradDegree'; else=NA")
sub3$educ_level<-as.factor(sub3$educ_level)
#College or more(1=college degree or higher and 0=Associates degree or lower)
sub3$collegeormore<- Recode(sub3$educd, recodes = "84:116=1; 2:83=0")
#USBORN
sub3$usborn <- Recode(sub3$bpl, recodes = "1:120=1; 121:900=0; else=NA")
#EMPLOYMENT
sub3$employedetailed <- Recode(sub3$empstatd, recodes = "10:12=1; 20:22=0; else=NA")
sub3$employed<-Recode(sub3$empstat, recodes = "1=1; else=NA")
sub3$unemployed<-Recode(sub3$empstat, recodes = "2=1; else=NA")
sub3$not_in_laborforce<-Recode(sub3$empstat, recodes = "3=1; else=NA")
#INCOME ADJUSTED FOR INFLATION
#sub$edu_scal_inc <- ave(sub$incwage, sub$male, FUN = scale)
sub3$inc_adj <- ifelse(sub3$year == 2005, sub3$incwage*1.23,
ifelse(sub3$year == 2006, sub3$incwage*1.18,
ifelse(sub3$year == 2007, sub3$incwage*1.16,
ifelse(sub3$year %in% c(2008,2009), sub3$incwage*1.1,
ifelse(sub3$year == 2010, sub3$incwage*1.1,
ifelse(sub3$year == 2011, sub3$incwage*1.07,
ifelse(sub3$year == 2012,sub3$incwage*1.05,
ifelse(sub3$year == 2013, sub3$incwage*1.03,
ifelse(sub3$year == 2014, sub3$incwage*1.01,
ifelse(sub3$year == 2015, sub3$incwage*1.01, sub3$incwage))))))))))
#TRAVEL TIME TO WORK (does not include zero time commute)
sub3$traveltime<-ifelse(sub3$trantime==000, NA, sub3$trantime)
#IN HS/COLLEGE OR NOT IN HS/COLLEGE
sub3$inhs<- Recode(sub3$gradeatt, recodes = "5=1; else=0")
sub3$incollege <- Recode(sub3$gradeatt, recodes = "6:7=1; else=0")
#YEARS IN THE US (0 indicates less than 1 year or NA - I eliminated all NAs)
sub3$yrsusa2<-Recode(sub3$yrsusa1, recodes="0=NA")
#HOUSEHOLD TYPE (1=married family, 2=one male or female living alone, 3=male or female living with someone)
sub3$housetype<-Recode(sub3$hhtype, recodes = "1=1; c(2,3,4,6)=2; c(5,7)=3; 9=NA" )
#METRO AREA (1=not in metro area, 2=metro central city, 3=suburban, 4=central/principal city unknown)
sub3$metroarea<-Recode(sub3$metro,recodes="1=1; 2=2; 3=3; else='NA'")
sub3$urban<-Recode(sub3$metroarea, recodes="2:3=1; 1=0")
sub3$rural<-Recode(sub3$metroarea, recodes="1=1; 2:3=0")
library(tidycensus)
usacs<-get_acs(geography = "county", year = 2015,
variables=c( "DP05_0001E", "DP03_0009P", "DP03_0062E", "DP03_0119PE",
"DP05_0001E","DP02_0009PE","DP02_0008PE", "DP02_0040E","DP02_0038E",
"DP02_0066PE","DP02_0067PE","DP02_0080PE","DP02_0092PE",
"DP03_0005PE","DP03_0028PE","DP03_0062E","DP03_0099PE","DP03_0101PE",
"DP03_0119PE","DP04_0046PE","DP05_0072PE","DP05_0073PE",
"DP05_0066PE","DP05_0033","DP05_0032", "DP05_0072PE", "DP02_0113PE", "DP04_0003PE") ,
summary_var = "B01001_001",
geometry = F, output = "wide")
## Getting data from the 2011-2015 5-year ACS
## Using the ACS Data Profile
## Using the ACS Data Profile
usacs2<-mutate(usacs,totpop= DP05_0001E, fertrate = DP02_0040E,pwhite=DP05_0072PE/100, nwhite=DP05_0032E,nblack=DP05_0033E, pblack=DP05_0073PE/100 , phisp=DP05_0066PE/100, pfemhh=DP02_0008PE/100, phsormore=DP02_0066PE/100,punemp=DP03_0009PE/100, medhhinc=DP03_0062E,ppov=DP03_0119PE/100, pforn=DP02_0092PE/100,plep=DP02_0113PE/100, pvacant=DP04_0003PE/100, GEOID=GEOID)
myv <- c("GEOID","NAME", "totpop", "pwhite", "pblack", "phisp", "pfemhh", "phsormore", "punemp", "medhhinc", "ppov", "pforn","plep")
acsvars <- usacs2[myv]
acsvars <-filter(acsvars,GEOID %in% 48001:48507)
acsvars$GEOID<- substring(acsvars$GEOID,3)
acsvars$GEOID<-as.numeric(acsvars$GEOID)
head(acsvars)
## # A tibble: 6 x 13
## GEOID NAME totpop pwhite pblack phisp pfemhh phsormore punemp medhhinc
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 Ande~ 57915 0.602 0.211 0.168 0.123 0.798 0.043 41327
## 2 3 Andr~ 16775 0.431 0.013 0.532 0.1 0.744 0.039 70423
## 3 5 Ange~ 87748 0.619 0.149 0.209 0.165 0.792 0.091 44223
## 4 7 Aran~ 24292 0.683 0.015 0.261 0.072 0.842 0.068 41690
## 5 9 Arch~ 8779 0.888 0.006 0.081 0.048 0.888 0.03 60275
## 6 11 Arms~ 1943 0.845 0.006 0.118 0.059 0.907 0.011 59737
## # ... with 3 more variables: ppov <dbl>, pforn <dbl>, plep <dbl>
#I fix the error:: `x` and `labels` must be same type with the following:
sub3[] <- lapply(sub3, unclass)
joindata<-merge(acsvars, sub3, by.x="GEOID", by.y="countyfips", all.x=T)
#names(joindata)
jdvars<-c("GEOID","NAME","metro","city","puma","hhincome","incwage","educd","empstatd","race", "hispan", "usborn","raced","sex","age","birthyr","year","gq","hhtype","yrsusa2","totpop", "pwhite", "pblack", "phisp", "pfemhh", "phsormore", "punemp", "medhhinc", "ppov","male", "pforn","plep","educ_level","race_eth","collegeormore")
joindata2<-joindata[,jdvars]
joindata2<-joindata2%>%
filter(complete.cases(.))
#rm(joindata); gc()
joindata2$yearsintheUSz<-scale(joindata2$yrsusa2, center = T, scale=T)
joindata2$agez<-scale(joindata2$age, center = T, scale=T)
However, we want to know whether *individuals with less years in the US have also lower levels of academic achievement, net of other demographic and socioeconomic factors?
fit1<-glmer(collegeormore~yearsintheUSz+male+agez+hispan+(1|GEOID), data=joindata2)
## Warning in glmer(collegeormore ~ yearsintheUSz + male + agez + hispan
## + : calling glmer() with family=gaussian (identity link) as a shortcut to
## lmer() is deprecated; please call lmer() directly
fit.mix.cont<-glmer(collegeormore~yearsintheUSz+male+agez+hispan+(1|GEOID),
family=binomial, data=joindata2,
control=glmerControl(optimizer="bobyqa",optCtrl=list(maxfun=2e5)))
library(MuMIn)
r.squaredGLMM(fit.mix.cont)
## Warning: 'r.squaredGLMM' now calculates a revised statistic. See the help
## page.
## Warning: The null model is correct only if all variables used by the
## original model remain unchanged.
## R2m R2c
## theoretical 0.3467434 0.4142742
## delta 0.2666883 0.3186278
cor(fitted(fit.mix.cont),fit.mix.cont@frame$yearsintheUSz^2)
## [,1]
## [1,] -0.1013087
#Tests of coefficitns
display(fit.mix.cont, detail=T, digits=4)
## glmer(formula = collegeormore ~ yearsintheUSz + male + agez +
## hispan + (1 | GEOID), data = joindata2, family = binomial,
## control = glmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05)))
## coef.est coef.se z value Pr(>|z|)
## (Intercept) -0.5753 0.1384 -4.1584 0.0000
## yearsintheUSz -0.9206 0.0494 -18.6215 0.0000
## male -0.0104 0.0756 -0.1377 0.8905
## agez 0.2099 0.0392 5.3532 0.0000
## hispan -0.7567 0.0467 -16.1944 0.0000
##
## Error terms:
## Groups Name Std.Dev.
## GEOID (Intercept) 0.6159
## Residual 1.0000
## ---
## number of obs: 4871, groups: GEOID, 38
## AIC = 4408.8, DIC = 4244.2
## deviance = 4320.5
#confidence intervals for the Odds ratio
exp(confint(fit.mix.cont, method="Wald"))
## 2.5 % 97.5 %
## .sig01 NA NA
## (Intercept) 0.4289207 0.7377461
## yearsintheUSz 0.3615079 0.4388118
## male 0.8533970 1.1476567
## agez 1.1422776 1.3320186
## hispan 0.4281566 0.5142188
We can see that an increase in years in the US is associated with lower likelihood of having a college degree or higher. In terms of race/ethnicity, we see that Hispanics are also less likely to have a college degree or more in comparison to other non-US born individuals. Age has a positive correlation with earning a bachelor’s degree however being male shows very little impact.
The second model integrates a group level variable as a predictor which in this case is the percentage of povery by county.
fit.mix.bin.ml<-glmer(collegeormore~yearsintheUSz+male+agez+hispan+ppov+(1|GEOID),nAGQ=10,
family=binomial, data=joindata2,
control=glmerControl(optimizer="bobyqa",optCtrl=list(maxfun=4e5)))
r.squaredGLMM(fit.mix.bin.ml)
## Warning: The null model is correct only if all variables used by the
## original model remain unchanged.
## R2m R2c
## theoretical 0.3779277 0.4144762
## delta 0.2907714 0.3188912
display(fit.mix.bin.ml, detail=T, digits=4)
## glmer(formula = collegeormore ~ yearsintheUSz + male + agez +
## hispan + ppov + (1 | GEOID), data = joindata2, family = binomial,
## control = glmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 4e+05)),
## nAGQ = 10)
## coef.est coef.se z value Pr(>|z|)
## (Intercept) 0.1768 0.2416 0.7318 0.4643
## yearsintheUSz -0.9205 0.0494 -18.6189 0.0000
## male -0.0149 0.0756 -0.1974 0.8435
## agez 0.2106 0.0392 5.3737 0.0000
## hispan -0.7510 0.0465 -16.1512 0.0000
## ppov -5.3624 1.5623 -3.4323 0.0006
##
## Error terms:
## Groups Name Std.Dev.
## GEOID (Intercept) 0.4532
## Residual 1.0000
## ---
## number of obs: 4871, groups: GEOID, 38
## AIC = 4402, DIC = 4269.9
## deviance = 4328.9
#confidence intervals for the Odds ratio
exp(confint(fit.mix.bin.ml, method="Wald"))
## 2.5 % 97.5 %
## .sig01 NA NA
## (Intercept) 0.7432580777 1.9161667
## yearsintheUSz 0.3615482085 0.4388621
## male 0.8495743337 1.1424684
## agez 1.1431342873 1.3329335
## hispan 0.4307724362 0.5169044
## ppov 0.0002194289 0.1002367
We can conclude by doing a comparison of the two models that the second one which includes the percentage of poverty by county has an impact on individuals earning a bachelor’s degree or higher.
anova(fit.mix.cont, fit.mix.bin.ml)
## Data: joindata2
## Models:
## fit.mix.cont: collegeormore ~ yearsintheUSz + male + agez + hispan + (1 | GEOID)
## fit.mix.bin.ml: collegeormore ~ yearsintheUSz + male + agez + hispan + ppov +
## fit.mix.bin.ml: (1 | GEOID)
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## fit.mix.cont 6 4408.8 4447.8 -2198.4 4396.8
## fit.mix.bin.ml 7 4402.0 4447.4 -2194.0 4388.0 8.8466 1 0.002936
##
## fit.mix.cont
## fit.mix.bin.ml **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
We expect to see the following patterns:
H1: Hispanics in areas of poverty will be less likely to have a college degree or higher H2: Men in areas of poverty will be more likely to have a college degree or higher H3: An increase in the number of years will be positively associated to the likelihood of having a bachelor’s degree or higher.
fit.mix.bin.cl<-glmer(collegeormore~agez+(male+hispan+yearsintheUSz)*ppov+ (1|GEOID),nAGQ=10,
family=binomial, data=joindata2,
control=glmerControl(optimizer="bobyqa",optCtrl=list(maxfun=4e5)))
display(fit.mix.bin.cl, detail=T)
## glmer(formula = collegeormore ~ agez + (male + hispan + yearsintheUSz) *
## ppov + (1 | GEOID), data = joindata2, family = binomial,
## control = glmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 4e+05)),
## nAGQ = 10)
## coef.est coef.se z value Pr(>|z|)
## (Intercept) 0.43 0.30 1.45 0.15
## agez 0.22 0.04 5.49 0.00
## male -0.42 0.22 -1.96 0.05
## hispan -0.91 0.19 -4.91 0.00
## yearsintheUSz -1.17 0.13 -8.73 0.00
## ppov -7.32 2.06 -3.54 0.00
## male:ppov 2.96 1.45 2.03 0.04
## hispan:ppov 1.21 1.33 0.91 0.36
## yearsintheUSz:ppov 1.81 0.88 2.05 0.04
##
## Error terms:
## Groups Name Std.Dev.
## GEOID (Intercept) 0.45
## Residual 1.00
## ---
## number of obs: 4871, groups: GEOID, 38
## AIC = 4397.9, DIC = 4261.2
## deviance = 4319.6
exp(confint(fit.mix.bin.cl, method="Wald"))
## 2.5 % 97.5 %
## .sig01 NA NA
## (Intercept) 0.8566151628 2.78348661
## agez 1.1492833880 1.34104453
## male 0.4301617064 0.99965653
## hispan 0.2791552476 0.57827861
## yearsintheUSz 0.2381683227 0.40305273
## ppov 0.0000116185 0.03796622
## male:ppov 1.1105339832 332.78752773
## hispan:ppov 0.2471513810 45.83080606
## yearsintheUSz:ppov 1.0857024879 34.10576228
anova(fit.mix.bin.ml, fit.mix.bin.cl)
## Data: joindata2
## Models:
## fit.mix.bin.ml: collegeormore ~ yearsintheUSz + male + agez + hispan + ppov +
## fit.mix.bin.ml: (1 | GEOID)
## fit.mix.bin.cl: collegeormore ~ agez + (male + hispan + yearsintheUSz) * ppov +
## fit.mix.bin.cl: (1 | GEOID)
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## fit.mix.bin.ml 7 4402.0 4447.4 -2194.0 4388.0
## fit.mix.bin.cl 10 4397.9 4462.8 -2188.9 4377.9 10.081 3 0.01789
##
## fit.mix.bin.ml
## fit.mix.bin.cl *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Findings show that Non-US born Hispanics in areas of poverty will slightly more likely to have a college degree or higher contrary to our expectations. A comparison with US-Born Hispacs as well as other groups by race/ethnicity may provide more insight.
Non-US born men living in areas of poverty are also more likely to have a college degree or higher when compared to women as expected.
Finally, we do see a positive relationship betweenthe number of years living in the US and the likelihood of having a bachelor’s degree or higher for individuals living in higher concentrations of poverty.
Here I use the highest and the lowest intercept counties to illustrate the differences which are joindata2$NAME[41] and joindata2$NAME[329]accordingly .
which.max(ranef(fit.mix.bin.cl)$GEOID$`(Intercept)`)
## [1] 4
which.min(ranef(fit.mix.bin.cl)$GEOID$`(Intercept)`)
## [1] 25
rownames(ranef(fit.mix.bin.cl)$GEOID)[c(4, 25)]
## [1] "41" "329"
joindata2$NAME[41]
## [1] "Bexar County, Texas"
joindata2$NAME[329]
## [1] "Brazos County, Texas"