We examine the relationship between having a bachelor’s degree or higher in the state of Texas for individuals who where NOT born in the US.

We fit two random intercept models, the second includes a group level variable of percentage of poverty in the county as a predictor. We then compare the fits of each model using a ratio test.

Individuals with less number of years in the country that live in rural areas have less academic achievement.

Multi level propositions

Figures from Kawachi and Berkman (2003)

In the image selection below from Kawachi et.,al(2003): Z = Percentage of poverty in the county, X = Number of years living in the US. Y = Having a bachelor’s degree or higher,

Selection of Multi level propositions

The sample includes data from 4,871 individuals born between 1978 and 1988 in the State of Texas. Data was gathered from ACS-IPUMS 2014-2016.

library(haven)
library(dplyr)
library (car)
library(lmtest)
library(arm)
library(lme4) 
library(ggplot2)
library(sjstats)
library(lmerTest)
library(MuMIn)
library(tidycensus)
library(stringr)
library(utils)



#library(tigris)
#options(tigris_class = "sf")
#pumas<-pumas(state = "TX", year = 2014, cb = T)

testdata<-read_dta('C:/Users/PCMcC/Documents/Hierarchical Models/hierarchichal methods class/usa_00001.dta')
load("C:/Users/PCMcC/Documents/Hierarchical Models/hierarchichal methods class/ipums_subset2.Rdata")

Filtering Data`

names(testdata)<-tolower(names(testdata))
sub2 <- filter(testdata, birthyr %in% 1978:1988, year %in% 2014:2016,  gq != 3, gq != 4, gq!=6, citizen==3:5, statefip==48)

## Warning in citizen == 3:5: longer object length is not a multiple of
## shorter object length

save(sub2, file="C:/Users/PCMcC/Documents/Hierarchical Models/hierarchichal methods class/ipums_subset2.Rdata")
#rm(testdata)
#gc()

#set.seed(1115)
#samp <- sample(1:dim(sub)[1], replace = F, 100000)

Individual Level Variables

names(sub2) #print the column names

##  [1] "year"       "datanum"    "serial"     "hhwt"       "hhtype"    
##  [6] "stateicp"   "statefip"   "county"     "countyfips" "metro"     
## [11] "city"       "puma"       "gq"         "hhincome"   "foodstmp"  
## [16] "pernum"     "perwt"      "sex"        "age"        "birthyr"   
## [21] "fertyr"     "race"       "raced"      "hispan"     "hispand"   
## [26] "bpl"        "bpld"       "citizen"    "yrsusa1"    "language"  
## [31] "languaged"  "school"     "educ"       "educd"      "gradeatt"  
## [36] "gradeattd"  "degfield"   "degfieldd"  "empstat"    "empstatd"  
## [41] "labforce"   "incwage"    "poverty"    "trantime"

myvars<-c("statefip","county","countyfips","metro","city","puma","hhincome","incwage","educd","empstatd","gradeatt","race", "hispan", "hispand","raced","sex","empstat","age","trantime","poverty","birthyr","year","gq","hhtype","yrsusa1","bpl")
sub3<-sub2[,myvars]
#rm(sub2); gc()

#RACE/ETHNICITY
sub3$hispanic <- Recode(sub3$hispan, recodes = "9=NA; 1:4='Hispanic'; 0='NonHispanic'")
sub3$race_rec <- Recode(sub3$race, recodes = "1='White'; 2='Black'; 3='Other'; 4:6='Asian'; 7:9='Other'")
sub3$race_eth <- interaction(sub3$hispanic, sub3$race_rec, sep = "_")
sub3$race_eth  <- as.factor(ifelse(substr(as.character(sub3$race_eth),1,8) == "Hispanic", "Hispanic", as.character(sub3$race_eth)))
sub3$race_eth <- relevel(sub3$race_eth, ref = "NonHispanic_White")

#SEX
sub3$sex2<- Recode (sub3$sex, recodes= "1=1; 2=2; else=NA")
sub3$male<- Recode (sub3$sex, recodes= "1=1; 2=0; else=NA")
sub3$female<- Recode (sub3$sex, recodes= "2=1; 1=0; else=NA")

#CITIZEN STATUS (1=born in the US or to US citizens, 0=not US citizen)
#sub3$citizenstatus <- Recode(sub3$citizen, recodes = "1:2=1; 3:5=0; else=NA")
#sub3$usborn <-Recode(sub3$citizenstatus, recodes="1=1; 0=0")
#sub3$notusborn <-Recode(sub3$citizenstatus, recodes="0=1; 1=0")
#table(sub3$usborn)


#EDUCATION LEVEL 
sub3$educ_level<- Recode(sub3$educd, recodes = "2:61='0Less_than_HS';62:64='1_HSD/GED';65:80='2_somecollege';90:100='2_somecollege'; 81:83='3_AssocDegree';101='4_bachelordegree'; 110:116='4_BAplus_GradDegree'; else=NA")
sub3$educ_level<-as.factor(sub3$educ_level)
#College or more(1=college degree or higher and 0=Associates degree or lower)
sub3$collegeormore<- Recode(sub3$educd, recodes = "84:116=1; 2:83=0")


#USBORN
sub3$usborn <- Recode(sub3$bpl, recodes = "1:120=1; 121:900=0; else=NA")

#EMPLOYMENT
sub3$employedetailed <- Recode(sub3$empstatd, recodes = "10:12=1; 20:22=0; else=NA")

sub3$employed<-Recode(sub3$empstat, recodes = "1=1; else=NA")
sub3$unemployed<-Recode(sub3$empstat, recodes = "2=1; else=NA")
sub3$not_in_laborforce<-Recode(sub3$empstat, recodes = "3=1; else=NA")


#INCOME ADJUSTED FOR INFLATION
#sub$edu_scal_inc <- ave(sub$incwage, sub$male, FUN = scale)
sub3$inc_adj <- ifelse(sub3$year == 2005, sub3$incwage*1.23,
                            ifelse(sub3$year == 2006, sub3$incwage*1.18,
                                   ifelse(sub3$year == 2007, sub3$incwage*1.16,
                                          ifelse(sub3$year %in% c(2008,2009), sub3$incwage*1.1, 
                                                 ifelse(sub3$year == 2010, sub3$incwage*1.1,
                                                        ifelse(sub3$year == 2011, sub3$incwage*1.07,
                                                               ifelse(sub3$year == 2012,sub3$incwage*1.05,
                                                                      ifelse(sub3$year == 2013, sub3$incwage*1.03, 
                                                                             ifelse(sub3$year == 2014, sub3$incwage*1.01,
                                                                                    ifelse(sub3$year == 2015, sub3$incwage*1.01, sub3$incwage))))))))))


#TRAVEL TIME TO WORK (does not include zero time commute)
sub3$traveltime<-ifelse(sub3$trantime==000, NA, sub3$trantime)

#IN HS/COLLEGE OR NOT IN HS/COLLEGE
sub3$inhs<- Recode(sub3$gradeatt, recodes = "5=1; else=0")
sub3$incollege <- Recode(sub3$gradeatt, recodes = "6:7=1; else=0")

#YEARS IN THE US (0 indicates less than 1 year or NA - I eliminated all NAs)
sub3$yrsusa2<-Recode(sub3$yrsusa1, recodes="0=NA")

Household level variables

#HOUSEHOLD TYPE (1=married family, 2=one male or female living alone, 3=male or female living with someone)
sub3$housetype<-Recode(sub3$hhtype, recodes = "1=1; c(2,3,4,6)=2; c(5,7)=3; 9=NA" )

#METRO AREA (1=not in metro area, 2=metro central city, 3=suburban, 4=central/principal city unknown)
sub3$metroarea<-Recode(sub3$metro,recodes="1=1; 2=2; 3=3; else='NA'")
sub3$urban<-Recode(sub3$metroarea, recodes="2:3=1; 1=0")
sub3$rural<-Recode(sub3$metroarea, recodes="1=1; 2:3=0")

Higher level (Macro) variables

library(tidycensus)
usacs<-get_acs(geography = "county", year = 2015,
                variables=c( "DP05_0001E", "DP03_0009P", "DP03_0062E", "DP03_0119PE",
                             "DP05_0001E","DP02_0009PE","DP02_0008PE", "DP02_0040E","DP02_0038E",
                             "DP02_0066PE","DP02_0067PE","DP02_0080PE","DP02_0092PE",
                             "DP03_0005PE","DP03_0028PE","DP03_0062E","DP03_0099PE","DP03_0101PE",
                             "DP03_0119PE","DP04_0046PE","DP05_0072PE","DP05_0073PE",
                             "DP05_0066PE","DP05_0033","DP05_0032", "DP05_0072PE", "DP02_0113PE",  "DP04_0003PE") ,
                summary_var = "B01001_001",
                geometry = F, output = "wide")

## Getting data from the 2011-2015 5-year ACS

## Using the ACS Data Profile
## Using the ACS Data Profile

usacs2<-mutate(usacs,totpop= DP05_0001E, fertrate = DP02_0040E,pwhite=DP05_0072PE/100, nwhite=DP05_0032E,nblack=DP05_0033E, pblack=DP05_0073PE/100 , phisp=DP05_0066PE/100, pfemhh=DP02_0008PE/100, phsormore=DP02_0066PE/100,punemp=DP03_0009PE/100, medhhinc=DP03_0062E,ppov=DP03_0119PE/100, pforn=DP02_0092PE/100,plep=DP02_0113PE/100,  pvacant=DP04_0003PE/100, GEOID=GEOID)


myv <- c("GEOID","NAME", "totpop", "pwhite", "pblack", "phisp", "pfemhh", "phsormore", "punemp", "medhhinc", "ppov", "pforn","plep")
acsvars <- usacs2[myv]
acsvars <-filter(acsvars,GEOID %in% 48001:48507)
acsvars$GEOID<- substring(acsvars$GEOID,3)
acsvars$GEOID<-as.numeric(acsvars$GEOID)
head(acsvars)

## # A tibble: 6 x 13
##   GEOID NAME  totpop pwhite pblack phisp pfemhh phsormore punemp medhhinc
##   <dbl> <chr>  <dbl>  <dbl>  <dbl> <dbl>  <dbl>     <dbl>  <dbl>    <dbl>
## 1     1 Ande~  57915  0.602  0.211 0.168  0.123     0.798  0.043    41327
## 2     3 Andr~  16775  0.431  0.013 0.532  0.1       0.744  0.039    70423
## 3     5 Ange~  87748  0.619  0.149 0.209  0.165     0.792  0.091    44223
## 4     7 Aran~  24292  0.683  0.015 0.261  0.072     0.842  0.068    41690
## 5     9 Arch~   8779  0.888  0.006 0.081  0.048     0.888  0.03     60275
## 6    11 Arms~   1943  0.845  0.006 0.118  0.059     0.907  0.011    59737
## # ... with 3 more variables: ppov <dbl>, pforn <dbl>, plep <dbl>

Merge files by county FIPS codes and filter for complete cases

#I fix the error:: `x` and `labels` must be same type with the following:
sub3[] <- lapply(sub3, unclass)
joindata<-merge(acsvars, sub3, by.x="GEOID", by.y="countyfips", all.x=T)
#names(joindata)
jdvars<-c("GEOID","NAME","metro","city","puma","hhincome","incwage","educd","empstatd","race", "hispan", "usborn","raced","sex","age","birthyr","year","gq","hhtype","yrsusa2","totpop", "pwhite", "pblack", "phisp", "pfemhh", "phsormore", "punemp", "medhhinc", "ppov","male", "pforn","plep","educ_level","race_eth","collegeormore")
joindata2<-joindata[,jdvars]
joindata2<-joindata2%>%
filter(complete.cases(.))
#rm(joindata); gc()

Scale Variables

joindata2$yearsintheUSz<-scale(joindata2$yrsusa2, center = T, scale=T)
joindata2$agez<-scale(joindata2$age, center = T, scale=T)

Model 1: Individual Level Effects

However, we want to know whether *individuals with less years in the US have also lower levels of academic achievement, net of other demographic and socioeconomic factors?

fit1<-glmer(collegeormore~yearsintheUSz+male+agez+hispan+(1|GEOID), data=joindata2)

## Warning in glmer(collegeormore ~ yearsintheUSz + male + agez + hispan
## + : calling glmer() with family=gaussian (identity link) as a shortcut to
## lmer() is deprecated; please call lmer() directly

fit.mix.cont<-glmer(collegeormore~yearsintheUSz+male+agez+hispan+(1|GEOID),
                   family=binomial, data=joindata2, 
                   control=glmerControl(optimizer="bobyqa",optCtrl=list(maxfun=2e5)))

library(MuMIn)
r.squaredGLMM(fit.mix.cont)

## Warning: 'r.squaredGLMM' now calculates a revised statistic. See the help
## page.

## Warning: The null model is correct only if all variables used by the
## original model remain unchanged.

##                   R2m       R2c
## theoretical 0.3467434 0.4142742
## delta       0.2666883 0.3186278

cor(fitted(fit.mix.cont),fit.mix.cont@frame$yearsintheUSz^2)

##            [,1]
## [1,] -0.1013087

#Tests of coefficitns
display(fit.mix.cont, detail=T, digits=4)

## glmer(formula = collegeormore ~ yearsintheUSz + male + agez + 
##     hispan + (1 | GEOID), data = joindata2, family = binomial, 
##     control = glmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05)))
##               coef.est coef.se  z value  Pr(>|z|)
## (Intercept)    -0.5753   0.1384  -4.1584   0.0000
## yearsintheUSz  -0.9206   0.0494 -18.6215   0.0000
## male           -0.0104   0.0756  -0.1377   0.8905
## agez            0.2099   0.0392   5.3532   0.0000
## hispan         -0.7567   0.0467 -16.1944   0.0000
## 
## Error terms:
##  Groups   Name        Std.Dev.
##  GEOID    (Intercept) 0.6159  
##  Residual             1.0000  
## ---
## number of obs: 4871, groups: GEOID, 38
## AIC = 4408.8, DIC = 4244.2
## deviance = 4320.5

#confidence intervals for the Odds ratio
exp(confint(fit.mix.cont, method="Wald"))

##                   2.5 %    97.5 %
## .sig01               NA        NA
## (Intercept)   0.4289207 0.7377461
## yearsintheUSz 0.3615079 0.4388118
## male          0.8533970 1.1476567
## agez          1.1422776 1.3320186
## hispan        0.4281566 0.5142188

We can see that an increase in years in the US is associated with lower likelihood of having a college degree or higher. In terms of race/ethnicity, we see that Hispanics are also less likely to have a college degree or more in comparison to other non-US born individuals. Age has a positive correlation with earning a bachelor’s degree however being male shows very little impact.

Model 2: Individual Level Effects + Macro Level

The second model integrates a group level variable as a predictor which in this case is the percentage of povery by county.

fit.mix.bin.ml<-glmer(collegeormore~yearsintheUSz+male+agez+hispan+ppov+(1|GEOID),nAGQ=10,
                      family=binomial, data=joindata2,
                      control=glmerControl(optimizer="bobyqa",optCtrl=list(maxfun=4e5)))

r.squaredGLMM(fit.mix.bin.ml)

## Warning: The null model is correct only if all variables used by the
## original model remain unchanged.

##                   R2m       R2c
## theoretical 0.3779277 0.4144762
## delta       0.2907714 0.3188912

display(fit.mix.bin.ml, detail=T, digits=4)

## glmer(formula = collegeormore ~ yearsintheUSz + male + agez + 
##     hispan + ppov + (1 | GEOID), data = joindata2, family = binomial, 
##     control = glmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 4e+05)), 
##     nAGQ = 10)
##               coef.est coef.se  z value  Pr(>|z|)
## (Intercept)     0.1768   0.2416   0.7318   0.4643
## yearsintheUSz  -0.9205   0.0494 -18.6189   0.0000
## male           -0.0149   0.0756  -0.1974   0.8435
## agez            0.2106   0.0392   5.3737   0.0000
## hispan         -0.7510   0.0465 -16.1512   0.0000
## ppov           -5.3624   1.5623  -3.4323   0.0006
## 
## Error terms:
##  Groups   Name        Std.Dev.
##  GEOID    (Intercept) 0.4532  
##  Residual             1.0000  
## ---
## number of obs: 4871, groups: GEOID, 38
## AIC = 4402, DIC = 4269.9
## deviance = 4328.9

#confidence intervals for the Odds ratio
exp(confint(fit.mix.bin.ml, method="Wald"))

##                      2.5 %    97.5 %
## .sig01                  NA        NA
## (Intercept)   0.7432580777 1.9161667
## yearsintheUSz 0.3615482085 0.4388621
## male          0.8495743337 1.1424684
## agez          1.1431342873 1.3329335
## hispan        0.4307724362 0.5169044
## ppov          0.0002194289 0.1002367

Comparison of Models

We can conclude by doing a comparison of the two models that the second one which includes the percentage of poverty by county has an impact on individuals earning a bachelor’s degree or higher.

anova(fit.mix.cont, fit.mix.bin.ml)

## Data: joindata2
## Models:
## fit.mix.cont: collegeormore ~ yearsintheUSz + male + agez + hispan + (1 | GEOID)
## fit.mix.bin.ml: collegeormore ~ yearsintheUSz + male + agez + hispan + ppov + 
## fit.mix.bin.ml:     (1 | GEOID)
##                Df    AIC    BIC  logLik deviance  Chisq Chi Df Pr(>Chisq)
## fit.mix.cont    6 4408.8 4447.8 -2198.4   4396.8                         
## fit.mix.bin.ml  7 4402.0 4447.4 -2194.0   4388.0 8.8466      1   0.002936
##                  
## fit.mix.cont     
## fit.mix.bin.ml **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Hypotheses

We expect to see the following patterns:

H1: Hispanics in areas of poverty will be less likely to have a college degree or higher H2: Men in areas of poverty will be more likely to have a college degree or higher H3: An increase in the number of years will be positively associated to the likelihood of having a bachelor’s degree or higher.

Model 3: Cross-Level Interaction

fit.mix.bin.cl<-glmer(collegeormore~agez+(male+hispan+yearsintheUSz)*ppov+ (1|GEOID),nAGQ=10,
                      family=binomial, data=joindata2,
                      control=glmerControl(optimizer="bobyqa",optCtrl=list(maxfun=4e5)))
display(fit.mix.bin.cl, detail=T)

## glmer(formula = collegeormore ~ agez + (male + hispan + yearsintheUSz) * 
##     ppov + (1 | GEOID), data = joindata2, family = binomial, 
##     control = glmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 4e+05)), 
##     nAGQ = 10)
##                    coef.est coef.se z value Pr(>|z|)
## (Intercept)         0.43     0.30    1.45    0.15   
## agez                0.22     0.04    5.49    0.00   
## male               -0.42     0.22   -1.96    0.05   
## hispan             -0.91     0.19   -4.91    0.00   
## yearsintheUSz      -1.17     0.13   -8.73    0.00   
## ppov               -7.32     2.06   -3.54    0.00   
## male:ppov           2.96     1.45    2.03    0.04   
## hispan:ppov         1.21     1.33    0.91    0.36   
## yearsintheUSz:ppov  1.81     0.88    2.05    0.04   
## 
## Error terms:
##  Groups   Name        Std.Dev.
##  GEOID    (Intercept) 0.45    
##  Residual             1.00    
## ---
## number of obs: 4871, groups: GEOID, 38
## AIC = 4397.9, DIC = 4261.2
## deviance = 4319.6

exp(confint(fit.mix.bin.cl, method="Wald"))

##                           2.5 %       97.5 %
## .sig01                       NA           NA
## (Intercept)        0.8566151628   2.78348661
## agez               1.1492833880   1.34104453
## male               0.4301617064   0.99965653
## hispan             0.2791552476   0.57827861
## yearsintheUSz      0.2381683227   0.40305273
## ppov               0.0000116185   0.03796622
## male:ppov          1.1105339832 332.78752773
## hispan:ppov        0.2471513810  45.83080606
## yearsintheUSz:ppov 1.0857024879  34.10576228

anova(fit.mix.bin.ml, fit.mix.bin.cl)

## Data: joindata2
## Models:
## fit.mix.bin.ml: collegeormore ~ yearsintheUSz + male + agez + hispan + ppov + 
## fit.mix.bin.ml:     (1 | GEOID)
## fit.mix.bin.cl: collegeormore ~ agez + (male + hispan + yearsintheUSz) * ppov + 
## fit.mix.bin.cl:     (1 | GEOID)
##                Df    AIC    BIC  logLik deviance  Chisq Chi Df Pr(>Chisq)
## fit.mix.bin.ml  7 4402.0 4447.4 -2194.0   4388.0                         
## fit.mix.bin.cl 10 4397.9 4462.8 -2188.9   4377.9 10.081      3    0.01789
##                 
## fit.mix.bin.ml  
## fit.mix.bin.cl *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Model 3: Cross-Level Interaction - Discussion

Findings show that Non-US born Hispanics in areas of poverty will slightly more likely to have a college degree or higher contrary to our expectations. A comparison with US-Born Hispacs as well as other groups by race/ethnicity may provide more insight.

Non-US born men living in areas of poverty are also more likely to have a college degree or higher when compared to women as expected.

Finally, we do see a positive relationship betweenthe number of years living in the US and the likelihood of having a bachelor’s degree or higher for individuals living in higher concentrations of poverty.

Fitted Values

Here I use the highest and the lowest intercept counties to illustrate the differences which are joindata2$NAME[41] and joindata2$NAME[329]accordingly .

which.max(ranef(fit.mix.bin.cl)$GEOID$`(Intercept)`)

## [1] 4

which.min(ranef(fit.mix.bin.cl)$GEOID$`(Intercept)`)

## [1] 25

rownames(ranef(fit.mix.bin.cl)$GEOID)[c(4, 25)]

## [1] "41"  "329"

joindata2$NAME[41]

## [1] "Bexar County, Texas"

joindata2$NAME[329]

## [1] "Brazos County, Texas"

Assigment #3, DEM 7473 Hierarchical Models

Paulina Cano

October 8, 2018