In this assignment, I conduct a multilevel analysis estimating a random intercept model for the outcome of travel time to work.I also estimate a multilevel model including RANDOM SLOPES for the following individual predictors: gender, income, age, marital status and current enrollment in high school or college. In addition, I include higher level variables such as living in an urban vs rural area and the household receiving foodstamps.
The sample includes data from 58,594 individuals born between 1988 and 1998 in the State of Texas and filtered to include those individuals who are currently employed. Data was gathered from ACS-IPUMS 2014-2016.
library(haven)
library(dplyr)
library (car)
library(lmtest)
library(arm)
library(lme4)
library(ggplot2)
library(sjstats)
library(lmerTest)
library(MuMIn)
load ('C:/Users/PCMcC/Documents/Hierarchical Models/hierarchichal methods class/data_1.RData')
myvars<-c("statefip","county","countyfips","metro","city","puma","hhincome","incwage","educd","empstatd","gradeatt","race", "hispan", "hispand","raced","sex","empstat","age","trantime","poverty","birthyr","year","gq","foodstmp","hhtype","fertyr","marst")
data1.sub<-sub[,myvars]
rm(sub); gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 2172906 116.1 4445197 237.4 2529642 135.1
## Vcells 5364828 41.0 12175244 92.9 9759974 74.5
#Filter copied from sansonePAA.rmd file
data1.sub<-filter(data1.sub,birthyr %in% 1988:1998,year %in% 2014:2016, gq != 3, gq != 4, gq!=6,empstat== 1)
names(data1.sub) #print the column names
## [1] "statefip" "county" "countyfips" "metro" "city"
## [6] "puma" "hhincome" "incwage" "educd" "empstatd"
## [11] "gradeatt" "race" "hispan" "hispand" "raced"
## [16] "sex" "empstat" "age" "trantime" "poverty"
## [21] "birthyr" "year" "gq" "foodstmp" "hhtype"
## [26] "fertyr" "marst"
#MARRIAGE STATUS
data1.sub$marriage_status<- Recode (data1.sub$marst, recodes= "1:2='married'; 3:5='sep_div_wid'; 6='single'")
data1.sub$married<-recode(data1.sub$marst, recodes="1:2=1; else=0")
data1.sub$single<-recode(data1.sub$marst, recodes="6=1; else=0")
#RACE/ETHNICITY
data1.sub$hispanic <- Recode(data1.sub$hispan, recodes = "9=NA; 1:4='Hispanic'; 0='NonHispanic'")
data1.sub$race_rec <- Recode(data1.sub$race, recodes = "1='White'; 2='Black'; 3='Other'; 4:6='Asian'; 7:9='Other'")
data1.sub$race_eth <- interaction(data1.sub$hispanic, data1.sub$race_rec, sep = "_")
data1.sub$race_eth <- as.factor(ifelse(substr(as.character(data1.sub$race_eth),1,8) == "Hispanic", "Hispanic", as.character(data1.sub$race_eth)))
data1.sub$race_eth <- relevel(data1.sub$race_eth, ref = "NonHispanic_White")
#FERTILITY (1=no, 2=yes)
data1.sub$fert<- Recode (data1.sub$fertyr, recodes= "1=0; 2=1; else=NA")
table(data1.sub$fert)
##
## 0 1
## 25925 1806
#SEX
data1.sub$sex2<- Recode (data1.sub$sex, recodes= "1=1; 2=2; else=NA")
data1.sub$male<- Recode (data1.sub$sex, recodes= "1=1; 2=0; else=NA")
#PLACE OF BIRTH
data1.sub$usborn <- Recode(data1.sub$bpl, recodes = "1:120=1; 121:900=0; else=NA")
## Warning: Unknown or uninitialised column: 'bpl'.
#EDUCATION LEVEL
data1.sub$educ_level<- Recode(data1.sub$educd, recodes = "2:61='0Less_than_HS';62:64='1_HSD/GED';65:80='2_somecollege';90:100='2_somecollege'; 81:83='3_AssocDegree';101='4_bachelordegree'; 110:116='4_BAplus_GradDegree'; else=NA")
#EMPLOYMENT
data1.sub$employedetailed <- Recode(data1.sub$empstatd, recodes = "10:12=1; 20:22=0; else=NA")
data1.sub$employed<-Recode(data1.sub$empstat, recodes = "1=1; else=NA")
data1.sub$unemployed<-Recode(data1.sub$empstat, recodes = "2=1; else=NA")
data1.sub$not_in_laborforce<-Recode(data1.sub$empstat, recodes = "3=1; else=NA")
#INCOME ADJUSTED FOR INFLATION
#sub$edu_scal_inc <- ave(sub$incwage, sub$male, FUN = scale)
data1.sub$inc_adj <- ifelse(data1.sub$year == 2005, data1.sub$incwage*1.23,
ifelse(data1.sub$year == 2006, data1.sub$incwage*1.18,
ifelse(data1.sub$year == 2007, data1.sub$incwage*1.16,
ifelse(data1.sub$year %in% c(2008,2009), data1.sub$incwage*1.1,
ifelse(data1.sub$year == 2010, data1.sub$incwage*1.1,
ifelse(data1.sub$year == 2011, data1.sub$incwage*1.07,
ifelse(data1.sub$year == 2012,data1.sub$incwage*1.05,
ifelse(data1.sub$year == 2013, data1.sub$incwage*1.03,
ifelse(data1.sub$year == 2014, data1.sub$incwage*1.01,
ifelse(data1.sub$year == 2015, data1.sub$incwage*1.01, data1.sub$incwage))))))))))
#TRAVEL TIME TO WORK (does not include zero time commute)
data1.sub$traveltime<-ifelse(data1.sub$trantime==000, NA, data1.sub$trantime)
#IN HS/COLLEGE OR NOT IN HS/COLLEGE
data1.sub$inhs<- Recode(data1.sub$gradeatt, recodes = "5=1; else=0")
data1.sub$incollege <- Recode(data1.sub$gradeatt, recodes = "6:7=1; else=0")
#FOOD STAMPS (2=yes, 1=no)
data1.sub$foodstamps<-Recode(data1.sub$foodstmp, recodes = "2=2; 1=1; 0=NA")
#HOUSEHOLD TYPE (1=married family, 2=one male or female living alone, 3=male or female living with someone)
data1.sub$housetype<-Recode(data1.sub$hhtype, recodes = "1=1; c(2,3,4,6)=2; c(5,7)=3; 9=NA" )
#METRO AREA (1=not in metro area, 2=metro central city, 3=suburban, 4=central/principal city unknown)
data1.sub$metroarea<-Recode(data1.sub$metro,recodes="1=1; 2=2; 3=3; else='NA'")
#URBAN (includes suburbs) & RURAL
data1.sub$urban<-recode(data1.sub$metroarea, recodes="2:3=1; else=0")
data1.sub$rural<-recode(data1.sub$metroarea, recodes="1=1; else=0")
The First model will fit a model that considers the individual and family level variables and a random intercept only.
A simple analysis of variance shows that there is in fact a significant difference in commuting times and income.
data1.sub$travelz<-scale(data1.sub$traveltime, center = T, scale=T)
data1.sub$inc_adjz<-scale(data1.sub$inc_adj, center = T, scale=T)
fit0<-lm(travelz~inc_adjz*factor(county), data=data1.sub)
anova(fit0)
## Analysis of Variance Table
##
## Response: travelz
## Df Sum Sq Mean Sq F value Pr(>F)
## inc_adjz 1 421 421.47 434.2908 < 2.2e-16 ***
## factor(county) 38 998 26.25 27.0529 < 2.2e-16 ***
## inc_adjz:factor(county) 38 320 8.42 8.6748 < 2.2e-16 ***
## Residuals 56320 54658 0.97
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
However, we want to know whether individuals between the ages of 16-25 with higher incomes have longer commuting times to work, net of other demographic and socioeconomic factors?
fit1<-lmer(travelz~foodstamps+male+inc_adjz+age+married+single+urban+rural+inhs+incollege+(1|county), data=data1.sub)
summary(fit1)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula:
## travelz ~ foodstamps + male + inc_adjz + age + married + single +
## urban + rural + inhs + incollege + (1 | county)
## Data: data1.sub
##
## REML criterion at convergence: 158226.5
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.8929 -0.5809 -0.2268 0.2822 7.3613
##
## Random effects:
## Groups Name Variance Std.Dev.
## county (Intercept) 0.03399 0.1844
## Residual 0.96479 0.9822
## Number of obs: 56398, groups: county, 39
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) -4.198e-01 6.160e-02 6.232e+02 -6.816 2.22e-11 ***
## foodstamps 3.523e-02 1.243e-02 5.639e+04 2.835 0.00459 **
## male 1.079e-01 8.399e-03 5.635e+04 12.846 < 2e-16 ***
## inc_adjz 3.639e-02 4.722e-03 5.638e+04 7.705 1.33e-14 ***
## age 1.521e-02 1.785e-03 5.637e+04 8.520 < 2e-16 ***
## married -2.688e-02 2.699e-02 5.635e+04 -0.996 0.31930
## single -6.601e-02 2.615e-02 5.636e+04 -2.524 0.01160 *
## urban -3.895e-02 1.254e-02 3.631e+04 -3.107 0.00189 **
## rural 4.359e-02 2.374e-02 5.638e+04 1.837 0.06628 .
## inhs -2.131e-01 2.007e-02 5.636e+04 -10.614 < 2e-16 ***
## incollege -7.630e-02 1.023e-02 5.638e+04 -7.454 9.18e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) fdstmp male inc_dj age marrid single urban rural
## foodstamps -0.251
## male -0.081 0.033
## inc_adjz 0.178 0.121 -0.096
## age -0.724 0.000 0.033 -0.300
## married -0.409 0.036 -0.045 -0.037 0.024
## single -0.494 0.046 -0.061 -0.001 0.123 0.914
## urban -0.040 -0.021 -0.005 -0.018 -0.009 0.000 -0.002
## rural 0.006 -0.028 0.005 0.009 -0.002 0.000 -0.003 0.001
## inhs -0.325 0.013 0.024 0.048 0.431 0.004 0.007 -0.005 0.011
## incollege -0.197 0.095 0.089 0.126 0.191 0.005 -0.026 -0.002 -0.007
## inhs
## foodstamps
## male
## inc_adjz
## age
## married
## single
## urban
## rural
## inhs
## incollege 0.264
The random intercept model shows that those who receive foodstamps have longer commute times than those who do not receive this service. Males as well as those with higher incomes are more likely to commute longer than females and individuals with lower incomes. Those who live in urban settings seem to have a shorter commute to work when compared to those who live in rural areas. In terms of those, currently enrolled in high school or college, the latter group experiences less commuting times. Finally, those who are single seem to have also a shorter commute than those who are married.
The coefficient for income in the model is 0.04, versus the standard error which is 0.0047224, which gives a test of 5.637798210^{4}, which, if we use a z-statistic, yields a p-value of : < 1e-05.
This tells us that, yes, higher incomes experience a longer commute (travel) time to work than those in with lower incomes
The second model considers both random intercepts and random slopes, in this case, I’m only considering a random slope for income. This implies the proposition:
*Do individuals between the ages of 16-25 who have higher levels of income experience longer work commuting times EQUALLY by county?
fit2<-lmer(travelz~foodstamps+male+inc_adjz+age+married+single+employed+urban+rural+inhs+incollege+(1+inc_adjz|county), data=data1.sub)
## fixed-effect model matrix is rank deficient so dropping 1 column / coefficient
summary(fit2)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula:
## travelz ~ foodstamps + male + inc_adjz + age + married + single +
## employed + urban + rural + inhs + incollege + (1 + inc_adjz |
## county)
## Data: data1.sub
##
## REML criterion at convergence: 158050.6
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.0386 -0.5789 -0.2246 0.2822 7.3722
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## county (Intercept) 0.035067 0.18726
## inc_adjz 0.004849 0.06963 0.40
## Residual 0.960898 0.98025
## Number of obs: 56398, groups: county, 39
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) -4.061e-01 6.182e-02 5.942e+02 -6.570 1.10e-10 ***
## foodstamps 3.448e-02 1.241e-02 5.634e+04 2.778 0.005469 **
## male 1.031e-01 8.400e-03 5.635e+04 12.275 < 2e-16 ***
## inc_adjz 5.834e-02 1.367e-02 3.516e+01 4.269 0.000141 ***
## age 1.470e-02 1.786e-03 5.632e+04 8.231 < 2e-16 ***
## married -3.098e-02 2.695e-02 5.635e+04 -1.150 0.250349
## single -6.364e-02 2.612e-02 5.635e+04 -2.437 0.014821 *
## urban -3.567e-02 1.253e-02 3.420e+04 -2.847 0.004417 **
## rural 4.941e-02 2.370e-02 5.630e+04 2.085 0.037065 *
## inhs -2.032e-01 2.005e-02 5.635e+04 -10.131 < 2e-16 ***
## incollege -7.485e-02 1.023e-02 5.629e+04 -7.318 2.56e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) fdstmp male inc_dj age marrid single urban rural
## foodstamps -0.249
## male -0.081 0.032
## inc_adjz 0.230 0.046 -0.044
## age -0.722 -0.001 0.034 -0.118
## married -0.408 0.036 -0.044 -0.014 0.025
## single -0.491 0.047 -0.063 0.006 0.122 0.914
## urban -0.038 -0.021 -0.007 0.004 -0.011 0.000 -0.002
## rural 0.006 -0.028 0.004 0.001 -0.002 0.000 -0.002 0.001
## inhs -0.322 0.013 0.022 0.025 0.428 0.004 0.007 -0.004 0.012
## incollege -0.194 0.096 0.088 0.054 0.188 0.004 -0.026 -0.001 -0.007
## inhs
## foodstamps
## male
## inc_adjz
## age
## married
## single
## urban
## rural
## inhs
## incollege 0.264
## fit warnings:
## fixed-effect model matrix is rank deficient so dropping 1 column / coefficient
The coefficient for income in the second model is 0.06, versus the standard error which is 0.0136652, which gives a test of 35.1636086, which, if we use a z-statistic, yields a p-value of : < 1e-05.
After including income as a random slope, we see an increase in the variation of the estimates amongst the fixed effects particularly in marital status and income. In terms of the remaining variables,We continue to observe similar trends than those produced by the first model.
anova (fit1, fit2)
## refitting model(s) with ML (instead of REML)
## Data: data1.sub
## Models:
## fit1: travelz ~ foodstamps + male + inc_adjz + age + married + single +
## fit1: urban + rural + inhs + incollege + (1 | county)
## fit2: travelz ~ foodstamps + male + inc_adjz + age + married + single +
## fit2: employed + urban + rural + inhs + incollege + (1 + inc_adjz |
## fit2: county)
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## fit1 13 158174 158290 -79074 158148
## fit2 15 158004 158138 -78987 157974 173.69 2 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
When comparing the two models, the second one provides slightly more information than the first as observed in the lower AIC of model 2 of 158,004 vs 158,174 of model 1.
Individuals with higher incomes experience longer commuting times, because of the fixed effect coefficient of 0.06, but the level of this finding varies by county. In other words, some counties may be better at providing more effective options of transportation to young working men and women.
Here is the ICC from fits 1 & 2
icc(fit1)
##
## Linear mixed model
##
## Family : gaussian (identity)
## Formula: travelz ~ foodstamps + male + inc_adjz + age + married + single + urban + rural + inhs + incollege + (1 | county)
##
## ICC (county): 0.0340
icc(fit2)
## Caution! ICC for random-slope-intercept models usually not meaningful. See 'Note' in `?icc`.
##
## Linear mixed model
##
## Family : gaussian (identity)
## Formula: travelz ~ foodstamps + male + inc_adjz + age + married + single + employed + urban + rural + inhs + incollege + (1 + inc_adjz | county)
##
## ICC (county): 0.0352
Including the random slope in model 2, improves slightly the correlation within counties.
Below is a plot of each county. The intercept of each line is the global intercept from fit2 + the random intercept component for each county. The slope for each line is the average slope, the fixed effect for income from fit2, plus the random slope component for each county.
#get the random effects
rancoefs2<-ranef(fit2)
par(mfrow=c(1,1))
fixef(fit2)
## (Intercept) foodstamps male inc_adjz age married
## -0.40612902 0.03447859 0.10310886 0.05833638 0.01470012 -0.03097990
## single urban rural inhs incollege
## -0.06363957 -0.03566485 0.04940821 -0.20316605 -0.07484938
summary(fixef(fit2)[1]+rancoefs2$county) # I do this to get values for the range for the y axis.
## (Intercept) inc_adjz
## Min. :-0.74751 Min. :-0.5000
## 1st Qu.:-0.56470 1st Qu.:-0.4522
## Median :-0.36832 Median :-0.4168
## Mean :-0.40613 Mean :-0.4061
## 3rd Qu.:-0.26615 3rd Qu.:-0.3747
## Max. :-0.08547 Max. :-0.2795
plot(NULL, ylim=c(-1, 2), xlim=c(0,1),ylab="Work Commute Time", xlab="Income") #You may have to change the ylim() for your outcome.
title (main="Regression Lines for each county from Random Slope and Intercept Model")
cols=sample(rainbow(n=39), size = dim(rancoefs2$county)[1], replace = T)
for (i in 1: dim(rancoefs2$county)[1]){
abline(a=fixef(fit2)[1]+rancoefs2$county[[1]][i], b=fixef(fit2)[4]+rancoefs2$county[[2]][i], col=cols[i],lwd=.5 )
}
#this line shows the "average" effect of income.
abline(a=fixef(fit2)[1], b=fixef(fit2)[4], col=1, lwd=3)
legend("topright", col=1, lwd=5, legend="Average Effect of Income")