In this assignment, I conduct a multilevel analysis estimating a random intercept model for the outcome of travel time to work.I also estimate a multilevel model including RANDOM SLOPES for the following individual predictors: gender, income, age, marital status and current enrollment in high school or college. In addition, I include higher level variables such as living in an urban vs rural area and the household receiving foodstamps.

The sample includes data from 58,594 individuals born between 1988 and 1998 in the State of Texas and filtered to include those individuals who are currently employed. Data was gathered from ACS-IPUMS 2014-2016.

library(haven)
library(dplyr)
library (car)
library(lmtest)
library(arm)
library(lme4) 
library(ggplot2)
library(sjstats)
library(lmerTest)
library(MuMIn)

load ('C:/Users/PCMcC/Documents/Hierarchical Models/hierarchichal methods class/data_1.RData')

myvars<-c("statefip","county","countyfips","metro","city","puma","hhincome","incwage","educd","empstatd","gradeatt","race", "hispan", "hispand","raced","sex","empstat","age","trantime","poverty","birthyr","year","gq","foodstmp","hhtype","fertyr","marst")
data1.sub<-sub[,myvars]
rm(sub); gc()

##           used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells 2172906 116.1    4445197 237.4  2529642 135.1
## Vcells 5364828  41.0   12175244  92.9  9759974  74.5

Filtering Data

#Filter copied from sansonePAA.rmd file
data1.sub<-filter(data1.sub,birthyr %in% 1988:1998,year %in% 2014:2016,  gq != 3, gq != 4, gq!=6,empstat== 1)
names(data1.sub) #print the column names

##  [1] "statefip"   "county"     "countyfips" "metro"      "city"      
##  [6] "puma"       "hhincome"   "incwage"    "educd"      "empstatd"  
## [11] "gradeatt"   "race"       "hispan"     "hispand"    "raced"     
## [16] "sex"        "empstat"    "age"        "trantime"   "poverty"   
## [21] "birthyr"    "year"       "gq"         "foodstmp"   "hhtype"    
## [26] "fertyr"     "marst"

Individual Level Variables

#MARRIAGE STATUS
data1.sub$marriage_status<- Recode (data1.sub$marst, recodes= "1:2='married'; 3:5='sep_div_wid'; 6='single'")
data1.sub$married<-recode(data1.sub$marst, recodes="1:2=1; else=0")
data1.sub$single<-recode(data1.sub$marst, recodes="6=1; else=0")

#RACE/ETHNICITY
data1.sub$hispanic <- Recode(data1.sub$hispan, recodes = "9=NA; 1:4='Hispanic'; 0='NonHispanic'")
data1.sub$race_rec <- Recode(data1.sub$race, recodes = "1='White'; 2='Black'; 3='Other'; 4:6='Asian'; 7:9='Other'")
data1.sub$race_eth <- interaction(data1.sub$hispanic, data1.sub$race_rec, sep = "_")
data1.sub$race_eth  <- as.factor(ifelse(substr(as.character(data1.sub$race_eth),1,8) == "Hispanic", "Hispanic", as.character(data1.sub$race_eth)))
data1.sub$race_eth <- relevel(data1.sub$race_eth, ref = "NonHispanic_White")

#FERTILITY (1=no, 2=yes)
data1.sub$fert<- Recode (data1.sub$fertyr, recodes= "1=0; 2=1; else=NA")
table(data1.sub$fert)

## 
##     0     1 
## 25925  1806

#SEX
data1.sub$sex2<- Recode (data1.sub$sex, recodes= "1=1; 2=2; else=NA")
data1.sub$male<- Recode (data1.sub$sex, recodes= "1=1; 2=0; else=NA")

#PLACE OF BIRTH
data1.sub$usborn <- Recode(data1.sub$bpl, recodes = "1:120=1; 121:900=0; else=NA")

## Warning: Unknown or uninitialised column: 'bpl'.

#EDUCATION LEVEL
data1.sub$educ_level<- Recode(data1.sub$educd, recodes = "2:61='0Less_than_HS';62:64='1_HSD/GED';65:80='2_somecollege';90:100='2_somecollege'; 81:83='3_AssocDegree';101='4_bachelordegree'; 110:116='4_BAplus_GradDegree'; else=NA")

#EMPLOYMENT
data1.sub$employedetailed <- Recode(data1.sub$empstatd, recodes = "10:12=1; 20:22=0; else=NA")

data1.sub$employed<-Recode(data1.sub$empstat, recodes = "1=1; else=NA")
data1.sub$unemployed<-Recode(data1.sub$empstat, recodes = "2=1; else=NA")
data1.sub$not_in_laborforce<-Recode(data1.sub$empstat, recodes = "3=1; else=NA")


#INCOME ADJUSTED FOR INFLATION
#sub$edu_scal_inc <- ave(sub$incwage, sub$male, FUN = scale)
data1.sub$inc_adj <- ifelse(data1.sub$year == 2005, data1.sub$incwage*1.23,
                      ifelse(data1.sub$year == 2006, data1.sub$incwage*1.18,
                            ifelse(data1.sub$year == 2007, data1.sub$incwage*1.16,
                                ifelse(data1.sub$year %in% c(2008,2009), data1.sub$incwage*1.1, 
                                     ifelse(data1.sub$year == 2010, data1.sub$incwage*1.1,
                                           ifelse(data1.sub$year == 2011, data1.sub$incwage*1.07,
                                               ifelse(data1.sub$year == 2012,data1.sub$incwage*1.05,
                                                   ifelse(data1.sub$year == 2013, data1.sub$incwage*1.03, 
                                                       ifelse(data1.sub$year == 2014, data1.sub$incwage*1.01,
                                                           ifelse(data1.sub$year == 2015, data1.sub$incwage*1.01, data1.sub$incwage))))))))))


#TRAVEL TIME TO WORK (does not include zero time commute)
data1.sub$traveltime<-ifelse(data1.sub$trantime==000, NA, data1.sub$trantime)


#IN HS/COLLEGE OR NOT IN HS/COLLEGE
data1.sub$inhs<- Recode(data1.sub$gradeatt, recodes = "5=1; else=0")
data1.sub$incollege <- Recode(data1.sub$gradeatt, recodes = "6:7=1; else=0")

Household level variables

#FOOD STAMPS (2=yes, 1=no)
data1.sub$foodstamps<-Recode(data1.sub$foodstmp, recodes = "2=2; 1=1; 0=NA")


#HOUSEHOLD TYPE (1=married family, 2=one male or female living alone, 3=male or female living with someone)
data1.sub$housetype<-Recode(data1.sub$hhtype, recodes = "1=1; c(2,3,4,6)=2; c(5,7)=3; 9=NA" )


#METRO AREA (1=not in metro area, 2=metro central city, 3=suburban, 4=central/principal city unknown)
data1.sub$metroarea<-Recode(data1.sub$metro,recodes="1=1; 2=2; 3=3; else='NA'")

#URBAN (includes suburbs) & RURAL
data1.sub$urban<-recode(data1.sub$metroarea, recodes="2:3=1; else=0")
data1.sub$rural<-recode(data1.sub$metroarea, recodes="1=1; else=0")

Modeling

Model 1

The First model will fit a model that considers the individual and family level variables and a random intercept only.

A simple analysis of variance shows that there is in fact a significant difference in commuting times and income.

data1.sub$travelz<-scale(data1.sub$traveltime, center = T, scale=T)
data1.sub$inc_adjz<-scale(data1.sub$inc_adj, center = T, scale=T)

fit0<-lm(travelz~inc_adjz*factor(county), data=data1.sub)
anova(fit0)

## Analysis of Variance Table
## 
## Response: travelz
##                            Df Sum Sq Mean Sq  F value    Pr(>F)    
## inc_adjz                    1    421  421.47 434.2908 < 2.2e-16 ***
## factor(county)             38    998   26.25  27.0529 < 2.2e-16 ***
## inc_adjz:factor(county)    38    320    8.42   8.6748 < 2.2e-16 ***
## Residuals               56320  54658    0.97                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

However, we want to know whether individuals between the ages of 16-25 with higher incomes have longer commuting times to work, net of other demographic and socioeconomic factors?

fit1<-lmer(travelz~foodstamps+male+inc_adjz+age+married+single+urban+rural+inhs+incollege+(1|county), data=data1.sub)
summary(fit1)

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: 
## travelz ~ foodstamps + male + inc_adjz + age + married + single +  
##     urban + rural + inhs + incollege + (1 | county)
##    Data: data1.sub
## 
## REML criterion at convergence: 158226.5
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.8929 -0.5809 -0.2268  0.2822  7.3613 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  county   (Intercept) 0.03399  0.1844  
##  Residual             0.96479  0.9822  
## Number of obs: 56398, groups:  county, 39
## 
## Fixed effects:
##               Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept) -4.198e-01  6.160e-02  6.232e+02  -6.816 2.22e-11 ***
## foodstamps   3.523e-02  1.243e-02  5.639e+04   2.835  0.00459 ** 
## male         1.079e-01  8.399e-03  5.635e+04  12.846  < 2e-16 ***
## inc_adjz     3.639e-02  4.722e-03  5.638e+04   7.705 1.33e-14 ***
## age          1.521e-02  1.785e-03  5.637e+04   8.520  < 2e-16 ***
## married     -2.688e-02  2.699e-02  5.635e+04  -0.996  0.31930    
## single      -6.601e-02  2.615e-02  5.636e+04  -2.524  0.01160 *  
## urban       -3.895e-02  1.254e-02  3.631e+04  -3.107  0.00189 ** 
## rural        4.359e-02  2.374e-02  5.638e+04   1.837  0.06628 .  
## inhs        -2.131e-01  2.007e-02  5.636e+04 -10.614  < 2e-16 ***
## incollege   -7.630e-02  1.023e-02  5.638e+04  -7.454 9.18e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##            (Intr) fdstmp male   inc_dj age    marrid single urban  rural 
## foodstamps -0.251                                                        
## male       -0.081  0.033                                                 
## inc_adjz    0.178  0.121 -0.096                                          
## age        -0.724  0.000  0.033 -0.300                                   
## married    -0.409  0.036 -0.045 -0.037  0.024                            
## single     -0.494  0.046 -0.061 -0.001  0.123  0.914                     
## urban      -0.040 -0.021 -0.005 -0.018 -0.009  0.000 -0.002              
## rural       0.006 -0.028  0.005  0.009 -0.002  0.000 -0.003  0.001       
## inhs       -0.325  0.013  0.024  0.048  0.431  0.004  0.007 -0.005  0.011
## incollege  -0.197  0.095  0.089  0.126  0.191  0.005 -0.026 -0.002 -0.007
##            inhs  
## foodstamps       
## male             
## inc_adjz         
## age              
## married          
## single           
## urban            
## rural            
## inhs             
## incollege   0.264

The random intercept model shows that those who receive foodstamps have longer commute times than those who do not receive this service. Males as well as those with higher incomes are more likely to commute longer than females and individuals with lower incomes. Those who live in urban settings seem to have a shorter commute to work when compared to those who live in rural areas. In terms of those, currently enrolled in high school or college, the latter group experiences less commuting times. Finally, those who are single seem to have also a shorter commute than those who are married.

The coefficient for income in the model is 0.04, versus the standard error which is 0.0047224, which gives a test of 5.637798210^{4}, which, if we use a z-statistic, yields a p-value of : < 1e-05.

This tells us that, yes, higher incomes experience a longer commute (travel) time to work than those in with lower incomes

Model 2

The second model considers both random intercepts and random slopes, in this case, I’m only considering a random slope for income. This implies the proposition:

*Do individuals between the ages of 16-25 who have higher levels of income experience longer work commuting times EQUALLY by county?

fit2<-lmer(travelz~foodstamps+male+inc_adjz+age+married+single+employed+urban+rural+inhs+incollege+(1+inc_adjz|county), data=data1.sub)

## fixed-effect model matrix is rank deficient so dropping 1 column / coefficient

summary(fit2)

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: 
## travelz ~ foodstamps + male + inc_adjz + age + married + single +  
##     employed + urban + rural + inhs + incollege + (1 + inc_adjz |  
##     county)
##    Data: data1.sub
## 
## REML criterion at convergence: 158050.6
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.0386 -0.5789 -0.2246  0.2822  7.3722 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr
##  county   (Intercept) 0.035067 0.18726      
##           inc_adjz    0.004849 0.06963  0.40
##  Residual             0.960898 0.98025      
## Number of obs: 56398, groups:  county, 39
## 
## Fixed effects:
##               Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept) -4.061e-01  6.182e-02  5.942e+02  -6.570 1.10e-10 ***
## foodstamps   3.448e-02  1.241e-02  5.634e+04   2.778 0.005469 ** 
## male         1.031e-01  8.400e-03  5.635e+04  12.275  < 2e-16 ***
## inc_adjz     5.834e-02  1.367e-02  3.516e+01   4.269 0.000141 ***
## age          1.470e-02  1.786e-03  5.632e+04   8.231  < 2e-16 ***
## married     -3.098e-02  2.695e-02  5.635e+04  -1.150 0.250349    
## single      -6.364e-02  2.612e-02  5.635e+04  -2.437 0.014821 *  
## urban       -3.567e-02  1.253e-02  3.420e+04  -2.847 0.004417 ** 
## rural        4.941e-02  2.370e-02  5.630e+04   2.085 0.037065 *  
## inhs        -2.032e-01  2.005e-02  5.635e+04 -10.131  < 2e-16 ***
## incollege   -7.485e-02  1.023e-02  5.629e+04  -7.318 2.56e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##            (Intr) fdstmp male   inc_dj age    marrid single urban  rural 
## foodstamps -0.249                                                        
## male       -0.081  0.032                                                 
## inc_adjz    0.230  0.046 -0.044                                          
## age        -0.722 -0.001  0.034 -0.118                                   
## married    -0.408  0.036 -0.044 -0.014  0.025                            
## single     -0.491  0.047 -0.063  0.006  0.122  0.914                     
## urban      -0.038 -0.021 -0.007  0.004 -0.011  0.000 -0.002              
## rural       0.006 -0.028  0.004  0.001 -0.002  0.000 -0.002  0.001       
## inhs       -0.322  0.013  0.022  0.025  0.428  0.004  0.007 -0.004  0.012
## incollege  -0.194  0.096  0.088  0.054  0.188  0.004 -0.026 -0.001 -0.007
##            inhs  
## foodstamps       
## male             
## inc_adjz         
## age              
## married          
## single           
## urban            
## rural            
## inhs             
## incollege   0.264
## fit warnings:
## fixed-effect model matrix is rank deficient so dropping 1 column / coefficient

The coefficient for income in the second model is 0.06, versus the standard error which is 0.0136652, which gives a test of 35.1636086, which, if we use a z-statistic, yields a p-value of : < 1e-05.

After including income as a random slope, we see an increase in the variation of the estimates amongst the fixed effects particularly in marital status and income. In terms of the remaining variables,We continue to observe similar trends than those produced by the first model.

Likelihood ratio between models

anova (fit1, fit2)

## refitting model(s) with ML (instead of REML)

## Data: data1.sub
## Models:
## fit1: travelz ~ foodstamps + male + inc_adjz + age + married + single + 
## fit1:     urban + rural + inhs + incollege + (1 | county)
## fit2: travelz ~ foodstamps + male + inc_adjz + age + married + single + 
## fit2:     employed + urban + rural + inhs + incollege + (1 + inc_adjz | 
## fit2:     county)
##      Df    AIC    BIC logLik deviance  Chisq Chi Df Pr(>Chisq)    
## fit1 13 158174 158290 -79074   158148                             
## fit2 15 158004 158138 -78987   157974 173.69      2  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

When comparing the two models, the second one provides slightly more information than the first as observed in the lower AIC of model 2 of 158,004 vs 158,174 of model 1.

Individuals with higher incomes experience longer commuting times, because of the fixed effect coefficient of 0.06, but the level of this finding varies by county. In other words, some counties may be better at providing more effective options of transportation to young working men and women.

Intra-class correlations

Here is the ICC from fits 1 & 2

icc(fit1)

## 
## Linear mixed model
## 
## Family : gaussian (identity)
## Formula: travelz ~ foodstamps + male + inc_adjz + age + married + single + urban + rural + inhs + incollege + (1 | county)
## 
##   ICC (county): 0.0340

icc(fit2)

## Caution! ICC for random-slope-intercept models usually not meaningful. See 'Note' in `?icc`.

## 
## Linear mixed model
## 
## Family : gaussian (identity)
## Formula: travelz ~ foodstamps + male + inc_adjz + age + married + single + employed + urban + rural + inhs + incollege + (1 + inc_adjz | county)
## 
##   ICC (county): 0.0352

Including the random slope in model 2, improves slightly the correlation within counties.

Visualizations of effects

Below is a plot of each county. The intercept of each line is the global intercept from fit2 + the random intercept component for each county. The slope for each line is the average slope, the fixed effect for income from fit2, plus the random slope component for each county.

#get the random effects
rancoefs2<-ranef(fit2)

par(mfrow=c(1,1))
fixef(fit2)

## (Intercept)  foodstamps        male    inc_adjz         age     married 
## -0.40612902  0.03447859  0.10310886  0.05833638  0.01470012 -0.03097990 
##      single       urban       rural        inhs   incollege 
## -0.06363957 -0.03566485  0.04940821 -0.20316605 -0.07484938

summary(fixef(fit2)[1]+rancoefs2$county) # I do this to get values for the range for the y axis.

##   (Intercept)          inc_adjz      
##  Min.   :-0.74751   Min.   :-0.5000  
##  1st Qu.:-0.56470   1st Qu.:-0.4522  
##  Median :-0.36832   Median :-0.4168  
##  Mean   :-0.40613   Mean   :-0.4061  
##  3rd Qu.:-0.26615   3rd Qu.:-0.3747  
##  Max.   :-0.08547   Max.   :-0.2795

plot(NULL, ylim=c(-1, 2), xlim=c(0,1),ylab="Work Commute Time", xlab="Income") #You may have to change the ylim() for your outcome.
title (main="Regression Lines for each county from Random Slope and Intercept Model")
cols=sample(rainbow(n=39), size = dim(rancoefs2$county)[1], replace = T)
for (i in 1: dim(rancoefs2$county)[1]){
  
  abline(a=fixef(fit2)[1]+rancoefs2$county[[1]][i], b=fixef(fit2)[4]+rancoefs2$county[[2]][i], col=cols[i],lwd=.5 )
}

#this line shows the "average" effect of income.
abline(a=fixef(fit2)[1], b=fixef(fit2)[4], col=1, lwd=3)
legend("topright", col=1, lwd=5, legend="Average Effect of Income")

Assigment #2, DEM 7473 Hierarchical Models

Paulina Cano

September 24, 2018