In this assignment, I estimate the random intercept model for my outcome of travel time by metro area. Two models are fitted, the first one is considered the “NULL”model, with no predictors, and the second uses both individual and group predictors including: race/ethnicity, age and sex of individuals as well as usage of foodstamps and household type.

The sample includes individuals born between 1988 and 1998 in the State of Texas. Data was gathered from ACS-IPUMS 2014-2016.

library(haven)
library(dplyr)
library (car)
library(lmtest)
library(arm)
library(lme4) 
library(ggplot2)
load ('C:/Users/PCMcC/Documents/Hierarchical Models/hierarchichal methods class/data_1.RData')

myvars<-c("statefip","county","countyfips","metro","city","puma","hhincome","incwage","educd","empstatd","gradeatt","race", "hispan", "hispand","raced","sex", "age","trantime","poverty","birthyr","year","gq","foodstmp","hhtype")
data1.sub<-sub[,myvars]
rm(sub); gc()
##           used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells 1908273 102.0    3806738 203.4  2524842 134.9
## Vcells 4851792  37.1   12138420  92.7  9391402  71.7

Filtering Data

#Filter copied from sansonePAA.rmd file
data1.sub<-filter(data1.sub,birthyr %in% 1988:1998,year %in% 2014:2016,  gq != 3, gq != 4, gq!=6)
names(data1.sub) #print the column names
##  [1] "statefip"   "county"     "countyfips" "metro"      "city"      
##  [6] "puma"       "hhincome"   "incwage"    "educd"      "empstatd"  
## [11] "gradeatt"   "race"       "hispan"     "hispand"    "raced"     
## [16] "sex"        "age"        "trantime"   "poverty"    "birthyr"   
## [21] "year"       "gq"         "foodstmp"   "hhtype"

Recoding of Variables

#RACE/ETHNICITY
data1.sub$hispanic <- Recode(data1.sub$hispan, recodes = "9=NA; 1:4='Hispanic'; 0='NonHispanic'")
data1.sub$race_rec <- Recode(data1.sub$race, recodes = "1='White'; 2='Black'; 3='Other'; 4:6='Asian'; 7:9='Other'")
data1.sub$race_eth <- interaction(data1.sub$hispanic, data1.sub$race_rec, sep = "_")
data1.sub$race_eth  <- as.factor(ifelse(substr(as.character(data1.sub$race_eth),1,8) == "Hispanic", "Hispanic", as.character(data1.sub$race_eth)))
data1.sub$race_eth <- relevel(data1.sub$race_eth, ref = "NonHispanic_White")

#SEX
data1.sub$sex2<- Recode (data1.sub$sex, recodes= "1=1; 2=2; else=NA")


#PLACE OF BIRTH
data1.sub$usborn <- Recode(data1.sub$bpl, recodes = "1:120=1; 121:900=0; else=NA")
## Warning: Unknown or uninitialised column: 'bpl'.
#EDUCATION LEVEL
data1.sub$educ_level<- Recode(data1.sub$educd, recodes = "2:61='0Less_than_HS';62:64='1_HSD/GED';65:80='2_somecollege';90:100='2_somecollege'; 81:83='3_AssocDegree';101='4_bachelordegree'; 110:116='4_BAplus_GradDegree'; else=NA")

#EMPLOYMENT
data1.sub$employed <- Recode(data1.sub$empstatd, recodes = "10:12=1; 20:22=0; else=NA")
data1.sub$inhs<- Recode(data1.sub$gradeatt, recodes = "5=1; else=0")
data1.sub$incollege <- Recode(data1.sub$gradeatt, recodes = "6:7=1; else=0")

data1.sub<-data1.sub%>%
  mutate(hsgrad_coll = as.factor(case_when(.$educ_level=='1_HSD/GED' & .$incollege==0 ~ '0hsgrad_notincollege',
                                 .$educ_level=='1_HSD/GED' & .$incollege==1 ~ '1hsgrad_incollege')))

#INCOME ADJUSTED FOR INFLATION
#sub$edu_scal_inc <- ave(sub$incwage, sub$male, FUN = scale)
data1.sub$inc_adj <- ifelse(data1.sub$year == 2005, data1.sub$incwage*1.23,
                      ifelse(data1.sub$year == 2006, data1.sub$incwage*1.18,
                            ifelse(data1.sub$year == 2007, data1.sub$incwage*1.16,
                                ifelse(data1.sub$year %in% c(2008,2009), data1.sub$incwage*1.1, 
                                     ifelse(data1.sub$year == 2010, data1.sub$incwage*1.1,
                                           ifelse(data1.sub$year == 2011, data1.sub$incwage*1.07,
                                               ifelse(data1.sub$year == 2012,data1.sub$incwage*1.05,
                                                   ifelse(data1.sub$year == 2013, data1.sub$incwage*1.03, 
                                                       ifelse(data1.sub$year == 2014, data1.sub$incwage*1.01,
                                                           ifelse(data1.sub$year == 2015, data1.sub$incwage*1.01, data1.sub$incwage))))))))))

table(data1.sub$metro)
## 
##     0     1     2     3     4 
## 11825  4144 11888  7068 61198
#TRAVEL TIME TO WORK (does not include zero time commute)
data1.sub$traveltime<-ifelse(data1.sub$trantime==000, NA, data1.sub$trantime)

Household level variables

#FOOD STAMPS (1=yes, 2=no)
data1.sub$foodstamps<-Recode(data1.sub$foodstmp, recodes = "2=1; 1=2; 0=NA")


#HOUSEHOLD TYPE (1=married family, 2=one male or female living alone, 3=male or female living with someone)
data1.sub$housetype<-Recode(data1.sub$hhtype, recodes = "1=1; c(2,3,4,6)=2; c(5,7)=3; 9=NA" )


#METRO (1=not in metro, 2=in metro area principal city, 3=in metro area outside central principal city, 4=central/principal city unknown)
data1.sub$metroarea<-Recode(data1.sub$metro, recodes = "1=1; 2=2; 3=3; 4=4; 0=NA")

Testing the variation our outcome

In testing the variation accross metro areas, I find that my F value is higher than 1 at 12.691 which shows that there is fact difference in the means of the metro area groups.

fit0<-lm(traveltime~factor(metroarea), data=data1.sub)
#Global F test for equality of means across groups
anova(fit0)
## Analysis of Variance Table
## 
## Response: traveltime
##                      Df   Sum Sq Mean Sq F value    Pr(>F)    
## factor(metroarea)     3    15024  5008.1  12.691 2.747e-08 ***
## Residuals         49712 19617282   394.6                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Null Model

The null model shows a difference between each type of metro areas as well as a group. Our T value is 50.76 which is statistically significant.

#was not able to subset complete cases, it would not work. 
fit1<-lmer(traveltime~1+(1|metroarea), data=data1.sub, REML=T)
summary(fit1)
## Linear mixed model fit by REML ['lmerMod']
## Formula: traveltime ~ 1 + (1 | metroarea)
##    Data: data1.sub
## 
## REML criterion at convergence: 438295.6
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.1641 -0.6759 -0.1725  0.3309  6.9399 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  metroarea (Intercept)   0.7545  0.8686 
##  Residual              394.6184 19.8650 
## Number of obs: 49716, groups:  metroarea, 4
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)   23.199      0.457   50.76

Null Model with Household and Individual Level Covariates

The following model includes travel time, age, sex, race/ethnicity of individuals, usage of foodstamps by household, type of household and type of metro area.

Results from the more complex model using additional coveriates shows a very slight decrease in the variance between metro areas 19.8 to 19.7.Estimates between groups shows that the largest variation in travel time occurs between Non-Hispanic_White and Non-Hispanic_Blacks and that generally, there is a significant influence from age as observed in the T-value of 23.9. This last finding is expected as our sample ranges from the age 16 to 28. Commute most likely will increase with age as this age group enters the workforce.

fit2<-lmer(traveltime~1+race_eth+hhtype+foodstamps+age+sex+(1|metroarea), data=data1.sub, REML=T)
summary(fit2)
## Linear mixed model fit by REML ['lmerMod']
## Formula: traveltime ~ 1 + race_eth + hhtype + foodstamps + age + sex +  
##     (1 | metroarea)
##    Data: data1.sub
## 
## REML criterion at convergence: 437481.8
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.4582 -0.5849 -0.2499  0.3070  7.2460 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  metroarea (Intercept)   1.075   1.037  
##  Residual              388.159  19.702  
## Number of obs: 49716, groups:  metroarea, 4
## 
## Fixed effects:
##                           Estimate Std. Error t value
## (Intercept)               10.98188    1.05687  10.391
## race_ethHispanic           0.71763    0.19870   3.612
## race_ethNonHispanic_Asian  0.80316    0.45208   1.777
## race_ethNonHispanic_Black  2.74905    0.31761   8.655
## race_ethNonHispanic_Other  0.42135    0.59243   0.711
## hhtype                    -0.24502    0.03288  -7.452
## foodstamps                -0.63433    0.26614  -2.383
## age                        0.72612    0.03026  23.993
## sex                       -2.09102    0.17791 -11.753
## 
## Correlation of Fixed Effects:
##             (Intr) rc_thH r_NH_A r_NH_B r_NH_O hhtype fdstmp age   
## rac_thHspnc -0.220                                                 
## rc_thNnHs_A -0.007  0.215                                          
## rc_thNnHs_B -0.116  0.327  0.134                                   
## rc_thNnHs_O -0.057  0.167  0.070  0.106                            
## hhtype      -0.018  0.097  0.037  0.003 -0.009                     
## foodstamps  -0.483  0.165 -0.003  0.133  0.028  0.017              
## age         -0.649  0.056 -0.043  0.013  0.021 -0.111 -0.016       
## sex         -0.273  0.022 -0.008 -0.030  0.004 -0.080  0.037  0.022

Comparison of Model Fit

Likelihood Ratio Test

The second model appears to be a much better fit than just the first one in which metro area is the only determinant on travel time.

anova(fit1, fit2)
## refitting model(s) with ML (instead of REML)
## Data: data1.sub
## Models:
## fit1: traveltime ~ 1 + (1 | metroarea)
## fit2: traveltime ~ 1 + race_eth + hhtype + foodstamps + age + sex + 
## fit2:     (1 | metroarea)
##      Df    AIC    BIC  logLik deviance  Chisq Chi Df Pr(>Chisq)    
## fit1  3 438302 438328 -219148   438296                             
## fit2 11 437491 437588 -218734   437469 827.15      8  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Reporting ICC

The Intraclass Correlation shows that the values of the first model are slightly less reliable than those of the second model. Adding covariates for control such as age, sex and race/ethnicity among other resulted in the decrease of the variation between metro areas in terms of their travel time.

#install.packages("sjstats")
library(sjstats)
icc(fit1)
## 
## Linear mixed model
## 
## Family : gaussian (identity)
## Formula: traveltime ~ 1 + (1 | metroarea)
## 
##   ICC (metroarea): 0.0019
icc(fit2)
## 
## Linear mixed model
## 
## Family : gaussian (identity)
## Formula: traveltime ~ 1 + race_eth + hhtype + foodstamps + age + sex + (1 | metroarea)
## 
##   ICC (metroarea): 0.0028