In this assignment, I estimate the random intercept model for my outcome of travel time by metro area. Two models are fitted, the first one is considered the “NULL”model, with no predictors, and the second uses both individual and group predictors including: race/ethnicity, age and sex of individuals as well as usage of foodstamps and household type.
The sample includes individuals born between 1988 and 1998 in the State of Texas. Data was gathered from ACS-IPUMS 2014-2016.
library(haven)
library(dplyr)
library (car)
library(lmtest)
library(arm)
library(lme4)
library(ggplot2)
load ('C:/Users/PCMcC/Documents/Hierarchical Models/hierarchichal methods class/data_1.RData')
myvars<-c("statefip","county","countyfips","metro","city","puma","hhincome","incwage","educd","empstatd","gradeatt","race", "hispan", "hispand","raced","sex", "age","trantime","poverty","birthyr","year","gq","foodstmp","hhtype")
data1.sub<-sub[,myvars]
rm(sub); gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 1908273 102.0 3806738 203.4 2524842 134.9
## Vcells 4851792 37.1 12138420 92.7 9391402 71.7
#Filter copied from sansonePAA.rmd file
data1.sub<-filter(data1.sub,birthyr %in% 1988:1998,year %in% 2014:2016, gq != 3, gq != 4, gq!=6)
names(data1.sub) #print the column names
## [1] "statefip" "county" "countyfips" "metro" "city"
## [6] "puma" "hhincome" "incwage" "educd" "empstatd"
## [11] "gradeatt" "race" "hispan" "hispand" "raced"
## [16] "sex" "age" "trantime" "poverty" "birthyr"
## [21] "year" "gq" "foodstmp" "hhtype"
#RACE/ETHNICITY
data1.sub$hispanic <- Recode(data1.sub$hispan, recodes = "9=NA; 1:4='Hispanic'; 0='NonHispanic'")
data1.sub$race_rec <- Recode(data1.sub$race, recodes = "1='White'; 2='Black'; 3='Other'; 4:6='Asian'; 7:9='Other'")
data1.sub$race_eth <- interaction(data1.sub$hispanic, data1.sub$race_rec, sep = "_")
data1.sub$race_eth <- as.factor(ifelse(substr(as.character(data1.sub$race_eth),1,8) == "Hispanic", "Hispanic", as.character(data1.sub$race_eth)))
data1.sub$race_eth <- relevel(data1.sub$race_eth, ref = "NonHispanic_White")
#SEX
data1.sub$sex2<- Recode (data1.sub$sex, recodes= "1=1; 2=2; else=NA")
#PLACE OF BIRTH
data1.sub$usborn <- Recode(data1.sub$bpl, recodes = "1:120=1; 121:900=0; else=NA")
## Warning: Unknown or uninitialised column: 'bpl'.
#EDUCATION LEVEL
data1.sub$educ_level<- Recode(data1.sub$educd, recodes = "2:61='0Less_than_HS';62:64='1_HSD/GED';65:80='2_somecollege';90:100='2_somecollege'; 81:83='3_AssocDegree';101='4_bachelordegree'; 110:116='4_BAplus_GradDegree'; else=NA")
#EMPLOYMENT
data1.sub$employed <- Recode(data1.sub$empstatd, recodes = "10:12=1; 20:22=0; else=NA")
data1.sub$inhs<- Recode(data1.sub$gradeatt, recodes = "5=1; else=0")
data1.sub$incollege <- Recode(data1.sub$gradeatt, recodes = "6:7=1; else=0")
data1.sub<-data1.sub%>%
mutate(hsgrad_coll = as.factor(case_when(.$educ_level=='1_HSD/GED' & .$incollege==0 ~ '0hsgrad_notincollege',
.$educ_level=='1_HSD/GED' & .$incollege==1 ~ '1hsgrad_incollege')))
#INCOME ADJUSTED FOR INFLATION
#sub$edu_scal_inc <- ave(sub$incwage, sub$male, FUN = scale)
data1.sub$inc_adj <- ifelse(data1.sub$year == 2005, data1.sub$incwage*1.23,
ifelse(data1.sub$year == 2006, data1.sub$incwage*1.18,
ifelse(data1.sub$year == 2007, data1.sub$incwage*1.16,
ifelse(data1.sub$year %in% c(2008,2009), data1.sub$incwage*1.1,
ifelse(data1.sub$year == 2010, data1.sub$incwage*1.1,
ifelse(data1.sub$year == 2011, data1.sub$incwage*1.07,
ifelse(data1.sub$year == 2012,data1.sub$incwage*1.05,
ifelse(data1.sub$year == 2013, data1.sub$incwage*1.03,
ifelse(data1.sub$year == 2014, data1.sub$incwage*1.01,
ifelse(data1.sub$year == 2015, data1.sub$incwage*1.01, data1.sub$incwage))))))))))
table(data1.sub$metro)
##
## 0 1 2 3 4
## 11825 4144 11888 7068 61198
#TRAVEL TIME TO WORK (does not include zero time commute)
data1.sub$traveltime<-ifelse(data1.sub$trantime==000, NA, data1.sub$trantime)
#FOOD STAMPS (1=yes, 2=no)
data1.sub$foodstamps<-Recode(data1.sub$foodstmp, recodes = "2=1; 1=2; 0=NA")
#HOUSEHOLD TYPE (1=married family, 2=one male or female living alone, 3=male or female living with someone)
data1.sub$housetype<-Recode(data1.sub$hhtype, recodes = "1=1; c(2,3,4,6)=2; c(5,7)=3; 9=NA" )
#METRO (1=not in metro, 2=in metro area principal city, 3=in metro area outside central principal city, 4=central/principal city unknown)
data1.sub$metroarea<-Recode(data1.sub$metro, recodes = "1=1; 2=2; 3=3; 4=4; 0=NA")
In testing the variation accross metro areas, I find that my F value is higher than 1 at 12.691 which shows that there is fact difference in the means of the metro area groups.
fit0<-lm(traveltime~factor(metroarea), data=data1.sub)
#Global F test for equality of means across groups
anova(fit0)
## Analysis of Variance Table
##
## Response: traveltime
## Df Sum Sq Mean Sq F value Pr(>F)
## factor(metroarea) 3 15024 5008.1 12.691 2.747e-08 ***
## Residuals 49712 19617282 394.6
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The null model shows a difference between each type of metro areas as well as a group. Our T value is 50.76 which is statistically significant.
#was not able to subset complete cases, it would not work.
fit1<-lmer(traveltime~1+(1|metroarea), data=data1.sub, REML=T)
summary(fit1)
## Linear mixed model fit by REML ['lmerMod']
## Formula: traveltime ~ 1 + (1 | metroarea)
## Data: data1.sub
##
## REML criterion at convergence: 438295.6
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.1641 -0.6759 -0.1725 0.3309 6.9399
##
## Random effects:
## Groups Name Variance Std.Dev.
## metroarea (Intercept) 0.7545 0.8686
## Residual 394.6184 19.8650
## Number of obs: 49716, groups: metroarea, 4
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 23.199 0.457 50.76
The following model includes travel time, age, sex, race/ethnicity of individuals, usage of foodstamps by household, type of household and type of metro area.
Results from the more complex model using additional coveriates shows a very slight decrease in the variance between metro areas 19.8 to 19.7.Estimates between groups shows that the largest variation in travel time occurs between Non-Hispanic_White and Non-Hispanic_Blacks and that generally, there is a significant influence from age as observed in the T-value of 23.9. This last finding is expected as our sample ranges from the age 16 to 28. Commute most likely will increase with age as this age group enters the workforce.
fit2<-lmer(traveltime~1+race_eth+hhtype+foodstamps+age+sex+(1|metroarea), data=data1.sub, REML=T)
summary(fit2)
## Linear mixed model fit by REML ['lmerMod']
## Formula: traveltime ~ 1 + race_eth + hhtype + foodstamps + age + sex +
## (1 | metroarea)
## Data: data1.sub
##
## REML criterion at convergence: 437481.8
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.4582 -0.5849 -0.2499 0.3070 7.2460
##
## Random effects:
## Groups Name Variance Std.Dev.
## metroarea (Intercept) 1.075 1.037
## Residual 388.159 19.702
## Number of obs: 49716, groups: metroarea, 4
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 10.98188 1.05687 10.391
## race_ethHispanic 0.71763 0.19870 3.612
## race_ethNonHispanic_Asian 0.80316 0.45208 1.777
## race_ethNonHispanic_Black 2.74905 0.31761 8.655
## race_ethNonHispanic_Other 0.42135 0.59243 0.711
## hhtype -0.24502 0.03288 -7.452
## foodstamps -0.63433 0.26614 -2.383
## age 0.72612 0.03026 23.993
## sex -2.09102 0.17791 -11.753
##
## Correlation of Fixed Effects:
## (Intr) rc_thH r_NH_A r_NH_B r_NH_O hhtype fdstmp age
## rac_thHspnc -0.220
## rc_thNnHs_A -0.007 0.215
## rc_thNnHs_B -0.116 0.327 0.134
## rc_thNnHs_O -0.057 0.167 0.070 0.106
## hhtype -0.018 0.097 0.037 0.003 -0.009
## foodstamps -0.483 0.165 -0.003 0.133 0.028 0.017
## age -0.649 0.056 -0.043 0.013 0.021 -0.111 -0.016
## sex -0.273 0.022 -0.008 -0.030 0.004 -0.080 0.037 0.022
The second model appears to be a much better fit than just the first one in which metro area is the only determinant on travel time.
anova(fit1, fit2)
## refitting model(s) with ML (instead of REML)
## Data: data1.sub
## Models:
## fit1: traveltime ~ 1 + (1 | metroarea)
## fit2: traveltime ~ 1 + race_eth + hhtype + foodstamps + age + sex +
## fit2: (1 | metroarea)
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## fit1 3 438302 438328 -219148 438296
## fit2 11 437491 437588 -218734 437469 827.15 8 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The Intraclass Correlation shows that the values of the first model are slightly less reliable than those of the second model. Adding covariates for control such as age, sex and race/ethnicity among other resulted in the decrease of the variation between metro areas in terms of their travel time.
#install.packages("sjstats")
library(sjstats)
icc(fit1)
##
## Linear mixed model
##
## Family : gaussian (identity)
## Formula: traveltime ~ 1 + (1 | metroarea)
##
## ICC (metroarea): 0.0019
icc(fit2)
##
## Linear mixed model
##
## Family : gaussian (identity)
## Formula: traveltime ~ 1 + race_eth + hhtype + foodstamps + age + sex + (1 | metroarea)
##
## ICC (metroarea): 0.0028