library(readr)
library(dplyr) #to manipulate data
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2) #to visualize data
library(broom) #to make results printable
library(tidyr)
Descriptive Statistics and Plots to Summarize the Distribution
of the Transit Time for the Metro Areas in Texas
Descriptive Statistics
#Using IPUMS Data
library(haven)
ipums<-read_dta("https://github.com/coreysparks/data/blob/master/usa_00045.dta?raw=true")
#Filter data by age, labor force status, living in metro areas in Texas
newpums<-ipums%>% #creating new dataset
filter(age>=18, labforce==2, statefip==48, met2013!=0, trantime>0)%>%
#Recoding Metro Areas in Texas
mutate(metarea=case_when(.$met2013=="11100" ~ "Amarillo",
.$met2013=="12420" ~ "Austin-Round Rock",
.$met2013=="13140" ~ "Beaumont-Port Arthur",
.$met2013=="15180" ~ "Brownsville-Harlingen",
.$met2013=="17780" ~ "College Station-Bryan",
.$met2013=="18580" ~ "Corpus Christi",
.$met2013=="19100" ~ "Dallas-Ft Worth-Arlington",
.$met2013=="21340" ~ "El Paso",
.$met2013=="26420" ~ "Houston-The Woodlands-Sugar Land",
.$met2013=="28660" ~ "Kileen-Temple",
.$met2013=="29700" ~ "Laredo",
.$met2013=="31180" ~ "Lubbock",
.$met2013=="32580" ~ "McAllen-Edinburg-Mission",
.$met2013=="33260" ~ "Midland",
.$met2013=="36220" ~ "Odessa",
.$met2013=="41660" ~ "San Angelo",
.$met2013=="41700" ~ "San Antonio-New Braunfels",
.$met2013=="46340" ~ "Tyler",
.$met2013=="47380" ~ "Waco",
.$met2013=="48660" ~ "Wichita Falls")) #Codebook_DEM7273_IPUMS2015
#Summary Statistics
newpums%>%
group_by(metarea)%>% #Group by MetroArea
summarise(mean.transit=mean(trantime, na.rm=T), sd.transit=sd(trantime, na.rm=T),n=n()) #to get the mean and standard deviation of transit time
## # A tibble: 19 x 4
## metarea mean.transit sd.transit n
## <chr> <dbl> <dbl> <int>
## 1 Amarillo 18.42975 22.160335 121
## 2 Austin-Round Rock 25.89784 18.036476 832
## 3 Beaumont-Port Arthur 23.85714 22.312567 133
## 4 Brownsville-Harlingen 18.22034 10.647527 118
## 5 College Station-Bryan 21.43750 24.918314 96
## 6 Corpus Christi 21.36453 23.844486 203
## 7 Dallas-Ft Worth-Arlington 27.95233 22.545452 2874
## 8 El Paso 24.07719 25.014806 285
## 9 Houston-The Woodlands-Sugar Land 29.52089 21.717294 2202
## 10 Laredo 21.27778 20.361358 90
## 11 Lubbock 16.47727 17.345376 132
## 12 McAllen-Edinburg-Mission 23.05917 22.180994 169
## 13 Midland 20.23404 16.808658 47
## 14 Odessa 19.07895 12.101779 38
## 15 San Angelo 27.55000 33.576510 40
## 16 San Antonio-New Braunfels 27.54078 21.971427 797
## 17 Tyler 24.20225 27.622940 89
## 18 Waco 22.22314 18.555542 121
## 19 Wichita Falls 13.43939 9.907462 66
The Houston-The Woodlands-Sugar Land has the longest transit time to work among the metropolitan areas in Texas and Wichita Falls has the shortest. The other cities are faily similar to each other in terms of the transit time to work.
Boxplot
newpums%>%
ggplot(aes(metarea, trantime))+geom_boxplot()+
ggtitle(label="Travel Time to Work in Metropolitan Areas in Texas") +
xlab("Metropolitan Areas in Texas")+
ylab("Transit Time")+
coord_flip()

The variations among the mean travel time is visible, however, this plot does not graph the output in a visually appealing manner.
Histogram
newpums%>%
ggplot(aes(trantime))+geom_histogram(aes(y=0.5*..density..))+
facet_wrap(~metarea)+
ggtitle(label="Travel Time to Work in Metropolitan Areas in Texas")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ANOVA
newpums$metarea<-relevel(as.factor(newpums$metarea), ref="San Antonio-New Braunfels")
commtime<-newpums%>%
lm(trantime~metarea, data=.)
anova(commtime)
## Analysis of Variance Table
##
## Response: trantime
## Df Sum Sq Mean Sq F value Pr(>F)
## metarea 18 88806 4933.7 10.458 < 2.2e-16 ***
## Residuals 8434 3978670 471.7
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The F test shows that the F value is 10.458 and P values is 2.2e-16. Hence, we can be faily certain that there is variation among transit times in the Texas Metropolitan Areas. However, the variations among the metropolitan areas cannot be seen from this test.
Post-Hoc Test:
pairwise.t.test(x = newpums$trantime, g = newpums$met2013, p.adjust.method = "bonf")
##
## Pairwise comparisons using t tests with pooled SD
##
## data: newpums$trantime and newpums$met2013
##
## 11100 12420 13140 15180 17780 18580 19100 21340
## 12420 0.07037 - - - - - - -
## 13140 1.00000 1.00000 - - - - - -
## 15180 1.00000 0.05613 1.00000 - - - - -
## 17780 1.00000 1.00000 1.00000 1.00000 - - - -
## 18580 1.00000 1.00000 1.00000 1.00000 1.00000 - - -
## 19100 0.00040 1.00000 1.00000 0.00032 0.65830 0.00511 - -
## 21340 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 0.69715 -
## 26420 8.0e-06 0.00716 0.59936 6.5e-06 0.06149 5.3e-05 1.00000 0.01181
## 29700 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 0.70198 1.00000
## 31180 1.00000 0.00064 0.97376 1.00000 1.00000 1.00000 5.2e-07 0.15265
## 32580 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 0.75829 1.00000
## 33260 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000
## 36220 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000
## 41660 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000
## 41700 0.00296 1.00000 1.00000 0.00235 1.00000 0.05126 1.00000 1.00000
## 46340 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000
## 47380 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 0.76760 1.00000
## 48660 1.00000 0.00126 0.24791 1.00000 1.00000 1.00000 1.4e-05 0.05787
## 26420 29700 31180 32580 33260 36220 41660 41700
## 12420 - - - - - - - -
## 13140 - - - - - - - -
## 15180 - - - - - - - -
## 17780 - - - - - - - -
## 18580 - - - - - - - -
## 19100 - - - - - - - -
## 21340 - - - - - - - -
## 26420 - - - - - - - -
## 29700 0.07169 - - - - - - -
## 31180 3.7e-09 1.00000 - - - - - -
## 32580 0.03333 1.00000 1.00000 - - - - -
## 33260 0.63865 1.00000 1.00000 1.00000 - - - -
## 36220 0.56574 1.00000 1.00000 1.00000 1.00000 - - -
## 41660 1.00000 1.00000 0.81144 1.00000 1.00000 1.00000 - -
## 41700 1.00000 1.00000 1.0e-05 1.00000 1.00000 1.00000 1.00000 -
## 46340 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000
## 47380 0.05506 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000
## 48660 5.5e-07 1.00000 1.00000 0.39075 1.00000 1.00000 0.20360 7.0e-05
## 46340 47380
## 12420 - -
## 13140 - -
## 15180 - -
## 17780 - -
## 18580 - -
## 19100 - -
## 21340 - -
## 26420 - -
## 29700 - -
## 31180 - -
## 32580 - -
## 33260 - -
## 36220 - -
## 41660 - -
## 41700 - -
## 46340 - -
## 47380 1.00000 -
## 48660 0.39183 1.00000
##
## P value adjustment method: bonferroni
From the post-hoc test, the The Houston-The Woodlands-Sugar Land metropolitan area stands out compared to the other areas in Texas in terms of the transit time.
tidy(commtime)
## term estimate std.error
## 1 (Intercept) 27.540777917 0.7693484
## 2 metareaAmarillo -9.111025851 2.1191011
## 3 metareaAustin-Round Rock -1.642941379 1.0765195
## 4 metareaBeaumont-Port Arthur -3.683635060 2.0344109
## 5 metareaBrownsville-Harlingen -9.320438934 2.1423606
## 6 metareaCollege Station-Bryan -6.103277917 2.3464599
## 7 metareaCorpus Christi -6.176245897 1.7075562
## 8 metareaDallas-Ft Worth-Arlington 0.411553328 0.8695045
## 9 metareaEl Paso -3.463584935 1.4990435
## 10 metareaHouston-The Woodlands-Sugar Land 1.980112183 0.8978476
## 11 metareaLaredo -6.263000139 2.4152583
## 12 metareaLubbock -11.063505190 2.0410043
## 13 metareaMcAllen-Edinburg-Mission -4.481606320 1.8393664
## 14 metareaMidland -7.306735364 3.2602081
## 15 metareaOdessa -8.461830549 3.6064048
## 16 metareaSan Angelo 0.009222083 3.5192957
## 17 metareaTyler -3.338530726 2.4274198
## 18 metareaWaco -5.317637421 2.1191011
## 19 metareaWichita Falls -14.101383978 2.7819957
## statistic p.value
## 1 35.797539420 2.130018e-261
## 2 -4.299476666 1.731281e-05
## 3 -1.526160334 1.270074e-01
## 4 -1.810664208 7.022841e-02
## 5 -4.350546299 1.373974e-05
## 6 -2.601057819 9.309905e-03
## 7 -3.617008840 2.997683e-04
## 8 0.473319373 6.359976e-01
## 9 -2.310530014 2.088284e-02
## 10 2.205399100 2.745298e-02
## 11 -2.593097455 9.528008e-03
## 12 -5.420618280 6.103387e-08
## 13 -2.436494671 1.485094e-02
## 14 -2.241186804 2.503989e-02
## 15 -2.346334128 1.898212e-02
## 16 0.002620434 9.979093e-01
## 17 -1.375341326 1.690622e-01
## 18 -2.509383508 1.211280e-02
## 19 -5.068801467 4.088349e-07
With San Antonio-New Braunfels as the reference group among the metropolitan areas in Texas, it is seen that, Houston-The Woodlands-Sugar Land has the longest averate commute time (from the descriptive mean statistics, it is 29.5 minutes), and Wichita Falls has the shortest average commute time (13.4 minutes).
Normality checks
qqnorm(rstudent(commtime), main = "Q Plot for Model Residuals")
qqline(rstudent(commtime), col=2)
