library(readr)
library(dplyr) #to manipulate data
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2) #to visualize data
library(broom) #to make results printable
library(tidyr)

Descriptive Statistics and Plots to Summarize the Distribution

of the Transit Time for the Metro Areas in Texas

Descriptive Statistics

#Using IPUMS Data
library(haven)
ipums<-read_dta("https://github.com/coreysparks/data/blob/master/usa_00045.dta?raw=true")

#Filter data by age, labor force status, living in metro areas in Texas
newpums<-ipums%>% #creating new dataset
  filter(age>=18, labforce==2, statefip==48, met2013!=0, trantime>0)%>%
  
#Recoding Metro Areas in Texas  
  mutate(metarea=case_when(.$met2013=="11100" ~ "Amarillo",
                           .$met2013=="12420" ~ "Austin-Round Rock",
                           .$met2013=="13140" ~ "Beaumont-Port Arthur",
                           .$met2013=="15180" ~ "Brownsville-Harlingen",
                           .$met2013=="17780" ~ "College Station-Bryan",
                           .$met2013=="18580" ~ "Corpus Christi",
                           .$met2013=="19100" ~ "Dallas-Ft Worth-Arlington",
                           .$met2013=="21340" ~ "El Paso",
                           .$met2013=="26420" ~ "Houston-The Woodlands-Sugar Land",
                           .$met2013=="28660" ~ "Kileen-Temple",
                           .$met2013=="29700" ~ "Laredo",
                           .$met2013=="31180" ~ "Lubbock",
                           .$met2013=="32580" ~ "McAllen-Edinburg-Mission",
                           .$met2013=="33260" ~ "Midland",
                           .$met2013=="36220" ~ "Odessa",
                           .$met2013=="41660" ~ "San Angelo",
                           .$met2013=="41700" ~ "San Antonio-New Braunfels",
                           .$met2013=="46340" ~ "Tyler",
                           .$met2013=="47380" ~ "Waco",
                           .$met2013=="48660" ~ "Wichita Falls")) #Codebook_DEM7273_IPUMS2015
#Summary Statistics
newpums%>%
  group_by(metarea)%>% #Group by MetroArea
  summarise(mean.transit=mean(trantime, na.rm=T), sd.transit=sd(trantime, na.rm=T),n=n()) #to get the mean and standard deviation of transit time
## # A tibble: 19 x 4
##                             metarea mean.transit sd.transit     n
##                               <chr>        <dbl>      <dbl> <int>
##  1                         Amarillo     18.42975  22.160335   121
##  2                Austin-Round Rock     25.89784  18.036476   832
##  3             Beaumont-Port Arthur     23.85714  22.312567   133
##  4            Brownsville-Harlingen     18.22034  10.647527   118
##  5            College Station-Bryan     21.43750  24.918314    96
##  6                   Corpus Christi     21.36453  23.844486   203
##  7        Dallas-Ft Worth-Arlington     27.95233  22.545452  2874
##  8                          El Paso     24.07719  25.014806   285
##  9 Houston-The Woodlands-Sugar Land     29.52089  21.717294  2202
## 10                           Laredo     21.27778  20.361358    90
## 11                          Lubbock     16.47727  17.345376   132
## 12         McAllen-Edinburg-Mission     23.05917  22.180994   169
## 13                          Midland     20.23404  16.808658    47
## 14                           Odessa     19.07895  12.101779    38
## 15                       San Angelo     27.55000  33.576510    40
## 16        San Antonio-New Braunfels     27.54078  21.971427   797
## 17                            Tyler     24.20225  27.622940    89
## 18                             Waco     22.22314  18.555542   121
## 19                    Wichita Falls     13.43939   9.907462    66

The Houston-The Woodlands-Sugar Land has the longest transit time to work among the metropolitan areas in Texas and Wichita Falls has the shortest. The other cities are faily similar to each other in terms of the transit time to work.

Boxplot

newpums%>%

  ggplot(aes(metarea, trantime))+geom_boxplot()+
  ggtitle(label="Travel Time to Work in Metropolitan Areas in Texas") +
  xlab("Metropolitan Areas in  Texas")+
  ylab("Transit Time")+
  coord_flip()

The variations among the mean travel time is visible, however, this plot does not graph the output in a visually appealing manner.

Histogram

newpums%>%

ggplot(aes(trantime))+geom_histogram(aes(y=0.5*..density..))+
  facet_wrap(~metarea)+
  ggtitle(label="Travel Time to Work in Metropolitan Areas in Texas")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ANOVA

newpums$metarea<-relevel(as.factor(newpums$metarea), ref="San Antonio-New Braunfels")

commtime<-newpums%>%
  lm(trantime~metarea, data=.)
anova(commtime)
## Analysis of Variance Table
## 
## Response: trantime
##             Df  Sum Sq Mean Sq F value    Pr(>F)    
## metarea     18   88806  4933.7  10.458 < 2.2e-16 ***
## Residuals 8434 3978670   471.7                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The F test shows that the F value is 10.458 and P values is 2.2e-16. Hence, we can be faily certain that there is variation among transit times in the Texas Metropolitan Areas. However, the variations among the metropolitan areas cannot be seen from this test.

Post-Hoc Test:

pairwise.t.test(x = newpums$trantime, g = newpums$met2013, p.adjust.method = "bonf")
## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  newpums$trantime and newpums$met2013 
## 
##       11100   12420   13140   15180   17780   18580   19100   21340  
## 12420 0.07037 -       -       -       -       -       -       -      
## 13140 1.00000 1.00000 -       -       -       -       -       -      
## 15180 1.00000 0.05613 1.00000 -       -       -       -       -      
## 17780 1.00000 1.00000 1.00000 1.00000 -       -       -       -      
## 18580 1.00000 1.00000 1.00000 1.00000 1.00000 -       -       -      
## 19100 0.00040 1.00000 1.00000 0.00032 0.65830 0.00511 -       -      
## 21340 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 0.69715 -      
## 26420 8.0e-06 0.00716 0.59936 6.5e-06 0.06149 5.3e-05 1.00000 0.01181
## 29700 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 0.70198 1.00000
## 31180 1.00000 0.00064 0.97376 1.00000 1.00000 1.00000 5.2e-07 0.15265
## 32580 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 0.75829 1.00000
## 33260 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000
## 36220 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000
## 41660 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000
## 41700 0.00296 1.00000 1.00000 0.00235 1.00000 0.05126 1.00000 1.00000
## 46340 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000
## 47380 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 0.76760 1.00000
## 48660 1.00000 0.00126 0.24791 1.00000 1.00000 1.00000 1.4e-05 0.05787
##       26420   29700   31180   32580   33260   36220   41660   41700  
## 12420 -       -       -       -       -       -       -       -      
## 13140 -       -       -       -       -       -       -       -      
## 15180 -       -       -       -       -       -       -       -      
## 17780 -       -       -       -       -       -       -       -      
## 18580 -       -       -       -       -       -       -       -      
## 19100 -       -       -       -       -       -       -       -      
## 21340 -       -       -       -       -       -       -       -      
## 26420 -       -       -       -       -       -       -       -      
## 29700 0.07169 -       -       -       -       -       -       -      
## 31180 3.7e-09 1.00000 -       -       -       -       -       -      
## 32580 0.03333 1.00000 1.00000 -       -       -       -       -      
## 33260 0.63865 1.00000 1.00000 1.00000 -       -       -       -      
## 36220 0.56574 1.00000 1.00000 1.00000 1.00000 -       -       -      
## 41660 1.00000 1.00000 0.81144 1.00000 1.00000 1.00000 -       -      
## 41700 1.00000 1.00000 1.0e-05 1.00000 1.00000 1.00000 1.00000 -      
## 46340 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000
## 47380 0.05506 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000
## 48660 5.5e-07 1.00000 1.00000 0.39075 1.00000 1.00000 0.20360 7.0e-05
##       46340   47380  
## 12420 -       -      
## 13140 -       -      
## 15180 -       -      
## 17780 -       -      
## 18580 -       -      
## 19100 -       -      
## 21340 -       -      
## 26420 -       -      
## 29700 -       -      
## 31180 -       -      
## 32580 -       -      
## 33260 -       -      
## 36220 -       -      
## 41660 -       -      
## 41700 -       -      
## 46340 -       -      
## 47380 1.00000 -      
## 48660 0.39183 1.00000
## 
## P value adjustment method: bonferroni

From the post-hoc test, the The Houston-The Woodlands-Sugar Land metropolitan area stands out compared to the other areas in Texas in terms of the transit time.

tidy(commtime)
##                                       term      estimate std.error
## 1                              (Intercept)  27.540777917 0.7693484
## 2                          metareaAmarillo  -9.111025851 2.1191011
## 3                 metareaAustin-Round Rock  -1.642941379 1.0765195
## 4              metareaBeaumont-Port Arthur  -3.683635060 2.0344109
## 5             metareaBrownsville-Harlingen  -9.320438934 2.1423606
## 6             metareaCollege Station-Bryan  -6.103277917 2.3464599
## 7                    metareaCorpus Christi  -6.176245897 1.7075562
## 8         metareaDallas-Ft Worth-Arlington   0.411553328 0.8695045
## 9                           metareaEl Paso  -3.463584935 1.4990435
## 10 metareaHouston-The Woodlands-Sugar Land   1.980112183 0.8978476
## 11                           metareaLaredo  -6.263000139 2.4152583
## 12                          metareaLubbock -11.063505190 2.0410043
## 13         metareaMcAllen-Edinburg-Mission  -4.481606320 1.8393664
## 14                          metareaMidland  -7.306735364 3.2602081
## 15                           metareaOdessa  -8.461830549 3.6064048
## 16                       metareaSan Angelo   0.009222083 3.5192957
## 17                            metareaTyler  -3.338530726 2.4274198
## 18                             metareaWaco  -5.317637421 2.1191011
## 19                    metareaWichita Falls -14.101383978 2.7819957
##       statistic       p.value
## 1  35.797539420 2.130018e-261
## 2  -4.299476666  1.731281e-05
## 3  -1.526160334  1.270074e-01
## 4  -1.810664208  7.022841e-02
## 5  -4.350546299  1.373974e-05
## 6  -2.601057819  9.309905e-03
## 7  -3.617008840  2.997683e-04
## 8   0.473319373  6.359976e-01
## 9  -2.310530014  2.088284e-02
## 10  2.205399100  2.745298e-02
## 11 -2.593097455  9.528008e-03
## 12 -5.420618280  6.103387e-08
## 13 -2.436494671  1.485094e-02
## 14 -2.241186804  2.503989e-02
## 15 -2.346334128  1.898212e-02
## 16  0.002620434  9.979093e-01
## 17 -1.375341326  1.690622e-01
## 18 -2.509383508  1.211280e-02
## 19 -5.068801467  4.088349e-07

With San Antonio-New Braunfels as the reference group among the metropolitan areas in Texas, it is seen that, Houston-The Woodlands-Sugar Land has the longest averate commute time (from the descriptive mean statistics, it is 29.5 minutes), and Wichita Falls has the shortest average commute time (13.4 minutes).

Normality checks

qqnorm(rstudent(commtime), main = "Q Plot for Model Residuals")
qqline(rstudent(commtime), col=2)

Log Transformation

commtimelog<-newpums%>%
lm(log(trantime)~metarea, data=.)
qqnorm(rstudent(commtimelog), main="Q-Q Plot for Model Residuals") 
qqline(rstudent(commtimelog), col=2)