Use IPUMS Data from Github

knitr::opts_chunk$set(echo = TRUE)
library(haven)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(broom)
ipums<-read_dta("https://github.com/coreysparks/data/blob/master/usa_00045.dta?raw=true")
names(ipums) #print the column names
##  [1] "year"      "datanum"   "serial"    "hhwt"      "statefip" 
##  [6] "met2013"   "puma"      "gq"        "pernum"    "perwt"    
## [11] "famsize"   "nchild"    "nchlt5"    "eldch"     "nsibs"    
## [16] "relate"    "related"   "sex"       "age"       "marst"    
## [21] "birthyr"   "fertyr"    "race"      "raced"     "hispan"   
## [26] "hispand"   "bpl"       "bpld"      "citizen"   "yrsusa1"  
## [31] "language"  "languaged" "speakeng"  "educ"      "educd"    
## [36] "empstat"   "empstatd"  "labforce"  "occ"       "ind"      
## [41] "inctot"    "incwage"   "poverty"   "hwsei"     "migrate1" 
## [46] "migrate1d" "carpool"   "trantime"

Filter trantime variable and recode missing values

ipums_new<-ipums%>%
  mutate(mytrantime<-ifelse(trantime==000, NA, trantime))%>%
  filter(labforce==2, met2013%in%c(11100, 12420, 13140, 15180, 17780, 18580, 19100, 21340, 26420, 28660, 29700, 31180, 32580, 33260, 36220, 41660, 41700, 46340, 47380,48660), age >= 18) %>%
  mutate(citymetro=case_when(.$met2013==11100~"Amarillo",
                             .$met2013==12420~"Austin",
                             .$met2013==13140~"Beaumont",
                             .$met2013==15180~"Brownsville",
                             .$met2013==17780~"College Station",
                             .$met2013==18580~"Corpus Christi",
                             .$met2013==19100~"Dallas",
                             .$met2013==21340~"El Paso",
                             .$met2013==26420~"Houston",
                             .$met2013==28660~"Killeen",
                             .$met2013==29700~"Laredo",
                             .$met2013==31180~"Lubbock",
                             .$met2013==32580~"McAllen",
                             .$met2013==33260~"Midland",
                             .$met2013==36220~"Odessa",
                             .$met2013==41660~"San Angelo",
                             .$met2013==41700~"San Antonio",
                             .$met2013==46340~"Tyler",
                             .$met2013==47380~"Waco",
                             .$met2013==48660~"Wichita Falls"))

Summary Statistics on trantime for Texas metro cities

ipums_new%>%
  group_by(citymetro)%>%
  summarise(means=mean(trantime, na.rm=T), sds=sd(trantime,na.rm=T), n=n())
## # A tibble: 19 x 4
##          citymetro    means      sds     n
##              <chr>    <dbl>    <dbl> <int>
##  1        Amarillo 16.76692 21.78368   133
##  2          Austin 22.80106 18.89597   945
##  3        Beaumont 21.01325 22.32188   151
##  4     Brownsville 16.53846 11.43919   130
##  5 College Station 20.37624 24.73291   101
##  6  Corpus Christi 19.36161 23.53661   224
##  7          Dallas 24.97202 22.98984  3217
##  8         El Paso 21.04908 24.71307   326
##  9         Houston 26.46783 22.44292  2456
## 10          Laredo 19.34343 20.35459    99
## 11         Lubbock 15.99265 17.31345   136
## 12         McAllen 19.38806 22.01837   201
## 13         Midland 19.40816 16.94466    49
## 14          Odessa 17.26190 12.81778    42
## 15      San Angelo 25.04545 32.96507    44
## 16     San Antonio 24.30786 22.46521   903
## 17           Tyler 21.75758 27.18271    99
## 18            Waco 20.06716 18.82136   134
## 19   Wichita Falls 11.98649 10.24962    74

Create boxplots to summarize trantime for Texas metro cities

ipums_new%>%
  ggplot(aes(x=citymetro, y=trantime))+geom_boxplot()+ggtitle(label= "Avg. Traveltime to Work in Texas Metro Cities",xlab("traveltime")) 

Perform ANOVA test

trantime_fit<-ipums_new%>%
  lm(trantime~citymetro, data=.)
anova(trantime_fit)
## Analysis of Variance Table
## 
## Response: trantime
##             Df  Sum Sq Mean Sq F value    Pr(>F)    
## citymetro   18   75244  4180.2  8.5119 < 2.2e-16 ***
## Residuals 9445 4638523   491.1                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(trantime_fit)
## 
## Call:
## lm(formula = trantime ~ citymetro, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -26.468 -14.308  -4.972   7.199 143.007 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               16.7669     1.9216   8.725  < 2e-16 ***
## citymetroAustin            6.0341     2.0524   2.940  0.00329 ** 
## citymetroBeaumont          4.2463     2.6353   1.611  0.10715    
## citymetroBrownsville      -0.2285     2.7332  -0.084  0.93339    
## citymetroCollege Station   3.6093     2.9249   1.234  0.21723    
## citymetroCorpus Christi    2.5947     2.4259   1.070  0.28484    
## citymetroDallas            8.2051     1.9609   4.184 2.89e-05 ***
## citymetroEl Paso           4.2822     2.2801   1.878  0.06041 .  
## citymetroHouston           9.7009     1.9729   4.917 8.94e-07 ***
## citymetroLaredo            2.5765     2.9416   0.876  0.38112    
## citymetroLubbock          -0.7743     2.7025  -0.286  0.77450    
## citymetroMcAllen           2.6211     2.4771   1.058  0.29001    
## citymetroMidland           2.6412     3.7034   0.713  0.47574    
## citymetroOdessa            0.4950     3.9225   0.126  0.89958    
## citymetroSan Angelo        8.2785     3.8541   2.148  0.03174 *  
## citymetroSan Antonio       7.5409     2.0583   3.664  0.00025 ***
## citymetroTyler             4.9907     2.9416   1.697  0.08981 .  
## citymetroWaco              3.3002     2.7125   1.217  0.22375    
## citymetroWichita Falls    -4.7804     3.2139  -1.487  0.13694    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22.16 on 9445 degrees of freedom
## Multiple R-squared:  0.01596,    Adjusted R-squared:  0.01409 
## F-statistic: 8.512 on 18 and 9445 DF,  p-value: < 2.2e-16

The F test from the ANOVA model is 8.5119 indicating there is a difference between the transtime means between the cities.

The city with the longest average commute time is Wichita Falls, TX with an average of 12 minutes.

The city with the longest average commute time is Houston, TX with an average of 26.5 minutes.