Loading libraries and IPUMS data.

library(haven)
library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(broom)
ipums<-read_dta("https://github.com/coreysparks/data/blob/master/usa_00045.dta?raw=true")
mySE<-function(x){sd(x,na.rm=T)/length(x)}

1 and 2) clear up missing values for Travel Time and remove poputaltion not in a metro area

myPositiveTravel<-subset(ipums,age>17)
myPositiveTravel<-subset(myPositiveTravel,trantime>0)
myPositiveTravel<-subset(myPositiveTravel,met2013>0)
#subset only takes 2 manipulations at a time

2)Mutating the database to reduce the observations to Texas Metro Areas and labeling the areas

The Metro List:

Code Metro Area

11100 Amarillo

12420 Austin-Round Rock

13140 Beaumont - Port Arthur

15180 Brownsville - Harlingen

17780 College Station - Bryan

18580 Corpus Christi

19100 Dallas - Fort Worth - Arlington

21340 El Paso

26420 Houston - The Woodlands - Sugar Land

28660 Killeen - Temple

29700 Laredo

31180 Lubbock

32580 McAllen - Edinburg - Mission

33260 Midland

36220 Odessa

41660 San Angelo

41700 San Antonio - New Braunfels

46340 Tyler

47380 Waco

48660 Wichita Falls

myTXMetros<-myPositiveTravel%>%
  filter(statefip==48)%>%
  mutate(TXmet2013 = case_when
    (
      .$met2013 ==11100 ~"Amarillo", 
      .$met2013 ==12420 ~"Austin-Round Rock",
      .$met2013 ==13140 ~"Beaumont - Port Arthur",
      .$met2013 ==15180 ~"Brownsville - Harlingen",
      .$met2013 ==17780 ~"College Station - Bryan",
      .$met2013 ==18580 ~"Corpus Christi",
      .$met2013 ==19100 ~"Dallas - Fort Worth - Arlington",
      .$met2013 ==21340 ~"El Paso",
      .$met2013 ==26420 ~"Houston - The Woodlands - Sugar Land",
      .$met2013 ==28660 ~"Killeen - Temple",
      .$met2013 ==29700 ~"Laredo",
      .$met2013 ==31180 ~"Lubbock",
      .$met2013 ==32580 ~"McAllen - Edinburg - Mission",
      .$met2013 ==33260 ~"Midland",
      .$met2013 ==36220 ~"Odessa",
      .$met2013 ==41660 ~"San Angelo",
      .$met2013 ==41700 ~"San Antonio - New Braunfels",
      .$met2013 ==46340 ~"Tyler",
      .$met2013 ==47380 ~"Waco",
      .$met2013 ==48660 ~"Wichita Falls"
    ))

The new database contains 14,863 observations.

3) Descriptive Statistics

General Summary:

myTXMetros%>%
  group_by(TXmet2013) %>%
  summarise(mean(trantime), sd(trantime), mySE(trantime))
## # A tibble: 19 x 4
##                               TXmet2013 `mean(trantime)` `sd(trantime)`
##                                   <chr>            <dbl>          <dbl>
##  1                             Amarillo         18.42975      22.160335
##  2                    Austin-Round Rock         25.89784      18.036476
##  3               Beaumont - Port Arthur         23.85714      22.312567
##  4              Brownsville - Harlingen         18.22034      10.647527
##  5              College Station - Bryan         21.43750      24.918314
##  6                       Corpus Christi         21.36453      23.844486
##  7      Dallas - Fort Worth - Arlington         27.95233      22.545452
##  8                              El Paso         24.07719      25.014806
##  9 Houston - The Woodlands - Sugar Land         29.52089      21.717294
## 10                               Laredo         21.27778      20.361358
## 11                              Lubbock         16.47727      17.345376
## 12         McAllen - Edinburg - Mission         23.05917      22.180994
## 13                              Midland         20.23404      16.808658
## 14                               Odessa         19.07895      12.101779
## 15                           San Angelo         27.55000      33.576510
## 16          San Antonio - New Braunfels         27.54078      21.971427
## 17                                Tyler         24.20225      27.622940
## 18                                 Waco         22.22314      18.555542
## 19                        Wichita Falls         13.43939       9.907462
## # ... with 1 more variables: `mySE(trantime)` <dbl>

Summary using the Linear Model

myPlot1<-lm(trantime~TXmet2013,data=myTXMetros)
tidy(myPlot1)
##                                             term   estimate std.error
## 1                                    (Intercept) 18.4297521  1.974511
## 2                     TXmet2013Austin-Round Rock  7.4680845  2.113218
## 3                TXmet2013Beaumont - Port Arthur  5.4273908  2.728667
## 4               TXmet2013Brownsville - Harlingen -0.2094131  2.810072
## 5               TXmet2013College Station - Bryan  3.0077479  2.968614
## 6                        TXmet2013Corpus Christi  2.9347800  2.494503
## 7       TXmet2013Dallas - Fort Worth - Arlington  9.5225792  2.015647
## 8                               TXmet2013El Paso  5.6474409  2.356677
## 9  TXmet2013Houston - The Woodlands - Sugar Land 11.0911380  2.028035
## 10                               TXmet2013Laredo  2.8480257  3.023288
## 11                              TXmet2013Lubbock -1.9524793  2.733586
## 12         TXmet2013McAllen - Edinburg - Mission  4.6294195  2.586516
## 13                              TXmet2013Midland  1.8042905  3.733062
## 14                               TXmet2013Odessa  0.6491953  4.038929
## 15                           TXmet2013San Angelo  9.1202479  3.961343
## 16          TXmet2013San Antonio - New Braunfels  9.1110259  2.119101
## 17                                TXmet2013Tyler  5.7724951  3.033012
## 18                                 TXmet2013Waco  3.7933884  2.792380
## 19                        TXmet2013Wichita Falls -4.9903581  3.323597
##      statistic      p.value
## 1   9.33383222 1.283309e-20
## 2   3.53398687 4.115336e-04
## 3   1.98902642 4.673054e-02
## 4  -0.07452232 9.405965e-01
## 5   1.01318251 3.110021e-01
## 6   1.17649898 2.394287e-01
## 7   4.72432818 2.346104e-06
## 8   2.39635749 1.658058e-02
## 9   5.46890853 4.657666e-08
## 10  0.94202934 3.462046e-01
## 11 -0.71425559 4.750889e-01
## 12  1.78982838 7.351737e-02
## 13  0.48332721 6.288760e-01
## 14  0.16073450 8.723064e-01
## 15  2.30231212 2.134181e-02
## 16  4.29947667 1.731281e-05
## 17  1.90322191 5.704561e-02
## 18  1.35847864 1.743482e-01
## 19 -1.50149322 1.332655e-01

Box Plot

myTXMetros%>%
  ggplot(.)+
  geom_boxplot(aes(TXmet2013,trantime))+
  ggtitle("Box Plot for Comute Time in Texas Metro Areas")+
  coord_flip()+
  xlab("Metro Area")+
  ylab("Commute Time")

Graph (Linear Model) of Commute Times in Texas Metros

qqnorm(rstudent(myPlot1), main="Q-Q Plot for Commute Times in Texas")
qqline(rstudent(myPlot1), col=2)

Transforamtion using Square Root

myPlot2<-lm(sqrt(trantime)~TXmet2013,data=myTXMetros)
tidy(myPlot2)
##                                             term   estimate std.error
## 1                                    (Intercept)  3.9106054 0.1622638
## 2                     TXmet2013Austin-Round Rock  0.9066938 0.1736627
## 3                TXmet2013Beaumont - Port Arthur  0.6255438 0.2242398
## 4               TXmet2013Brownsville - Harlingen  0.1821689 0.2309296
## 5               TXmet2013College Station - Bryan  0.3218004 0.2439585
## 6                        TXmet2013Corpus Christi  0.3252899 0.2049963
## 7       TXmet2013Dallas - Fort Worth - Arlington  1.0467210 0.1656444
## 8                               TXmet2013El Paso  0.6395462 0.1936699
## 9  TXmet2013Houston - The Woodlands - Sugar Land  1.2051111 0.1666624
## 10                               TXmet2013Laredo  0.4110962 0.2484515
## 11                              TXmet2013Lubbock -0.1202756 0.2246441
## 12         TXmet2013McAllen - Edinburg - Mission  0.5671799 0.2125579
## 13                              TXmet2013Midland  0.2600347 0.3067802
## 14                               TXmet2013Odessa  0.2291389 0.3319162
## 15                           TXmet2013San Angelo  0.8106906 0.3255402
## 16          TXmet2013San Antonio - New Braunfels  1.0427744 0.1741461
## 17                                TXmet2013Tyler  0.5974084 0.2492506
## 18                                 TXmet2013Waco  0.5251029 0.2294757
## 19                        TXmet2013Wichita Falls -0.4050950 0.2731307
##     statistic       p.value
## 1  24.1002949 3.651938e-124
## 2   5.2210060  1.821977e-07
## 3   2.7896198  5.288838e-03
## 4   0.7888505  4.302215e-01
## 5   1.3190787  1.871786e-01
## 6   1.5868082  1.125936e-01
## 7   6.3190862  2.764530e-10
## 8   3.3022478  9.631228e-04
## 9   7.2308525  5.219080e-13
## 10  1.6546336  9.803605e-02
## 11 -0.5354051  5.923839e-01
## 12  2.6683546  7.636956e-03
## 13  0.8476253  3.966708e-01
## 14  0.6903519  4.899919e-01
## 15  2.4902935  1.278286e-02
## 16  5.9879277  2.212265e-09
## 17  2.3968178  1.655978e-02
## 18  2.2882725  2.214628e-02
## 19 -1.4831544  1.380708e-01

Square Root Graph

qqnorm(rstudent(myPlot2), main="Q-Q Plot")
qqline(rstudent(myPlot2), col=2)

Reciprocal Transformation

myPlot3<-lm(I(1/trantime)~TXmet2013,data=myTXMetros)
tidy(myPlot3)
##                                             term     estimate   std.error
## 1                                    (Intercept)  0.114731096 0.008114440
## 2                     TXmet2013Austin-Round Rock -0.047199567 0.008684471
## 3                TXmet2013Beaumont - Port Arthur -0.027517223 0.011213717
## 4               TXmet2013Brownsville - Harlingen -0.029813151 0.011548259
## 5               TXmet2013College Station - Bryan -0.030526691 0.012199803
## 6                        TXmet2013Corpus Christi -0.023422716 0.010251397
## 7       TXmet2013Dallas - Fort Worth - Arlington -0.047936517 0.008283495
## 8                               TXmet2013El Paso -0.043197705 0.009684989
## 9  TXmet2013Houston - The Woodlands - Sugar Land -0.051231649 0.008334403
## 10                               TXmet2013Laredo -0.042261031 0.012424489
## 11                              TXmet2013Lubbock -0.015041440 0.011233934
## 12         TXmet2013McAllen - Edinburg - Mission -0.039115915 0.010629534
## 13                              TXmet2013Midland -0.025465939 0.015341375
## 14                               TXmet2013Odessa -0.020480887 0.016598366
## 15                           TXmet2013San Angelo -0.041898073 0.016279517
## 16          TXmet2013San Antonio - New Braunfels -0.052562620 0.008708648
## 17                                TXmet2013Tyler -0.037161573 0.012464453
## 18                                 TXmet2013Waco -0.040610033 0.011475551
## 19                        TXmet2013Wichita Falls -0.009730824 0.013658638
##     statistic      p.value
## 1  14.1391263 7.079216e-45
## 2  -5.4349388 5.634515e-08
## 3  -2.4538895 1.415212e-02
## 4  -2.5816144 9.850657e-03
## 5  -2.5022282 1.236022e-02
## 6  -2.2848316 2.234742e-02
## 7  -5.7869919 7.421893e-09
## 8  -4.4602739 8.291528e-06
## 9  -6.1470087 8.254306e-10
## 10 -3.4014301 6.734553e-04
## 11 -1.3389290 1.806299e-01
## 12 -3.6799276 2.347544e-04
## 13 -1.6599515 9.696138e-02
## 14 -1.2339099 2.172709e-01
## 15 -2.5736681 1.007959e-02
## 16 -6.0356807 1.649776e-09
## 17 -2.9814043 2.877492e-03
## 18 -3.5388306 4.040670e-04
## 19 -0.7124301 4.762182e-01

Reciprocal Graph

qqnorm(rstudent(myPlot3), main="Q-Q Plot for Reciprocal Family Size")

Histogram

myTXMetros%>%
   ggplot(.)+geom_histogram(aes(trantime))+facet_wrap(~TXmet2013)+ggtitle(label = "Histogram Texas Metros Commute Times")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

4)ANOVA

The ANOVA itself:

anova(myPlot1)
## Analysis of Variance Table
## 
## Response: trantime
##             Df  Sum Sq Mean Sq F value    Pr(>F)    
## TXmet2013   18   88806  4933.7  10.458 < 2.2e-16 ***
## Residuals 8434 3978670   471.7                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

t-Test for lowest (Wichita Falls) and highest (Houston Area), finding a true difference:

myWF<-subset(myTXMetros,TXmet2013=="Wichita Falls")
myHou<-subset(myTXMetros,TXmet2013=="Houston - The Woodlands - Sugar Land")

t.test(myHou$trantime,myWF$trantime)
## 
##  Welch Two Sample t-test
## 
## data:  myHou$trantime and myWF$trantime
## t = 12.329, df = 85.018, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  13.48803 18.67496
## sample estimates:
## mean of x mean of y 
##  29.52089  13.43939