Loading libraries and IPUMS data.
library(haven)
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(broom)
ipums<-read_dta("https://github.com/coreysparks/data/blob/master/usa_00045.dta?raw=true")
mySE<-function(x){sd(x,na.rm=T)/length(x)}
1 and 2) clear up missing values for Travel Time and remove poputaltion not in a metro area
myPositiveTravel<-subset(ipums,age>17)
myPositiveTravel<-subset(myPositiveTravel,trantime>0)
myPositiveTravel<-subset(myPositiveTravel,met2013>0)
#subset only takes 2 manipulations at a time
2)Mutating the database to reduce the observations to Texas Metro Areas and labeling the areas
The Metro List:
Code Metro Area
11100 Amarillo
12420 Austin-Round Rock
13140 Beaumont - Port Arthur
15180 Brownsville - Harlingen
17780 College Station - Bryan
18580 Corpus Christi
19100 Dallas - Fort Worth - Arlington
21340 El Paso
26420 Houston - The Woodlands - Sugar Land
28660 Killeen - Temple
29700 Laredo
31180 Lubbock
32580 McAllen - Edinburg - Mission
33260 Midland
36220 Odessa
41660 San Angelo
41700 San Antonio - New Braunfels
46340 Tyler
47380 Waco
48660 Wichita Falls
myTXMetros<-myPositiveTravel%>%
filter(statefip==48)%>%
mutate(TXmet2013 = case_when
(
.$met2013 ==11100 ~"Amarillo",
.$met2013 ==12420 ~"Austin-Round Rock",
.$met2013 ==13140 ~"Beaumont - Port Arthur",
.$met2013 ==15180 ~"Brownsville - Harlingen",
.$met2013 ==17780 ~"College Station - Bryan",
.$met2013 ==18580 ~"Corpus Christi",
.$met2013 ==19100 ~"Dallas - Fort Worth - Arlington",
.$met2013 ==21340 ~"El Paso",
.$met2013 ==26420 ~"Houston - The Woodlands - Sugar Land",
.$met2013 ==28660 ~"Killeen - Temple",
.$met2013 ==29700 ~"Laredo",
.$met2013 ==31180 ~"Lubbock",
.$met2013 ==32580 ~"McAllen - Edinburg - Mission",
.$met2013 ==33260 ~"Midland",
.$met2013 ==36220 ~"Odessa",
.$met2013 ==41660 ~"San Angelo",
.$met2013 ==41700 ~"San Antonio - New Braunfels",
.$met2013 ==46340 ~"Tyler",
.$met2013 ==47380 ~"Waco",
.$met2013 ==48660 ~"Wichita Falls"
))
The new database contains 14,863 observations.
3) Descriptive Statistics
General Summary:
myTXMetros%>%
group_by(TXmet2013) %>%
summarise(mean(trantime), sd(trantime), mySE(trantime))
## # A tibble: 19 x 4
## TXmet2013 `mean(trantime)` `sd(trantime)`
## <chr> <dbl> <dbl>
## 1 Amarillo 18.42975 22.160335
## 2 Austin-Round Rock 25.89784 18.036476
## 3 Beaumont - Port Arthur 23.85714 22.312567
## 4 Brownsville - Harlingen 18.22034 10.647527
## 5 College Station - Bryan 21.43750 24.918314
## 6 Corpus Christi 21.36453 23.844486
## 7 Dallas - Fort Worth - Arlington 27.95233 22.545452
## 8 El Paso 24.07719 25.014806
## 9 Houston - The Woodlands - Sugar Land 29.52089 21.717294
## 10 Laredo 21.27778 20.361358
## 11 Lubbock 16.47727 17.345376
## 12 McAllen - Edinburg - Mission 23.05917 22.180994
## 13 Midland 20.23404 16.808658
## 14 Odessa 19.07895 12.101779
## 15 San Angelo 27.55000 33.576510
## 16 San Antonio - New Braunfels 27.54078 21.971427
## 17 Tyler 24.20225 27.622940
## 18 Waco 22.22314 18.555542
## 19 Wichita Falls 13.43939 9.907462
## # ... with 1 more variables: `mySE(trantime)` <dbl>
Summary using the Linear Model
myPlot1<-lm(trantime~TXmet2013,data=myTXMetros)
tidy(myPlot1)
## term estimate std.error
## 1 (Intercept) 18.4297521 1.974511
## 2 TXmet2013Austin-Round Rock 7.4680845 2.113218
## 3 TXmet2013Beaumont - Port Arthur 5.4273908 2.728667
## 4 TXmet2013Brownsville - Harlingen -0.2094131 2.810072
## 5 TXmet2013College Station - Bryan 3.0077479 2.968614
## 6 TXmet2013Corpus Christi 2.9347800 2.494503
## 7 TXmet2013Dallas - Fort Worth - Arlington 9.5225792 2.015647
## 8 TXmet2013El Paso 5.6474409 2.356677
## 9 TXmet2013Houston - The Woodlands - Sugar Land 11.0911380 2.028035
## 10 TXmet2013Laredo 2.8480257 3.023288
## 11 TXmet2013Lubbock -1.9524793 2.733586
## 12 TXmet2013McAllen - Edinburg - Mission 4.6294195 2.586516
## 13 TXmet2013Midland 1.8042905 3.733062
## 14 TXmet2013Odessa 0.6491953 4.038929
## 15 TXmet2013San Angelo 9.1202479 3.961343
## 16 TXmet2013San Antonio - New Braunfels 9.1110259 2.119101
## 17 TXmet2013Tyler 5.7724951 3.033012
## 18 TXmet2013Waco 3.7933884 2.792380
## 19 TXmet2013Wichita Falls -4.9903581 3.323597
## statistic p.value
## 1 9.33383222 1.283309e-20
## 2 3.53398687 4.115336e-04
## 3 1.98902642 4.673054e-02
## 4 -0.07452232 9.405965e-01
## 5 1.01318251 3.110021e-01
## 6 1.17649898 2.394287e-01
## 7 4.72432818 2.346104e-06
## 8 2.39635749 1.658058e-02
## 9 5.46890853 4.657666e-08
## 10 0.94202934 3.462046e-01
## 11 -0.71425559 4.750889e-01
## 12 1.78982838 7.351737e-02
## 13 0.48332721 6.288760e-01
## 14 0.16073450 8.723064e-01
## 15 2.30231212 2.134181e-02
## 16 4.29947667 1.731281e-05
## 17 1.90322191 5.704561e-02
## 18 1.35847864 1.743482e-01
## 19 -1.50149322 1.332655e-01
Box Plot
myTXMetros%>%
ggplot(.)+
geom_boxplot(aes(TXmet2013,trantime))+
ggtitle("Box Plot for Comute Time in Texas Metro Areas")+
coord_flip()+
xlab("Metro Area")+
ylab("Commute Time")

Graph (Linear Model) of Commute Times in Texas Metros
qqnorm(rstudent(myPlot1), main="Q-Q Plot for Commute Times in Texas")
qqline(rstudent(myPlot1), col=2)

Transforamtion using Square Root
myPlot2<-lm(sqrt(trantime)~TXmet2013,data=myTXMetros)
tidy(myPlot2)
## term estimate std.error
## 1 (Intercept) 3.9106054 0.1622638
## 2 TXmet2013Austin-Round Rock 0.9066938 0.1736627
## 3 TXmet2013Beaumont - Port Arthur 0.6255438 0.2242398
## 4 TXmet2013Brownsville - Harlingen 0.1821689 0.2309296
## 5 TXmet2013College Station - Bryan 0.3218004 0.2439585
## 6 TXmet2013Corpus Christi 0.3252899 0.2049963
## 7 TXmet2013Dallas - Fort Worth - Arlington 1.0467210 0.1656444
## 8 TXmet2013El Paso 0.6395462 0.1936699
## 9 TXmet2013Houston - The Woodlands - Sugar Land 1.2051111 0.1666624
## 10 TXmet2013Laredo 0.4110962 0.2484515
## 11 TXmet2013Lubbock -0.1202756 0.2246441
## 12 TXmet2013McAllen - Edinburg - Mission 0.5671799 0.2125579
## 13 TXmet2013Midland 0.2600347 0.3067802
## 14 TXmet2013Odessa 0.2291389 0.3319162
## 15 TXmet2013San Angelo 0.8106906 0.3255402
## 16 TXmet2013San Antonio - New Braunfels 1.0427744 0.1741461
## 17 TXmet2013Tyler 0.5974084 0.2492506
## 18 TXmet2013Waco 0.5251029 0.2294757
## 19 TXmet2013Wichita Falls -0.4050950 0.2731307
## statistic p.value
## 1 24.1002949 3.651938e-124
## 2 5.2210060 1.821977e-07
## 3 2.7896198 5.288838e-03
## 4 0.7888505 4.302215e-01
## 5 1.3190787 1.871786e-01
## 6 1.5868082 1.125936e-01
## 7 6.3190862 2.764530e-10
## 8 3.3022478 9.631228e-04
## 9 7.2308525 5.219080e-13
## 10 1.6546336 9.803605e-02
## 11 -0.5354051 5.923839e-01
## 12 2.6683546 7.636956e-03
## 13 0.8476253 3.966708e-01
## 14 0.6903519 4.899919e-01
## 15 2.4902935 1.278286e-02
## 16 5.9879277 2.212265e-09
## 17 2.3968178 1.655978e-02
## 18 2.2882725 2.214628e-02
## 19 -1.4831544 1.380708e-01
Square Root Graph
qqnorm(rstudent(myPlot2), main="Q-Q Plot")
qqline(rstudent(myPlot2), col=2)

Reciprocal Graph
qqnorm(rstudent(myPlot3), main="Q-Q Plot for Reciprocal Family Size")

Histogram
myTXMetros%>%
ggplot(.)+geom_histogram(aes(trantime))+facet_wrap(~TXmet2013)+ggtitle(label = "Histogram Texas Metros Commute Times")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

4)ANOVA
The ANOVA itself:
anova(myPlot1)
## Analysis of Variance Table
##
## Response: trantime
## Df Sum Sq Mean Sq F value Pr(>F)
## TXmet2013 18 88806 4933.7 10.458 < 2.2e-16 ***
## Residuals 8434 3978670 471.7
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
t-Test for lowest (Wichita Falls) and highest (Houston Area), finding a true difference:
myWF<-subset(myTXMetros,TXmet2013=="Wichita Falls")
myHou<-subset(myTXMetros,TXmet2013=="Houston - The Woodlands - Sugar Land")
t.test(myHou$trantime,myWF$trantime)
##
## Welch Two Sample t-test
##
## data: myHou$trantime and myWF$trantime
## t = 12.329, df = 85.018, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 13.48803 18.67496
## sample estimates:
## mean of x mean of y
## 29.52089 13.43939