Homework 6. ANOVA.
library(haven)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(tidyr)
acs<-read_dta("https://github.com/coreysparks/data/blob/master/usa_00045.dta?raw=true")
Filter the data to keep info from adults (older than 18 years old) in the labor force on Travel time to work for Texan Metropolitan areas.
acstt<-acs%>%
filter(labforce==2, age>=18, statefip==48, met2013!=0, trantime>0)
Descriptive statistics and plots for Travel time to work.
acstt%>%
mutate(metro=case_when(met2013==11100 ~ "Amarillo", met2013==12420 ~ "Austin-Round Rock", met2013==13140 ~ "Beaumont-Port Arthur", met2013==15180 ~ "Brownsville-Harlingen", met2013==17780 ~ "College Station-Bryan", met2013==18580 ~ "Corpus Christi", met2013==19100 ~ "Dallas-Fort Worth-Arlington", met2013==21340 ~ "El Paso", met2013==26420 ~ "Houston-The Woodlands-Sugar Land", met2013==28660 ~ "Killeen-Temple", met2013==29700 ~ "Laredo", met2013==31180 ~ "Lubbock", met2013==32580 ~ "McAllen-Edinburg-Mission", met2013==33260 ~ "Midland", met2013==36220 ~ "Odessa", met2013==41660 ~ "San Angelo", met2013==41700 ~ "San Antonio-New Braunfels", met2013==46340 ~ "Tyler", met2013==47380 ~ "Waco", met2013==48660 ~ "Wichita Falls"))%>%
mutate(sext=ifelse(sex==1, "Male", "Female"))%>%
group_by(metro, sext)%>%
summarise(means=mean(trantime), sds=sd(trantime), n=n())
## # A tibble: 38 x 5
## # Groups: metro [?]
## metro sext means sds n
## <chr> <chr> <dbl> <dbl> <int>
## 1 Amarillo Female 14.79310 9.560075 58
## 2 Amarillo Male 21.77778 29.029154 63
## 3 Austin-Round Rock Female 26.18421 18.708929 380
## 4 Austin-Round Rock Male 25.65708 17.468365 452
## 5 Beaumont-Port Arthur Female 21.30159 21.247114 63
## 6 Beaumont-Port Arthur Male 26.15714 23.139549 70
## 7 Brownsville-Harlingen Female 18.62264 10.381615 53
## 8 Brownsville-Harlingen Male 17.89231 10.928922 65
## 9 College Station-Bryan Female 17.67442 13.276763 43
## 10 College Station-Bryan Male 24.49057 31.157554 53
## # ... with 28 more rows
acstt%>%
mutate(metro=case_when(met2013==11100 ~ "Amarillo", met2013==12420 ~ "Austin-Round Rock", met2013==13140 ~ "Beaumont-Port Arthur", met2013==15180 ~ "Brownsville-Harlingen", met2013==17780 ~ "College Station-Bryan", met2013==18580 ~ "Corpus Christi", met2013==19100 ~ "Dallas-Fort Worth-Arlington", met2013==21340 ~ "El Paso", met2013==26420 ~ "Houston-The Woodlands-Sugar Land", met2013==28660 ~ "Killeen-Temple", met2013==29700 ~ "Laredo", met2013==31180 ~ "Lubbock", met2013==32580 ~ "McAllen-Edinburg-Mission", met2013==33260 ~ "Midland", met2013==36220 ~ "Odessa", met2013==41660 ~ "San Angelo", met2013==41700 ~ "San Antonio-New Braunfels", met2013==46340 ~ "Tyler", met2013==47380 ~ "Waco", met2013==48660 ~ "Wichita Falls"))%>%
mutate(sext=ifelse(sex==1, "Male", "Female"))%>%
group_by(metro, sext)%>%
ggplot(.)+ geom_boxplot(aes(metro,acstt$trantime))+facet_wrap(~sext) +ggtitle("Boxplot. Travel time to work", "Metropolitan areas in Texas")+xlab("Metropolitan area in Texas")+ylab("Travel time")+ theme(axis.text.x = element_text(angle = 90, hjust = 1))

acstt%>%
mutate(sext=ifelse(sex==1, "Male", "Female"))%>%
ggplot(.)+geom_histogram(aes(acstt$trantime), bins =30, binwidth = 1.9)+ggtitle("Histogram. Travel time to work", "Metropolitan areas in Texas")+ facet_wrap(~sext)+xlab("Time")+ ylab("Frequency")+scale_x_discrete("Time", limits = seq(1,161, 10))+theme(axis.text.x = element_text(angle = 90, hjust = 1))

acstt%>%
mutate(metro=case_when(met2013==11100 ~ "Amarillo", met2013==12420 ~ "Austin-Round Rock", met2013==13140 ~ "Beaumont-Port Arthur", met2013==15180 ~ "Brownsville-Harlingen", met2013==17780 ~ "College Station-Bryan", met2013==18580 ~ "Corpus Christi", met2013==19100 ~ "Dallas-Fort Worth-Arlington", met2013==21340 ~ "El Paso", met2013==26420 ~ "Houston-The Woodlands-Sugar Land", met2013==28660 ~ "Killeen-Temple", met2013==29700 ~ "Laredo", met2013==31180 ~ "Lubbock", met2013==32580 ~ "McAllen-Edinburg-Mission", met2013==33260 ~ "Midland", met2013==36220 ~ "Odessa", met2013==41660 ~ "San Angelo", met2013==41700 ~ "San Antonio-New Braunfels", met2013==46340 ~ "Tyler", met2013==47380 ~ "Waco", met2013==48660 ~ "Wichita Falls"))%>%
ggplot(.)+geom_histogram(aes(acstt$trantime), bins = 30, binwidth = 1.9)+ggtitle("Travel time to work", "Metropolitan areas in Texas")+ facet_wrap(~metro)+xlab("Time")+ ylab("Frequency")+scale_x_discrete("Time", limits = seq(1,160, 20))+theme(axis.text.x = element_text(angle = 90, hjust = 1))

ANOVA. Travel time to work ~ Metropolitan areas in Texas.
ttt<-acstt%>%
mutate(metro=case_when(met2013==11100 ~ "Amarillo", met2013==12420 ~ "Austin-Round Rock", met2013==13140 ~ "Beaumont-Port Arthur", met2013==15180 ~ "Brownsville-Harlingen", met2013==17780 ~ "College Station-Bryan", met2013==18580 ~ "Corpus Christi", met2013==19100 ~ "Dallas-Fort Worth-Arlington", met2013==21340 ~ "El Paso", met2013==26420 ~ "Houston-The Woodlands-Sugar Land", met2013==28660 ~ "Killeen-Temple", met2013==29700 ~ "Laredo", met2013==31180 ~ "Lubbock", met2013==32580 ~ "McAllen-Edinburg-Mission", met2013==33260 ~ "Midland", met2013==36220 ~ "Odessa", met2013==41660 ~ "San Angelo", met2013==41700 ~ "San Antonio-New Braunfels", met2013==46340 ~ "Tyler", met2013==47380 ~ "Waco", met2013==48660 ~ "Wichita Falls"))%>%
lm(trantime~metro, data=.)
anova(ttt)
## Analysis of Variance Table
##
## Response: trantime
## Df Sum Sq Mean Sq F value Pr(>F)
## metro 18 88806 4933.7 10.458 < 2.2e-16 ***
## Residuals 8434 3978670 471.7
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# The F-statistic is 10.458.
anova(ttt)
## Analysis of Variance Table
##
## Response: trantime
## Df Sum Sq Mean Sq F value Pr(>F)
## metro 18 88806 4933.7 10.458 < 2.2e-16 ***
## Residuals 8434 3978670 471.7
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# In the lineal model the reference metro area is Amarillo.
summary(ttt)
##
## Call:
## lm(formula = trantime ~ metro, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28.521 -12.952 -4.521 6.143 142.523
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 18.4298 1.9745 9.334 < 2e-16
## metroAustin-Round Rock 7.4681 2.1132 3.534 0.000412
## metroBeaumont-Port Arthur 5.4274 2.7287 1.989 0.046731
## metroBrownsville-Harlingen -0.2094 2.8101 -0.075 0.940597
## metroCollege Station-Bryan 3.0077 2.9686 1.013 0.311002
## metroCorpus Christi 2.9348 2.4945 1.176 0.239429
## metroDallas-Fort Worth-Arlington 9.5226 2.0156 4.724 2.35e-06
## metroEl Paso 5.6474 2.3567 2.396 0.016581
## metroHouston-The Woodlands-Sugar Land 11.0911 2.0280 5.469 4.66e-08
## metroLaredo 2.8480 3.0233 0.942 0.346205
## metroLubbock -1.9525 2.7336 -0.714 0.475089
## metroMcAllen-Edinburg-Mission 4.6294 2.5865 1.790 0.073517
## metroMidland 1.8043 3.7331 0.483 0.628876
## metroOdessa 0.6492 4.0389 0.161 0.872306
## metroSan Angelo 9.1202 3.9613 2.302 0.021342
## metroSan Antonio-New Braunfels 9.1110 2.1191 4.299 1.73e-05
## metroTyler 5.7725 3.0330 1.903 0.057046
## metroWaco 3.7934 2.7924 1.358 0.174348
## metroWichita Falls -4.9904 3.3236 -1.501 0.133265
##
## (Intercept) ***
## metroAustin-Round Rock ***
## metroBeaumont-Port Arthur *
## metroBrownsville-Harlingen
## metroCollege Station-Bryan
## metroCorpus Christi
## metroDallas-Fort Worth-Arlington ***
## metroEl Paso *
## metroHouston-The Woodlands-Sugar Land ***
## metroLaredo
## metroLubbock
## metroMcAllen-Edinburg-Mission .
## metroMidland
## metroOdessa
## metroSan Angelo *
## metroSan Antonio-New Braunfels ***
## metroTyler .
## metroWaco
## metroWichita Falls
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 21.72 on 8434 degrees of freedom
## Multiple R-squared: 0.02183, Adjusted R-squared: 0.01975
## F-statistic: 10.46 on 18 and 8434 DF, p-value: < 2.2e-16
# (I try with tidy but I couldn't find why R cannot find the function)
# Amarillo metro area is the reference group. According to our table, Houston-The Woodlands-Sugar Land would be the metro area with the longest travel time to work (29.5 minutes on average); while Wichita Falls would be the metro area with the shortest travel time to work (13 minutos on average).
acstt%>%
mutate(metro=case_when(met2013==11100 ~ "Amarillo", met2013==12420 ~ "Austin-Round Rock", met2013==13140 ~ "Beaumont-Port Arthur", met2013==15180 ~ "Brownsville-Harlingen", met2013==17780 ~ "College Station-Bryan", met2013==18580 ~ "Corpus Christi", met2013==19100 ~ "Dallas-Fort Worth-Arlington", met2013==21340 ~ "El Paso", met2013==26420 ~ "Houston-The Woodlands-Sugar Land", met2013==28660 ~ "Killeen-Temple", met2013==29700 ~ "Laredo", met2013==31180 ~ "Lubbock", met2013==32580 ~ "McAllen-Edinburg-Mission", met2013==33260 ~ "Midland", met2013==36220 ~ "Odessa", met2013==41660 ~ "San Angelo", met2013==41700 ~ "San Antonio-New Braunfels", met2013==46340 ~ "Tyler", met2013==47380 ~ "Waco", met2013==48660 ~ "Wichita Falls"))%>%
lm(trantime~metro, data=.)
##
## Call:
## lm(formula = trantime ~ metro, data = .)
##
## Coefficients:
## (Intercept)
## 18.4298
## metroAustin-Round Rock
## 7.4681
## metroBeaumont-Port Arthur
## 5.4274
## metroBrownsville-Harlingen
## -0.2094
## metroCollege Station-Bryan
## 3.0077
## metroCorpus Christi
## 2.9348
## metroDallas-Fort Worth-Arlington
## 9.5226
## metroEl Paso
## 5.6474
## metroHouston-The Woodlands-Sugar Land
## 11.0911
## metroLaredo
## 2.8480
## metroLubbock
## -1.9525
## metroMcAllen-Edinburg-Mission
## 4.6294
## metroMidland
## 1.8043
## metroOdessa
## 0.6492
## metroSan Angelo
## 9.1202
## metroSan Antonio-New Braunfels
## 9.1110
## metroTyler
## 5.7725
## metroWaco
## 3.7934
## metroWichita Falls
## -4.9904
pairwise.t.test(x = acstt$trantime, g = acstt$met2013, p.adjust.method = "bonf")
##
## Pairwise comparisons using t tests with pooled SD
##
## data: acstt$trantime and acstt$met2013
##
## 11100 12420 13140 15180 17780 18580 19100 21340
## 12420 0.07037 - - - - - - -
## 13140 1.00000 1.00000 - - - - - -
## 15180 1.00000 0.05613 1.00000 - - - - -
## 17780 1.00000 1.00000 1.00000 1.00000 - - - -
## 18580 1.00000 1.00000 1.00000 1.00000 1.00000 - - -
## 19100 0.00040 1.00000 1.00000 0.00032 0.65830 0.00511 - -
## 21340 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 0.69715 -
## 26420 8.0e-06 0.00716 0.59936 6.5e-06 0.06149 5.3e-05 1.00000 0.01181
## 29700 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 0.70198 1.00000
## 31180 1.00000 0.00064 0.97376 1.00000 1.00000 1.00000 5.2e-07 0.15265
## 32580 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 0.75829 1.00000
## 33260 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000
## 36220 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000
## 41660 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000
## 41700 0.00296 1.00000 1.00000 0.00235 1.00000 0.05126 1.00000 1.00000
## 46340 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000
## 47380 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 0.76760 1.00000
## 48660 1.00000 0.00126 0.24791 1.00000 1.00000 1.00000 1.4e-05 0.05787
## 26420 29700 31180 32580 33260 36220 41660 41700
## 12420 - - - - - - - -
## 13140 - - - - - - - -
## 15180 - - - - - - - -
## 17780 - - - - - - - -
## 18580 - - - - - - - -
## 19100 - - - - - - - -
## 21340 - - - - - - - -
## 26420 - - - - - - - -
## 29700 0.07169 - - - - - - -
## 31180 3.7e-09 1.00000 - - - - - -
## 32580 0.03333 1.00000 1.00000 - - - - -
## 33260 0.63865 1.00000 1.00000 1.00000 - - - -
## 36220 0.56574 1.00000 1.00000 1.00000 1.00000 - - -
## 41660 1.00000 1.00000 0.81144 1.00000 1.00000 1.00000 - -
## 41700 1.00000 1.00000 1.0e-05 1.00000 1.00000 1.00000 1.00000 -
## 46340 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000
## 47380 0.05506 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000
## 48660 5.5e-07 1.00000 1.00000 0.39075 1.00000 1.00000 0.20360 7.0e-05
## 46340 47380
## 12420 - -
## 13140 - -
## 15180 - -
## 17780 - -
## 18580 - -
## 19100 - -
## 21340 - -
## 26420 - -
## 29700 - -
## 31180 - -
## 32580 - -
## 33260 - -
## 36220 - -
## 41660 - -
## 41700 - -
## 46340 - -
## 47380 1.00000 -
## 48660 0.39183 1.00000
##
## P value adjustment method: bonferroni
qqnorm(rstudent(ttt), main="Q-Q Plot for Residuals")
qqline(rstudent(ttt), col=2)
