Homework 6. ANOVA.

library(haven)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(tidyr)

acs<-read_dta("https://github.com/coreysparks/data/blob/master/usa_00045.dta?raw=true")

Filter the data to keep info from adults (older than 18 years old) in the labor force on Travel time to work for Texan Metropolitan areas.

acstt<-acs%>%
  filter(labforce==2, age>=18, statefip==48, met2013!=0, trantime>0)

Descriptive statistics and plots for Travel time to work.

acstt%>%
  mutate(metro=case_when(met2013==11100 ~ "Amarillo", met2013==12420 ~ "Austin-Round Rock", met2013==13140 ~ "Beaumont-Port Arthur", met2013==15180 ~ "Brownsville-Harlingen", met2013==17780 ~ "College Station-Bryan", met2013==18580 ~ "Corpus Christi", met2013==19100 ~ "Dallas-Fort Worth-Arlington", met2013==21340 ~ "El Paso", met2013==26420 ~ "Houston-The Woodlands-Sugar Land", met2013==28660 ~ "Killeen-Temple", met2013==29700 ~ "Laredo", met2013==31180 ~ "Lubbock", met2013==32580 ~ "McAllen-Edinburg-Mission", met2013==33260 ~ "Midland", met2013==36220 ~ "Odessa", met2013==41660 ~ "San Angelo", met2013==41700 ~ "San Antonio-New Braunfels", met2013==46340 ~ "Tyler", met2013==47380 ~ "Waco", met2013==48660 ~ "Wichita Falls"))%>%
  mutate(sext=ifelse(sex==1, "Male", "Female"))%>%
  group_by(metro, sext)%>%
  summarise(means=mean(trantime), sds=sd(trantime), n=n())
## # A tibble: 38 x 5
## # Groups:   metro [?]
##                    metro   sext    means       sds     n
##                    <chr>  <chr>    <dbl>     <dbl> <int>
##  1              Amarillo Female 14.79310  9.560075    58
##  2              Amarillo   Male 21.77778 29.029154    63
##  3     Austin-Round Rock Female 26.18421 18.708929   380
##  4     Austin-Round Rock   Male 25.65708 17.468365   452
##  5  Beaumont-Port Arthur Female 21.30159 21.247114    63
##  6  Beaumont-Port Arthur   Male 26.15714 23.139549    70
##  7 Brownsville-Harlingen Female 18.62264 10.381615    53
##  8 Brownsville-Harlingen   Male 17.89231 10.928922    65
##  9 College Station-Bryan Female 17.67442 13.276763    43
## 10 College Station-Bryan   Male 24.49057 31.157554    53
## # ... with 28 more rows
acstt%>%
  mutate(metro=case_when(met2013==11100 ~ "Amarillo", met2013==12420 ~ "Austin-Round Rock", met2013==13140 ~ "Beaumont-Port Arthur", met2013==15180 ~ "Brownsville-Harlingen", met2013==17780 ~ "College Station-Bryan", met2013==18580 ~ "Corpus Christi", met2013==19100 ~ "Dallas-Fort Worth-Arlington", met2013==21340 ~ "El Paso", met2013==26420 ~ "Houston-The Woodlands-Sugar Land", met2013==28660 ~ "Killeen-Temple", met2013==29700 ~ "Laredo", met2013==31180 ~ "Lubbock", met2013==32580 ~ "McAllen-Edinburg-Mission", met2013==33260 ~ "Midland", met2013==36220 ~ "Odessa", met2013==41660 ~ "San Angelo", met2013==41700 ~ "San Antonio-New Braunfels", met2013==46340 ~ "Tyler", met2013==47380 ~ "Waco", met2013==48660 ~ "Wichita Falls"))%>%
  mutate(sext=ifelse(sex==1, "Male", "Female"))%>%
  group_by(metro, sext)%>%
  ggplot(.)+ geom_boxplot(aes(metro,acstt$trantime))+facet_wrap(~sext) +ggtitle("Boxplot. Travel time to work", "Metropolitan areas in Texas")+xlab("Metropolitan area in Texas")+ylab("Travel time")+ theme(axis.text.x = element_text(angle = 90, hjust = 1))

acstt%>%
  mutate(sext=ifelse(sex==1, "Male", "Female"))%>%
  ggplot(.)+geom_histogram(aes(acstt$trantime), bins =30, binwidth = 1.9)+ggtitle("Histogram. Travel time to work", "Metropolitan areas in Texas")+ facet_wrap(~sext)+xlab("Time")+ ylab("Frequency")+scale_x_discrete("Time", limits = seq(1,161, 10))+theme(axis.text.x = element_text(angle = 90, hjust = 1))

acstt%>%
   mutate(metro=case_when(met2013==11100 ~ "Amarillo", met2013==12420 ~ "Austin-Round Rock", met2013==13140 ~ "Beaumont-Port Arthur", met2013==15180 ~ "Brownsville-Harlingen", met2013==17780 ~ "College Station-Bryan", met2013==18580 ~ "Corpus Christi", met2013==19100 ~ "Dallas-Fort Worth-Arlington", met2013==21340 ~ "El Paso", met2013==26420 ~ "Houston-The Woodlands-Sugar Land", met2013==28660 ~ "Killeen-Temple", met2013==29700 ~ "Laredo", met2013==31180 ~ "Lubbock", met2013==32580 ~ "McAllen-Edinburg-Mission", met2013==33260 ~ "Midland", met2013==36220 ~ "Odessa", met2013==41660 ~ "San Angelo", met2013==41700 ~ "San Antonio-New Braunfels", met2013==46340 ~ "Tyler", met2013==47380 ~ "Waco", met2013==48660 ~ "Wichita Falls"))%>%
  ggplot(.)+geom_histogram(aes(acstt$trantime), bins = 30, binwidth = 1.9)+ggtitle("Travel time to work", "Metropolitan areas in Texas")+ facet_wrap(~metro)+xlab("Time")+ ylab("Frequency")+scale_x_discrete("Time", limits = seq(1,160, 20))+theme(axis.text.x = element_text(angle = 90, hjust = 1))

ANOVA. Travel time to work ~ Metropolitan areas in Texas.

ttt<-acstt%>%
  mutate(metro=case_when(met2013==11100 ~ "Amarillo", met2013==12420 ~ "Austin-Round Rock", met2013==13140 ~ "Beaumont-Port Arthur", met2013==15180 ~ "Brownsville-Harlingen", met2013==17780 ~ "College Station-Bryan", met2013==18580 ~ "Corpus Christi", met2013==19100 ~ "Dallas-Fort Worth-Arlington", met2013==21340 ~ "El Paso", met2013==26420 ~ "Houston-The Woodlands-Sugar Land", met2013==28660 ~ "Killeen-Temple", met2013==29700 ~ "Laredo", met2013==31180 ~ "Lubbock", met2013==32580 ~ "McAllen-Edinburg-Mission", met2013==33260 ~ "Midland", met2013==36220 ~ "Odessa", met2013==41660 ~ "San Angelo", met2013==41700 ~ "San Antonio-New Braunfels", met2013==46340 ~ "Tyler", met2013==47380 ~ "Waco", met2013==48660 ~ "Wichita Falls"))%>%
  lm(trantime~metro, data=.)
anova(ttt)
## Analysis of Variance Table
## 
## Response: trantime
##             Df  Sum Sq Mean Sq F value    Pr(>F)    
## metro       18   88806  4933.7  10.458 < 2.2e-16 ***
## Residuals 8434 3978670   471.7                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# The F-statistic is 10.458. 

anova(ttt)
## Analysis of Variance Table
## 
## Response: trantime
##             Df  Sum Sq Mean Sq F value    Pr(>F)    
## metro       18   88806  4933.7  10.458 < 2.2e-16 ***
## Residuals 8434 3978670   471.7                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# In the lineal model the reference metro area is Amarillo. 

summary(ttt)
## 
## Call:
## lm(formula = trantime ~ metro, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -28.521 -12.952  -4.521   6.143 142.523 
## 
## Coefficients:
##                                       Estimate Std. Error t value Pr(>|t|)
## (Intercept)                            18.4298     1.9745   9.334  < 2e-16
## metroAustin-Round Rock                  7.4681     2.1132   3.534 0.000412
## metroBeaumont-Port Arthur               5.4274     2.7287   1.989 0.046731
## metroBrownsville-Harlingen             -0.2094     2.8101  -0.075 0.940597
## metroCollege Station-Bryan              3.0077     2.9686   1.013 0.311002
## metroCorpus Christi                     2.9348     2.4945   1.176 0.239429
## metroDallas-Fort Worth-Arlington        9.5226     2.0156   4.724 2.35e-06
## metroEl Paso                            5.6474     2.3567   2.396 0.016581
## metroHouston-The Woodlands-Sugar Land  11.0911     2.0280   5.469 4.66e-08
## metroLaredo                             2.8480     3.0233   0.942 0.346205
## metroLubbock                           -1.9525     2.7336  -0.714 0.475089
## metroMcAllen-Edinburg-Mission           4.6294     2.5865   1.790 0.073517
## metroMidland                            1.8043     3.7331   0.483 0.628876
## metroOdessa                             0.6492     4.0389   0.161 0.872306
## metroSan Angelo                         9.1202     3.9613   2.302 0.021342
## metroSan Antonio-New Braunfels          9.1110     2.1191   4.299 1.73e-05
## metroTyler                              5.7725     3.0330   1.903 0.057046
## metroWaco                               3.7934     2.7924   1.358 0.174348
## metroWichita Falls                     -4.9904     3.3236  -1.501 0.133265
##                                          
## (Intercept)                           ***
## metroAustin-Round Rock                ***
## metroBeaumont-Port Arthur             *  
## metroBrownsville-Harlingen               
## metroCollege Station-Bryan               
## metroCorpus Christi                      
## metroDallas-Fort Worth-Arlington      ***
## metroEl Paso                          *  
## metroHouston-The Woodlands-Sugar Land ***
## metroLaredo                              
## metroLubbock                             
## metroMcAllen-Edinburg-Mission         .  
## metroMidland                             
## metroOdessa                              
## metroSan Angelo                       *  
## metroSan Antonio-New Braunfels        ***
## metroTyler                            .  
## metroWaco                                
## metroWichita Falls                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21.72 on 8434 degrees of freedom
## Multiple R-squared:  0.02183,    Adjusted R-squared:  0.01975 
## F-statistic: 10.46 on 18 and 8434 DF,  p-value: < 2.2e-16
# (I try with tidy but I couldn't find why R cannot find the function)
# Amarillo metro area is the reference group. According to our table, Houston-The Woodlands-Sugar Land would be the metro area with the longest travel time to work (29.5 minutes on average); while Wichita Falls  would be the metro area with the shortest travel time to work (13 minutos on average).
acstt%>%
  mutate(metro=case_when(met2013==11100 ~ "Amarillo", met2013==12420 ~ "Austin-Round Rock", met2013==13140 ~ "Beaumont-Port Arthur", met2013==15180 ~ "Brownsville-Harlingen", met2013==17780 ~ "College Station-Bryan", met2013==18580 ~ "Corpus Christi", met2013==19100 ~ "Dallas-Fort Worth-Arlington", met2013==21340 ~ "El Paso", met2013==26420 ~ "Houston-The Woodlands-Sugar Land", met2013==28660 ~ "Killeen-Temple", met2013==29700 ~ "Laredo", met2013==31180 ~ "Lubbock", met2013==32580 ~ "McAllen-Edinburg-Mission", met2013==33260 ~ "Midland", met2013==36220 ~ "Odessa", met2013==41660 ~ "San Angelo", met2013==41700 ~ "San Antonio-New Braunfels", met2013==46340 ~ "Tyler", met2013==47380 ~ "Waco", met2013==48660 ~ "Wichita Falls"))%>%
  lm(trantime~metro, data=.)
## 
## Call:
## lm(formula = trantime ~ metro, data = .)
## 
## Coefficients:
##                           (Intercept)  
##                               18.4298  
##                metroAustin-Round Rock  
##                                7.4681  
##             metroBeaumont-Port Arthur  
##                                5.4274  
##            metroBrownsville-Harlingen  
##                               -0.2094  
##            metroCollege Station-Bryan  
##                                3.0077  
##                   metroCorpus Christi  
##                                2.9348  
##      metroDallas-Fort Worth-Arlington  
##                                9.5226  
##                          metroEl Paso  
##                                5.6474  
## metroHouston-The Woodlands-Sugar Land  
##                               11.0911  
##                           metroLaredo  
##                                2.8480  
##                          metroLubbock  
##                               -1.9525  
##         metroMcAllen-Edinburg-Mission  
##                                4.6294  
##                          metroMidland  
##                                1.8043  
##                           metroOdessa  
##                                0.6492  
##                       metroSan Angelo  
##                                9.1202  
##        metroSan Antonio-New Braunfels  
##                                9.1110  
##                            metroTyler  
##                                5.7725  
##                             metroWaco  
##                                3.7934  
##                    metroWichita Falls  
##                               -4.9904
pairwise.t.test(x = acstt$trantime, g = acstt$met2013, p.adjust.method = "bonf")
## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  acstt$trantime and acstt$met2013 
## 
##       11100   12420   13140   15180   17780   18580   19100   21340  
## 12420 0.07037 -       -       -       -       -       -       -      
## 13140 1.00000 1.00000 -       -       -       -       -       -      
## 15180 1.00000 0.05613 1.00000 -       -       -       -       -      
## 17780 1.00000 1.00000 1.00000 1.00000 -       -       -       -      
## 18580 1.00000 1.00000 1.00000 1.00000 1.00000 -       -       -      
## 19100 0.00040 1.00000 1.00000 0.00032 0.65830 0.00511 -       -      
## 21340 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 0.69715 -      
## 26420 8.0e-06 0.00716 0.59936 6.5e-06 0.06149 5.3e-05 1.00000 0.01181
## 29700 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 0.70198 1.00000
## 31180 1.00000 0.00064 0.97376 1.00000 1.00000 1.00000 5.2e-07 0.15265
## 32580 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 0.75829 1.00000
## 33260 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000
## 36220 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000
## 41660 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000
## 41700 0.00296 1.00000 1.00000 0.00235 1.00000 0.05126 1.00000 1.00000
## 46340 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000
## 47380 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 0.76760 1.00000
## 48660 1.00000 0.00126 0.24791 1.00000 1.00000 1.00000 1.4e-05 0.05787
##       26420   29700   31180   32580   33260   36220   41660   41700  
## 12420 -       -       -       -       -       -       -       -      
## 13140 -       -       -       -       -       -       -       -      
## 15180 -       -       -       -       -       -       -       -      
## 17780 -       -       -       -       -       -       -       -      
## 18580 -       -       -       -       -       -       -       -      
## 19100 -       -       -       -       -       -       -       -      
## 21340 -       -       -       -       -       -       -       -      
## 26420 -       -       -       -       -       -       -       -      
## 29700 0.07169 -       -       -       -       -       -       -      
## 31180 3.7e-09 1.00000 -       -       -       -       -       -      
## 32580 0.03333 1.00000 1.00000 -       -       -       -       -      
## 33260 0.63865 1.00000 1.00000 1.00000 -       -       -       -      
## 36220 0.56574 1.00000 1.00000 1.00000 1.00000 -       -       -      
## 41660 1.00000 1.00000 0.81144 1.00000 1.00000 1.00000 -       -      
## 41700 1.00000 1.00000 1.0e-05 1.00000 1.00000 1.00000 1.00000 -      
## 46340 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000
## 47380 0.05506 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000
## 48660 5.5e-07 1.00000 1.00000 0.39075 1.00000 1.00000 0.20360 7.0e-05
##       46340   47380  
## 12420 -       -      
## 13140 -       -      
## 15180 -       -      
## 17780 -       -      
## 18580 -       -      
## 19100 -       -      
## 21340 -       -      
## 26420 -       -      
## 29700 -       -      
## 31180 -       -      
## 32580 -       -      
## 33260 -       -      
## 36220 -       -      
## 41660 -       -      
## 41700 -       -      
## 46340 -       -      
## 47380 1.00000 -      
## 48660 0.39183 1.00000
## 
## P value adjustment method: bonferroni
qqnorm(rstudent(ttt), main="Q-Q Plot for Residuals")
qqline(rstudent(ttt), col=2)

Log-Transformation for travel time to work.

tttlog<-acstt%>%
  mutate(metro=case_when(met2013==11100 ~ "Amarillo", met2013==12420 ~ "Austin-Round Rock", met2013==13140 ~ "Beaumont-Port Arthur", met2013==15180 ~ "Brownsville-Harlingen", met2013==17780 ~ "College Station-Bryan", met2013==18580 ~ "Corpus Christi", met2013==19100 ~ "Dallas-Fort Worth-Arlington", met2013==21340 ~ "El Paso", met2013==26420 ~ "Houston-The Woodlands-Sugar Land", met2013==28660 ~ "Killeen-Temple", met2013==29700 ~ "Laredo", met2013==31180 ~ "Lubbock", met2013==32580 ~ "McAllen-Edinburg-Mission", met2013==33260 ~ "Midland", met2013==36220 ~ "Odessa", met2013==41660 ~ "San Angelo", met2013==41700 ~ "San Antonio-New Braunfels", met2013==46340 ~ "Tyler", met2013==47380 ~ "Waco", met2013==48660 ~ "Wichita Falls"))%>%
lm(log(trantime)~metro, data=.)
qqnorm(rstudent(tttlog), main="Q-Q Plot for Model Residuals")
qqline(rstudent(tttlog), col=2)