library(haven)
ipums<-read_dta("https://github.com/coreysparks/data/blob/master/usa_00045.dta?raw=true")
## `curl` package not installed, falling back to using `url()`
names(ipums) #print the column names
##  [1] "year"      "datanum"   "serial"    "hhwt"      "statefip" 
##  [6] "met2013"   "puma"      "gq"        "pernum"    "perwt"    
## [11] "famsize"   "nchild"    "nchlt5"    "eldch"     "nsibs"    
## [16] "relate"    "related"   "sex"       "age"       "marst"    
## [21] "birthyr"   "fertyr"    "race"      "raced"     "hispan"   
## [26] "hispand"   "bpl"       "bpld"      "citizen"   "yrsusa1"  
## [31] "language"  "languaged" "speakeng"  "educ"      "educd"    
## [36] "empstat"   "empstatd"  "labforce"  "occ"       "ind"      
## [41] "inctot"    "incwage"   "poverty"   "hwsei"     "migrate1" 
## [46] "migrate1d" "carpool"   "trantime"
library(broom)
library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)

In this analysis, we focus on the population of Texas over the age of 18 living in Metro Areas that participates in the labor force.

dragonipums<-ipums%>%
mutate(trantime= ifelse(trantime%in%c(000), NA, trantime))%>%
filter(labforce==2, age>=18, met2013>=1) %>%
mutate(livingarea=ifelse(met2013%in%c(00000), NA, trantime))%>%
mutate(livingarea = case_when(.$met2013==11100 ~ "Amarillo", 
                        .$met2013==12420 ~ "Austin-Round Rock",
                         .$met2013==13140 ~ "Beaumont-Port Arthur",
                         .$met2013==15180 ~ "Brownsville-Harlingen",
                         .$met2013==17780 ~ "College Station-Bryan",
                         .$met2013==18580 ~ "Corpus Christi",
                         .$met2013==19100 ~ "Dallas-Fort Worth-Arlington",
                         .$met2013==21340 ~ "El Paso",
                         .$met2013==26420 ~ "Houston-The Woodland-Sugarland",
                         .$met2013==28660 ~ "Killeen-Temple",
                         .$met2013==29700 ~ "Laredo",
                         .$met2013==31180 ~ "Lubbock",
                         .$met2013==32580 ~ "McCallen-Edinburg-Mission",
                         .$met2013==32260 ~ "Midland",
                         .$met2013==36220 ~ "Odessa",
                         .$met2013==41660 ~ "San Angelo",
                         .$met2013==41700 ~ "San Antonio-New Braunfels",
                         .$met2013==46340 ~ "Tyler",
                         .$met2013==47380 ~ "Waco",
                         .$met2013==48660 ~ "Wichita Falls"))

We review some of the descriptive statistics for the metro areas in Texas.

dragonipums%>%
group_by(livingarea)%>%
summarise(meantime=mean(trantime, na.rm=T), sds=sd(trantime, na.rm=T), n=n())
## # A tibble: 19 x 4
##                        livingarea meantime       sds     n
##                             <chr>    <dbl>     <dbl> <int>
##  1                       Amarillo 18.42975 22.160335   133
##  2              Austin-Round Rock 25.89784 18.036476   945
##  3           Beaumont-Port Arthur 23.85714 22.312567   151
##  4          Brownsville-Harlingen 18.22034 10.647527   130
##  5          College Station-Bryan 21.43750 24.918314   101
##  6                 Corpus Christi 21.36453 23.844486   224
##  7    Dallas-Fort Worth-Arlington 27.95233 22.545452  3217
##  8                        El Paso 24.07719 25.014806   326
##  9 Houston-The Woodland-Sugarland 29.52089 21.717294  2456
## 10                         Laredo 21.27778 20.361358    99
## 11                        Lubbock 16.47727 17.345376   136
## 12      McCallen-Edinburg-Mission 23.05917 22.180994   201
## 13                         Odessa 19.07895 12.101779    42
## 14                     San Angelo 27.55000 33.576510    44
## 15      San Antonio-New Braunfels 27.54078 21.971427   903
## 16                          Tyler 24.20225 27.622940    99
## 17                           Waco 22.22314 18.555542   134
## 18                  Wichita Falls 13.43939  9.907462    74
## 19                           <NA> 28.06333 22.823387 99649

We look at the distribution by mean time by metro area through different graphs:

dragonipums%>%
   ggplot(aes(x=livingarea, y=trantime))+geom_boxplot()+coord_flip()
## Warning: Removed 13516 rows containing non-finite values (stat_boxplot).

dragonipums%>%
   ggplot(aes(trantime, fill=livingarea))+geom_histogram(aes(y=0.5*..density..))+facet_wrap(~livingarea)+ggtitle(label = "Commute Time by Texas Metro Area")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 13516 rows containing non-finite values (stat_bin).

We proceed to do an ANOVA test. Results show that there is a significant variation between the variables based on the F value of 18 and a P value of -2.2.

trantime_fit<-dragonipums%>%
  lm(trantime~livingarea, data=.)
anova(trantime_fit)
## Analysis of Variance Table
## 
## Response: trantime
##              Df  Sum Sq Mean Sq F value    Pr(>F)    
## livingarea   17   86798  5105.8  10.799 < 2.2e-16 ***
## Residuals  8388 3965674   472.8                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
pairwise.t.test(x = dragonipums$trantime,g = dragonipums$livingarea,p.adjust.method = "bonf")
## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  dragonipums$trantime and dragonipums$livingarea 
## 
##                                Amarillo Austin-Round Rock
## Austin-Round Rock              0.06390  -                
## Beaumont-Port Arthur           1.00000  1.00000          
## Brownsville-Harlingen          1.00000  0.05098          
## College Station-Bryan          1.00000  1.00000          
## Corpus Christi                 1.00000  1.00000          
## Dallas-Fort Worth-Arlington    0.00037  1.00000          
## El Paso                        1.00000  1.00000          
## Houston-The Woodland-Sugarland 7.4e-06  0.00653          
## Laredo                         1.00000  1.00000          
## Lubbock                        1.00000  0.00058          
## McCallen-Edinburg-Mission      1.00000  1.00000          
## Odessa                         1.00000  1.00000          
## San Angelo                     1.00000  1.00000          
## San Antonio-New Braunfels      0.00271  1.00000          
## Tyler                          1.00000  1.00000          
## Waco                           1.00000  1.00000          
## Wichita Falls                  1.00000  0.00115          
##                                Beaumont-Port Arthur Brownsville-Harlingen
## Austin-Round Rock              -                    -                    
## Beaumont-Port Arthur           -                    -                    
## Brownsville-Harlingen          1.00000              -                    
## College Station-Bryan          1.00000              1.00000              
## Corpus Christi                 1.00000              1.00000              
## Dallas-Fort Worth-Arlington    1.00000              0.00029              
## El Paso                        1.00000              1.00000              
## Houston-The Woodland-Sugarland 0.54182              6.0e-06              
## Laredo                         1.00000              1.00000              
## Lubbock                        0.87941              1.00000              
## McCallen-Edinburg-Mission      1.00000              1.00000              
## Odessa                         1.00000              1.00000              
## San Angelo                     1.00000              1.00000              
## San Antonio-New Braunfels      1.00000              0.00215              
## Tyler                          1.00000              1.00000              
## Waco                           1.00000              1.00000              
## Wichita Falls                  0.22451              1.00000              
##                                College Station-Bryan Corpus Christi
## Austin-Round Rock              -                     -             
## Beaumont-Port Arthur           -                     -             
## Brownsville-Harlingen          -                     -             
## College Station-Bryan          -                     -             
## Corpus Christi                 1.00000               -             
## Dallas-Fort Worth-Arlington    0.59498               0.00467       
## El Paso                        1.00000               1.00000       
## Houston-The Woodland-Sugarland 0.05585               4.9e-05       
## Laredo                         1.00000               1.00000       
## Lubbock                        1.00000               1.00000       
## McCallen-Edinburg-Mission      1.00000               1.00000       
## Odessa                         1.00000               1.00000       
## San Angelo                     1.00000               1.00000       
## San Antonio-New Braunfels      1.00000               0.04657       
## Tyler                          1.00000               1.00000       
## Waco                           1.00000               1.00000       
## Wichita Falls                  1.00000               1.00000       
##                                Dallas-Fort Worth-Arlington El Paso
## Austin-Round Rock              -                           -      
## Beaumont-Port Arthur           -                           -      
## Brownsville-Harlingen          -                           -      
## College Station-Bryan          -                           -      
## Corpus Christi                 -                           -      
## Dallas-Fort Worth-Arlington    -                           -      
## El Paso                        0.63003                     -      
## Houston-The Woodland-Sugarland 1.00000                     0.01077
## Laredo                         0.63438                     1.00000
## Lubbock                        4.9e-07                     0.13838
## McCallen-Edinburg-Mission      0.68516                     1.00000
## Odessa                         1.00000                     1.00000
## San Angelo                     1.00000                     1.00000
## San Antonio-New Braunfels      1.00000                     1.00000
## Tyler                          1.00000                     1.00000
## Waco                           0.69355                     1.00000
## Wichita Falls                  1.3e-05                     0.05257
##                                Houston-The Woodland-Sugarland Laredo 
## Austin-Round Rock              -                              -      
## Beaumont-Port Arthur           -                              -      
## Brownsville-Harlingen          -                              -      
## College Station-Bryan          -                              -      
## Corpus Christi                 -                              -      
## Dallas-Fort Worth-Arlington    -                              -      
## El Paso                        -                              -      
## Houston-The Woodland-Sugarland -                              -      
## Laredo                         0.06509                        -      
## Lubbock                        3.5e-09                        1.00000
## McCallen-Edinburg-Mission      0.03031                        1.00000
## Odessa                         0.51149                        1.00000
## San Angelo                     1.00000                        1.00000
## San Antonio-New Braunfels      1.00000                        1.00000
## Tyler                          1.00000                        1.00000
## Waco                           0.05002                        1.00000
## Wichita Falls                  5.1e-07                        1.00000
##                                Lubbock McCallen-Edinburg-Mission Odessa 
## Austin-Round Rock              -       -                         -      
## Beaumont-Port Arthur           -       -                         -      
## Brownsville-Harlingen          -       -                         -      
## College Station-Bryan          -       -                         -      
## Corpus Christi                 -       -                         -      
## Dallas-Fort Worth-Arlington    -       -                         -      
## El Paso                        -       -                         -      
## Houston-The Woodland-Sugarland -       -                         -      
## Laredo                         -       -                         -      
## Lubbock                        -       -                         -      
## McCallen-Edinburg-Mission      1.00000 -                         -      
## Odessa                         1.00000 1.00000                   -      
## San Angelo                     0.73308 1.00000                   1.00000
## San Antonio-New Braunfels      9.7e-06 1.00000                   1.00000
## Tyler                          1.00000 1.00000                   1.00000
## Waco                           1.00000 1.00000                   1.00000
## Wichita Falls                  1.00000 0.35355                   1.00000
##                                San Angelo San Antonio-New Braunfels
## Austin-Round Rock              -          -                        
## Beaumont-Port Arthur           -          -                        
## Brownsville-Harlingen          -          -                        
## College Station-Bryan          -          -                        
## Corpus Christi                 -          -                        
## Dallas-Fort Worth-Arlington    -          -                        
## El Paso                        -          -                        
## Houston-The Woodland-Sugarland -          -                        
## Laredo                         -          -                        
## Lubbock                        -          -                        
## McCallen-Edinburg-Mission      -          -                        
## Odessa                         -          -                        
## San Angelo                     -          -                        
## San Antonio-New Braunfels      1.00000    -                        
## Tyler                          1.00000    1.00000                  
## Waco                           1.00000    1.00000                  
## Wichita Falls                  0.18446    6.4e-05                  
##                                Tyler   Waco   
## Austin-Round Rock              -       -      
## Beaumont-Port Arthur           -       -      
## Brownsville-Harlingen          -       -      
## College Station-Bryan          -       -      
## Corpus Christi                 -       -      
## Dallas-Fort Worth-Arlington    -       -      
## El Paso                        -       -      
## Houston-The Woodland-Sugarland -       -      
## Laredo                         -       -      
## Lubbock                        -       -      
## McCallen-Edinburg-Mission      -       -      
## Odessa                         -       -      
## San Angelo                     -       -      
## San Antonio-New Braunfels      -       -      
## Tyler                          -       -      
## Waco                           1.00000 -      
## Wichita Falls                  0.35452 1.00000
## 
## P value adjustment method: bonferroni
tidy(trantime_fit)
##                                        term   estimate std.error
## 1                               (Intercept) 18.4297521  1.976681
## 2               livingareaAustin-Round Rock  7.4680845  2.115541
## 3            livingareaBeaumont-Port Arthur  5.4273908  2.731666
## 4           livingareaBrownsville-Harlingen -0.2094131  2.813161
## 5           livingareaCollege Station-Bryan  3.0077479  2.971877
## 6                  livingareaCorpus Christi  2.9347800  2.497245
## 7     livingareaDallas-Fort Worth-Arlington  9.5225792  2.017863
## 8                         livingareaEl Paso  5.6474409  2.359268
## 9  livingareaHouston-The Woodland-Sugarland 11.0911380  2.030264
## 10                         livingareaLaredo  2.8480257  3.026611
## 11                        livingareaLubbock -1.9524793  2.736591
## 12      livingareaMcCallen-Edinburg-Mission  4.6294195  2.589359
## 13                         livingareaOdessa  0.6491953  4.043369
## 14                     livingareaSan Angelo  9.1202479  3.965697
## 15      livingareaSan Antonio-New Braunfels  9.1110259  2.121430
## 16                          livingareaTyler  5.7724951  3.036346
## 17                           livingareaWaco  3.7933884  2.795449
## 18                  livingareaWichita Falls -4.9903581  3.327250
##      statistic      p.value
## 1   9.32358384 1.413929e-20
## 2   3.53010662 4.176198e-04
## 3   1.98684251 4.697233e-02
## 4  -0.07444049 9.406617e-01
## 5   1.01207005 3.115338e-01
## 6   1.17520721 2.399452e-01
## 7   4.71914095 2.406770e-06
## 8   2.39372633 1.670002e-02
## 9   5.46290377 4.818221e-08
## 10  0.94099501 3.467345e-01
## 11 -0.71347135 4.755740e-01
## 12  1.78786318 7.383417e-02
## 13  0.16055802 8.724454e-01
## 14  2.29978422 2.148489e-02
## 15  4.29475592 1.768559e-05
## 16  1.90113220 5.731894e-02
## 17  1.35698706 1.748218e-01
## 18 -1.49984461 1.336923e-01

We compare the rest of the metro areas in Texas using San Antonio as reference

dragonipums$livingarea2<-relevel(as.factor(dragonipums$livingarea),ref = "San Antonio-New Braunfels")
trantime_fit2<-lm(trantime~livingarea2, data=dragonipums)
tidy(trantime_fit2)
##                                         term      estimate std.error
## 1                                (Intercept)  27.540777917 0.7701940
## 2                        livingarea2Amarillo  -9.111025851 2.1214304
## 3               livingarea2Austin-Round Rock  -1.642941379 1.0777028
## 4            livingarea2Beaumont-Port Arthur  -3.683635060 2.0366471
## 5           livingarea2Brownsville-Harlingen  -9.320438934 2.1447154
## 6           livingarea2College Station-Bryan  -6.103277917 2.3490391
## 7                  livingarea2Corpus Christi  -6.176245897 1.7094331
## 8     livingarea2Dallas-Fort Worth-Arlington   0.411553328 0.8704603
## 9                         livingarea2El Paso  -3.463584935 1.5006912
## 10 livingarea2Houston-The Woodland-Sugarland   1.980112183 0.8988345
## 11                         livingarea2Laredo  -6.263000139 2.4179131
## 12                        livingarea2Lubbock -11.063505190 2.0432477
## 13      livingarea2McCallen-Edinburg-Mission  -4.481606320 1.8413882
## 14                         livingarea2Odessa  -8.461830549 3.6103689
## 15                     livingarea2San Angelo   0.009222083 3.5231641
## 16                          livingarea2Tyler  -3.338530726 2.4300880
## 17                           livingarea2Waco  -5.317637421 2.1214304
## 18                  livingarea2Wichita Falls -14.101383978 2.7850537
##       statistic       p.value
## 1  35.758234360 9.008903e-261
## 2  -4.294755917  1.768559e-05
## 3  -1.524484637  1.274254e-01
## 4  -1.808676131  7.053713e-02
## 5  -4.345769477  1.404242e-05
## 6  -2.598201904  9.387725e-03
## 7  -3.613037429  3.044014e-04
## 8   0.472799677  6.363684e-01
## 9  -2.307993093  2.102373e-02
## 10  2.202977611  2.762342e-02
## 11 -2.590250281  9.607206e-03
## 12 -5.414666538  6.310123e-08
## 13 -2.433819443  1.496118e-02
## 14 -2.343757895  1.911378e-02
## 15  0.002617557  9.979116e-01
## 16 -1.373831226  1.695308e-01
## 17 -2.506628249  1.220765e-02
## 18 -5.063236013  4.209684e-07

We do a second comparison of the metro areas in Texas using San Antonio as reference

dragonipums$livingarea2<-relevel(as.factor(dragonipums$livingarea),ref = "Wichita Falls")
trantime_fit2<-lm(trantime~livingarea2, data=dragonipums)
tidy(trantime_fit2)
##                                         term  estimate std.error statistic
## 1                                (Intercept) 13.439394  2.676439 5.0213715
## 2                        livingarea2Amarillo  4.990358  3.327250 1.4998446
## 3               livingarea2Austin-Round Rock 12.458443  2.780570 4.4805355
## 4            livingarea2Beaumont-Port Arthur 10.417749  3.273844 3.1821148
## 5           livingarea2Brownsville-Harlingen  4.780945  3.342145 1.4305021
## 6           livingarea2College Station-Bryan  7.998106  3.476796 2.3004243
## 7                  livingarea2Corpus Christi  7.925138  3.080956 2.5722986
## 8     livingarea2Dallas-Fort Worth-Arlington 14.512937  2.706996 5.3612703
## 9                         livingarea2El Paso 10.637799  2.970219 3.5814865
## 10 livingarea2Houston-The Woodland-Sugarland 16.081496  2.716253 5.9204709
## 11                         livingarea2Laredo  7.838384  3.523696 2.2244782
## 12                        livingarea2Lubbock  3.037879  3.277955 0.9267604
## 13      livingarea2McCallen-Edinburg-Mission  9.619778  3.156079 3.0480152
## 14                         livingarea2Odessa  5.639553  4.427741 1.2736864
## 15                     livingarea2San Angelo 14.110606  4.356927 3.2386603
## 16      livingarea2San Antonio-New Braunfels 14.101384  2.785054 5.0632360
## 17                          livingarea2Tyler 10.762853  3.532061 3.0471880
## 18                           livingarea2Waco  8.783747  3.327250 2.6399418
##         p.value
## 1  5.236057e-07
## 2  1.336923e-01
## 3  7.544281e-06
## 4  1.467371e-03
## 5  1.526102e-01
## 6  2.144862e-02
## 7  1.011962e-02
## 8  8.483713e-08
## 9  3.435787e-04
## 10 3.336474e-09
## 11 2.614280e-02
## 12 3.540776e-01
## 13 2.310755e-03
## 14 2.028099e-01
## 15 1.205599e-03
## 16 4.209684e-07
## 17 2.317117e-03
## 18 8.307375e-03

We review the summary of the means:

cont_means<-dragonipums%>%
  group_by(livingarea2)%>%
  summarise(means=mean(trantime, na.rm=T), sds=sd(trantime, na.rm=T), n=n())
cont_means$diff<-cont_means$means-cont_means$means[1]
cont_means
## # A tibble: 19 x 5
##                       livingarea2    means       sds     n      diff
##                            <fctr>    <dbl>     <dbl> <int>     <dbl>
##  1                  Wichita Falls 13.43939  9.907462    74  0.000000
##  2                       Amarillo 18.42975 22.160335   133  4.990358
##  3              Austin-Round Rock 25.89784 18.036476   945 12.458443
##  4           Beaumont-Port Arthur 23.85714 22.312567   151 10.417749
##  5          Brownsville-Harlingen 18.22034 10.647527   130  4.780945
##  6          College Station-Bryan 21.43750 24.918314   101  7.998106
##  7                 Corpus Christi 21.36453 23.844486   224  7.925138
##  8    Dallas-Fort Worth-Arlington 27.95233 22.545452  3217 14.512937
##  9                        El Paso 24.07719 25.014806   326 10.637799
## 10 Houston-The Woodland-Sugarland 29.52089 21.717294  2456 16.081496
## 11                         Laredo 21.27778 20.361358    99  7.838384
## 12                        Lubbock 16.47727 17.345376   136  3.037879
## 13      McCallen-Edinburg-Mission 23.05917 22.180994   201  9.619778
## 14                         Odessa 19.07895 12.101779    42  5.639553
## 15                     San Angelo 27.55000 33.576510    44 14.110606
## 16      San Antonio-New Braunfels 27.54078 21.971427   903 14.101384
## 17                          Tyler 24.20225 27.622940    99 10.762853
## 18                           Waco 22.22314 18.555542   134  8.783747
## 19                           <NA> 28.06333 22.823387 99649 14.623939
qqnorm(rstudent(trantime_fit2), main="Q-Q Plot for Model Residuals")
qqline(rstudent(trantime_fit2), col="maroon")

We apply a log transformation to reduce the anormalities in the data. There is some correction however it continues to have a lot of variation as observed in the Q-Q plot.

trantime_fit3<-lm( log(trantime)~livingarea, data=dragonipums)
qqnorm(rstudent(trantime_fit3), main="Q-Q Plot for Model Residuals")
qqline(rstudent(trantime_fit3), col="red")

The data in this report shows that the metro area with the longest commute time from home to work is that of Houston-The Woodlands-Sugarland and the shortest commute belongs to Wichita Falls.