library(haven)
ipums<-read_dta("https://github.com/coreysparks/data/blob/master/usa_00045.dta?raw=true")
## `curl` package not installed, falling back to using `url()`
names(ipums) #print the column names
## [1] "year" "datanum" "serial" "hhwt" "statefip"
## [6] "met2013" "puma" "gq" "pernum" "perwt"
## [11] "famsize" "nchild" "nchlt5" "eldch" "nsibs"
## [16] "relate" "related" "sex" "age" "marst"
## [21] "birthyr" "fertyr" "race" "raced" "hispan"
## [26] "hispand" "bpl" "bpld" "citizen" "yrsusa1"
## [31] "language" "languaged" "speakeng" "educ" "educd"
## [36] "empstat" "empstatd" "labforce" "occ" "ind"
## [41] "inctot" "incwage" "poverty" "hwsei" "migrate1"
## [46] "migrate1d" "carpool" "trantime"
library(broom)
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
In this analysis, we focus on the population of Texas over the age of 18 living in Metro Areas that participates in the labor force.
dragonipums<-ipums%>%
mutate(trantime= ifelse(trantime%in%c(000), NA, trantime))%>%
filter(labforce==2, age>=18, met2013>=1) %>%
mutate(livingarea=ifelse(met2013%in%c(00000), NA, trantime))%>%
mutate(livingarea = case_when(.$met2013==11100 ~ "Amarillo",
.$met2013==12420 ~ "Austin-Round Rock",
.$met2013==13140 ~ "Beaumont-Port Arthur",
.$met2013==15180 ~ "Brownsville-Harlingen",
.$met2013==17780 ~ "College Station-Bryan",
.$met2013==18580 ~ "Corpus Christi",
.$met2013==19100 ~ "Dallas-Fort Worth-Arlington",
.$met2013==21340 ~ "El Paso",
.$met2013==26420 ~ "Houston-The Woodland-Sugarland",
.$met2013==28660 ~ "Killeen-Temple",
.$met2013==29700 ~ "Laredo",
.$met2013==31180 ~ "Lubbock",
.$met2013==32580 ~ "McCallen-Edinburg-Mission",
.$met2013==32260 ~ "Midland",
.$met2013==36220 ~ "Odessa",
.$met2013==41660 ~ "San Angelo",
.$met2013==41700 ~ "San Antonio-New Braunfels",
.$met2013==46340 ~ "Tyler",
.$met2013==47380 ~ "Waco",
.$met2013==48660 ~ "Wichita Falls"))
We review some of the descriptive statistics for the metro areas in Texas.
dragonipums%>%
group_by(livingarea)%>%
summarise(meantime=mean(trantime, na.rm=T), sds=sd(trantime, na.rm=T), n=n())
## # A tibble: 19 x 4
## livingarea meantime sds n
## <chr> <dbl> <dbl> <int>
## 1 Amarillo 18.42975 22.160335 133
## 2 Austin-Round Rock 25.89784 18.036476 945
## 3 Beaumont-Port Arthur 23.85714 22.312567 151
## 4 Brownsville-Harlingen 18.22034 10.647527 130
## 5 College Station-Bryan 21.43750 24.918314 101
## 6 Corpus Christi 21.36453 23.844486 224
## 7 Dallas-Fort Worth-Arlington 27.95233 22.545452 3217
## 8 El Paso 24.07719 25.014806 326
## 9 Houston-The Woodland-Sugarland 29.52089 21.717294 2456
## 10 Laredo 21.27778 20.361358 99
## 11 Lubbock 16.47727 17.345376 136
## 12 McCallen-Edinburg-Mission 23.05917 22.180994 201
## 13 Odessa 19.07895 12.101779 42
## 14 San Angelo 27.55000 33.576510 44
## 15 San Antonio-New Braunfels 27.54078 21.971427 903
## 16 Tyler 24.20225 27.622940 99
## 17 Waco 22.22314 18.555542 134
## 18 Wichita Falls 13.43939 9.907462 74
## 19 <NA> 28.06333 22.823387 99649
We look at the distribution by mean time by metro area through different graphs:
dragonipums%>%
ggplot(aes(x=livingarea, y=trantime))+geom_boxplot()+coord_flip()
## Warning: Removed 13516 rows containing non-finite values (stat_boxplot).
dragonipums%>%
ggplot(aes(trantime, fill=livingarea))+geom_histogram(aes(y=0.5*..density..))+facet_wrap(~livingarea)+ggtitle(label = "Commute Time by Texas Metro Area")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 13516 rows containing non-finite values (stat_bin).
We proceed to do an ANOVA test. Results show that there is a significant variation between the variables based on the F value of 18 and a P value of -2.2.
trantime_fit<-dragonipums%>%
lm(trantime~livingarea, data=.)
anova(trantime_fit)
## Analysis of Variance Table
##
## Response: trantime
## Df Sum Sq Mean Sq F value Pr(>F)
## livingarea 17 86798 5105.8 10.799 < 2.2e-16 ***
## Residuals 8388 3965674 472.8
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
pairwise.t.test(x = dragonipums$trantime,g = dragonipums$livingarea,p.adjust.method = "bonf")
##
## Pairwise comparisons using t tests with pooled SD
##
## data: dragonipums$trantime and dragonipums$livingarea
##
## Amarillo Austin-Round Rock
## Austin-Round Rock 0.06390 -
## Beaumont-Port Arthur 1.00000 1.00000
## Brownsville-Harlingen 1.00000 0.05098
## College Station-Bryan 1.00000 1.00000
## Corpus Christi 1.00000 1.00000
## Dallas-Fort Worth-Arlington 0.00037 1.00000
## El Paso 1.00000 1.00000
## Houston-The Woodland-Sugarland 7.4e-06 0.00653
## Laredo 1.00000 1.00000
## Lubbock 1.00000 0.00058
## McCallen-Edinburg-Mission 1.00000 1.00000
## Odessa 1.00000 1.00000
## San Angelo 1.00000 1.00000
## San Antonio-New Braunfels 0.00271 1.00000
## Tyler 1.00000 1.00000
## Waco 1.00000 1.00000
## Wichita Falls 1.00000 0.00115
## Beaumont-Port Arthur Brownsville-Harlingen
## Austin-Round Rock - -
## Beaumont-Port Arthur - -
## Brownsville-Harlingen 1.00000 -
## College Station-Bryan 1.00000 1.00000
## Corpus Christi 1.00000 1.00000
## Dallas-Fort Worth-Arlington 1.00000 0.00029
## El Paso 1.00000 1.00000
## Houston-The Woodland-Sugarland 0.54182 6.0e-06
## Laredo 1.00000 1.00000
## Lubbock 0.87941 1.00000
## McCallen-Edinburg-Mission 1.00000 1.00000
## Odessa 1.00000 1.00000
## San Angelo 1.00000 1.00000
## San Antonio-New Braunfels 1.00000 0.00215
## Tyler 1.00000 1.00000
## Waco 1.00000 1.00000
## Wichita Falls 0.22451 1.00000
## College Station-Bryan Corpus Christi
## Austin-Round Rock - -
## Beaumont-Port Arthur - -
## Brownsville-Harlingen - -
## College Station-Bryan - -
## Corpus Christi 1.00000 -
## Dallas-Fort Worth-Arlington 0.59498 0.00467
## El Paso 1.00000 1.00000
## Houston-The Woodland-Sugarland 0.05585 4.9e-05
## Laredo 1.00000 1.00000
## Lubbock 1.00000 1.00000
## McCallen-Edinburg-Mission 1.00000 1.00000
## Odessa 1.00000 1.00000
## San Angelo 1.00000 1.00000
## San Antonio-New Braunfels 1.00000 0.04657
## Tyler 1.00000 1.00000
## Waco 1.00000 1.00000
## Wichita Falls 1.00000 1.00000
## Dallas-Fort Worth-Arlington El Paso
## Austin-Round Rock - -
## Beaumont-Port Arthur - -
## Brownsville-Harlingen - -
## College Station-Bryan - -
## Corpus Christi - -
## Dallas-Fort Worth-Arlington - -
## El Paso 0.63003 -
## Houston-The Woodland-Sugarland 1.00000 0.01077
## Laredo 0.63438 1.00000
## Lubbock 4.9e-07 0.13838
## McCallen-Edinburg-Mission 0.68516 1.00000
## Odessa 1.00000 1.00000
## San Angelo 1.00000 1.00000
## San Antonio-New Braunfels 1.00000 1.00000
## Tyler 1.00000 1.00000
## Waco 0.69355 1.00000
## Wichita Falls 1.3e-05 0.05257
## Houston-The Woodland-Sugarland Laredo
## Austin-Round Rock - -
## Beaumont-Port Arthur - -
## Brownsville-Harlingen - -
## College Station-Bryan - -
## Corpus Christi - -
## Dallas-Fort Worth-Arlington - -
## El Paso - -
## Houston-The Woodland-Sugarland - -
## Laredo 0.06509 -
## Lubbock 3.5e-09 1.00000
## McCallen-Edinburg-Mission 0.03031 1.00000
## Odessa 0.51149 1.00000
## San Angelo 1.00000 1.00000
## San Antonio-New Braunfels 1.00000 1.00000
## Tyler 1.00000 1.00000
## Waco 0.05002 1.00000
## Wichita Falls 5.1e-07 1.00000
## Lubbock McCallen-Edinburg-Mission Odessa
## Austin-Round Rock - - -
## Beaumont-Port Arthur - - -
## Brownsville-Harlingen - - -
## College Station-Bryan - - -
## Corpus Christi - - -
## Dallas-Fort Worth-Arlington - - -
## El Paso - - -
## Houston-The Woodland-Sugarland - - -
## Laredo - - -
## Lubbock - - -
## McCallen-Edinburg-Mission 1.00000 - -
## Odessa 1.00000 1.00000 -
## San Angelo 0.73308 1.00000 1.00000
## San Antonio-New Braunfels 9.7e-06 1.00000 1.00000
## Tyler 1.00000 1.00000 1.00000
## Waco 1.00000 1.00000 1.00000
## Wichita Falls 1.00000 0.35355 1.00000
## San Angelo San Antonio-New Braunfels
## Austin-Round Rock - -
## Beaumont-Port Arthur - -
## Brownsville-Harlingen - -
## College Station-Bryan - -
## Corpus Christi - -
## Dallas-Fort Worth-Arlington - -
## El Paso - -
## Houston-The Woodland-Sugarland - -
## Laredo - -
## Lubbock - -
## McCallen-Edinburg-Mission - -
## Odessa - -
## San Angelo - -
## San Antonio-New Braunfels 1.00000 -
## Tyler 1.00000 1.00000
## Waco 1.00000 1.00000
## Wichita Falls 0.18446 6.4e-05
## Tyler Waco
## Austin-Round Rock - -
## Beaumont-Port Arthur - -
## Brownsville-Harlingen - -
## College Station-Bryan - -
## Corpus Christi - -
## Dallas-Fort Worth-Arlington - -
## El Paso - -
## Houston-The Woodland-Sugarland - -
## Laredo - -
## Lubbock - -
## McCallen-Edinburg-Mission - -
## Odessa - -
## San Angelo - -
## San Antonio-New Braunfels - -
## Tyler - -
## Waco 1.00000 -
## Wichita Falls 0.35452 1.00000
##
## P value adjustment method: bonferroni
tidy(trantime_fit)
## term estimate std.error
## 1 (Intercept) 18.4297521 1.976681
## 2 livingareaAustin-Round Rock 7.4680845 2.115541
## 3 livingareaBeaumont-Port Arthur 5.4273908 2.731666
## 4 livingareaBrownsville-Harlingen -0.2094131 2.813161
## 5 livingareaCollege Station-Bryan 3.0077479 2.971877
## 6 livingareaCorpus Christi 2.9347800 2.497245
## 7 livingareaDallas-Fort Worth-Arlington 9.5225792 2.017863
## 8 livingareaEl Paso 5.6474409 2.359268
## 9 livingareaHouston-The Woodland-Sugarland 11.0911380 2.030264
## 10 livingareaLaredo 2.8480257 3.026611
## 11 livingareaLubbock -1.9524793 2.736591
## 12 livingareaMcCallen-Edinburg-Mission 4.6294195 2.589359
## 13 livingareaOdessa 0.6491953 4.043369
## 14 livingareaSan Angelo 9.1202479 3.965697
## 15 livingareaSan Antonio-New Braunfels 9.1110259 2.121430
## 16 livingareaTyler 5.7724951 3.036346
## 17 livingareaWaco 3.7933884 2.795449
## 18 livingareaWichita Falls -4.9903581 3.327250
## statistic p.value
## 1 9.32358384 1.413929e-20
## 2 3.53010662 4.176198e-04
## 3 1.98684251 4.697233e-02
## 4 -0.07444049 9.406617e-01
## 5 1.01207005 3.115338e-01
## 6 1.17520721 2.399452e-01
## 7 4.71914095 2.406770e-06
## 8 2.39372633 1.670002e-02
## 9 5.46290377 4.818221e-08
## 10 0.94099501 3.467345e-01
## 11 -0.71347135 4.755740e-01
## 12 1.78786318 7.383417e-02
## 13 0.16055802 8.724454e-01
## 14 2.29978422 2.148489e-02
## 15 4.29475592 1.768559e-05
## 16 1.90113220 5.731894e-02
## 17 1.35698706 1.748218e-01
## 18 -1.49984461 1.336923e-01
We compare the rest of the metro areas in Texas using San Antonio as reference
dragonipums$livingarea2<-relevel(as.factor(dragonipums$livingarea),ref = "San Antonio-New Braunfels")
trantime_fit2<-lm(trantime~livingarea2, data=dragonipums)
tidy(trantime_fit2)
## term estimate std.error
## 1 (Intercept) 27.540777917 0.7701940
## 2 livingarea2Amarillo -9.111025851 2.1214304
## 3 livingarea2Austin-Round Rock -1.642941379 1.0777028
## 4 livingarea2Beaumont-Port Arthur -3.683635060 2.0366471
## 5 livingarea2Brownsville-Harlingen -9.320438934 2.1447154
## 6 livingarea2College Station-Bryan -6.103277917 2.3490391
## 7 livingarea2Corpus Christi -6.176245897 1.7094331
## 8 livingarea2Dallas-Fort Worth-Arlington 0.411553328 0.8704603
## 9 livingarea2El Paso -3.463584935 1.5006912
## 10 livingarea2Houston-The Woodland-Sugarland 1.980112183 0.8988345
## 11 livingarea2Laredo -6.263000139 2.4179131
## 12 livingarea2Lubbock -11.063505190 2.0432477
## 13 livingarea2McCallen-Edinburg-Mission -4.481606320 1.8413882
## 14 livingarea2Odessa -8.461830549 3.6103689
## 15 livingarea2San Angelo 0.009222083 3.5231641
## 16 livingarea2Tyler -3.338530726 2.4300880
## 17 livingarea2Waco -5.317637421 2.1214304
## 18 livingarea2Wichita Falls -14.101383978 2.7850537
## statistic p.value
## 1 35.758234360 9.008903e-261
## 2 -4.294755917 1.768559e-05
## 3 -1.524484637 1.274254e-01
## 4 -1.808676131 7.053713e-02
## 5 -4.345769477 1.404242e-05
## 6 -2.598201904 9.387725e-03
## 7 -3.613037429 3.044014e-04
## 8 0.472799677 6.363684e-01
## 9 -2.307993093 2.102373e-02
## 10 2.202977611 2.762342e-02
## 11 -2.590250281 9.607206e-03
## 12 -5.414666538 6.310123e-08
## 13 -2.433819443 1.496118e-02
## 14 -2.343757895 1.911378e-02
## 15 0.002617557 9.979116e-01
## 16 -1.373831226 1.695308e-01
## 17 -2.506628249 1.220765e-02
## 18 -5.063236013 4.209684e-07
We do a second comparison of the metro areas in Texas using San Antonio as reference
dragonipums$livingarea2<-relevel(as.factor(dragonipums$livingarea),ref = "Wichita Falls")
trantime_fit2<-lm(trantime~livingarea2, data=dragonipums)
tidy(trantime_fit2)
## term estimate std.error statistic
## 1 (Intercept) 13.439394 2.676439 5.0213715
## 2 livingarea2Amarillo 4.990358 3.327250 1.4998446
## 3 livingarea2Austin-Round Rock 12.458443 2.780570 4.4805355
## 4 livingarea2Beaumont-Port Arthur 10.417749 3.273844 3.1821148
## 5 livingarea2Brownsville-Harlingen 4.780945 3.342145 1.4305021
## 6 livingarea2College Station-Bryan 7.998106 3.476796 2.3004243
## 7 livingarea2Corpus Christi 7.925138 3.080956 2.5722986
## 8 livingarea2Dallas-Fort Worth-Arlington 14.512937 2.706996 5.3612703
## 9 livingarea2El Paso 10.637799 2.970219 3.5814865
## 10 livingarea2Houston-The Woodland-Sugarland 16.081496 2.716253 5.9204709
## 11 livingarea2Laredo 7.838384 3.523696 2.2244782
## 12 livingarea2Lubbock 3.037879 3.277955 0.9267604
## 13 livingarea2McCallen-Edinburg-Mission 9.619778 3.156079 3.0480152
## 14 livingarea2Odessa 5.639553 4.427741 1.2736864
## 15 livingarea2San Angelo 14.110606 4.356927 3.2386603
## 16 livingarea2San Antonio-New Braunfels 14.101384 2.785054 5.0632360
## 17 livingarea2Tyler 10.762853 3.532061 3.0471880
## 18 livingarea2Waco 8.783747 3.327250 2.6399418
## p.value
## 1 5.236057e-07
## 2 1.336923e-01
## 3 7.544281e-06
## 4 1.467371e-03
## 5 1.526102e-01
## 6 2.144862e-02
## 7 1.011962e-02
## 8 8.483713e-08
## 9 3.435787e-04
## 10 3.336474e-09
## 11 2.614280e-02
## 12 3.540776e-01
## 13 2.310755e-03
## 14 2.028099e-01
## 15 1.205599e-03
## 16 4.209684e-07
## 17 2.317117e-03
## 18 8.307375e-03
We review the summary of the means:
cont_means<-dragonipums%>%
group_by(livingarea2)%>%
summarise(means=mean(trantime, na.rm=T), sds=sd(trantime, na.rm=T), n=n())
cont_means$diff<-cont_means$means-cont_means$means[1]
cont_means
## # A tibble: 19 x 5
## livingarea2 means sds n diff
## <fctr> <dbl> <dbl> <int> <dbl>
## 1 Wichita Falls 13.43939 9.907462 74 0.000000
## 2 Amarillo 18.42975 22.160335 133 4.990358
## 3 Austin-Round Rock 25.89784 18.036476 945 12.458443
## 4 Beaumont-Port Arthur 23.85714 22.312567 151 10.417749
## 5 Brownsville-Harlingen 18.22034 10.647527 130 4.780945
## 6 College Station-Bryan 21.43750 24.918314 101 7.998106
## 7 Corpus Christi 21.36453 23.844486 224 7.925138
## 8 Dallas-Fort Worth-Arlington 27.95233 22.545452 3217 14.512937
## 9 El Paso 24.07719 25.014806 326 10.637799
## 10 Houston-The Woodland-Sugarland 29.52089 21.717294 2456 16.081496
## 11 Laredo 21.27778 20.361358 99 7.838384
## 12 Lubbock 16.47727 17.345376 136 3.037879
## 13 McCallen-Edinburg-Mission 23.05917 22.180994 201 9.619778
## 14 Odessa 19.07895 12.101779 42 5.639553
## 15 San Angelo 27.55000 33.576510 44 14.110606
## 16 San Antonio-New Braunfels 27.54078 21.971427 903 14.101384
## 17 Tyler 24.20225 27.622940 99 10.762853
## 18 Waco 22.22314 18.555542 134 8.783747
## 19 <NA> 28.06333 22.823387 99649 14.623939
qqnorm(rstudent(trantime_fit2), main="Q-Q Plot for Model Residuals")
qqline(rstudent(trantime_fit2), col="maroon")
We apply a log transformation to reduce the anormalities in the data. There is some correction however it continues to have a lot of variation as observed in the Q-Q plot.
trantime_fit3<-lm( log(trantime)~livingarea, data=dragonipums)
qqnorm(rstudent(trantime_fit3), main="Q-Q Plot for Model Residuals")
qqline(rstudent(trantime_fit3), col="red")
The data in this report shows that the metro area with the longest commute time from home to work is that of Houston-The Woodlands-Sugarland and the shortest commute belongs to Wichita Falls.