library(data.table)
## Warning: package 'data.table' was built under R version 4.2.3
library(foreign)
## Warning: package 'foreign' was built under R version 4.2.3
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.2.3
HTS<-data.table(read.spss("datasets/HTS.household.10regions.sav",to.data.frame=T))
## re-encoding from CP1252
Run a T-Test to show if the household income means is statistically different between households living in single family residences or not (use the whole sample). Produce two pdfs, one with an histogram pdf plot, and another with the simulated hypothesis testing plot showing where the T-statistic falls. Provide a short interpretation of your results
# step 1
HTS[,.N,by=htype]
## htype N
## 1: single family detached 10712
## 2: multi-family 1410
## 3: single family attached 934
## 4: <NA> 963
## 5: other 193
HTS$hhtype<-ifelse(HTS$htype %in% c("single family detached","single family attached"),"single family","not single family")
ggplot(data=HTS,aes(x=hhincome))+
geom_histogram()+
facet_grid(hhtype~.)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 857 rows containing non-finite values (`stat_bin()`).
ggplot(data=HTS,aes(x=hhincome, y=hhtype))+
geom_boxplot()
## Warning: Removed 857 rows containing non-finite values (`stat_boxplot()`).
# desc stats
HTS[,.(mean=mean(hhincome,na.rm=T),sd=sd(hhincome,na.rm=T),Obs=.N),by=.(hhtype)]
## hhtype mean sd Obs
## 1: single family 73.31830 41.27469 11646
## 2: not single family 50.23185 38.40659 2566
#Step 2: dependent variable types
class(HTS$hhincome) ; class(HTS$htype)
## [1] "numeric"
## [1] "factor"
# Step 3:
# Step 4: Conduct t-test
t_test<-HTS[,t.test(formula = hhincome ~ hhtype)] # two-tailed >>> Dependent Y ~ Independent X
t_test
##
## Welch Two Sample t-test
##
## data: hhincome by hhtype
## t = -26.463, df = 3799.8, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group not single family and group single family is not equal to 0
## 95 percent confidence interval:
## -24.79686 -21.37603
## sample estimates:
## mean in group not single family mean in group single family
## 50.23185 73.31830
# Step 5. Interpret results
## T-value is -26.463, degrees of freedom is 3799.8, p-value is 2.2e-16 which means we would reject the null hypothesis which shows there is statistical significance between household type and household income. Overall we can conclude that there is statistical significance between household type and household income.
curve(dt(x, df = 3799.8), from = -40, to = 40)
abline(h=0,col='blue')
points(x=t_test$statistic,y=0,col='red')
upper975 <- qt(p = .975, df = 3799.8)
abline(v = upper975,y=0,col='red')
## Warning in int_abline(a = a, b = b, h = h, v = v, untf = untf, ...): "y" is not
## a graphical parameter
lower025 <- qt(p = .025, df = 3799.8)
abline(v = lower025,y=0,col='red')
## Warning in int_abline(a = a, b = b, h = h, v = v, untf = untf, ...): "y" is not
## a graphical parameter
Filter the sample to select only the region of San Antonio. Prepare
an T-Test to show if the household vehicle miles traveled (in natural
logs - lnvmt) is statistically different between households living in
neighborhoods with a job-population index (variable jobpop)
over and under the city median (of the jobpop variable of
course)
SA_hts <- HTS[region %in% "San Antonio, TX"]
median_job_pop <- median(SA_hts$jobpop, na.rm = TRUE)
SA_hts[, jobpop_med := ifelse(jobpop > median_job_pop, "over_median", "under_median")]
SA_hts[, jobpop_med := factor(jobpop_med)]
SA_t_test<-SA_hts[,t.test(formula = lnvmt~jobpop_med)]
SA_t_test
##
## Welch Two Sample t-test
##
## data: lnvmt by jobpop_med
## t = -2.2052, df = 1512.2, p-value = 0.02759
## alternative hypothesis: true difference in means between group over_median and group under_median is not equal to 0
## 95 percent confidence interval:
## -0.21709014 -0.01269755
## sample estimates:
## mean in group over_median mean in group under_median
## 2.989259 3.104152
##Interpretation:
using the same data set (San Antonio sample), run an ANOVA test to see if there are significant differences between income categories and vehicle miles traveled by household. Follow the same steps used in the ANOVA exercise done in class. Produce three pdfs: one checking the normality assumption of the dependent variable, a second one checking the presence of outliers, and a third one showing the Tukey (post hoc) T-tests plot.
#Step 1: Normality assumption of the dependent variable
hist(SA_hts$lnvmt)
#Box-plot to check for the presence of outliers
ggplot(data = SA_hts, aes(x=income_cat, y=lnvmt))+
geom_boxplot()
## Warning: Removed 43 rows containing non-finite values (`stat_boxplot()`).
#Removing outliers
hts_bp<-boxplot(SA_hts$lnvmt~SA_hts$income_cat)
outliers <- hts_bp$out
SA_hts[lnvmt%in%outliers,]
## rhhnum region geoid10 anyvmt
## 1: 150000034 San Antonio, TX 1
## 2: 150000075 San Antonio, TX 1
## 3: 150000094 San Antonio, TX 1
## 4: 150000113 San Antonio, TX 1
## 5: 150000123 San Antonio, TX 1
## 6: 150000162 San Antonio, TX 1
## 7: 150000164 San Antonio, TX 1
## 8: 150000201 San Antonio, TX 1
## 9: 150000205 San Antonio, TX 1
## 10: 150000242 San Antonio, TX 1
## 11: 150000359 San Antonio, TX 1
## 12: 150000400 San Antonio, TX 1
## 13: 150000488 San Antonio, TX 1
## 14: 150000529 San Antonio, TX 1
## 15: 150000585 San Antonio, TX 1
## 16: 150000656 San Antonio, TX 1
## 17: 150000669 San Antonio, TX 1
## 18: 150000728 San Antonio, TX 1
## 19: 150000740 San Antonio, TX 1
## 20: 150000759 San Antonio, TX 1
## 21: 150000766 San Antonio, TX 1
## 22: 150000768 San Antonio, TX 1
## 23: 150000828 San Antonio, TX 1
## 24: 150001040 San Antonio, TX 1
## 25: 150001060 San Antonio, TX 1
## 26: 150001072 San Antonio, TX 1
## 27: 150001122 San Antonio, TX 1
## 28: 150001169 San Antonio, TX 1
## 29: 150001265 San Antonio, TX 1
## 30: 150001374 San Antonio, TX 1
## 31: 150001387 San Antonio, TX 1
## 32: 155000211 San Antonio, TX 1
## 33: 155000223 San Antonio, TX 1
## 34: 155000323 San Antonio, TX 1
## 35: 155000334 San Antonio, TX 1
## rhhnum region geoid10 anyvmt
## lnvmt autotrips anywalk walktrips anybike biketrips anytransit
## 1: -1.617320225 2 0 NA 0 NA 0
## 2: 0.110661824 2 1 2 0 NA 0
## 3: -0.550564483 2 0 NA 0 NA 0
## 4: 1.356620908 10 0 NA 0 NA 0
## 5: 0.547420044 2 0 NA 0 NA 0
## 6: 0.971593758 2 0 NA 0 NA 0
## 7: 0.973079594 2 0 NA 0 NA 0
## 8: 0.962427632 2 0 NA 0 NA 0
## 9: -0.001480761 2 0 NA 0 NA 0
## 10: -2.155615546 2 0 NA 0 NA 0
## 11: -0.415248432 2 0 NA 0 NA 0
## 12: -0.500980007 2 0 NA 0 NA 0
## 13: -0.057901070 2 1 2 0 NA 0
## 14: -0.652877789 2 0 NA 0 NA 0
## 15: 0.934789108 3 0 NA 0 NA 0
## 16: 0.630968751 3 0 NA 0 NA 0
## 17: 5.654327814 12 0 NA 0 NA 0
## 18: 0.803340881 2 0 NA 0 NA 0
## 19: 0.786823621 2 0 NA 0 NA 0
## 20: -0.517029239 2 1 2 0 NA 0
## 21: -0.815960978 4 1 20 0 NA 0
## 22: 0.087722985 2 0 NA 0 NA 0
## 23: 0.820388367 3 0 NA 0 NA 0
## 24: 0.810984705 3 0 NA 0 NA 0
## 25: 0.673487713 6 0 NA 0 NA 0
## 26: -0.475510803 2 1 4 0 NA 1
## 27: 0.471652396 2 0 NA 0 NA 0
## 28: 0.761051704 2 0 NA 0 NA 0
## 29: 5.566984726 61 0 NA 0 NA 0
## 30: -1.098307547 4 0 NA 0 NA 0
## 31: 5.469960773 24 0 NA 0 NA 0
## 32: 0.816963848 4 0 NA 0 NA 0
## 33: 1.019976182 2 0 NA 0 NA 0
## 34: -1.613511253 2 0 NA 0 NA 0
## 35: 0.788930682 2 0 NA 0 NA 0
## lnvmt autotrips anywalk walktrips anybike biketrips anytransit
## transittrips veh hhsize hhworker htype
## 1: NA 1 2 0 single family detached
## 2: NA 0 1 0 single family detached
## 3: NA 0 1 0 single family attached
## 4: NA 2 4 0 single family detached
## 5: NA 1 1 0 single family detached
## 6: NA 1 2 2 single family detached
## 7: NA 2 2 0 multi-family
## 8: NA 1 1 0 single family detached
## 9: NA 1 1 1 multi-family
## 10: NA 1 1 0 multi-family
## 11: NA 1 2 0 single family detached
## 12: NA 1 1 0 single family detached
## 13: NA 1 2 1 single family detached
## 14: NA 1 1 0 single family detached
## 15: NA 1 2 0 single family detached
## 16: NA 2 2 1 single family detached
## 17: NA 2 2 0 multi-family
## 18: NA 1 1 0 multi-family
## 19: NA 1 2 1 single family detached
## 20: NA 1 1 1 single family detached
## 21: NA 0 10 0 single family attached
## 22: NA 1 1 0 single family detached
## 23: NA 2 1 0 single family detached
## 24: NA 1 1 0 multi-family
## 25: NA 2 3 3 single family detached
## 26: 2 1 4 2 single family detached
## 27: NA 3 3 2 single family detached
## 28: NA 1 1 0 single family detached
## 29: NA 3 8 3 single family detached
## 30: NA 2 3 0 single family detached
## 31: NA 2 4 2 single family detached
## 32: NA 1 2 0 single family detached
## 33: NA 2 2 0 single family detached
## 34: NA 1 1 0 single family attached
## 35: NA 1 1 0 single family detached
## transittrips veh hhsize hhworker htype
## sf hhincome lnhhincome income_cat actden
## 1: single family detached 36.093344 3.586108 middle (35K-75K) 9.2397270
## 2: single family detached 49.975400 3.911531 middle (35K-75K) 8.6468920
## 3: other 5.552822 1.714306 low (<35K) 3.0320845
## 4: single family detached 91.621560 4.517667 high (>75K) 8.1877985
## 5: single family detached 36.093344 3.586108 middle (35K-75K) 4.8896856
## 6: single family detached 74.963090 4.316996 middle (35K-75K) 14.3997310
## 7: other 91.621560 4.517667 high (>75K) 9.0961010
## 8: single family detached 49.975400 3.911531 middle (35K-75K) 9.9988560
## 9: other 36.093344 3.586108 middle (35K-75K) 5.9810605
## 10: other 24.987700 3.218384 low (<35K) 11.0650290
## 11: single family detached 74.963090 4.316996 middle (35K-75K) 8.8165430
## 12: single family detached 30.540521 3.419054 low (<35K) 8.8444410
## 13: single family detached 41.646164 3.729209 middle (35K-75K) 4.4840920
## 14: single family detached 5.552822 1.714306 low (<35K) 10.0952270
## 15: single family detached 41.646164 3.729209 middle (35K-75K) 7.7122264
## 16: single family detached 36.093344 3.586108 middle (35K-75K) 7.9214687
## 17: other 194.348770 5.269654 high (>75K) 9.6090740
## 18: other 74.963090 4.316996 middle (35K-75K) 7.3696694
## 19: single family detached 61.081043 4.112202 middle (35K-75K) 3.4951300
## 20: single family detached 24.987700 3.218384 low (<35K) 10.9680110
## 21: other 5.552822 1.714306 low (<35K) 12.0877430
## 22: single family detached 49.975400 3.911531 middle (35K-75K) 10.4318230
## 23: single family detached 49.975400 3.911531 middle (35K-75K) 7.5484366
## 24: other 91.621560 4.517667 high (>75K) 7.0588540
## 25: single family detached 61.081043 4.112202 middle (35K-75K) 0.1214534
## 26: single family detached 36.093344 3.586108 middle (35K-75K) 11.4529560
## 27: single family detached 49.975400 3.911531 middle (35K-75K) 8.0802355
## 28: single family detached 91.621560 4.517667 high (>75K) 5.4094615
## 29: single family detached 124.938490 4.827822 high (>75K) 5.6574220
## 30: single family detached 49.975400 3.911531 middle (35K-75K) 6.4791360
## 31: single family detached 49.975400 3.911531 middle (35K-75K) 0.9654419
## 32: single family detached 41.646164 3.729209 middle (35K-75K) 8.8580770
## 33: single family detached 91.621560 4.517667 high (>75K) 5.6731050
## 34: other 49.975400 3.911531 middle (35K-75K) 5.8626366
## 35: single family detached 49.975400 3.911531 middle (35K-75K) 7.3114360
## sf hhincome lnhhincome income_cat actden
## jobpop entropy intden pct4way stopden emp10a emp20a
## 1: 0.4185260 0.16244244 209.82292 69.90291 65.187706 13.86737109 57.308064
## 2: 0.7139923 0.29639672 195.75620 62.19512 62.069035 14.15221104 52.264787
## 3: 0.9751881 0.46053908 51.83287 17.64706 36.587906 3.86342048 34.935372
## 4: 0.9562107 0.48856271 163.87358 17.54386 132.248860 17.52706908 54.715649
## 5: 0.9392486 0.07615311 137.93326 50.00000 18.391100 10.98739150 47.230055
## 6: 0.3155761 0.52129104 115.73794 16.00000 4.629518 14.52770440 64.147319
## 7: 0.2757024 0.59759781 116.43728 26.08696 65.812380 11.83869142 60.151262
## 8: 0.5662457 0.70665044 167.30470 18.18182 106.466630 20.08555106 55.436170
## 9: 0.3022279 0.38942315 106.07134 25.00000 11.785705 10.62143408 58.532009
## 10: 0.9296499 0.60263764 233.90045 73.78641 86.293370 15.17862561 60.224454
## 11: 0.9929870 0.51493784 164.66924 21.31148 91.782850 8.67411957 47.455079
## 12: 0.5860302 0.56954311 111.70435 34.28571 82.980380 18.02256675 60.190521
## 13: 0.9506992 0.05914424 124.10874 37.77778 13.789861 12.05442672 59.864812
## 14: 0.6627539 0.37209211 252.25702 60.00000 73.383860 14.15221104 52.264787
## 15: 0.9333875 0.42519550 172.45209 43.05556 167.661760 17.60682427 62.487322
## 16: 0.9140198 0.54222122 135.42618 46.15385 45.836550 6.60915377 33.560958
## 17: 0.2794879 0.57826638 98.04865 23.68421 25.802277 11.83869142 60.151262
## 18: 0.9042735 0.59814906 126.50892 18.18182 80.505680 11.34368912 43.979908
## 19: 0.6501693 0.40846235 120.17170 31.11111 0.000000 4.82395033 27.595294
## 20: 0.4640113 0.36697353 254.67436 58.82353 59.923378 13.86737109 57.308064
## 21: 0.8113150 0.86108506 311.18470 62.04380 95.399690 15.83487209 59.136737
## 22: 0.7780097 0.37126910 231.49753 60.67416 33.814247 16.46263457 62.034178
## 23: 0.3808891 0.31534339 139.31204 14.28571 71.077570 13.74823368 60.723171
## 24: 0.4314074 0.50996051 139.85750 39.06250 74.299290 11.93268860 60.676606
## 25: 0.0000000 0.00000000 60.98489 0.00000 0.000000 0.09362565 1.253296
## 26: 0.8972445 0.53395276 236.78170 80.58252 68.965540 14.95124902 58.091250
## 27: 0.1477215 0.43741136 149.86797 47.72727 146.461880 17.99073899 62.373386
## 28: 0.8524599 0.43011995 129.08925 25.92593 19.124332 5.11262943 33.805920
## 29: 0.2674744 0.07052753 169.45107 10.52632 0.000000 3.81388310 33.332466
## 30: 0.9604064 0.52177640 141.30476 12.00000 45.217525 3.04568214 26.317106
## 31: 0.7904070 0.00000000 58.45176 18.18182 0.000000 0.58020660 6.606925
## 32: 0.4953511 0.38403628 144.59960 43.13725 39.694008 9.32590768 59.651306
## 33: 0.6559231 0.28286766 132.42319 23.80952 0.000000 11.66531058 47.301018
## 34: 0.8542333 0.38653741 112.23320 17.07317 10.949581 4.02615078 30.948851
## 35: 0.6401025 0.63938971 102.99034 54.05405 11.134090 3.65957415 31.873095
## jobpop entropy intden pct4way stopden emp10a emp20a
## emp30a emp30t intptlat10 intptlon10 hhtype jobpop_med
## 1: 83.01140 61.899065 NA NA single family under_median
## 2: 82.79096 61.309323 NA NA single family over_median
## 3: 67.87067 42.216129 NA NA single family over_median
## 4: 86.74590 60.671653 NA NA single family over_median
## 5: 80.05266 49.565495 NA NA single family over_median
## 6: 89.43900 58.747992 NA NA single family under_median
## 7: 86.38886 45.255248 NA NA not single family under_median
## 8: 86.01300 60.948567 NA NA single family under_median
## 9: 83.42281 50.670922 NA NA not single family under_median
## 10: 85.87503 66.208817 NA NA not single family over_median
## 11: 81.97929 56.125112 NA NA single family over_median
## 12: 88.17728 61.762466 NA NA single family under_median
## 13: 83.44027 48.867885 NA NA single family over_median
## 14: 82.79096 61.309323 NA NA single family over_median
## 15: 86.72918 63.452186 NA NA single family over_median
## 16: 72.11665 48.604842 NA NA single family over_median
## 17: 86.38886 45.255248 NA NA not single family under_median
## 18: 84.49009 25.250195 NA NA not single family over_median
## 19: 69.97539 0.000000 NA NA single family over_median
## 20: 83.01140 61.899065 NA NA single family under_median
## 21: 85.59019 62.466392 NA NA single family over_median
## 22: 86.35518 57.009230 NA NA single family over_median
## 23: 86.61166 55.180929 NA NA single family under_median
## 24: 84.54458 57.327136 NA NA not single family under_median
## 25: 19.07548 0.000000 NA NA single family under_median
## 26: 83.66579 66.097234 NA NA single family over_median
## 27: 86.78776 62.041733 NA NA single family under_median
## 28: 74.63203 2.848276 NA NA single family over_median
## 29: 72.47047 33.151036 NA NA single family under_median
## 30: 68.36939 3.703415 NA NA single family over_median
## 31: 32.78149 0.000000 NA NA single family over_median
## 32: 88.99626 60.086864 NA NA single family under_median
## 33: 78.23687 31.201864 NA NA single family over_median
## 34: 70.83189 5.107056 NA NA single family over_median
## 35: 69.11641 33.188560 NA NA single family under_median
## emp30a emp30t intptlat10 intptlon10 hhtype jobpop_med
SA_hts2<-SA_hts[!lnvmt%in%outliers,]
boxplot(SA_hts$lnvmt~SA_hts$income_cat)
boxplot(SA_hts2$lnvmt~SA_hts2$income_cat)
#ANOVA
fit<-aov(SA_hts2$lnvmt~SA_hts2$income_cat)
summary(fit)
## Df Sum Sq Mean Sq F value Pr(>F)
## SA_hts2$income_cat 2 139.1 69.54 93 <2e-16 ***
## Residuals 1482 1108.1 0.75
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 43 observations deleted due to missingness
#Tukey post-hoc test
TukeyHSD(fit)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = SA_hts2$lnvmt ~ SA_hts2$income_cat)
##
## $`SA_hts2$income_cat`
## diff lwr upr p adj
## middle (35K-75K)-low (<35K) 0.5946113 0.46854317 0.7206794 0.0000000
## high (>75K)-low (<35K) 0.7968258 0.65009509 0.9435565 0.0000000
## high (>75K)-middle (35K-75K) 0.2022145 0.07144679 0.3329822 0.0008622
plot(TukeyHSD(fit))
## Run a T-test and an ANOVA test in your data:
#T-test
#T-test 2 Categories: Region (Downtown blk 1, The Rim) + Stop_counts by hour
library(jsonlite)
## Warning: package 'jsonlite' was built under R version 4.2.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.2.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(deweydatar)
## Loading required package: httr2
## Warning: package 'httr2' was built under R version 4.2.3
library(data.table)
library(ggplot2)
library(qs)
## Warning: package 'qs' was built under R version 4.2.3
## qs 0.25.7
dt<-qs::qread(file = "../07-lab-SalomonMendoz/datasets/BEXAR_08_2023.qs")
block_groups<-c("480291101001", #DT block 1
"480291918072" #The Rim
)
bexar_dt<-data.table(GEOID=dt$GEOID,stops_by_hour=dt$STOPS_BY_EACH_HOUR)
bexar_dt<-bexar_dt[GEOID%in%block_groups]
bexar_dt<-expand_integer_json(bexar_dt,expand = 'stops_by_hour',by="GEOID",index = "hour")
bexar_dt$GEOID<-ifelse(bexar_dt$GEOID %in% c("480291101001"),"Downtown Blk 1","The Rim")
#Box-plot
ggplot(data=bexar_dt,aes(x=stops_by_hour, y=GEOID))+
geom_boxplot()
t_test<-bexar_dt[,t.test(formula = stops_by_hour ~ GEOID)] # two-tailed >>> Dependent Y ~ Independent X
t_test
##
## Welch Two Sample t-test
##
## data: stops_by_hour by GEOID
## t = 3.9596, df = 1484.8, p-value = 7.864e-05
## alternative hypothesis: true difference in means between group Downtown Blk 1 and group The Rim is not equal to 0
## 95 percent confidence interval:
## 29.56800 87.62555
## sample estimates:
## mean in group Downtown Blk 1 mean in group The Rim
## 445.6599 387.0632
#ANOVA
#ANOVA Test: Sub categories of specific places of interest and raw visitor counts
#Sub-categories: Malls,Amusement and Theme Parks,All Other Amusement and Recreation Industries,Full-Service Restaurants
mp_dt<-qread(file = "../07-lab-SalomonMendoz/datasets/monthly_patterns_top10_2023_09_01.qs")
mp_dt<-mp_dt[SUB_CATEGORY %in% c("Malls", "Amusement and Theme Parks","All Other Amusement and Recreation Industries", "Full-Service Restaurants")]
mp_dt$SUB_CATEGORY <- as.factor(mp_dt$SUB_CATEGORY)
mp_dt$RAW_VISITOR_COUNTS <- as.numeric(mp_dt$RAW_VISITOR_COUNTS)
anova_test<-aov(mp_dt$RAW_VISITOR_COUNTS ~ mp_dt$SUB_CATEGORY)
summary(anova_test)
## Df Sum Sq Mean Sq F value Pr(>F)
## mp_dt$SUB_CATEGORY 3 6.102e+08 203414627 6.19 0.000626 ***
## Residuals 112 3.681e+09 32863075
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 65 observations deleted due to missingness