library(data.table)
library(haven)
library(foreign)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#1. Load Household Dataset
hts <- read_sav("~/Desktop/Methods2_Labs/07-lab-assignment-2025-Green-range/datasets/HTS.household.10regions.sav")
hts <- as.data.table(hts)
# Convert and clean
hts$hhincome <- as.numeric(hts$hhincome)
hts_clean <- hts[!is.na(hhincome)]
hts_clean$htype <- as.factor(hts_clean$htype)
# Filter only two groups
hts_two_groups <- hts_clean[htype %in% c("0", "1")]
# Run t-test
t.test(hhincome ~ htype, data = hts_two_groups)
##
## Welch Two Sample t-test
##
## data: hhincome by htype
## t = -15.389, df = 194.91, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -36.14148 -27.93007
## sample estimates:
## mean in group 0 mean in group 1
## 43.34637 75.38215
#2. T-Test: Income by Housing Type
# Convert and clean
hts$hhincome <- as.numeric(hts$hhincome)
hts_clean <- hts[!is.na(hhincome)]
hts_clean$htype <- as.factor(hts_clean$htype)
# Filter only two groups
hts_two_groups <- hts_clean[htype %in% c("0", "1")]
# Run t-test
t.test(hhincome ~ htype, data = hts_two_groups)
##
## Welch Two Sample t-test
##
## data: hhincome by htype
## t = -15.389, df = 194.91, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -36.14148 -27.93007
## sample estimates:
## mean in group 0 mean in group 1
## 43.34637 75.38215
#2. T-Test: Income by Housing Type
# Convert and clean
hts$hhincome <- as.numeric(hts$hhincome)
hts_clean <- hts[!is.na(hhincome)]
hts_clean$htype <- as.factor(hts_clean$htype)
# Filter only two groups
hts_two_groups <- hts_clean[htype %in% c("0", "1")]
# Run t-test
t.test(hhincome ~ htype, data = hts_two_groups)
##
## Welch Two Sample t-test
##
## data: hhincome by htype
## t = -15.389, df = 194.91, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -36.14148 -27.93007
## sample estimates:
## mean in group 0 mean in group 1
## 43.34637 75.38215
#Histogram of Income by Housing Type
ggplot(hts_two_groups, aes(x = hhincome, fill = htype)) +
geom_histogram(alpha = 0.6, position = "identity", bins = 30) +
labs(title = "Histogram of Household Income by Housing Type",
x = "Household Income", fill = "Housing Type") +
theme_minimal()
#3. T-Test: San Antonio
library(data.table)
library(haven)
library(foreign)
library(ggplot2)
library(dplyr)
#1. Load Household Dataset
hts <- read_sav("~/Desktop/Methods2_Labs/07-lab-assignment-2025-Green-range/datasets/HTS.household.10regions.sav")
hts <- as.data.table(hts)
# Convert and clean
hts$hhincome <- as.numeric(hts$hhincome)
hts_clean <- hts[!is.na(hhincome)]
hts_clean$htype <- as.factor(hts_clean$htype)
# Filter only two groups
hts_two_groups <- hts_clean[htype %in% c("0", "1")]
# Run t-test
t.test(hhincome ~ htype, data = hts_two_groups)
##
## Welch Two Sample t-test
##
## data: hhincome by htype
## t = -15.389, df = 194.91, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -36.14148 -27.93007
## sample estimates:
## mean in group 0 mean in group 1
## 43.34637 75.38215
#2. T-Test: Income by Housing Type
library(data.table)
library(haven)
library(foreign)
library(ggplot2)
library(dplyr)
#1. Load Household Dataset
hts <- read_sav("~/Desktop/Methods2_Labs/07-lab-assignment-2025-Green-range/datasets/HTS.household.10regions.sav")
hts <- as.data.table(hts)
# Convert and clean
hts$hhincome <- as.numeric(hts$hhincome)
hts_clean <- hts[!is.na(hhincome)]
hts_clean$htype <- as.factor(hts_clean$htype)
# Filter only two groups
hts_two_groups <- hts_clean[htype %in% c("0", "1")]
# Run t-test
t.test(hhincome ~ htype, data = hts_two_groups)
##
## Welch Two Sample t-test
##
## data: hhincome by htype
## t = -15.389, df = 194.91, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -36.14148 -27.93007
## sample estimates:
## mean in group 0 mean in group 1
## 43.34637 75.38215
#3. T-Test: San Antonio
# Filter San Antonio
hts_SA <- hts %>% filter(region == 15)
# Create jobpop group
jobpop_median <- median(hts_SA$jobpop, na.rm = TRUE)
hts_SA <- hts_SA %>%
mutate(jobpop_group = ifelse(jobpop > jobpop_median, "High", "Low")) %>%
filter(!is.na(lnvmt), !is.na(jobpop_group))
# Run t-test
t.test(lnvmt ~ jobpop_group, data = hts_SA)
##
## Welch Two Sample t-test
##
## data: lnvmt by jobpop_group
## t = -2.2052, df = 1512.2, p-value = 0.02759
## alternative hypothesis: true difference in means between group High and group Low is not equal to 0
## 95 percent confidence interval:
## -0.21709014 -0.01269755
## sample estimates:
## mean in group High mean in group Low
## 2.989259 3.104152
#4. ANOVA: lnvmt by Income Category
# Ensure categorical income
hts_SA$income_cat <- as.factor(hts_SA$income_cat)
anova_result <- aov(lnvmt ~ income_cat, data = hts_SA)
summary(anova_result)
## Df Sum Sq Mean Sq F value Pr(>F)
## income_cat 2 144.1 72.03 76.6 <2e-16 ***
## Residuals 1517 1426.5 0.94
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#QQ Plot (Normality)
qqnorm(hts_SA$lnvmt)
qqline(hts_SA$lnvmt)
#Boxplot of lnvmt by Income
boxplot(lnvmt ~ income_cat, data = hts_SA,
main = "Boxplot of Log Vehicle Miles Traveled",
xlab = "Income Category", ylab = "lnvmt")
#5. Tukey Post Hoc Test
tukey_result <- TukeyHSD(anova_result)
plot(tukey_result)
tukey_result
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = lnvmt ~ income_cat, data = hts_SA)
##
## $income_cat
## diff lwr upr p adj
## 2-1 0.5628495 0.4228997 0.7027994 0.00e+00
## 3-1 0.8266261 0.6634770 0.9897752 0.00e+00
## 3-2 0.2637766 0.1187965 0.4087567 6.21e-05
#6. Simulated T-Test Plot
# Create single_family var
hts[, single_family := ifelse(htype == 0, "Single Family", "Other")]
hts[, single_family := factor(single_family)]
t_test_income <- t.test(hhincome ~ single_family, data = hts)
# Plot
curve(dt(x, df = t_test_income$parameter),
from = -4, to = 4, col = "darkblue", lwd = 2,
main = "Simulated T-distribution (T-Test)",
xlab = "T-value", ylab = "Density")
abline(v = t_test_income$statistic, col = "red", lwd = 2)
text(t_test_income$statistic, 0.05,
labels = paste0("t = ", round(t_test_income$statistic, 2)),
pos = 4, col = "red")
#7. Summary of Results #T-test 1: There is a statistically significant difference in household income between housing types (t = …, p < 0.001).
#T-test 2: Households in high job-population areas drive less on average than those in low areas (p < 0.05).
#ANOVA: Significant variation in VMT by income category (p < 0.01); Tukey’s test confirms differences between low and high income groups.