library(data.table)
library(haven)
library(foreign)
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#1. Load Household Dataset
hts <- read_sav("~/Desktop/Methods2_Labs/07-lab-assignment-2025-Green-range/datasets/HTS.household.10regions.sav")
hts <- as.data.table(hts)

# Convert and clean
hts$hhincome <- as.numeric(hts$hhincome)
hts_clean <- hts[!is.na(hhincome)]
hts_clean$htype <- as.factor(hts_clean$htype)

# Filter only two groups
hts_two_groups <- hts_clean[htype %in% c("0", "1")]

# Run t-test
t.test(hhincome ~ htype, data = hts_two_groups)
## 
##  Welch Two Sample t-test
## 
## data:  hhincome by htype
## t = -15.389, df = 194.91, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -36.14148 -27.93007
## sample estimates:
## mean in group 0 mean in group 1 
##        43.34637        75.38215

#2. T-Test: Income by Housing Type

# Convert and clean
hts$hhincome <- as.numeric(hts$hhincome)
hts_clean <- hts[!is.na(hhincome)]
hts_clean$htype <- as.factor(hts_clean$htype)

# Filter only two groups
hts_two_groups <- hts_clean[htype %in% c("0", "1")]

# Run t-test
t.test(hhincome ~ htype, data = hts_two_groups)
## 
##  Welch Two Sample t-test
## 
## data:  hhincome by htype
## t = -15.389, df = 194.91, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -36.14148 -27.93007
## sample estimates:
## mean in group 0 mean in group 1 
##        43.34637        75.38215
#2. T-Test: Income by Housing Type

# Convert and clean
hts$hhincome <- as.numeric(hts$hhincome)
hts_clean <- hts[!is.na(hhincome)]
hts_clean$htype <- as.factor(hts_clean$htype)

# Filter only two groups
hts_two_groups <- hts_clean[htype %in% c("0", "1")]

# Run t-test
t.test(hhincome ~ htype, data = hts_two_groups)
## 
##  Welch Two Sample t-test
## 
## data:  hhincome by htype
## t = -15.389, df = 194.91, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -36.14148 -27.93007
## sample estimates:
## mean in group 0 mean in group 1 
##        43.34637        75.38215
#Histogram of Income by Housing Type

ggplot(hts_two_groups, aes(x = hhincome, fill = htype)) +
  geom_histogram(alpha = 0.6, position = "identity", bins = 30) +
  labs(title = "Histogram of Household Income by Housing Type",
       x = "Household Income", fill = "Housing Type") +
  theme_minimal()

#3. T-Test: San Antonio

library(data.table)
library(haven)
library(foreign)
library(ggplot2)
library(dplyr)

#1. Load Household Dataset
hts <- read_sav("~/Desktop/Methods2_Labs/07-lab-assignment-2025-Green-range/datasets/HTS.household.10regions.sav")
hts <- as.data.table(hts)

# Convert and clean
hts$hhincome <- as.numeric(hts$hhincome)
hts_clean <- hts[!is.na(hhincome)]
hts_clean$htype <- as.factor(hts_clean$htype)

# Filter only two groups
hts_two_groups <- hts_clean[htype %in% c("0", "1")]

# Run t-test
t.test(hhincome ~ htype, data = hts_two_groups)
## 
##  Welch Two Sample t-test
## 
## data:  hhincome by htype
## t = -15.389, df = 194.91, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -36.14148 -27.93007
## sample estimates:
## mean in group 0 mean in group 1 
##        43.34637        75.38215

#2. T-Test: Income by Housing Type

library(data.table)
library(haven)
library(foreign)
library(ggplot2)
library(dplyr)

#1. Load Household Dataset
hts <- read_sav("~/Desktop/Methods2_Labs/07-lab-assignment-2025-Green-range/datasets/HTS.household.10regions.sav")
hts <- as.data.table(hts)

# Convert and clean
hts$hhincome <- as.numeric(hts$hhincome)
hts_clean <- hts[!is.na(hhincome)]
hts_clean$htype <- as.factor(hts_clean$htype)

# Filter only two groups
hts_two_groups <- hts_clean[htype %in% c("0", "1")]

# Run t-test
t.test(hhincome ~ htype, data = hts_two_groups)
## 
##  Welch Two Sample t-test
## 
## data:  hhincome by htype
## t = -15.389, df = 194.91, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -36.14148 -27.93007
## sample estimates:
## mean in group 0 mean in group 1 
##        43.34637        75.38215

#3. T-Test: San Antonio

# Filter San Antonio
hts_SA <- hts %>% filter(region == 15)

# Create jobpop group
jobpop_median <- median(hts_SA$jobpop, na.rm = TRUE)
hts_SA <- hts_SA %>%
  mutate(jobpop_group = ifelse(jobpop > jobpop_median, "High", "Low")) %>%
  filter(!is.na(lnvmt), !is.na(jobpop_group))

# Run t-test
t.test(lnvmt ~ jobpop_group, data = hts_SA)
## 
##  Welch Two Sample t-test
## 
## data:  lnvmt by jobpop_group
## t = -2.2052, df = 1512.2, p-value = 0.02759
## alternative hypothesis: true difference in means between group High and group Low is not equal to 0
## 95 percent confidence interval:
##  -0.21709014 -0.01269755
## sample estimates:
## mean in group High  mean in group Low 
##           2.989259           3.104152

#4. ANOVA: lnvmt by Income Category

# Ensure categorical income
hts_SA$income_cat <- as.factor(hts_SA$income_cat)
anova_result <- aov(lnvmt ~ income_cat, data = hts_SA)
summary(anova_result)
##               Df Sum Sq Mean Sq F value Pr(>F)    
## income_cat     2  144.1   72.03    76.6 <2e-16 ***
## Residuals   1517 1426.5    0.94                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#QQ Plot (Normality)
qqnorm(hts_SA$lnvmt)
qqline(hts_SA$lnvmt)

#Boxplot of lnvmt by Income

boxplot(lnvmt ~ income_cat, data = hts_SA,
        main = "Boxplot of Log Vehicle Miles Traveled",
        xlab = "Income Category", ylab = "lnvmt")

#5. Tukey Post Hoc Test

tukey_result <- TukeyHSD(anova_result)
plot(tukey_result)

tukey_result
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = lnvmt ~ income_cat, data = hts_SA)
## 
## $income_cat
##          diff       lwr       upr    p adj
## 2-1 0.5628495 0.4228997 0.7027994 0.00e+00
## 3-1 0.8266261 0.6634770 0.9897752 0.00e+00
## 3-2 0.2637766 0.1187965 0.4087567 6.21e-05

#6. Simulated T-Test Plot

# Create single_family var
hts[, single_family := ifelse(htype == 0, "Single Family", "Other")]
hts[, single_family := factor(single_family)]
t_test_income <- t.test(hhincome ~ single_family, data = hts)

# Plot
curve(dt(x, df = t_test_income$parameter),
      from = -4, to = 4, col = "darkblue", lwd = 2,
      main = "Simulated T-distribution (T-Test)",
      xlab = "T-value", ylab = "Density")
abline(v = t_test_income$statistic, col = "red", lwd = 2)
text(t_test_income$statistic, 0.05,
     labels = paste0("t = ", round(t_test_income$statistic, 2)),
     pos = 4, col = "red")

#7. Summary of Results #T-test 1: There is a statistically significant difference in household income between housing types (t = …, p < 0.001).

#T-test 2: Households in high job-population areas drive less on average than those in low areas (p < 0.05).

#ANOVA: Significant variation in VMT by income category (p < 0.01); Tukey’s test confirms differences between low and high income groups.