library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
  1. Import dataset “HTS.household.10regions.csv”, and anwser the following questions.
household_data <- read.csv("./data/HTS.household.10regions.csv")
  1. Researcher A hypothesizes that there is there a difference in the number of household car trips (autotrips) between single-family-detached housing (htype==1) and multi-family housing (htype==3).
ds_cleaned <- household_data %>%
  select(autotrips, htype) %>%
  filter(htype %in% c(1, 3))

#Null Hypothesis: There is no difference in the number of household car trips (autotrips) between single-family-detached housing (htype == 1) and multi-family housing (htype == 3).

#Research Hypothesis: There is a difference in the number of household car trips (autotrips) between single-family-detached housing (htype==1) and multi-family housing (htype==3)

#No, because the data size is large and more than 30, proving the central limit theorum

library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
leveneTest(autotrips~factor(htype), data = ds_cleaned)
## Levene's Test for Homogeneity of Variance (center = median)
##          Df F value    Pr(>F)    
## group     1  144.05 < 2.2e-16 ***
##       12120                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
library(stats)
library(rstatix)
## 
## Attaching package: 'rstatix'
## The following object is masked from 'package:stats':
## 
##     filter
ds_cleaned$htype <- as.factor(ds_cleaned$htype)
ds_cleaned %>%
  levene_test(autotrips~htype)
## # A tibble: 1 × 4
##     df1   df2 statistic        p
##   <int> <int>     <dbl>    <dbl>
## 1     1 12120      144. 5.32e-33

#Because of the large sample, we dont have to test for normality, there is no equality of variance meaning we reject the null levene test, we should use welch’s t-test

ds_cleaned %>%
  rstatix:: t_test(autotrips~htype, var.equal = F)
## # A tibble: 1 × 8
##   .y.       group1 group2    n1    n2 statistic    df         p
## * <chr>     <chr>  <chr>  <int> <int>     <dbl> <dbl>     <dbl>
## 1 autotrips 1      3      10712  1410      30.4 2240. 3.53e-170

#Because the p-value is less than .05 we can reject the null hypothesis

  1. Researcher B hypothesizes that there is a relationship between household type (sf) and any household transit trips (anytrainsit).

#sf is nominal, because it is a categorical variable for household type #anytrainsit is also nominal, as it is a variable that tells whether or not there are any transit trips per each household

#Null Hypothesis: there is no relationship between household type (sf) and any household transit trips (anytrainsit).

#Research Hypothesis: there is a relationship between household type (sf) and any household transit trips (anytrainsit).

#A pearson chi-squared should be used to test this hypothesis beause it is the approprtate test to use when comparing two categorical variables.

chisq.test(household_data$sf, household_data$anytransit)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  household_data$sf and household_data$anytransit
## X-squared = 253.64, df = 1, p-value < 2.2e-16

#since the p-value is less than .05, we reject null hypothesis

#install.packages("sjstats")
library(sjstats)
## Warning: package 'sjstats' was built under R version 4.4.2
## 
## Attaching package: 'sjstats'
## The following object is masked from 'package:rstatix':
## 
##     t_test
table <- table(household_data$sf, household_data$anytransit)
cramer(table)
## [1] 0.1386222
  1. Researcher C hypothesizes that the household car trips (autotrips) vary by real household income (hhincome).

#Null Hypothesis: household car trips (autotrips) do not vary by real household income (hhincome). #Research Hypothesis: household car trips (autotrips) vary by real household income (hhincome).

library(stats)
cor.test(household_data$hhincome, household_data$autotrips)
## 
##  Pearson's product-moment correlation
## 
## data:  household_data$hhincome and household_data$autotrips
## t = 30.147, df = 13353, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2364873 0.2682469
## sample estimates:
##       cor 
## 0.2524351
#pearsons product-moment correlation

#Since the P-value is so low we reject the null and confirm the research hypothesis that household car trips vary by real household income.