library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
household_data <- read.csv("./data/HTS.household.10regions.csv")
What is the level of measurement of htype
and
autotrips
? #htype level of measurement is Nominal - the
number is represetning a categorical variable, in this case housing type
#autotrip level of measurment is Ratio - it is numerical and have even
gaps between
Use pipe to select autotrips
and htype
,
then keep rows whose value of htype
is 1 or 3. And name
your dataset ds_cleaned
.
ds_cleaned <- household_data %>%
select(autotrips, htype) %>%
filter(htype %in% c(1, 3))
#Null Hypothesis: There is no difference in the number of household car trips (autotrips) between single-family-detached housing (htype == 1) and multi-family housing (htype == 3).
#Research Hypothesis: There is a difference in the number of household car trips (autotrips) between single-family-detached housing (htype==1) and multi-family housing (htype==3)
#No, because the data size is large and more than 30, proving the central limit theorum
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
leveneTest(autotrips~factor(htype), data = ds_cleaned)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 1 144.05 < 2.2e-16 ***
## 12120
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
library(stats)
library(rstatix)
##
## Attaching package: 'rstatix'
## The following object is masked from 'package:stats':
##
## filter
ds_cleaned$htype <- as.factor(ds_cleaned$htype)
ds_cleaned %>%
levene_test(autotrips~htype)
## # A tibble: 1 × 4
## df1 df2 statistic p
## <int> <int> <dbl> <dbl>
## 1 1 12120 144. 5.32e-33
#Because of the large sample, we dont have to test for normality, there is no equality of variance meaning we reject the null levene test, we should use welch’s t-test
ds_cleaned %>%
rstatix:: t_test(autotrips~htype, var.equal = F)
## # A tibble: 1 × 8
## .y. group1 group2 n1 n2 statistic df p
## * <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 autotrips 1 3 10712 1410 30.4 2240. 3.53e-170
#Because the p-value is less than .05 we can reject the null hypothesis
sf
and
anytrainsit
?#sf is nominal, because it is a categorical variable for household type #anytrainsit is also nominal, as it is a variable that tells whether or not there are any transit trips per each household
#Null Hypothesis: there is no relationship between household type (sf) and any household transit trips (anytrainsit).
#Research Hypothesis: there is a relationship between household type (sf) and any household transit trips (anytrainsit).
#A pearson chi-squared should be used to test this hypothesis beause it is the approprtate test to use when comparing two categorical variables.
chisq.test(household_data$sf, household_data$anytransit)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: household_data$sf and household_data$anytransit
## X-squared = 253.64, df = 1, p-value < 2.2e-16
#since the p-value is less than .05, we reject null hypothesis
#install.packages("sjstats")
library(sjstats)
## Warning: package 'sjstats' was built under R version 4.4.2
##
## Attaching package: 'sjstats'
## The following object is masked from 'package:rstatix':
##
## t_test
table <- table(household_data$sf, household_data$anytransit)
cramer(table)
## [1] 0.1386222
What is the level of measurement of autotrips
and
hhincome
? #autotrip level of measurment is Ratio - it is
numerical and have even gaps between #hhincome also has a ratio level of
measurment because it is a continuous variable with meaningful zero and
increments (aka it’s currency)
Use an appropriate statistic to test this hypothesis and strength (State the null and research hypothesis, and the results for each test)
#Null Hypothesis: household car trips (autotrips) do not vary by real household income (hhincome). #Research Hypothesis: household car trips (autotrips) vary by real household income (hhincome).
library(stats)
cor.test(household_data$hhincome, household_data$autotrips)
##
## Pearson's product-moment correlation
##
## data: household_data$hhincome and household_data$autotrips
## t = 30.147, df = 13353, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2364873 0.2682469
## sample estimates:
## cor
## 0.2524351
#pearsons product-moment correlation
#Since the P-value is so low we reject the null and confirm the research hypothesis that household car trips vary by real household income.