#install.packages("ade4")
#library(ade4)
library(tidyverse)
txhousing
## # A tibble: 8,602 × 9
## city year month sales volume median listings inventory date
## <chr> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Abilene 2000 1 72 5380000 71400 701 6.3 2000
## 2 Abilene 2000 2 98 6505000 58700 746 6.6 2000.
## 3 Abilene 2000 3 130 9285000 58100 784 6.8 2000.
## 4 Abilene 2000 4 98 9730000 68600 785 6.9 2000.
## 5 Abilene 2000 5 141 10590000 67300 794 6.8 2000.
## 6 Abilene 2000 6 156 13910000 66900 780 6.6 2000.
## 7 Abilene 2000 7 152 12635000 73500 742 6.2 2000.
## 8 Abilene 2000 8 131 10710000 75000 765 6.4 2001.
## 9 Abilene 2000 9 104 7615000 64500 771 6.5 2001.
## 10 Abilene 2000 10 101 7040000 59300 764 6.6 2001.
## # ℹ 8,592 more rows
##
## Cohen's d
##
## d estimate: -0.1123257 (negligible)
## 95 percent confidence interval:
## lower upper
## -0.9596482 0.7349968
## [1] 136.1667
## [1] 132.5
As seen below we have only 12 data-points for both the 2010 and 2009 subsets of our data, which I believe is too low.
test <- pwrss.t.2means(mu1 = 1.5,
sd1 = sd(filter(abeline_df, year == 2009) |> pluck("sales")),
kappa = 1,
power = .8, alpha = 0.05,
alternative = "not equal")
## Difference between Two means
## (Independent Samples t Test)
## H0: mu1 = mu2
## HA: mu1 != mu2
## ------------------------------
## Statistical power = 0.8
## n1 = 6023
## n2 = 6023
## ------------------------------
## Alternative = "not equal"
## Degrees of freedom = 12044
## Non-centrality parameter = 2.802
## Type I error rate = 0.05
## Type II error rate = 0.2
As seen above, it appears as though n1 and n2 = 6023 is the required sample size to get our desired Minimum Effect Size. This is likely because our Cohen’s D value is really low and we require more data points before we can reach this Effect Size.
Therefore we cannot adequately conduct the Fischer’s Sampling Test
test <- pwrss.t.2means(mu1 = 1.5,
sd1 = sd(pluck(abeline_df, "sales")),
kappa = 1,
power = .8, alpha = 0.05,
alternative = "not equal")
## Difference between Two means
## (Independent Samples t Test)
## H0: mu1 = mu2
## HA: mu1 != mu2
## ------------------------------
## Statistical power = 0.8
## n1 = 11171
## n2 = 11171
## ------------------------------
## Alternative = "not equal"
## Degrees of freedom = 22340
## Non-centrality parameter = 2.802
## Type I error rate = 0.05
## Type II error rate = 0.2
plot(test)
## Warning in qt(1 - prob.extreme, df = df, ncp = ncp, lower.tail = TRUE): full
## precision may not have been achieved in 'pnt{final}'
## Hypothesis Test 2:
We can redefine our null hypothesis as ” Mean Listings in December - Mean Listings in July = 0”. Given that our Null hypothesis is a measure of the difference in the means of 2 distributions. The appropriate test would be - 2-Sampled t-test as we are concerned with calculating the difference between the means.
Let us first visualise the difference in means:
df_dec_jun = txhousing[txhousing$year == 2010, ]
df_dec_jun= df_dec_jun[df_dec_jun$month == 6 | df_dec_jun$month == 12, ]
df_dec_jun
## # A tibble: 92 × 9
## city year month sales volume median listings inventory date
## <chr> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Abilene 2010 6 169 23216943 127900 932 6.7 2010.
## 2 Abilene 2010 12 116 15289470 118300 830 6.3 2011.
## 3 Amarillo 2010 6 272 42959136 139400 1449 6.1 2010.
## 4 Amarillo 2010 12 185 24975000 118900 1381 6.5 2011.
## 5 Arlington 2010 6 367 58389192 134600 2221 5.9 2010.
## 6 Arlington 2010 12 285 41382701 130000 1821 5.6 2011.
## 7 Austin 2010 6 2190 584250558 200500 13353 7.2 2010.
## 8 Austin 2010 12 1561 384045548 191200 9284 5.6 2011.
## 9 Bay Area 2010 6 520 101230779 162100 4627 10.2 2010.
## 10 Bay Area 2010 12 396 76002315 158200 3938 9.6 2011.
## # ℹ 82 more rows
df_dec_jun |>
ggplot() +
geom_boxplot(mapping =
aes(x = listings,
y = factor(month, levels = c(6, 12),
labels = c("June", "December")))) +
labs(title = "Advertisement Effect on Revenue",
x = "Listings (# of Properties )",
y = "Month") +
theme_minimal()
## Warning: Removed 2 rows containing non-finite values (`stat_boxplot()`).
As seen above, the means actually appear to be very similar as do the general distribution of Listings in the 2 months. However, we need to explore this further by calculating a p-value before we can make any conclusions.
jun_2010 = filter(df_dec_jun, month == 6) |> pluck("listings")
dec_2010 = filter(df_dec_jun, month == 12) |> pluck("listings")
t.test(jun_2010, dec_2010)
##
## Welch Two Sample t-test
##
## data: jun_2010 and dec_2010
## t = 0.38315, df = 86.196, p-value = 0.7025
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2246.164 3318.786
## sample estimates:
## mean of x mean of y
## 3747.756 3211.444
Our calculated P-Value is 0.702 which is well above our significance level of 0.05. This p-value gives us the probability of obtaining a result that is equal to or more extreme than our observed difference in means. In this case the difference in means is 3898.568 - 3211.444 = 687.124. And the probability of obtaining a result that is equal to or even more extreme than this is 0.634.
This implies that we can reject the Null Hypothesis that there is no difference in between the average number of listings in December and July of 2010.