#The primary concern with the "Hypothesis Testing" Module lies in the exclusive consideration of the correlation between the amount of revenue generated and the presence or absence of displayed ads. We advocate for a more comprehensive approach that incorporates additional factors before drawing conclusions about the dependency of ad revenue on display status.
#In our revised version of this module, we have expanded the analysis to include the number of clicks on the ad. By examining the combination of the number of clicks and the presence or absence of display, we aim to gain a more nuanced understanding of how display impacts the revenue generated by the ad.
#It is inaccurate to assert a direct influence of display on ads without accounting for the number of clicks. We contend that a thorough assessment should encompass both variables to better discern the actual impact of display on ad revenue.
#discuss how you dont agree with the concept of hypothesis testing
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggthemes)
library(ggrepel)
library(effsize)
## Warning: package 'effsize' was built under R version 4.3.2
library(pwrss)
## Warning: package 'pwrss' was built under R version 4.3.2
##
## Attaching package: 'pwrss'
##
## The following object is masked from 'package:stats':
##
## power.t.test
url_ <- "https://raw.githubusercontent.com/leontoddjohnson/i590/main/data/marketing/marketing.csv"
marketing <- read_delim(url_, delim = ",")
## Rows: 40 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (8): spend, clicks, impressions, display, transactions, revenue, ctr, co...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#Viewing the data
marketing
## # A tibble: 40 × 8
## spend clicks impressions display transactions revenue ctr con_rate
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 22.6 165 8672 0 2 58.9 1.9 1.21
## 2 37.3 228 11875 0 2 44.9 1.92 0.88
## 3 55.6 291 14631 0 3 142. 1.99 1.03
## 4 45.4 247 11709 0 2 210. 2.11 0.81
## 5 50.2 290 14768 0 3 198. 1.96 1.03
## 6 33.0 172 8698 0 2 204. 1.98 1.16
## 7 12.9 68 2924 0 1 117. 2.33 1.47
## 8 23.9 112 5919 0 1 72.0 1.89 0.89
## 9 58.4 306 14789 0 3 290. 2.07 0.98
## 10 48.2 300 14818 0 3 246. 2.02 1
## # ℹ 30 more rows
##Arranging the data according to the number of clicks
marketing_sorted <- marketing %>% arrange(clicks)
marketing_sorted
## # A tibble: 40 × 8
## spend clicks impressions display transactions revenue ctr con_rate
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1.12 48 1862 1 1 17.4 2.58 2.08
## 2 12.9 68 2924 0 1 117. 2.33 1.47
## 3 13.0 106 4056 1 2 232. 2.61 1.89
## 4 23.9 112 5919 0 1 72.0 1.89 0.89
## 5 28.7 117 6043 0 1 16.2 1.94 0.85
## 6 23.0 126 3828 1 2 235. 3.29 1.59
## 7 22.6 165 8672 0 2 58.9 1.9 1.21
## 8 39.7 169 6048 1 3 289. 2.79 1.78
## 9 33.0 172 8698 0 2 204. 1.98 1.16
## 10 25.0 175 5708 1 3 185. 3.07 1.71
## # ℹ 30 more rows
#finding average number of clicks
average_clicks <- mean(marketing$clicks)
print(average_clicks)
## [1] 284.225
#Arranging in a manner where display and clicks are taken into consideration
data_grouped <- marketing_sorted %>%
mutate(group_number = case_when(
clicks < 284 & display == 0 ~ 1, # First group criteria
clicks >= 284 & display == 1 ~ 2, # Second group criteria
TRUE ~ NA_integer_ # Any other cases (not necessary here)
)) %>%
group_by(group_number)
# View the modified data with group numbers
print(data_grouped)
## # A tibble: 40 × 9
## # Groups: group_number [3]
## spend clicks impressions display transactions revenue ctr con_rate
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1.12 48 1862 1 1 17.4 2.58 2.08
## 2 12.9 68 2924 0 1 117. 2.33 1.47
## 3 13.0 106 4056 1 2 232. 2.61 1.89
## 4 23.9 112 5919 0 1 72.0 1.89 0.89
## 5 28.7 117 6043 0 1 16.2 1.94 0.85
## 6 23.0 126 3828 1 2 235. 3.29 1.59
## 7 22.6 165 8672 0 2 58.9 1.9 1.21
## 8 39.7 169 6048 1 3 289. 2.79 1.78
## 9 33.0 172 8698 0 2 204. 1.98 1.16
## 10 25.0 175 5708 1 3 185. 3.07 1.71
## # ℹ 30 more rows
## # ℹ 1 more variable: group_number <dbl>
#remove values where the group number is NA
Filtered <- data_grouped %>%
filter(!is.na(group_number))
# View the filtered dataframe
print(Filtered)
## # A tibble: 19 × 9
## # Groups: group_number [2]
## spend clicks impressions display transactions revenue ctr con_rate
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 12.9 68 2924 0 1 117. 2.33 1.47
## 2 23.9 112 5919 0 1 72.0 1.89 0.89
## 3 28.7 117 6043 0 1 16.2 1.94 0.85
## 4 22.6 165 8672 0 2 58.9 1.9 1.21
## 5 33.0 172 8698 0 2 204. 1.98 1.16
## 6 28.3 177 9011 0 2 66.7 1.96 1.13
## 7 37.3 228 11875 0 2 44.9 1.92 0.88
## 8 36.3 238 11855 0 2 280. 2.01 0.84
## 9 45.4 247 11709 0 2 210. 2.11 0.81
## 10 50.2 286 9797 1 4 411. 2.92 1.4
## 11 42.6 288 9748 1 4 355. 2.95 1.39
## 12 48.3 300 9751 1 4 305. 3.08 1.33
## 13 54.9 303 9820 1 4 404. 3.09 1.32
## 14 55.6 360 11879 1 5 309. 3.03 1.39
## 15 65.3 362 11727 1 5 317. 3.09 1.38
## 16 65.3 412 13534 1 6 413. 3.04 1.46
## 17 85.1 479 15500 1 7 411. 3.09 1.46
## 18 91.1 541 17571 1 8 534. 3.08 1.48
## 19 91.5 592 19433 1 9 707. 3.05 1.52
## # ℹ 1 more variable: group_number <dbl>
#NULL Hypothesis: ads with less clicks and no display have the same average revenue as those with more clicks and display
grouped_data <- Filtered %>%
group_by(group_number) %>%
summarize(mean_value = mean(revenue))
grouped_data
## # A tibble: 2 × 2
## group_number mean_value
## <dbl> <dbl>
## 1 1 119.
## 2 2 417.
calculate_difference <- function(mean_values) {
diff_values <- diff(mean_values)
return(diff_values)
}
# Given mean values
mean_values <- c(118.9022, 416.6200)
# Calculate the difference between mean values
difference <- calculate_difference(mean_values)
# View the difference
difference
## [1] 297.7178
#Now using the boostrap function
bootstrap <- function (x, func=mean, n_iter=10^4) {
# empty vector to be filled with values from each iteration
func_values <- c(NULL)
# we simulate sampling `n_iter` times
for (i in 1:n_iter) {
# pull the sample (e.g., a vector or data frame)
x_sample <- sample_n(x, size = length(x), replace = TRUE)
# add on this iteration's value to the collection
func_values <- c(func_values, func(x_sample))
}
return(func_values)
}
diff_in_avg <- function (x_data) {
grouped_data <- x_data |>
group_by(display) |>
summarize(avg_revenue = mean(revenue)) |>
arrange(display)
# difference = revenue_with - revenue_without
diff <- (grouped_data$avg_revenue[2] -
grouped_data$avg_revenue[1])
return(diff)
}
diffs_in_avgs <- bootstrap(Filtered, diff_in_avg, n_iter = 100)
diffs_in_avgs
## [1] 371.5022 311.9778 269.3022 261.0889 359.7822 338.7289 255.6622 265.0222
## [9] 322.6089 249.1111 225.7022 279.1422 252.6356 283.3867 388.1956 248.1600
## [17] 286.3156 263.0978 166.1200 370.0267 294.0133 232.3156 357.4400 314.3378
## [25] 273.6444 243.9333 271.8267 308.2044 411.2311 330.4089 278.8444 284.7956
## [33] 310.9689 270.6400 209.5467 332.8489 217.0444 239.0444 382.8533 239.9200
## [41] 250.8267 260.5289 243.6667 295.6889 267.4978 294.5733 309.8933 289.9867
## [49] 241.5156 242.8267 367.8622 317.9956 257.4933 341.0978 288.4978 341.9067
## [57] 307.6133 391.0667 363.6667 313.6044 295.5244 341.6800 344.5911 297.6089
## [65] 376.3289 331.3244 272.7378 290.8756 422.0800 236.8311 374.6489 299.7067
## [73] 299.8044 349.7200 389.5733 241.5778 233.5556 243.3333 319.9111 309.1244
## [81] 290.5111 359.7956 349.7156 347.1556 358.0400 308.3244 233.9822 329.0489
## [89] 261.1067 266.4222 336.0178 313.1156 302.3867 230.6533 215.0756 362.0311
## [97] 315.6356 259.5778 312.9689 316.3911
ggplot() +
geom_function(xlim = c(-600, 600),
fun = function(x) dnorm(x, mean = 0,
sd = sd(diffs_in_avgs))) +
geom_vline(mapping = aes(xintercept = difference,
color = paste("observed: ",
round(difference)))) +
labs(title = "Bootstrapped Sampling Distribution of Revenue Differences",
x = "Difference in Revenue Calculated",
y = "Probability Density",
color = "") +
scale_x_continuous(breaks = seq(-600, 600, 100)) +
theme_minimal()
#Sampling Distribution
#differences
#Hypothesis testing is a great tool. However There are many Assumptions and issues with Hypothesis Testing.The main issue is the concept of Random Samplung