#The main issue with the "Hypothesis Testing" Module is that you considered the correlation between the amount of revenue generated and if the ads were displayed or not. We believe more factors should have been taken into consideration before concluding that the revenue generated by an ad depends on whether it has a display or not.
#In our version of this module, we have taken into consideration the number of clicks on the ad as well. A combination of The number of clicks on an AD with the presence/absence of display will help us to better determine the impact display has on the revenue generated by the ad.
#Its incorrect to assume that the display has an impact on the ads without taking into consideration if the ads with the display had enough clicks.
#discuss how you dont agree with the concept of hypothesis testing
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggthemes)
library(ggrepel)
library(effsize)
library(pwrss)
##
## Attaching package: 'pwrss'
##
## The following object is masked from 'package:stats':
##
## power.t.test
url_ <- "https://raw.githubusercontent.com/leontoddjohnson/i590/main/data/marketing/marketing.csv"
marketing <- read_delim(url_, delim = ",")
## Rows: 40 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (8): spend, clicks, impressions, display, transactions, revenue, ctr, co...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#Viewing the data
marketing
## # A tibble: 40 × 8
## spend clicks impressions display transactions revenue ctr con_rate
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 22.6 165 8672 0 2 58.9 1.9 1.21
## 2 37.3 228 11875 0 2 44.9 1.92 0.88
## 3 55.6 291 14631 0 3 142. 1.99 1.03
## 4 45.4 247 11709 0 2 210. 2.11 0.81
## 5 50.2 290 14768 0 3 198. 1.96 1.03
## 6 33.0 172 8698 0 2 204. 1.98 1.16
## 7 12.9 68 2924 0 1 117. 2.33 1.47
## 8 23.9 112 5919 0 1 72.0 1.89 0.89
## 9 58.4 306 14789 0 3 290. 2.07 0.98
## 10 48.2 300 14818 0 3 246. 2.02 1
## # ℹ 30 more rows
##Arranging the data according to the number of clicks
marketing_sorted <- marketing %>% arrange(clicks)
marketing_sorted
## # A tibble: 40 × 8
## spend clicks impressions display transactions revenue ctr con_rate
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1.12 48 1862 1 1 17.4 2.58 2.08
## 2 12.9 68 2924 0 1 117. 2.33 1.47
## 3 13.0 106 4056 1 2 232. 2.61 1.89
## 4 23.9 112 5919 0 1 72.0 1.89 0.89
## 5 28.7 117 6043 0 1 16.2 1.94 0.85
## 6 23.0 126 3828 1 2 235. 3.29 1.59
## 7 22.6 165 8672 0 2 58.9 1.9 1.21
## 8 39.7 169 6048 1 3 289. 2.79 1.78
## 9 33.0 172 8698 0 2 204. 1.98 1.16
## 10 25.0 175 5708 1 3 185. 3.07 1.71
## # ℹ 30 more rows
#finding average number of clicks
average_clicks <- mean(marketing$clicks)
print(average_clicks)
## [1] 284.225
#Arranging in a manner where display and clicks are taken into consideration #We put the ads into 2 groups, one where the clicks were less than the average and the ad had no display, and one where the clicks were more than the average and the ad had display
data_grouped <- marketing_sorted %>%
mutate(group_number = case_when(
clicks < 284 & display == 0 ~ 1, # First group criteria
clicks >= 284 & display == 1 ~ 2, # Second group criteria
TRUE ~ NA_integer_ # Any other cases (not necessary here)
)) %>%
group_by(group_number)
# View the modified data with group numbers
print(data_grouped)
## # A tibble: 40 × 9
## # Groups: group_number [3]
## spend clicks impressions display transactions revenue ctr con_rate
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1.12 48 1862 1 1 17.4 2.58 2.08
## 2 12.9 68 2924 0 1 117. 2.33 1.47
## 3 13.0 106 4056 1 2 232. 2.61 1.89
## 4 23.9 112 5919 0 1 72.0 1.89 0.89
## 5 28.7 117 6043 0 1 16.2 1.94 0.85
## 6 23.0 126 3828 1 2 235. 3.29 1.59
## 7 22.6 165 8672 0 2 58.9 1.9 1.21
## 8 39.7 169 6048 1 3 289. 2.79 1.78
## 9 33.0 172 8698 0 2 204. 1.98 1.16
## 10 25.0 175 5708 1 3 185. 3.07 1.71
## # ℹ 30 more rows
## # ℹ 1 more variable: group_number <dbl>
#remove values where the group number is NA
Filtered <- data_grouped %>%
filter(!is.na(group_number))
# View the filtered dataframe
print(Filtered)
## # A tibble: 19 × 9
## # Groups: group_number [2]
## spend clicks impressions display transactions revenue ctr con_rate
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 12.9 68 2924 0 1 117. 2.33 1.47
## 2 23.9 112 5919 0 1 72.0 1.89 0.89
## 3 28.7 117 6043 0 1 16.2 1.94 0.85
## 4 22.6 165 8672 0 2 58.9 1.9 1.21
## 5 33.0 172 8698 0 2 204. 1.98 1.16
## 6 28.3 177 9011 0 2 66.7 1.96 1.13
## 7 37.3 228 11875 0 2 44.9 1.92 0.88
## 8 36.3 238 11855 0 2 280. 2.01 0.84
## 9 45.4 247 11709 0 2 210. 2.11 0.81
## 10 50.2 286 9797 1 4 411. 2.92 1.4
## 11 42.6 288 9748 1 4 355. 2.95 1.39
## 12 48.3 300 9751 1 4 305. 3.08 1.33
## 13 54.9 303 9820 1 4 404. 3.09 1.32
## 14 55.6 360 11879 1 5 309. 3.03 1.39
## 15 65.3 362 11727 1 5 317. 3.09 1.38
## 16 65.3 412 13534 1 6 413. 3.04 1.46
## 17 85.1 479 15500 1 7 411. 3.09 1.46
## 18 91.1 541 17571 1 8 534. 3.08 1.48
## 19 91.5 592 19433 1 9 707. 3.05 1.52
## # ℹ 1 more variable: group_number <dbl>
#NULL Hypothesis: ads with less clicks and no display have the same average revenue as those with more clicks and display (the difference in revenue is 0)
grouped_data <- Filtered %>%
group_by(group_number) %>%
summarize(mean_value = mean(revenue))
grouped_data
## # A tibble: 2 × 2
## group_number mean_value
## <dbl> <dbl>
## 1 1 119.
## 2 2 417.
calculate_difference <- function(mean_values) {
diff_values <- diff(mean_values)
return(diff_values)
}
# Given mean values
mean_values <- c(118.9022, 416.6200)
# Calculate the difference between mean values
difference <- calculate_difference(mean_values)
# View the difference
difference
## [1] 297.7178
#Now using the boostrap function
bootstrap <- function (x, func=mean, n_iter=10^4) {
# empty vector to be filled with values from each iteration
func_values <- c(NULL)
# we simulate sampling `n_iter` times
for (i in 1:n_iter) {
# pull the sample (e.g., a vector or data frame)
x_sample <- sample_n(x, size = length(x), replace = TRUE)
# add on this iteration's value to the collection
func_values <- c(func_values, func(x_sample))
}
return(func_values)
}
diff_in_avg <- function (x_data) {
grouped_data <- x_data |>
group_by(display) |>
summarize(avg_revenue = mean(revenue)) |>
arrange(display)
# difference = revenue_with - revenue_without
diff <- (grouped_data$avg_revenue[2] -
grouped_data$avg_revenue[1])
return(diff)
}
diffs_in_avgs <- bootstrap(Filtered, diff_in_avg, n_iter = 100)
diffs_in_avgs
## [1] 294.0711 268.3333 237.3289 315.5244 307.3689 289.3244 299.6000 281.1467
## [9] 296.4578 286.2133 392.3556 379.0578 337.2222 346.4578 264.4933 340.7600
## [17] 367.5111 295.5822 254.9289 424.3467 270.0622 330.5289 332.3289 253.2444
## [25] 283.5778 303.4133 305.3244 217.0844 329.3644 347.6400 384.9644 303.2267
## [33] 350.1156 376.2489 260.2756 234.6933 304.2044 308.4133 372.8267 271.2667
## [41] 285.7378 332.9067 477.2489 298.9111 325.6444 235.6756 256.8978 354.6267
## [49] 253.2533 314.2933 333.0356 214.9556 225.0178 235.6489 333.8133 223.3911
## [57] 277.1111 197.7689 302.4622 265.7111 288.3867 302.1911 255.8622 304.5200
## [65] 288.5022 306.1422 272.1200 328.6622 269.8578 297.5289 311.3200 310.0800
## [73] 386.2711 267.7244 301.4000 339.3156 281.8533 281.6800 322.2489 453.0800
## [81] 352.4000 295.8756 293.4844 298.5111 238.1733 287.5422 325.6978 399.5422
## [89] 200.0267 268.2889 207.2133 314.1111 303.3067 285.4267 232.1289 284.7467
## [97] 259.5467 278.4533 324.8533 289.9689
ggplot() +
geom_function(xlim = c(-600, 600),
fun = function(x) dnorm(x, mean = 0,
sd = sd(diffs_in_avgs))) +
geom_vline(mapping = aes(xintercept = difference,
color = paste("observed: ",
round(difference)))) +
labs(title = "Bootstrapped Sampling Distribution of Revenue Differences",
x = "Difference in Revenue Calculated",
y = "Probability Density",
color = "") +
scale_x_continuous(breaks = seq(-600, 600, 100)) +
theme_minimal()
#Sampling Distribution #The null hypothesis is not true, the average
difference does not lie in the bulk of the sampling data.But it surely
does not like in the center where the average difference is 0.
#Conclusion:
#Hypothesis testing is a great tool. However There are many Assumptions and issues with Hypothesis Testing.The main issue is the concept of Random Sampling.Each time around you get different results, making inferences can get tricky at times.
#There are also multiple ethical issues with capturing data and analysing it to predict consumer behaviour. It plays with the psychology of human beings and profiting off it is not ethical at all times.