#The main issue with the "Hypothesis Testing" Module is that you considered the correlation between the amount of revenue generated and if the ads were displayed or not. We believe more factors should have been taken into consideration before concluding that the revenue generated by an ad depends on whether it has a display or not. 

#In our version of this module, we have taken into consideration the number of clicks on the ad as well. A combination of The number of clicks on an AD with the presence/absence of display will help us to better determine the impact display has on the revenue generated by the ad.

#Its incorrect to assume that the display has an impact on the ads without taking into consideration if the ads with the display had enough clicks.

#discuss how you dont agree with the concept of hypothesis testing

Importing modules

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggthemes)
library(ggrepel)
library(effsize)
library(pwrss)
## 
## Attaching package: 'pwrss'
## 
## The following object is masked from 'package:stats':
## 
##     power.t.test

Importing Data

url_ <- "https://raw.githubusercontent.com/leontoddjohnson/i590/main/data/marketing/marketing.csv"
marketing <- read_delim(url_, delim = ",")
## Rows: 40 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (8): spend, clicks, impressions, display, transactions, revenue, ctr, co...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

#Viewing the data

marketing
## # A tibble: 40 × 8
##    spend clicks impressions display transactions revenue   ctr con_rate
##    <dbl>  <dbl>       <dbl>   <dbl>        <dbl>   <dbl> <dbl>    <dbl>
##  1  22.6    165        8672       0            2    58.9  1.9      1.21
##  2  37.3    228       11875       0            2    44.9  1.92     0.88
##  3  55.6    291       14631       0            3   142.   1.99     1.03
##  4  45.4    247       11709       0            2   210.   2.11     0.81
##  5  50.2    290       14768       0            3   198.   1.96     1.03
##  6  33.0    172        8698       0            2   204.   1.98     1.16
##  7  12.9     68        2924       0            1   117.   2.33     1.47
##  8  23.9    112        5919       0            1    72.0  1.89     0.89
##  9  58.4    306       14789       0            3   290.   2.07     0.98
## 10  48.2    300       14818       0            3   246.   2.02     1   
## # ℹ 30 more rows

##Arranging the data according to the number of clicks

marketing_sorted <- marketing %>% arrange(clicks)
marketing_sorted
## # A tibble: 40 × 8
##    spend clicks impressions display transactions revenue   ctr con_rate
##    <dbl>  <dbl>       <dbl>   <dbl>        <dbl>   <dbl> <dbl>    <dbl>
##  1  1.12     48        1862       1            1    17.4  2.58     2.08
##  2 12.9      68        2924       0            1   117.   2.33     1.47
##  3 13.0     106        4056       1            2   232.   2.61     1.89
##  4 23.9     112        5919       0            1    72.0  1.89     0.89
##  5 28.7     117        6043       0            1    16.2  1.94     0.85
##  6 23.0     126        3828       1            2   235.   3.29     1.59
##  7 22.6     165        8672       0            2    58.9  1.9      1.21
##  8 39.7     169        6048       1            3   289.   2.79     1.78
##  9 33.0     172        8698       0            2   204.   1.98     1.16
## 10 25.0     175        5708       1            3   185.   3.07     1.71
## # ℹ 30 more rows

#finding average number of clicks

average_clicks <- mean(marketing$clicks)

print(average_clicks)
## [1] 284.225

#Arranging in a manner where display and clicks are taken into consideration #We put the ads into 2 groups, one where the clicks were less than the average and the ad had no display, and one where the clicks were more than the average and the ad had display

data_grouped <- marketing_sorted %>%
  mutate(group_number = case_when(
    clicks < 284 & display == 0 ~ 1,  # First group criteria
    clicks >= 284 & display == 1 ~ 2, # Second group criteria
    TRUE ~ NA_integer_  # Any other cases (not necessary here)
  )) %>%
  group_by(group_number)

# View the modified data with group numbers
print(data_grouped)
## # A tibble: 40 × 9
## # Groups:   group_number [3]
##    spend clicks impressions display transactions revenue   ctr con_rate
##    <dbl>  <dbl>       <dbl>   <dbl>        <dbl>   <dbl> <dbl>    <dbl>
##  1  1.12     48        1862       1            1    17.4  2.58     2.08
##  2 12.9      68        2924       0            1   117.   2.33     1.47
##  3 13.0     106        4056       1            2   232.   2.61     1.89
##  4 23.9     112        5919       0            1    72.0  1.89     0.89
##  5 28.7     117        6043       0            1    16.2  1.94     0.85
##  6 23.0     126        3828       1            2   235.   3.29     1.59
##  7 22.6     165        8672       0            2    58.9  1.9      1.21
##  8 39.7     169        6048       1            3   289.   2.79     1.78
##  9 33.0     172        8698       0            2   204.   1.98     1.16
## 10 25.0     175        5708       1            3   185.   3.07     1.71
## # ℹ 30 more rows
## # ℹ 1 more variable: group_number <dbl>

#remove values where the group number is NA

Filtered <- data_grouped %>%
  filter(!is.na(group_number))

# View the filtered dataframe
print(Filtered)
## # A tibble: 19 × 9
## # Groups:   group_number [2]
##    spend clicks impressions display transactions revenue   ctr con_rate
##    <dbl>  <dbl>       <dbl>   <dbl>        <dbl>   <dbl> <dbl>    <dbl>
##  1  12.9     68        2924       0            1   117.   2.33     1.47
##  2  23.9    112        5919       0            1    72.0  1.89     0.89
##  3  28.7    117        6043       0            1    16.2  1.94     0.85
##  4  22.6    165        8672       0            2    58.9  1.9      1.21
##  5  33.0    172        8698       0            2   204.   1.98     1.16
##  6  28.3    177        9011       0            2    66.7  1.96     1.13
##  7  37.3    228       11875       0            2    44.9  1.92     0.88
##  8  36.3    238       11855       0            2   280.   2.01     0.84
##  9  45.4    247       11709       0            2   210.   2.11     0.81
## 10  50.2    286        9797       1            4   411.   2.92     1.4 
## 11  42.6    288        9748       1            4   355.   2.95     1.39
## 12  48.3    300        9751       1            4   305.   3.08     1.33
## 13  54.9    303        9820       1            4   404.   3.09     1.32
## 14  55.6    360       11879       1            5   309.   3.03     1.39
## 15  65.3    362       11727       1            5   317.   3.09     1.38
## 16  65.3    412       13534       1            6   413.   3.04     1.46
## 17  85.1    479       15500       1            7   411.   3.09     1.46
## 18  91.1    541       17571       1            8   534.   3.08     1.48
## 19  91.5    592       19433       1            9   707.   3.05     1.52
## # ℹ 1 more variable: group_number <dbl>

#NULL Hypothesis: ads with less clicks and no display have the same average revenue as those with more clicks and display (the difference in revenue is 0)

grouped_data <- Filtered %>%
  group_by(group_number) %>%
  summarize(mean_value = mean(revenue))

grouped_data
## # A tibble: 2 × 2
##   group_number mean_value
##          <dbl>      <dbl>
## 1            1       119.
## 2            2       417.
calculate_difference <- function(mean_values) {
  diff_values <- diff(mean_values)
  return(diff_values)
}

# Given mean values
mean_values <- c(118.9022, 416.6200)

# Calculate the difference between mean values
difference <- calculate_difference(mean_values)

# View the difference
difference
## [1] 297.7178

#Now using the boostrap function

bootstrap <- function (x, func=mean, n_iter=10^4) {
  # empty vector to be filled with values from each iteration
  func_values <- c(NULL)
  
  # we simulate sampling `n_iter` times
  for (i in 1:n_iter) {
    # pull the sample (e.g., a vector or data frame)
    x_sample <- sample_n(x, size = length(x), replace = TRUE)
    
    # add on this iteration's value to the collection
    func_values <- c(func_values, func(x_sample))
  }
  
  return(func_values)
}
diff_in_avg <- function (x_data) {
  grouped_data <- x_data |>
    group_by(display) |>
    summarize(avg_revenue = mean(revenue)) |>
    arrange(display)
  
  # difference = revenue_with - revenue_without
  diff <- (grouped_data$avg_revenue[2] - 
           grouped_data$avg_revenue[1])
  
  return(diff)
}

diffs_in_avgs <- bootstrap(Filtered, diff_in_avg, n_iter = 100)

diffs_in_avgs
##   [1] 294.0711 268.3333 237.3289 315.5244 307.3689 289.3244 299.6000 281.1467
##   [9] 296.4578 286.2133 392.3556 379.0578 337.2222 346.4578 264.4933 340.7600
##  [17] 367.5111 295.5822 254.9289 424.3467 270.0622 330.5289 332.3289 253.2444
##  [25] 283.5778 303.4133 305.3244 217.0844 329.3644 347.6400 384.9644 303.2267
##  [33] 350.1156 376.2489 260.2756 234.6933 304.2044 308.4133 372.8267 271.2667
##  [41] 285.7378 332.9067 477.2489 298.9111 325.6444 235.6756 256.8978 354.6267
##  [49] 253.2533 314.2933 333.0356 214.9556 225.0178 235.6489 333.8133 223.3911
##  [57] 277.1111 197.7689 302.4622 265.7111 288.3867 302.1911 255.8622 304.5200
##  [65] 288.5022 306.1422 272.1200 328.6622 269.8578 297.5289 311.3200 310.0800
##  [73] 386.2711 267.7244 301.4000 339.3156 281.8533 281.6800 322.2489 453.0800
##  [81] 352.4000 295.8756 293.4844 298.5111 238.1733 287.5422 325.6978 399.5422
##  [89] 200.0267 268.2889 207.2133 314.1111 303.3067 285.4267 232.1289 284.7467
##  [97] 259.5467 278.4533 324.8533 289.9689
ggplot() +
  geom_function(xlim = c(-600, 600), 
                fun = function(x) dnorm(x, mean = 0, 
                                        sd = sd(diffs_in_avgs))) +
  geom_vline(mapping = aes(xintercept = difference,
                           color = paste("observed: ",
                                         round(difference)))) +
  labs(title = "Bootstrapped Sampling Distribution of Revenue Differences",
       x = "Difference in Revenue Calculated",
       y = "Probability Density",
       color = "") +
  scale_x_continuous(breaks = seq(-600, 600, 100)) +
  theme_minimal()

#Sampling Distribution #The null hypothesis is not true, the average difference does not lie in the bulk of the sampling data.But it surely does not like in the center where the average difference is 0.

#Conclusion:
#Hypothesis testing is a great tool. However There are many Assumptions and issues with Hypothesis Testing.The main issue is the concept of Random Sampling.Each time around you get different results, making inferences can get tricky at times.

#There are also multiple ethical issues with capturing data and analysing it to predict consumer behaviour. It plays with the psychology of human  beings and profiting off it is not ethical at all times.