#The primary concern with the "Hypothesis Testing" Module lies in the exclusive consideration of the correlation between the amount of revenue generated and the presence or absence of displayed ads. We advocate for a more comprehensive approach that incorporates additional factors before drawing conclusions about the dependency of ad revenue on display status.

#In our revised version of this module, we have expanded the analysis to include the number of clicks on the ad. By examining the combination of the number of clicks and the presence or absence of display, we aim to gain a more nuanced understanding of how display impacts the revenue generated by the ad.

#It is inaccurate to assert a direct influence of display on ads without accounting for the number of clicks. We contend that a thorough assessment should encompass both variables to better discern the actual impact of display on ad revenue.

#discuss how you dont agree with the concept of hypothesis testing

Importing modules

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggthemes)
library(ggrepel)
library(effsize)
## Warning: package 'effsize' was built under R version 4.3.2
library(pwrss)
## Warning: package 'pwrss' was built under R version 4.3.2
## 
## Attaching package: 'pwrss'
## 
## The following object is masked from 'package:stats':
## 
##     power.t.test

Importing Data

url_ <- "https://raw.githubusercontent.com/leontoddjohnson/i590/main/data/marketing/marketing.csv"
marketing <- read_delim(url_, delim = ",")
## Rows: 40 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (8): spend, clicks, impressions, display, transactions, revenue, ctr, co...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

#Viewing the data

marketing
## # A tibble: 40 × 8
##    spend clicks impressions display transactions revenue   ctr con_rate
##    <dbl>  <dbl>       <dbl>   <dbl>        <dbl>   <dbl> <dbl>    <dbl>
##  1  22.6    165        8672       0            2    58.9  1.9      1.21
##  2  37.3    228       11875       0            2    44.9  1.92     0.88
##  3  55.6    291       14631       0            3   142.   1.99     1.03
##  4  45.4    247       11709       0            2   210.   2.11     0.81
##  5  50.2    290       14768       0            3   198.   1.96     1.03
##  6  33.0    172        8698       0            2   204.   1.98     1.16
##  7  12.9     68        2924       0            1   117.   2.33     1.47
##  8  23.9    112        5919       0            1    72.0  1.89     0.89
##  9  58.4    306       14789       0            3   290.   2.07     0.98
## 10  48.2    300       14818       0            3   246.   2.02     1   
## # ℹ 30 more rows

##Arranging the data according to the number of clicks

marketing_sorted <- marketing %>% arrange(clicks)
marketing_sorted
## # A tibble: 40 × 8
##    spend clicks impressions display transactions revenue   ctr con_rate
##    <dbl>  <dbl>       <dbl>   <dbl>        <dbl>   <dbl> <dbl>    <dbl>
##  1  1.12     48        1862       1            1    17.4  2.58     2.08
##  2 12.9      68        2924       0            1   117.   2.33     1.47
##  3 13.0     106        4056       1            2   232.   2.61     1.89
##  4 23.9     112        5919       0            1    72.0  1.89     0.89
##  5 28.7     117        6043       0            1    16.2  1.94     0.85
##  6 23.0     126        3828       1            2   235.   3.29     1.59
##  7 22.6     165        8672       0            2    58.9  1.9      1.21
##  8 39.7     169        6048       1            3   289.   2.79     1.78
##  9 33.0     172        8698       0            2   204.   1.98     1.16
## 10 25.0     175        5708       1            3   185.   3.07     1.71
## # ℹ 30 more rows

#finding average number of clicks

average_clicks <- mean(marketing$clicks)

print(average_clicks)
## [1] 284.225

#Arranging in a manner where display and clicks are taken into consideration

data_grouped <- marketing_sorted %>%
  mutate(group_number = case_when(
    clicks < 284 & display == 0 ~ 1,  # First group criteria
    clicks >= 284 & display == 1 ~ 2, # Second group criteria
    TRUE ~ NA_integer_  # Any other cases (not necessary here)
  )) %>%
  group_by(group_number)

# View the modified data with group numbers
print(data_grouped)
## # A tibble: 40 × 9
## # Groups:   group_number [3]
##    spend clicks impressions display transactions revenue   ctr con_rate
##    <dbl>  <dbl>       <dbl>   <dbl>        <dbl>   <dbl> <dbl>    <dbl>
##  1  1.12     48        1862       1            1    17.4  2.58     2.08
##  2 12.9      68        2924       0            1   117.   2.33     1.47
##  3 13.0     106        4056       1            2   232.   2.61     1.89
##  4 23.9     112        5919       0            1    72.0  1.89     0.89
##  5 28.7     117        6043       0            1    16.2  1.94     0.85
##  6 23.0     126        3828       1            2   235.   3.29     1.59
##  7 22.6     165        8672       0            2    58.9  1.9      1.21
##  8 39.7     169        6048       1            3   289.   2.79     1.78
##  9 33.0     172        8698       0            2   204.   1.98     1.16
## 10 25.0     175        5708       1            3   185.   3.07     1.71
## # ℹ 30 more rows
## # ℹ 1 more variable: group_number <dbl>

#remove values where the group number is NA

Filtered <- data_grouped %>%
  filter(!is.na(group_number))

# View the filtered dataframe
print(Filtered)
## # A tibble: 19 × 9
## # Groups:   group_number [2]
##    spend clicks impressions display transactions revenue   ctr con_rate
##    <dbl>  <dbl>       <dbl>   <dbl>        <dbl>   <dbl> <dbl>    <dbl>
##  1  12.9     68        2924       0            1   117.   2.33     1.47
##  2  23.9    112        5919       0            1    72.0  1.89     0.89
##  3  28.7    117        6043       0            1    16.2  1.94     0.85
##  4  22.6    165        8672       0            2    58.9  1.9      1.21
##  5  33.0    172        8698       0            2   204.   1.98     1.16
##  6  28.3    177        9011       0            2    66.7  1.96     1.13
##  7  37.3    228       11875       0            2    44.9  1.92     0.88
##  8  36.3    238       11855       0            2   280.   2.01     0.84
##  9  45.4    247       11709       0            2   210.   2.11     0.81
## 10  50.2    286        9797       1            4   411.   2.92     1.4 
## 11  42.6    288        9748       1            4   355.   2.95     1.39
## 12  48.3    300        9751       1            4   305.   3.08     1.33
## 13  54.9    303        9820       1            4   404.   3.09     1.32
## 14  55.6    360       11879       1            5   309.   3.03     1.39
## 15  65.3    362       11727       1            5   317.   3.09     1.38
## 16  65.3    412       13534       1            6   413.   3.04     1.46
## 17  85.1    479       15500       1            7   411.   3.09     1.46
## 18  91.1    541       17571       1            8   534.   3.08     1.48
## 19  91.5    592       19433       1            9   707.   3.05     1.52
## # ℹ 1 more variable: group_number <dbl>

#NULL Hypothesis: ads with less clicks and no display have the same average revenue as those with more clicks and display

grouped_data <- Filtered %>%
  group_by(group_number) %>%
  summarize(mean_value = mean(revenue))

grouped_data
## # A tibble: 2 × 2
##   group_number mean_value
##          <dbl>      <dbl>
## 1            1       119.
## 2            2       417.
calculate_difference <- function(mean_values) {
  diff_values <- diff(mean_values)
  return(diff_values)
}

# Given mean values
mean_values <- c(118.9022, 416.6200)

# Calculate the difference between mean values
difference <- calculate_difference(mean_values)

# View the difference
difference
## [1] 297.7178

#Now using the boostrap function

bootstrap <- function (x, func=mean, n_iter=10^4) {
  # empty vector to be filled with values from each iteration
  func_values <- c(NULL)
  
  # we simulate sampling `n_iter` times
  for (i in 1:n_iter) {
    # pull the sample (e.g., a vector or data frame)
    x_sample <- sample_n(x, size = length(x), replace = TRUE)
    
    # add on this iteration's value to the collection
    func_values <- c(func_values, func(x_sample))
  }
  
  return(func_values)
}
diff_in_avg <- function (x_data) {
  grouped_data <- x_data |>
    group_by(display) |>
    summarize(avg_revenue = mean(revenue)) |>
    arrange(display)
  
  # difference = revenue_with - revenue_without
  diff <- (grouped_data$avg_revenue[2] - 
           grouped_data$avg_revenue[1])
  
  return(diff)
}

diffs_in_avgs <- bootstrap(Filtered, diff_in_avg, n_iter = 100)

diffs_in_avgs
##   [1] 371.5022 311.9778 269.3022 261.0889 359.7822 338.7289 255.6622 265.0222
##   [9] 322.6089 249.1111 225.7022 279.1422 252.6356 283.3867 388.1956 248.1600
##  [17] 286.3156 263.0978 166.1200 370.0267 294.0133 232.3156 357.4400 314.3378
##  [25] 273.6444 243.9333 271.8267 308.2044 411.2311 330.4089 278.8444 284.7956
##  [33] 310.9689 270.6400 209.5467 332.8489 217.0444 239.0444 382.8533 239.9200
##  [41] 250.8267 260.5289 243.6667 295.6889 267.4978 294.5733 309.8933 289.9867
##  [49] 241.5156 242.8267 367.8622 317.9956 257.4933 341.0978 288.4978 341.9067
##  [57] 307.6133 391.0667 363.6667 313.6044 295.5244 341.6800 344.5911 297.6089
##  [65] 376.3289 331.3244 272.7378 290.8756 422.0800 236.8311 374.6489 299.7067
##  [73] 299.8044 349.7200 389.5733 241.5778 233.5556 243.3333 319.9111 309.1244
##  [81] 290.5111 359.7956 349.7156 347.1556 358.0400 308.3244 233.9822 329.0489
##  [89] 261.1067 266.4222 336.0178 313.1156 302.3867 230.6533 215.0756 362.0311
##  [97] 315.6356 259.5778 312.9689 316.3911
ggplot() +
  geom_function(xlim = c(-600, 600), 
                fun = function(x) dnorm(x, mean = 0, 
                                        sd = sd(diffs_in_avgs))) +
  geom_vline(mapping = aes(xintercept = difference,
                           color = paste("observed: ",
                                         round(difference)))) +
  labs(title = "Bootstrapped Sampling Distribution of Revenue Differences",
       x = "Difference in Revenue Calculated",
       y = "Probability Density",
       color = "") +
  scale_x_continuous(breaks = seq(-600, 600, 100)) +
  theme_minimal()

#Sampling Distribution

#differences
#Hypothesis testing is a great tool. However There are many Assumptions and issues with Hypothesis Testing.The main issue is the concept of Random Samplung