03/05/25
This analysis examines the effectiveness of different advertising campaigns on purchase behavior. I will analyze two datasets:
ab_testing.csv
- Contains 4 advertising campaigns (0,
1, 2, 3)abtesting.csv
- Contains 2 advertising campaigns (0,
1)The null hypothesis is that there is no relationship between advertising exposure and product purchases. The alternative hypothesis is that there is a relationship between advertising exposures and product purchases.
# Load required libraries
library(readr)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Import the dataset
ab_data <- read_csv("ab_testing.csv")
## Rows: 80 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): Ads, Purchase
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Display first few rows
head(ab_data)
## # A tibble: 6 × 2
## Ads Purchase
## <dbl> <dbl>
## 1 1 152
## 2 0 21
## 3 3 77
## 4 0 65
## 5 1 183
## 6 1 87
# Check data structure
str(ab_data)
## spc_tbl_ [80 × 2] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Ads : num [1:80] 1 0 3 0 1 1 2 2 2 0 ...
## $ Purchase: num [1:80] 152 21 77 65 183 87 121 104 116 82 ...
## - attr(*, "spec")=
## .. cols(
## .. Ads = col_double(),
## .. Purchase = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
# Convert Ads to factor
ab_data$Ads <- factor(ab_data$Ads)
# Examine the distribution of campaigns
table(ab_data$Ads)
##
## 0 1 2 3
## 21 21 16 22
# Calculate summary statistics by group
group_stats <- ab_data %>%
group_by(Ads) %>%
summarise(
mean_purchase = mean(Purchase),
median_purchase = median(Purchase),
sd_purchase = sd(Purchase),
count = n()
)
print(group_stats)
## # A tibble: 4 × 5
## Ads mean_purchase median_purchase sd_purchase count
## <fct> <dbl> <dbl> <dbl> <int>
## 1 0 55.4 60 27.2 21
## 2 1 131. 124 36.8 21
## 3 2 91.9 102. 27.0 16
## 4 3 52.7 52.5 25.8 22
# Visualize the distributions
ggplot(ab_data, aes(x = Ads, y = Purchase, fill = Ads)) +
geom_boxplot() +
labs(title = "Purchase Amount by Ad Campaign",
x = "Ad Campaign",
y = "Purchase Amount") +
theme_minimal()
# Distribution of purchase values
ggplot(ab_data, aes(x = Purchase, fill = Ads)) +
geom_histogram(binwidth = 10, position = "dodge") +
labs(title = "Distribution of Purchase Amounts",
x = "Purchase Amount",
y = "Count") +
theme_minimal()
# Run regression model
model1 <- lm(Purchase ~ Ads, data = ab_data)
# Display results
summary(model1)
##
## Call:
## lm(formula = Purchase ~ Ads, data = ab_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -50.095 -27.891 -0.227 24.773 65.905
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 55.381 6.472 8.557 9.41e-13 ***
## Ads1 75.714 9.152 8.273 3.31e-12 ***
## Ads2 36.557 9.842 3.715 0.000386 ***
## Ads3 -2.654 9.048 -0.293 0.770096
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 29.66 on 76 degrees of freedom
## Multiple R-squared: 0.5624, Adjusted R-squared: 0.5452
## F-statistic: 32.56 on 3 and 76 DF, p-value: 1.216e-13
# Calculate and display ANOVA
anova(model1)
## Analysis of Variance Table
##
## Response: Purchase
## Df Sum Sq Mean Sq F value Pr(>F)
## Ads 3 85927 28642.3 32.565 1.216e-13 ***
## Residuals 76 66846 879.6
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
From the regression analysis above, we can see that:
The intercept (β₀) is 55.3809524, which represents the average purchase amount for the control group (Ads = 0).
Compared to the control group (Ads = 0):
The R-squared value is 0.5624481, indicating that approximately 56.2% of the variation in purchase amounts can be explained by the different advertising campaigns.
The F-statistic is 32.5645638 with a p-value of 1.216267^{-13}, which is less than 0.05, indicating that the model as a whole is statistically significant.
# Install and load the multcomp package if needed
# install.packages("multcomp")
library(multcomp)
## Loading required package: mvtnorm
## Loading required package: survival
## Loading required package: TH.data
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
##
## Attaching package: 'TH.data'
## The following object is masked from 'package:MASS':
##
## geyser
# Perform Tukey's HSD test for pairwise comparisons
tukey <- glht(model1, linfct = mcp(Ads = "Tukey"))
tukey_summary <- summary(tukey)
print(tukey_summary)
##
## Simultaneous Tests for General Linear Hypotheses
##
## Multiple Comparisons of Means: Tukey Contrasts
##
##
## Fit: lm(formula = Purchase ~ Ads, data = ab_data)
##
## Linear Hypotheses:
## Estimate Std. Error t value Pr(>|t|)
## 1 - 0 == 0 75.714 9.152 8.273 < 0.001 ***
## 2 - 0 == 0 36.557 9.842 3.715 0.00209 **
## 3 - 0 == 0 -2.654 9.048 -0.293 0.99113
## 2 - 1 == 0 -39.158 9.842 -3.979 < 0.001 ***
## 3 - 1 == 0 -78.368 9.048 -8.662 < 0.001 ***
## 3 - 2 == 0 -39.210 9.744 -4.024 < 0.001 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Adjusted p values reported -- single-step method)
# Plot confidence intervals for the comparisons
plot(tukey)
# Check residuals
par(mfrow = c(2, 2))
plot(model1)
# Import the second dataset
ab_data2 <- read_csv("abtesting.csv")
## Rows: 38 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): Ads, Purchase
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Display first few rows
head(ab_data2)
## # A tibble: 6 × 2
## Ads Purchase
## <dbl> <dbl>
## 1 1 113
## 2 0 83
## 3 0 52
## 4 1 119
## 5 1 188
## 6 0 99
# Check data structure
str(ab_data2)
## spc_tbl_ [38 × 2] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Ads : num [1:38] 1 0 0 1 1 0 0 1 1 1 ...
## $ Purchase: num [1:38] 113 83 52 119 188 99 71 181 120 111 ...
## - attr(*, "spec")=
## .. cols(
## .. Ads = col_double(),
## .. Purchase = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
# Convert Ads to factor
ab_data2$Ads <- factor(ab_data2$Ads)
# Examine the distribution of campaigns
table(ab_data2$Ads)
##
## 0 1
## 21 17
# Calculate summary statistics by group
group_stats2 <- ab_data2 %>%
group_by(Ads) %>%
summarise(
mean_purchase = mean(Purchase),
median_purchase = median(Purchase),
sd_purchase = sd(Purchase),
count = n()
)
print(group_stats2)
## # A tibble: 2 × 5
## Ads mean_purchase median_purchase sd_purchase count
## <fct> <dbl> <dbl> <dbl> <int>
## 1 0 95.4 99 24.5 21
## 2 1 137 135 34.8 17
# Visualize the distributions
ggplot(ab_data2, aes(x = Ads, y = Purchase, fill = Ads)) +
geom_boxplot() +
labs(title = "Purchase Amount by Ad Campaign",
x = "Ad Campaign",
y = "Purchase Amount") +
theme_minimal()
# Distribution of purchase values
ggplot(ab_data2, aes(x = Purchase, fill = Ads)) +
geom_histogram(binwidth = 10, position = "dodge") +
labs(title = "Distribution of Purchase Amounts",
x = "Purchase Amount",
y = "Count") +
theme_minimal()
# Run regression model
model2 <- lm(Purchase ~ Ads, data = ab_data2)
# Display results
summary(model2)
##
## Call:
## lm(formula = Purchase ~ Ads, data = ab_data2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -57.000 -23.250 3.071 22.643 51.000
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 95.429 6.441 14.816 < 2e-16 ***
## Ads1 41.571 9.630 4.317 0.000118 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 29.52 on 36 degrees of freedom
## Multiple R-squared: 0.3411, Adjusted R-squared: 0.3228
## F-statistic: 18.64 on 1 and 36 DF, p-value: 0.0001184
# Calculate and display ANOVA
anova(model2)
## Analysis of Variance Table
##
## Response: Purchase
## Df Sum Sq Mean Sq F value Pr(>F)
## Ads 1 16236 16235.8 18.636 0.0001184 ***
## Residuals 36 31363 871.2
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1