Load required libraries

cat("Loading required libraries...\n")
## Loading required libraries...
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Read the cleaned data exported from csv

cat("Reading cleaned marketing data from CSV...\n")
## Reading cleaned marketing data from CSV...
cleaned_df <- read.csv('marketing_cleaned_data.csv')
cat("Data loaded. Rows:", nrow(cleaned_df), "Columns:", ncol(cleaned_df), "\n\n")
## Data loaded. Rows: 100 Columns: 5

Remove NAs and outliers in conversion_rate

cat("Cleaning data: removing NAs and outliers in conversion_rate...\n")
## Cleaning data: removing NAs and outliers in conversion_rate...
data_r <- na.omit(cleaned_df)
q_low <- quantile(data_r$conversion_rate, 0.01)
q_high <- quantile(data_r$conversion_rate, 0.99)
data_r <- subset(data_r, conversion_rate >= q_low & conversion_rate <= q_high)
cat("Data cleaned. Remaining rows:", nrow(data_r), "\n\n")
## Data cleaned. Remaining rows: 98

CHANNEL-WISE MEAN CONVERSION RATES

group_means <- data_r %>% group_by(channel) %>% summarise(mean_conversion = mean(conversion_rate), n = n())
group_means
## # A tibble: 4 × 3
##   channel mean_conversion     n
##   <chr>             <dbl> <int>
## 1 Digital          0.0106    33
## 2 Print            0.0144    19
## 3 Radio            0.0131    26
## 4 TV               0.0137    20
cat("\nInterpretation: The table above shows the average conversion rate and sample size for each marketing channel.\n\n")
## 
## Interpretation: The table above shows the average conversion rate and sample size for each marketing channel.

ANOVA: Are conversion rates significantly different across channels?

anova_res <- aov(conversion_rate ~ channel, data = data_r)
summary(anova_res)
##             Df   Sum Sq   Mean Sq F value Pr(>F)
## channel      3 0.000225 7.500e-05   1.082  0.361
## Residuals   94 0.006519 6.935e-05
cat("\nInterpretation: If the p-value is < 0.05, there is a statistically significant difference in conversion rates between at least two channels.\n\n")
## 
## Interpretation: If the p-value is < 0.05, there is a statistically significant difference in conversion rates between at least two channels.

POST-HOC TUKEY HSD TEST (if ANOVA is significant)

if(summary(anova_res)[[1]][["Pr(>F)"]][1] < 0.05) {
  tukey <- TukeyHSD(anova_res)
  print(tukey)
  cat("\nInterpretation: The Tukey HSD test identifies which specific channels differ from each other.\n\n")
} else {
  cat("\nNo significant differences found between channels (ANOVA p >= 0.05).\n\n")
}
## 
## No significant differences found between channels (ANOVA p >= 0.05).

VISUALIZATION: Conversion Rate by Channel

ggplot(data_r, aes(x=channel, y=conversion_rate, fill=channel)) +
  geom_boxplot(alpha=0.7, outlier.shape=NA) +
  geom_jitter(width=0.2, alpha=0.4, color="black") +
  labs(title="Conversion Rate by Channel",
       x="Channel", y="Conversion Rate") +
  theme_minimal()

cat("\nInterpretation: The boxplot above visualizes the spread and central tendency of conversion rates for each channel.\n\n")
## 
## Interpretation: The boxplot above visualizes the spread and central tendency of conversion rates for each channel.

VISUALIZATION: Density Plot of Conversion Rates

ggplot(data_r, aes(x=conversion_rate, fill=channel)) +
  geom_density(alpha=0.4) +
  labs(title="Density of Conversion Rate by Channel",
       x="Conversion Rate", y="Density") +
  theme_minimal()

cat("\nInterpretation: The density plot shows the distribution and overlap of conversion rates across channels.\n\n")
## 
## Interpretation: The density plot shows the distribution and overlap of conversion rates across channels.

SUMMARY STATISTICS BY CHANNEL

sum_stats <- data_r %>% group_by(channel) %>% summarise(
  mean = mean(conversion_rate),
  median = median(conversion_rate),
  sd = sd(conversion_rate),
  min = min(conversion_rate),
  max = max(conversion_rate),
  n = n()
)
sum_stats
## # A tibble: 4 × 7
##   channel   mean  median      sd     min    max     n
##   <chr>    <dbl>   <dbl>   <dbl>   <dbl>  <dbl> <int>
## 1 Digital 0.0106 0.00958 0.00704 0.00215 0.0334    33
## 2 Print   0.0144 0.0124  0.00850 0.00265 0.0336    19
## 3 Radio   0.0131 0.0105  0.00925 0.00155 0.0387    26
## 4 TV      0.0137 0.0113  0.00887 0.00385 0.0362    20
cat("\nInterpretation: Use these statistics to compare the performance and consistency of each channel.\n\n")
## 
## Interpretation: Use these statistics to compare the performance and consistency of each channel.