Mid-Term Exam by Monica Fernandez

Question 1: Display Data Analysis

# Import the dataset
display_data <- read.csv("Display_data.csv")

# Take a look at the data structure
str(display_data)

## 'data.frame':    29 obs. of  8 variables:
##  $ spend       : num  22.6 37.3 55.6 45.4 50.2 ...
##  $ clicks      : int  165 228 291 247 290 172 68 112 306 300 ...
##  $ impressions : int  8672 11875 14631 11709 14768 8698 2924 5919 14789 14818 ...
##  $ display     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ transactions: int  2 2 3 2 3 2 1 1 3 3 ...
##  $ revenue     : num  58.9 44.9 141.6 209.8 197.7 ...
##  $ ctr         : num  1.9 1.92 1.99 2.11 1.96 1.98 2.33 1.89 2.07 2.02 ...
##  $ con_rate    : num  1.21 0.88 1.03 0.81 1.03 1.16 1.47 0.89 0.98 1 ...

summary(display_data)

##      spend           clicks       impressions       display      
##  Min.   : 1.12   Min.   : 48.0   Min.   : 1862   Min.   :0.0000  
##  1st Qu.:28.73   1st Qu.:172.0   1st Qu.: 6048   1st Qu.:0.0000  
##  Median :39.68   Median :241.0   Median : 9934   Median :0.0000  
##  Mean   :44.22   Mean   :257.1   Mean   :11858   Mean   :0.3103  
##  3rd Qu.:55.57   3rd Qu.:303.0   3rd Qu.:14789   3rd Qu.:1.0000  
##  Max.   :91.28   Max.   :593.0   Max.   :29324   Max.   :1.0000  
##   transactions      revenue            ctr           con_rate    
##  Min.   :1.000   Min.   : 16.16   Min.   :1.890   Min.   :0.810  
##  1st Qu.:2.000   1st Qu.:117.32   1st Qu.:1.970   1st Qu.:0.990  
##  Median :3.000   Median :235.16   Median :2.020   Median :1.130  
##  Mean   :2.966   Mean   :223.50   Mean   :2.306   Mean   :1.227  
##  3rd Qu.:4.000   3rd Qu.:298.92   3rd Qu.:2.790   3rd Qu.:1.470  
##  Max.   :6.000   Max.   :522.00   Max.   :3.290   Max.   :2.080

# Check for missing values
sum(is.na(display_data))

## [1] 0

# Create scatter plot of spend vs. revenue
ggplot(display_data, aes(x = spend, y = revenue, color = factor(display))) +
  geom_point() +
  geom_smooth(method = "lm", se = TRUE) +
  labs(title = "Relationship between Ad Spend and Revenue",
       x = "Ad Spend ($)",
       y = "Revenue ($)",
       color = "Display Campaign") +
  theme_minimal()

# Simple Regression (Revenue ~ Spend)
model1 <- lm(revenue ~ spend, data = display_data)
summary(model1)

## 
## Call:
## lm(formula = revenue ~ spend, data = display_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -145.210  -54.647    1.117   67.780  149.476 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  10.9397    37.9668   0.288    0.775    
## spend         4.8066     0.7775   6.182 1.31e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 86.71 on 27 degrees of freedom
## Multiple R-squared:  0.586,  Adjusted R-squared:  0.5707 
## F-statistic: 38.22 on 1 and 27 DF,  p-value: 1.311e-06

Question 2: AB Testing Analysis

# Import the dataset
ab_data <- read.csv("ab_testing1.csv")

# Check the data structure
str(ab_data)

## 'data.frame':    29 obs. of  2 variables:
##  $ Ads     : int  1 0 2 0 1 1 2 2 2 0 ...
##  $ Purchase: int  152 21 77 65 183 87 121 104 116 82 ...

summary(ab_data)

##       Ads           Purchase     
##  Min.   :0.000   Min.   : 14.00  
##  1st Qu.:0.000   1st Qu.: 51.00  
##  Median :1.000   Median : 77.00  
##  Mean   :1.069   Mean   : 76.07  
##  3rd Qu.:2.000   3rd Qu.:104.00  
##  Max.   :2.000   Max.   :183.00

# Convert Ads to a factor variable
ab_data$Ads <- factor(ab_data$Ads)

# Calculate descriptive statistics by group
group_stats <- ab_data %>%
  group_by(Ads) %>%
  summarize(
    count = n(),
    mean = mean(Purchase),
    sd = sd(Purchase),
    min = min(Purchase),
    max = max(Purchase)
  )

print(group_stats)

## # A tibble: 3 × 6
##   Ads   count  mean    sd   min   max
##   <fct> <int> <dbl> <dbl> <int> <int>
## 1 0        10  49    27.2    21    85
## 2 1         7 119.   40.3    61   183
## 3 2        12  73.8  31.1    14   121

# Create a boxplot to visualize the distribution by ad group
ggplot(ab_data, aes(x = Ads, y = Purchase, fill = Ads)) +
  geom_boxplot() +
  labs(title = "Purchase Amount by Advertisement Version",
       x = "Advertisement Version (0 = Control, 1 = Version 1, 2 = Version 2)",
       y = "Purchase Amount ($)") +
  theme_minimal()

# Run regression with Ads as a categorical predictor
model_ab <- lm(Purchase ~ Ads, data = ab_data)
summary(model_ab)

## 
## Call:
## lm(formula = Purchase ~ Ads, data = ab_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -59.75 -22.75  -3.75  30.25  64.29 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    49.00      10.21   4.800 5.69e-05 ***
## Ads1           69.71      15.91   4.383 0.000171 ***
## Ads2           24.75      13.82   1.791 0.084982 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 32.28 on 26 degrees of freedom
## Multiple R-squared:  0.4262, Adjusted R-squared:  0.3821 
## F-statistic: 9.656 on 2 and 26 DF,  p-value: 0.0007308

# ANOVA to test if there are significant differences between groups
anova_result <- aov(Purchase ~ Ads, data = ab_data)
summary(anova_result)

##             Df Sum Sq Mean Sq F value   Pr(>F)    
## Ads          2  20122   10061   9.656 0.000731 ***
## Residuals   26  27090    1042                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Post-hoc comparison to see if Ad1 and Ad2 differ significantly
pairwise_comparison <- pairwise.t.test(ab_data$Purchase, ab_data$Ads, p.adjust.method = "bonferroni")
print(pairwise_comparison)

## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  ab_data$Purchase and ab_data$Ads 
## 
##   0       1      
## 1 0.00051 -      
## 2 0.25495 0.02096
## 
## P value adjustment method: bonferroni

# Create a function for mean_se since it's not directly available
mean_se <- function(x) {
  return(data.frame(y = mean(x), 
                   ymin = mean(x) - (sd(x)/sqrt(length(x))), 
                   ymax = mean(x) + (sd(x)/sqrt(length(x)))))
}

# Create bar plot with error bars to show mean purchase by ad group
ggplot(ab_data, aes(x = Ads, y = Purchase, fill = Ads)) +
  stat_summary(fun = mean, geom = "bar") +
  stat_summary(fun.data = mean_se, geom = "errorbar", width = 0.2) +
  labs(title = "Mean Purchase Amount by Advertisement Version",
       x = "Advertisement Version (0 = Control, 1 = Version 1, 2 = Version 2)",
       y = "Mean Purchase Amount ($)") +
  theme_minimal()

#Q1 Describe your hypotheses (or hypotheses if you have more than one).
  H0: The higher you spend on advertising does not lead to higher revenue.
  H1: The higher advertising spending leads to higher revenue.

#Q3  Explain your outcome and make managerial recommendations.
  From the multiple regression the "Spend" variable is highly significant that does mean that the more you spend on an ad it will increase the revenue so it has a positive impact.

#Q1Describe your hypotheses (or hypotheses if you have more than one).
  H0: There won't be a significant difference in the number of purchases between the three groups.
  H1: One verison will lead to a higher purchases.
  
#Q2Explain your outcome and make managerial recommendations
  From the results there was a significant in Ad 1 that was positive.
  There is no significant increase in purchases to compared.

Mid-Term Exam by Monica Fernandez

2025-03-21

Question 1: Display Data Analysis

Question 2: AB Testing Analysis