Ryann Garcia: Midterm Exam

library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(readr)
library(stargazer)

## 
## Please cite as:

##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.

##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer

data <- read_csv("Display_data.csv")

## Rows: 29 Columns: 8

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (8): spend, clicks, impressions, display, transactions, revenue, ctr, co...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

colnames(data) <- tolower(colnames(data))

data <- na.omit(data)

colnames(data) <- tolower(colnames(data))

data <- na.omit(data)

sink("summary_statistics.txt")
print(summary(data))
sink()

simple_model <- lm(revenue ~ spend, data = data)

sink("simple_regression_results.txt")
print(summary(simple_model))
sink()

ggplot(data, aes(x = spend, y = revenue)) +
  geom_point() +
  geom_smooth(method = "lm", col = "blue") +
  ggtitle("Simple Regression: Revenue vs. Spend") +
  xlab("Spend") +
  ylab("Revenue") +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

coefficients <- summary(simple_model)$coefficients
r_squared <- summary(simple_model)$r.squared
p_value <- coefficients[2, 4]
slope <- coefficients[2, 1]

cat("Regression Coefficient (Slope): ", round(slope, 4), "\n")

## Regression Coefficient (Slope):  4.8066

cat("R-squared Value: ", round(r_squared, 4), "\n")

## R-squared Value:  0.586

cat("P-value for 'Spend': ", round(p_value, 4), "\n")

## P-value for 'Spend':  0

library(ggplot2)
library(dplyr)
library(readr)
library(stargazer)

# Please use the dataset “Display_data.csv” for this question.
data <- read_csv("Display_data.csv")

## Rows: 29 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (8): spend, clicks, impressions, display, transactions, revenue, ctr, co...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

print(colnames(data))

## [1] "spend"        "clicks"       "impressions"  "display"      "transactions"
## [6] "revenue"      "ctr"          "con_rate"

colnames(data) <- tolower(colnames(data))
data$display <- as.factor(data$display)

if (sum(is.na(data)) > 0) {
    cat("Missing values found! Removing them...\n")
    data <- na.omit(data)  
}

cat("\n--- Data Summary ---\n")

## 
## --- Data Summary ---

summary(data)

##      spend           clicks       impressions    display  transactions  
##  Min.   : 1.12   Min.   : 48.0   Min.   : 1862   0:20    Min.   :1.000  
##  1st Qu.:28.73   1st Qu.:172.0   1st Qu.: 6048   1: 9    1st Qu.:2.000  
##  Median :39.68   Median :241.0   Median : 9934           Median :3.000  
##  Mean   :44.22   Mean   :257.1   Mean   :11858           Mean   :2.966  
##  3rd Qu.:55.57   3rd Qu.:303.0   3rd Qu.:14789           3rd Qu.:4.000  
##  Max.   :91.28   Max.   :593.0   Max.   :29324           Max.   :6.000  
##     revenue            ctr           con_rate    
##  Min.   : 16.16   Min.   :1.890   Min.   :0.810  
##  1st Qu.:117.32   1st Qu.:1.970   1st Qu.:0.990  
##  Median :235.16   Median :2.020   Median :1.130  
##  Mean   :223.50   Mean   :2.306   Mean   :1.227  
##  3rd Qu.:298.92   3rd Qu.:2.790   3rd Qu.:1.470  
##  Max.   :522.00   Max.   :3.290   Max.   :2.080

sink("summary_statistics.txt")
print(summary(data))
sink()

cat("\n--- Running Multiple Regression Model: revenue ~ spend + display ---\n")

## 
## --- Running Multiple Regression Model: revenue ~ spend + display ---

multiple_model <- lm(revenue ~ spend + display, data = data)

print(summary(multiple_model))

## 
## Call:
## lm(formula = revenue ~ spend + display, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -176.730  -35.020    8.661   56.440  129.231 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -50.8612    40.3336  -1.261  0.21850    
## spend         5.5473     0.7415   7.482 6.07e-08 ***
## display1     93.5856    33.1910   2.820  0.00908 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 77.33 on 26 degrees of freedom
## Multiple R-squared:  0.6829, Adjusted R-squared:  0.6586 
## F-statistic:    28 on 2 and 26 DF,  p-value: 3.271e-07

sink("multiple_regression_results.txt")
print(summary(multiple_model))
sink()


cat("\n--- Generating Visualization ---\n")

## 
## --- Generating Visualization ---

ggplot(data, aes(x = spend, y = revenue, color = display)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  ggtitle("Multiple Regression: Revenue vs. Spend & Display") +
  xlab("Spend") +
  ylab("Revenue") +
  labs(color = "Display Campaign (0 = No, 1 = Yes)") +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

#Please use the dataset “ab_testing1.csv” for this question.
library(ggplot2)
library(dplyr)
library(readr)
library(stargazer)

library(ggplot2)
library(dplyr)
library(readr)
library(stargazer)

data <- read_csv("ab_testing1.csv")

## Rows: 29 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): Ads, Purchase
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

print(colnames(data))

## [1] "Ads"      "Purchase"

if ("Ads" %in% colnames(data) & "Purchase" %in% colnames(data)) {
    data$Ads <- as.factor(data$Ads)
} else {
    stop("Error: 'Ads' or 'Purchase' column not found in the dataset.")
}

data <- na.omit(data)

model <- lm(Purchase ~ Ads, data = data)

sink("regression_results.txt")
print(summary(model))
sink()

ggplot(data, aes(x = Ads, y = Purchase, fill = Ads)) +
  geom_boxplot() +
  ggtitle("Impact of Ads on Product Purchase") +
  xlab("Advertisement Version") +
  ylab("Number of Purchases") +
  theme_minimal()

Ryann Garcia: Midterm Exam

2025-03-20