##########################################################
# Step 1. Load the data
# set your current directory
setwd("/Users/whinton/src/rstudio/tim8521")
##########################################################
# Read the file as a csv file
data_df <- read.csv("synthetic_data.csv", header = TRUE, sep= ",",stringsAsFactors = TRUE)
###########################################################
# Assess the Data Frame, Variable Structure, and Types
###########################################################
# Data Definition Table
data_definition <- data.frame(
Column = c("PageViews", "TimeSpent", "Conversion", "Version"),
DataType = c("Integer", "Float", "Integer (Binary)", "Factor"),
Description = c(
"Number of pages viewed during session",
"Time spent on website (minutes)",
"Whether the user made a purchase (1 = Yes, 0 = No)",
"Version of the algorithm (A or B)"
)
)
print(data_definition)
## Column DataType
## 1 PageViews Integer
## 2 TimeSpent Float
## 3 Conversion Integer (Binary)
## 4 Version Factor
## Description
## 1 Number of pages viewed during session
## 2 Time spent on website (minutes)
## 3 Whether the user made a purchase (1 = Yes, 0 = No)
## 4 Version of the algorithm (A or B)
# Check for missing values
colSums(is.na(data_df))
## PageViews TimeSpent Conversion Version
## 0 0 0 0
missmap(data_df)
# Basic descriptive statistics
summary(data_df)
## PageViews TimeSpent Conversion Version
## Min. : 0.000 Min. : 0.00578 Min. :0.00 A:250
## 1st Qu.: 3.000 1st Qu.: 2.04942 1st Qu.:0.00 B:250
## Median : 5.000 Median : 4.87187 Median :0.00
## Mean : 5.006 Mean : 7.16872 Mean :0.13
## 3rd Qu.: 6.000 3rd Qu.: 9.79244 3rd Qu.:0.00
## Max. :14.000 Max. :44.95319 Max. :1.00
str(data_df)
## 'data.frame': 500 obs. of 4 variables:
## $ PageViews : int 4 7 4 8 9 2 5 8 5 5 ...
## $ TimeSpent : num 7.75 8.11 5.89 16.51 8.08 ...
## $ Conversion: int 0 0 0 0 0 0 0 0 0 0 ...
## $ Version : Factor w/ 2 levels "A","B": 1 1 1 1 1 1 1 1 1 1 ...
dplyr::glimpse(data_df)
## Rows: 500
## Columns: 4
## $ PageViews <int> 4, 7, 4, 8, 9, 2, 5, 8, 5, 5, 9, 5, 6, 5, 2, 8, 3, 2, 4, 9,…
## $ TimeSpent <dbl> 7.75300039, 8.11239072, 5.89083392, 16.51305697, 8.08474981…
## $ Conversion <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ Version <fct> A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,…
#View(data_df) # view data in spreadsheet form.
# Visualize distributions
ggplot(data_df, aes(x = PageViews)) +
geom_histogram(binwidth = 1, fill = "skyblue", color = "black") +
labs(title = "Histogram of PageViews", x = "Page Views", y = "Count")
ggplot(data_df, aes(x = TimeSpent)) +
geom_histogram(binwidth = 1, fill = "lightgreen", color = "black") +
labs(title = "Histogram of Time Spent", x = "Time Spent (minutes)", y = "Count")
###########################################################
# Summarize the conversion counts for hypothesis testing
# Perform Two-Proportion Z-Test
###########################################################
# Summarize conversion counts by version
conversion_summary <- data_df %>%
group_by(Version) %>%
dplyr::summarise(
Total_Users = n(),
Conversions = sum(Conversion),
Conversion_Rate = mean(Conversion)
)
print(conversion_summary)
## # A tibble: 2 × 4
## Version Total_Users Conversions Conversion_Rate
## <fct> <int> <int> <dbl>
## 1 A 250 20 0.08
## 2 B 250 45 0.18
#cat("Z-Test Complete.")
#cat("\n###########################################################\n")
summary_table <- table(data_df$Version, data_df$Conversion)
##print(summary_table)
# Two-Proportion Z-Test
# Calculate proportions and sample sizes
prop_A <- summary_table["A", "1"] / sum(summary_table["A", ])
prop_B <- summary_table["B", "1"] / sum(summary_table["B", ])
n_A <- sum(summary_table["A", ])
n_B <- sum(summary_table["B", ])
# Perform the test
prop_test <- prop.test(c(summary_table["A", "1"], summary_table["B", "1"]),
c(n_A, n_B), alternative = "greater")
print(prop_test)
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(summary_table["A", "1"], summary_table["B", "1"]) out of c(n_A, n_B)
## X-squared = 10.186, df = 1, p-value = 0.9993
## alternative hypothesis: greater
## 95 percent confidence interval:
## -0.1529271 1.0000000
## sample estimates:
## prop 1 prop 2
## 0.08 0.18
##########################################################
# Confidence intervals for Conversion Rates
##########################################################
# Confidence intervals for conversion rates
conversion_summary <- conversion_summary %>%
mutate(
CI_Lower = Conversion_Rate - qnorm(0.975) * sqrt((Conversion_Rate * (1 - Conversion_Rate)) / Total_Users),
CI_Upper = Conversion_Rate + qnorm(0.975) * sqrt((Conversion_Rate * (1 - Conversion_Rate)) / Total_Users)
)
print(conversion_summary)
## # A tibble: 2 × 6
## Version Total_Users Conversions Conversion_Rate CI_Lower CI_Upper
## <fct> <int> <int> <dbl> <dbl> <dbl>
## 1 A 250 20 0.08 0.0464 0.114
## 2 B 250 45 0.18 0.132 0.228
##########################################################
# Confidence intervals for mean TimeSpent
##########################################################
time_ci <- data_df %>%
group_by(Version) %>%
dplyr::summarise(
mean_time = mean(TimeSpent),
sd_time = sd(TimeSpent),
n = n()
) %>%
mutate(
ci_95_lower = mean_time - qt(0.975, df = n - 1) * (sd_time / sqrt(n)),
ci_95_upper = mean_time + qt(0.975, df = n - 1) * (sd_time / sqrt(n))
)
print(time_ci)
## # A tibble: 2 × 6
## Version mean_time sd_time n ci_95_lower ci_95_upper
## <fct> <dbl> <dbl> <int> <dbl> <dbl>
## 1 A 6.90 6.80 250 6.05 7.74
## 2 B 7.44 7.57 250 6.50 8.38
##########################################################
# Visualizations
##########################################################
# 1. Histogram of TimeSpent by Conversion Status
hist_plot <- ggplot(data_df, aes(x = TimeSpent, fill = as.factor(Conversion))) +
geom_histogram(position = "dodge", bins = 30, color = "black") +
facet_wrap(~ Version) +
labs(title = "Histogram of Time Spent by Conversion Status",
x = "Time Spent (Minutes)", y = "Count", fill = "Conversion") +
theme_minimal()
print(hist_plot)
# 2. Boxplot of TimeSpent by Version
boxplot <- ggplot(data_df, aes(x = Version, y = TimeSpent, fill = Version)) +
geom_boxplot(outlier.color = "red", outlier.shape = 8) +
labs(title = "Boxplot of Time Spent by Version",
x = "Version", y = "Time Spent (Minutes)") +
theme_minimal()
print(boxplot)
# 3. Confidence intervals for TimeSpent
ci_plot <- ggplot(time_ci, aes(x = Version, y = mean_time)) +
geom_point(size = 3, color = "blue") +
geom_errorbar(aes(ymin = ci_95_lower, ymax = ci_95_upper), width = 0.2, color = "red") +
labs(title = "95% Confidence Interval for Time Spent by Version",
x = "Version", y = "Mean Time Spent (Minutes)") +
theme_minimal()
print(ci_plot)
##
## Difference of proportion power calculation for binomial distribution (arcsine transformation)
##
## h = -0.302785
## n = 250
## sig.level = 0.05
## power = 2.451218e-07
## alternative = greater
##
## NOTE: same sample sizes
This study performed by Will Hinton