A Practical Application of Hypothesis Testing


Step 1: Acquire and Load the Data


##########################################################
# Step 1. Load the data
# set your current directory
setwd("/Users/whinton/src/rstudio/tim8521")
##########################################################
# Read the file as a csv file
data_df <- read.csv("synthetic_data.csv", header = TRUE, sep= ",",stringsAsFactors = TRUE)

Steps 2: Assess the Data Frame, Variable Structure, and Types


###########################################################
# Assess the Data Frame, Variable Structure, and Types 
###########################################################
# Data Definition Table 
data_definition <- data.frame(
  Column = c("PageViews", "TimeSpent", "Conversion", "Version"),
  DataType = c("Integer", "Float", "Integer (Binary)", "Factor"),
  Description = c(
    "Number of pages viewed during session",
    "Time spent on website (minutes)",
    "Whether the user made a purchase (1 = Yes, 0 = No)",
    "Version of the algorithm (A or B)"
  )
)
print(data_definition)
##       Column         DataType
## 1  PageViews          Integer
## 2  TimeSpent            Float
## 3 Conversion Integer (Binary)
## 4    Version           Factor
##                                          Description
## 1              Number of pages viewed during session
## 2                    Time spent on website (minutes)
## 3 Whether the user made a purchase (1 = Yes, 0 = No)
## 4                  Version of the algorithm (A or B)
# Check for missing values
colSums(is.na(data_df))
##  PageViews  TimeSpent Conversion    Version 
##          0          0          0          0
missmap(data_df)

# Basic descriptive statistics
summary(data_df)
##    PageViews        TimeSpent          Conversion   Version
##  Min.   : 0.000   Min.   : 0.00578   Min.   :0.00   A:250  
##  1st Qu.: 3.000   1st Qu.: 2.04942   1st Qu.:0.00   B:250  
##  Median : 5.000   Median : 4.87187   Median :0.00          
##  Mean   : 5.006   Mean   : 7.16872   Mean   :0.13          
##  3rd Qu.: 6.000   3rd Qu.: 9.79244   3rd Qu.:0.00          
##  Max.   :14.000   Max.   :44.95319   Max.   :1.00
str(data_df)
## 'data.frame':    500 obs. of  4 variables:
##  $ PageViews : int  4 7 4 8 9 2 5 8 5 5 ...
##  $ TimeSpent : num  7.75 8.11 5.89 16.51 8.08 ...
##  $ Conversion: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Version   : Factor w/ 2 levels "A","B": 1 1 1 1 1 1 1 1 1 1 ...
dplyr::glimpse(data_df)
## Rows: 500
## Columns: 4
## $ PageViews  <int> 4, 7, 4, 8, 9, 2, 5, 8, 5, 5, 9, 5, 6, 5, 2, 8, 3, 2, 4, 9,…
## $ TimeSpent  <dbl> 7.75300039, 8.11239072, 5.89083392, 16.51305697, 8.08474981…
## $ Conversion <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ Version    <fct> A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,…
#View(data_df) # view data in spreadsheet form.


# Visualize distributions
ggplot(data_df, aes(x = PageViews)) +
  geom_histogram(binwidth = 1, fill = "skyblue", color = "black") +
  labs(title = "Histogram of PageViews", x = "Page Views", y = "Count")

ggplot(data_df, aes(x = TimeSpent)) +
  geom_histogram(binwidth = 1, fill = "lightgreen", color = "black") +
  labs(title = "Histogram of Time Spent", x = "Time Spent (minutes)", y = "Count")


Step 3: Hypothesis Test - Two-Proportion Z-Test


###########################################################
#  Summarize the conversion counts for hypothesis testing
# Perform Two-Proportion Z-Test
###########################################################

# Summarize conversion counts by version
conversion_summary <- data_df %>%
  group_by(Version) %>%
  dplyr::summarise(
    Total_Users = n(),
    Conversions = sum(Conversion),
    Conversion_Rate = mean(Conversion)
  )
print(conversion_summary)
## # A tibble: 2 × 4
##   Version Total_Users Conversions Conversion_Rate
##   <fct>         <int>       <int>           <dbl>
## 1 A               250          20            0.08
## 2 B               250          45            0.18
#cat("Z-Test Complete.")
#cat("\n###########################################################\n")

summary_table <- table(data_df$Version, data_df$Conversion)
##print(summary_table)

# Two-Proportion Z-Test
# Calculate proportions and sample sizes
prop_A <- summary_table["A", "1"] / sum(summary_table["A", ])
prop_B <- summary_table["B", "1"] / sum(summary_table["B", ])

n_A <- sum(summary_table["A", ])
n_B <- sum(summary_table["B", ])

# Perform the test
prop_test <- prop.test(c(summary_table["A", "1"], summary_table["B", "1"]),
                       c(n_A, n_B), alternative = "greater")
print(prop_test)
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(summary_table["A", "1"], summary_table["B", "1"]) out of c(n_A, n_B)
## X-squared = 10.186, df = 1, p-value = 0.9993
## alternative hypothesis: greater
## 95 percent confidence interval:
##  -0.1529271  1.0000000
## sample estimates:
## prop 1 prop 2 
##   0.08   0.18

Step 4: Confidence intervals and Visualizations


##########################################################
# Confidence intervals for Conversion Rates
##########################################################
# Confidence intervals for conversion rates
conversion_summary <- conversion_summary %>%
  mutate(
    CI_Lower = Conversion_Rate - qnorm(0.975) * sqrt((Conversion_Rate * (1 - Conversion_Rate)) / Total_Users),
    CI_Upper = Conversion_Rate + qnorm(0.975) * sqrt((Conversion_Rate * (1 - Conversion_Rate)) / Total_Users)
  )
print(conversion_summary)
## # A tibble: 2 × 6
##   Version Total_Users Conversions Conversion_Rate CI_Lower CI_Upper
##   <fct>         <int>       <int>           <dbl>    <dbl>    <dbl>
## 1 A               250          20            0.08   0.0464    0.114
## 2 B               250          45            0.18   0.132     0.228
##########################################################
# Confidence intervals for mean TimeSpent
##########################################################
time_ci <- data_df %>%
  group_by(Version) %>%
  dplyr::summarise(
    mean_time = mean(TimeSpent),
    sd_time = sd(TimeSpent),
    n = n()
  ) %>%
  mutate(
    ci_95_lower = mean_time - qt(0.975, df = n - 1) * (sd_time / sqrt(n)),
    ci_95_upper = mean_time + qt(0.975, df = n - 1) * (sd_time / sqrt(n))
  )
print(time_ci)
## # A tibble: 2 × 6
##   Version mean_time sd_time     n ci_95_lower ci_95_upper
##   <fct>       <dbl>   <dbl> <int>       <dbl>       <dbl>
## 1 A            6.90    6.80   250        6.05        7.74
## 2 B            7.44    7.57   250        6.50        8.38
##########################################################
# Visualizations
##########################################################
# 1. Histogram of TimeSpent by Conversion Status
hist_plot <- ggplot(data_df, aes(x = TimeSpent, fill = as.factor(Conversion))) +
  geom_histogram(position = "dodge", bins = 30, color = "black") +
  facet_wrap(~ Version) +
  labs(title = "Histogram of Time Spent by Conversion Status",
       x = "Time Spent (Minutes)", y = "Count", fill = "Conversion") +
  theme_minimal()
print(hist_plot)

# 2. Boxplot of TimeSpent by Version
boxplot <- ggplot(data_df, aes(x = Version, y = TimeSpent, fill = Version)) +
  geom_boxplot(outlier.color = "red", outlier.shape = 8) +
  labs(title = "Boxplot of Time Spent by Version",
       x = "Version", y = "Time Spent (Minutes)") +
  theme_minimal()
print(boxplot)

# 3. Confidence intervals for TimeSpent
ci_plot <- ggplot(time_ci, aes(x = Version, y = mean_time)) +
  geom_point(size = 3, color = "blue") +
  geom_errorbar(aes(ymin = ci_95_lower, ymax = ci_95_upper), width = 0.2, color = "red") +
  labs(title = "95% Confidence Interval for Time Spent by Version",
       x = "Version", y = "Mean Time Spent (Minutes)") +
  theme_minimal()
print(ci_plot)


Step 5: Power Analysis


## 
##      Difference of proportion power calculation for binomial distribution (arcsine transformation) 
## 
##               h = -0.302785
##               n = 250
##       sig.level = 0.05
##           power = 2.451218e-07
##     alternative = greater
## 
## NOTE: same sample sizes


This study performed by Will Hinton