#Reading dataset
data<- read.csv("ab_data.csv", header=T)
summary(data)
## user_id timestamp group
## Min. :630000 2017-01-02 13:42:05.378582: 1 control :147202
## 1st Qu.:709032 2017-01-02 13:42:15.234051: 1 treatment:147276
## Median :787934 2017-01-02 13:42:21.786186: 1
## Mean :787974 2017-01-02 13:42:26.640581: 1
## 3rd Qu.:866912 2017-01-02 13:42:27.851110: 1
## Max. :945999 2017-01-02 13:42:28.522322: 1
## (Other) :294472
## landing_page converted
## new_page:147239 Min. :0.0000
## old_page:147239 1st Qu.:0.0000
## Median :0.0000
## Mean :0.1197
## 3rd Qu.:0.0000
## Max. :1.0000
##
str(data)
## 'data.frame': 294478 obs. of 5 variables:
## $ user_id : int 851104 804228 661590 853541 864975 936923 679687 719014 817355 839785 ...
## $ timestamp : Factor w/ 294478 levels "2017-01-02 13:42:05.378582",..: 258966 131019 122512 82871 247589 108211 221819 194300 29089 176584 ...
## $ group : Factor w/ 2 levels "control","treatment": 1 1 2 2 1 1 2 1 2 2 ...
## $ landing_page: Factor w/ 2 levels "new_page","old_page": 2 2 1 1 2 2 1 2 1 1 ...
## $ converted : int 0 0 0 0 1 0 1 0 1 1 ...
head(data,10)
## user_id timestamp group landing_page converted
## 1 851104 2017-01-21 22:11:48.556739 control old_page 0
## 2 804228 2017-01-12 08:01:45.159739 control old_page 0
## 3 661590 2017-01-11 16:55:06.154213 treatment new_page 0
## 4 853541 2017-01-08 18:28:03.143765 treatment new_page 0
## 5 864975 2017-01-21 01:52:26.210827 control old_page 1
## 6 936923 2017-01-10 15:20:49.083499 control old_page 0
## 7 679687 2017-01-19 03:26:46.940749 treatment new_page 1
## 8 719014 2017-01-17 01:48:29.539573 control old_page 0
## 9 817355 2017-01-04 17:58:08.979471 treatment new_page 1
## 10 839785 2017-01-15 18:11:06.610965 treatment new_page 1
#Formatting Timestamp
data$timestamp<- as.POSIXct(data$timestamp, format="%Y-%m-%d")
timestamp
## function (stamp = date(), prefix = "##------ ", suffix = " ------##",
## quiet = FALSE)
## {
## stamp <- paste0(prefix, stamp, suffix)
## .External2(C_addhistory, stamp)
## if (!quiet)
## cat(stamp, sep = "\n")
## invisible(stamp)
## }
## <bytecode: 0x7fc33c841b10>
## <environment: namespace:utils>
data$hours <- format(as.POSIXct(data$`timestamp`, "%Y-%m-%d %H:%M:%S", tz = ""), format = "%H:%M")
data$dates <- format(as.Date(data$`timestamp`,"%Y-%m-%d"), format = "%d/%m/%Y")
## Warning in as.POSIXlt.POSIXct(x, tz = tz): unknown timezone '%Y-%m-%d'
data$month <- format(as.Date(data$dates), "%m")
head(data,10)
## user_id timestamp group landing_page converted hours dates
## 1 851104 2017-01-21 control old_page 0 00:00 21/01/2017
## 2 804228 2017-01-12 control old_page 0 00:00 12/01/2017
## 3 661590 2017-01-11 treatment new_page 0 00:00 11/01/2017
## 4 853541 2017-01-08 treatment new_page 0 00:00 08/01/2017
## 5 864975 2017-01-21 control old_page 1 00:00 21/01/2017
## 6 936923 2017-01-10 control old_page 0 00:00 10/01/2017
## 7 679687 2017-01-19 treatment new_page 1 00:00 19/01/2017
## 8 719014 2017-01-17 control old_page 0 00:00 17/01/2017
## 9 817355 2017-01-04 treatment new_page 1 00:00 04/01/2017
## 10 839785 2017-01-15 treatment new_page 1 00:00 15/01/2017
## month
## 1 01
## 2 01
## 3 01
## 4 01
## 5 01
## 6 01
## 7 01
## 8 01
## 9 01
## 10 01
#conversion rate
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✓ ggplot2 3.3.0 ✓ purrr 0.3.4
## ✓ tibble 2.1.3 ✓ dplyr 0.8.3
## ✓ tidyr 1.0.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.4.0
## Warning: package 'purrr' was built under R version 3.6.2
## ── Conflicts ───────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
data%>% summarise(conv_rate=mean(converted))
## conv_rate
## 1 0.1196592
#checking effect of seasonality
data%>%
group_by(month) %>%
summarise(conv_rate=mean(converted))
## # A tibble: 1 x 2
## month conv_rate
## <chr> <dbl>
## 1 01 0.120
Sample Size Calculation
#install.packages("powerMediation")
library(powerMediation)
## Warning: package 'powerMediation' was built under R version 3.6.2
tot_sample_size<- SSizeLogisticBin(0.11,
0.20,
0.5,
alpha = 0.05,
power = 0.8)
tot_sample_size
## [1] 506
#we need sample size of 506 people in both test and control treatment
After Running Experimentation
data %>%
group_by(group)%>%
summarise(conv_rate=mean(converted))
## # A tibble: 2 x 2
## group conv_rate
## <fct> <dbl>
## 1 control 0.120
## 2 treatment 0.119
#The control group (the group with the old page) converted at a higher rate than the teatment (the group with the new page). However, the magnitude of this change is very small with a difference of roughly 0.2%
library(broom)
glm(converted~group,
family="binomial",
data=data)%>%
tidy()
## # A tibble: 2 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -1.99 0.00801 -248. 0
## 2 grouptreatment -0.0140 0.0114 -1.24 0.216