#Reading dataset
data<- read.csv("ab_data.csv", header=T)
summary(data)
##     user_id                            timestamp            group       
##  Min.   :630000   2017-01-02 13:42:05.378582:     1   control  :147202  
##  1st Qu.:709032   2017-01-02 13:42:15.234051:     1   treatment:147276  
##  Median :787934   2017-01-02 13:42:21.786186:     1                     
##  Mean   :787974   2017-01-02 13:42:26.640581:     1                     
##  3rd Qu.:866912   2017-01-02 13:42:27.851110:     1                     
##  Max.   :945999   2017-01-02 13:42:28.522322:     1                     
##                   (Other)                   :294472                     
##    landing_page      converted     
##  new_page:147239   Min.   :0.0000  
##  old_page:147239   1st Qu.:0.0000  
##                    Median :0.0000  
##                    Mean   :0.1197  
##                    3rd Qu.:0.0000  
##                    Max.   :1.0000  
## 
str(data)
## 'data.frame':    294478 obs. of  5 variables:
##  $ user_id     : int  851104 804228 661590 853541 864975 936923 679687 719014 817355 839785 ...
##  $ timestamp   : Factor w/ 294478 levels "2017-01-02 13:42:05.378582",..: 258966 131019 122512 82871 247589 108211 221819 194300 29089 176584 ...
##  $ group       : Factor w/ 2 levels "control","treatment": 1 1 2 2 1 1 2 1 2 2 ...
##  $ landing_page: Factor w/ 2 levels "new_page","old_page": 2 2 1 1 2 2 1 2 1 1 ...
##  $ converted   : int  0 0 0 0 1 0 1 0 1 1 ...
head(data,10)
##    user_id                  timestamp     group landing_page converted
## 1   851104 2017-01-21 22:11:48.556739   control     old_page         0
## 2   804228 2017-01-12 08:01:45.159739   control     old_page         0
## 3   661590 2017-01-11 16:55:06.154213 treatment     new_page         0
## 4   853541 2017-01-08 18:28:03.143765 treatment     new_page         0
## 5   864975 2017-01-21 01:52:26.210827   control     old_page         1
## 6   936923 2017-01-10 15:20:49.083499   control     old_page         0
## 7   679687 2017-01-19 03:26:46.940749 treatment     new_page         1
## 8   719014 2017-01-17 01:48:29.539573   control     old_page         0
## 9   817355 2017-01-04 17:58:08.979471 treatment     new_page         1
## 10  839785 2017-01-15 18:11:06.610965 treatment     new_page         1
#Formatting Timestamp
data$timestamp<- as.POSIXct(data$timestamp, format="%Y-%m-%d")
timestamp
## function (stamp = date(), prefix = "##------ ", suffix = " ------##", 
##     quiet = FALSE) 
## {
##     stamp <- paste0(prefix, stamp, suffix)
##     .External2(C_addhistory, stamp)
##     if (!quiet) 
##         cat(stamp, sep = "\n")
##     invisible(stamp)
## }
## <bytecode: 0x7fc33c841b10>
## <environment: namespace:utils>
data$hours <- format(as.POSIXct(data$`timestamp`, "%Y-%m-%d %H:%M:%S", tz = ""), format = "%H:%M")
data$dates <- format(as.Date(data$`timestamp`,"%Y-%m-%d"), format = "%d/%m/%Y")
## Warning in as.POSIXlt.POSIXct(x, tz = tz): unknown timezone '%Y-%m-%d'
data$month <- format(as.Date(data$dates), "%m")
head(data,10)
##    user_id  timestamp     group landing_page converted hours      dates
## 1   851104 2017-01-21   control     old_page         0 00:00 21/01/2017
## 2   804228 2017-01-12   control     old_page         0 00:00 12/01/2017
## 3   661590 2017-01-11 treatment     new_page         0 00:00 11/01/2017
## 4   853541 2017-01-08 treatment     new_page         0 00:00 08/01/2017
## 5   864975 2017-01-21   control     old_page         1 00:00 21/01/2017
## 6   936923 2017-01-10   control     old_page         0 00:00 10/01/2017
## 7   679687 2017-01-19 treatment     new_page         1 00:00 19/01/2017
## 8   719014 2017-01-17   control     old_page         0 00:00 17/01/2017
## 9   817355 2017-01-04 treatment     new_page         1 00:00 04/01/2017
## 10  839785 2017-01-15 treatment     new_page         1 00:00 15/01/2017
##    month
## 1     01
## 2     01
## 3     01
## 4     01
## 5     01
## 6     01
## 7     01
## 8     01
## 9     01
## 10    01
#conversion rate
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✓ ggplot2 3.3.0     ✓ purrr   0.3.4
## ✓ tibble  2.1.3     ✓ dplyr   0.8.3
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.4.0
## Warning: package 'purrr' was built under R version 3.6.2
## ── Conflicts ───────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
data%>% summarise(conv_rate=mean(converted))
##   conv_rate
## 1 0.1196592
#checking effect of seasonality
data%>%
  group_by(month) %>%
  summarise(conv_rate=mean(converted))
## # A tibble: 1 x 2
##   month conv_rate
##   <chr>     <dbl>
## 1 01        0.120

Sample Size Calculation

#install.packages("powerMediation")
library(powerMediation)
## Warning: package 'powerMediation' was built under R version 3.6.2
tot_sample_size<- SSizeLogisticBin(0.11, 
                 0.20, 
                 0.5, 
                 alpha = 0.05, 
                 power = 0.8)
tot_sample_size
## [1] 506
#we need sample size of 506 people in both test and control treatment

After Running Experimentation

data %>%
  group_by(group)%>%
  summarise(conv_rate=mean(converted))
## # A tibble: 2 x 2
##   group     conv_rate
##   <fct>         <dbl>
## 1 control       0.120
## 2 treatment     0.119
#The control group (the group with the old page) converted at a higher rate than the teatment (the group with the new page). However, the magnitude of this change is very small with a difference of roughly 0.2%

library(broom)
glm(converted~group,
    family="binomial",
    data=data)%>%
  tidy()
## # A tibble: 2 x 5
##   term           estimate std.error statistic p.value
##   <chr>             <dbl>     <dbl>     <dbl>   <dbl>
## 1 (Intercept)     -1.99     0.00801   -248.     0    
## 2 grouptreatment  -0.0140   0.0114      -1.24   0.216