knitr::opts_chunk$set(echo = TRUE)
data <- read.csv("C:\\Users\\Admin\\Desktop\\statistical computing\\data\\abtest.csv")
View(data)
nrow(data)
## [1] 100
ncol(data)
## [1] 6
summary(data)
## user_id group landing_page time_spent_on_the_page
## Min. :546443 Length:100 Length:100 Min. : 0.190
## 1st Qu.:546468 Class :character Class :character 1st Qu.: 3.880
## Median :546493 Mode :character Mode :character Median : 5.415
## Mean :546517 Mean : 5.378
## 3rd Qu.:546567 3rd Qu.: 7.022
## Max. :546592 Max. :10.710
## converted language_preferred
## Length:100 Length:100
## Class :character Class :character
## Mode :character Mode :character
##
##
##
head(data)
## user_id group landing_page time_spent_on_the_page converted
## 1 546592 control old 3.48 no
## 2 546468 treatment new 7.13 yes
## 3 546462 treatment new 4.40 no
## 4 546567 control old 3.02 no
## 5 546459 treatment new 4.75 yes
## 6 546558 control old 5.28 yes
## language_preferred
## 1 Spanish
## 2 English
## 3 Spanish
## 4 French
## 5 Spanish
## 6 English
tail(data)
## user_id group landing_page time_spent_on_the_page converted
## 95 546550 control old 3.05 no
## 96 546446 treatment new 5.15 no
## 97 546544 control old 6.52 yes
## 98 546472 treatment new 7.07 yes
## 99 546481 treatment new 6.20 yes
## 100 546483 treatment new 5.86 yes
## language_preferred
## 95 English
## 96 Spanish
## 97 English
## 98 Spanish
## 99 Spanish
## 100 English
sum(is.na(data))
## [1] 0
sum(duplicated(data))
## [1] 0
print("##Univariate analysis")
## [1] "##Univariate analysis"
mean(data$time_spent_on_the_page)
## [1] 5.3778
sd(data$time_spent_on_the_page)
## [1] 2.378166
median(data$time_spent_on_the_page)
## [1] 5.415
quantile(data$time_spent_on_the_page)
## 0% 25% 50% 75% 100%
## 0.1900 3.8800 5.4150 7.0225 10.7100
IQR(data$time_spent_on_the_page)
## [1] 3.1425
hist(data$time_spent_on_the_page)

barplot(data$time_spent_on_the_page)

boxplot(data$time_spent_on_the_page)

###Bivariate analysis
print("Question one")
## [1] "Question one"
print("DEFINING HYPOTHESIS")
## [1] "DEFINING HYPOTHESIS"
print("Null :time_spent_on_new = time_spent_on_exixting")
## [1] "Null :time_spent_on_new = time_spent_on_exixting"
print("Alternate:time_spent_on_new> time_spent_on_existing")
## [1] "Alternate:time_spent_on_new> time_spent_on_existing"
print("APPROPRIATE TEST")
## [1] "APPROPRIATE TEST"
print("two sampled t-test")
## [1] "two sampled t-test"
print("SIGNIFICANCE LEVEL")
## [1] "SIGNIFICANCE LEVEL"
alpha <- 0.05
print("COLLECTING AND PREPARING DATA")
## [1] "COLLECTING AND PREPARING DATA"
print("Visual analysis")
## [1] "Visual analysis"
boxplot(time_spent_on_the_page~landing_page,data=data)

print("Collecting data")
## [1] "Collecting data"
new_time_spent_on_page <- data$time_spent_on_the_page[data$landing_page=="new"]
old_time_spent_on_page <- data$time_spent_on_the_page[data$landing_page=="old"]
print("Calculating p-value")
## [1] "Calculating p-value"
t_test <- t.test(new_time_spent_on_page,old_time_spent_on_page,alternative = "greater",var.equal = FALSE)
print(t_test)
##
## Welch Two Sample t-test
##
## data: new_time_spent_on_page and old_time_spent_on_page
## t = 3.7868, df = 87.975, p-value = 0.0001392
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
## 0.9485536 Inf
## sample estimates:
## mean of x mean of y
## 6.2232 4.5324
t_test$p.value
## [1] 0.0001392381
print("Comparing the p-value with alpha")
## [1] "Comparing the p-value with alpha"
alpha <- 0.05
t_test$p.value < alpha
## [1] TRUE
###Drawing inference
if(t_test$p.value < alpha){print("Reject Null,users spend more time on the new landing page than on the old landing page")} else{print("Fail to reject Null,users spend more time on the old landing page than on the new landing page")}
## [1] "Reject Null,users spend more time on the new landing page than on the old landing page"
print("users spend more time on the new landing page than on the old landing page")
## [1] "users spend more time on the new landing page than on the old landing page"
###Question Two)
print("Defining hypothesis")
## [1] "Defining hypothesis"
print("Null: conversion_rate_on_new = conversion_rate_on_old")
## [1] "Null: conversion_rate_on_new = conversion_rate_on_old"
print("Alternate: conversion_rate_on_new > conversion_rate_on_old")
## [1] "Alternate: conversion_rate_on_new > conversion_rate_on_old"
###Appropriate test
print("Two-proportion z-test")
## [1] "Two-proportion z-test"
###Significance level
alpha <- 0.05
###Preparing and collecting data
print("visual analysis")
## [1] "visual analysis"
conv_table <- table(data$landing_page,data$converted)
View(conv_table)
###collecting data
success <- c(conv_table["new","yes"],conv_table["old","yes"])
total <- c(sum(conv_table["new",]),sum(conv_table["old",]))
###calculating p-value
prop.test <- prop.test(success,total,alternative = "greater")
print(prop.test)
##
## 2-sample test for equality of proportions with continuity correction
##
## data: success out of total
## X-squared = 4.8712, df = 1, p-value = 0.01365
## alternative hypothesis: greater
## 95 percent confidence interval:
## 0.06086519 1.00000000
## sample estimates:
## prop 1 prop 2
## 0.66 0.42
prop.test$p.value
## [1] 0.01365445
###comparing p-value
prop.test$p.value <= alpha
## [1] TRUE
###Drawing inference
if(prop.test$p.value < alpha){print("Reject Null,the conversion page on new page is greater than on old page")} else{print("Fail to reject Null,the conversion page on old page is greater than on new page")}
## [1] "Reject Null,the conversion page on new page is greater than on old page"
###Question Three
print("Defining hypothesis")
## [1] "Defining hypothesis"
print("Null: Conversion and preffered language are independent")
## [1] "Null: Conversion and preffered language are independent"
print("Alternate: Conversion and preffered language are dependent")
## [1] "Alternate: Conversion and preffered language are dependent"
###Appropriate test
print("Chi-square test of independence")
## [1] "Chi-square test of independence"
###Significance level
alpha <- 0.05
###Preparing and collecting data
print("visual analysis")
## [1] "visual analysis"
conv_table <- table(data$converted,data$language_preferred)
View(conv_table)
###Calculating p-value
chisq.test<-chisq.test(conv_table,correct = TRUE)
print(chisq.test)
##
## Pearson's Chi-squared test
##
## data: conv_table
## X-squared = 3.093, df = 2, p-value = 0.213
chisq.test$p.value
## [1] 0.2129889
###comparing p-value
chisq.test$p.value <= alpha
## [1] FALSE
###Drawing inference
if(chisq.test$p.value < alpha){print("Reject Null,conversion and preffered language are dependent")} else{print("Fail to reject Null,conversion and prefferd language are independent")}
## [1] "Fail to reject Null,conversion and prefferd language are independent"
###Question Four
print("Defining hypothesis")
## [1] "Defining hypothesis"
print("Null: the time spent on new page is same in all languages")
## [1] "Null: the time spent on new page is same in all languages"
print("Alternate: time spent on new page is different in all languages")
## [1] "Alternate: time spent on new page is different in all languages"
###Appropriate test
print("One way ANOVA test")
## [1] "One way ANOVA test"
###Significance level
alpha <- 0.05
###Preparing and collecting data
new_time_spent_on_page <- subset(data,landing_page =="new")
print(new_time_spent_on_page)
## user_id group landing_page time_spent_on_the_page converted
## 2 546468 treatment new 7.13 yes
## 3 546462 treatment new 4.40 no
## 5 546459 treatment new 4.75 yes
## 7 546448 treatment new 5.25 yes
## 9 546461 treatment new 10.71 yes
## 13 546491 treatment new 5.86 yes
## 14 546478 treatment new 6.03 yes
## 16 546466 treatment new 6.27 yes
## 17 546443 treatment new 8.73 no
## 23 546450 treatment new 3.65 no
## 24 546475 treatment new 7.02 yes
## 25 546456 treatment new 6.18 no
## 26 546455 treatment new 4.39 no
## 27 546469 treatment new 9.49 yes
## 29 546471 treatment new 7.81 yes
## 31 546464 treatment new 5.41 yes
## 39 546487 treatment new 1.65 no
## 43 546489 treatment new 7.16 yes
## 44 546453 treatment new 7.16 yes
## 45 546488 treatment new 3.91 no
## 47 546460 treatment new 5.37 yes
## 48 546458 treatment new 7.23 yes
## 49 546492 treatment new 8.08 yes
## 50 546473 treatment new 10.50 yes
## 52 546457 treatment new 5.65 no
## 53 546479 treatment new 6.47 yes
## 55 546482 treatment new 6.41 yes
## 58 546454 treatment new 8.30 yes
## 61 546470 treatment new 6.01 yes
## 62 546467 treatment new 6.79 yes
## 66 546445 treatment new 7.27 yes
## 69 546484 treatment new 6.70 no
## 72 546476 treatment new 5.42 yes
## 73 546452 treatment new 5.08 yes
## 74 546444 treatment new 7.46 yes
## 78 546485 treatment new 3.88 no
## 79 546486 treatment new 9.12 yes
## 81 546490 treatment new 4.68 no
## 82 546449 treatment new 5.26 yes
## 83 546463 treatment new 5.74 yes
## 87 546465 treatment new 6.71 no
## 88 546480 treatment new 3.68 no
## 89 546447 treatment new 3.30 no
## 91 546477 treatment new 5.40 no
## 92 546451 treatment new 8.47 yes
## 94 546474 treatment new 4.94 no
## 96 546446 treatment new 5.15 no
## 98 546472 treatment new 7.07 yes
## 99 546481 treatment new 6.20 yes
## 100 546483 treatment new 5.86 yes
## language_preferred
## 2 English
## 3 Spanish
## 5 Spanish
## 7 French
## 9 French
## 13 Spanish
## 14 French
## 16 Spanish
## 17 English
## 23 English
## 24 English
## 25 Spanish
## 26 English
## 27 English
## 29 French
## 31 English
## 39 Spanish
## 43 Spanish
## 44 English
## 45 English
## 47 French
## 48 Spanish
## 49 Spanish
## 50 English
## 52 English
## 53 Spanish
## 55 Spanish
## 58 French
## 61 French
## 62 French
## 66 French
## 69 Spanish
## 72 French
## 73 English
## 74 English
## 78 Spanish
## 79 French
## 81 French
## 82 French
## 83 Spanish
## 87 English
## 88 French
## 89 French
## 91 French
## 92 English
## 94 French
## 96 Spanish
## 98 Spanish
## 99 Spanish
## 100 English
View(new_time_spent_on_page)
###calculating p-value
anova_test <- aov(time_spent_on_the_page~language_preferred,data =new_time_spent_on_page)
print(anova_test)
## Call:
## aov(formula = time_spent_on_the_page ~ language_preferred, data = new_time_spent_on_page)
##
## Terms:
## language_preferred Residuals
## Sum of Squares 5.6755 156.1030
## Deg. of Freedom 2 47
##
## Residual standard error: 1.822454
## Estimated effects may be unbalanced
summary(anova_test)
## Df Sum Sq Mean Sq F value Pr(>F)
## language_preferred 2 5.68 2.838 0.854 0.432
## Residuals 47 156.10 3.321
###comparing p_value
p_value <- summary(anova_test)[[1]][["Pr(>F)"]][1]
print(p_value)
## [1] 0.4320414
summary(anova_test)[[1]][["Pr(>F)"]][1]<alpha
## [1] FALSE
###Drawing inference
if(p_value < alpha){print("Reject Null,the time spent on new page is different in all languages")} else{print("Fail to reject Null,time spent on new page is same in all languages")}
## [1] "Fail to reject Null,time spent on new page is same in all languages"