#LOADING THE DATA
data <- read.csv("abtest.csv")
head(data)
## user_id group landing_page time_spent_on_the_page converted
## 1 546592 control old 3.48 no
## 2 546468 treatment new 7.13 yes
## 3 546462 treatment new 4.40 no
## 4 546567 control old 3.02 no
## 5 546459 treatment new 4.75 yes
## 6 546558 control old 5.28 yes
## language_preferred
## 1 Spanish
## 2 English
## 3 Spanish
## 4 French
## 5 Spanish
## 6 English
# Structure
str(data)
## 'data.frame': 100 obs. of 6 variables:
## $ user_id : int 546592 546468 546462 546567 546459 546558 546448 546581 546461 546548 ...
## $ group : chr "control" "treatment" "treatment" "control" ...
## $ landing_page : chr "old" "new" "new" "old" ...
## $ time_spent_on_the_page: num 3.48 7.13 4.4 3.02 4.75 ...
## $ converted : chr "no" "yes" "no" "no" ...
## $ language_preferred : chr "Spanish" "English" "Spanish" "French" ...
#DATA OVERVIEW
# Summary statistics
summary(data)
## user_id group landing_page time_spent_on_the_page
## Min. :546443 Length:100 Length:100 Min. : 0.190
## 1st Qu.:546468 Class :character Class :character 1st Qu.: 3.880
## Median :546493 Mode :character Mode :character Median : 5.415
## Mean :546517 Mean : 5.378
## 3rd Qu.:546567 3rd Qu.: 7.022
## Max. :546592 Max. :10.710
## converted language_preferred
## Length:100 Length:100
## Class :character Class :character
## Mode :character Mode :character
##
##
##
#number of rows
nrow(data)
## [1] 100
# Check missing values
colSums(is.na(data))
## user_id group landing_page
## 0 0 0
## time_spent_on_the_page converted language_preferred
## 0 0 0
# Check duplicates
sum(duplicated(data))
## [1] 0
# UNIVARIATE ANALYSIS
##(numericalvariables, first)
#minimum
min_time <- min(data$time_spent_on_the_page,na.rm = TRUE)
#maximum
max_time <- max(data$time_spent_on_the_page,na.rm = TRUE)
#mean
mean_time <- mean(data$time_spent_on_the_page,na.rm = TRUE)
#median
median_time <- median(data$time_spent_on_the_page,na.rm = TRUE)
#standard deviation
sd_time <- sd(data$time_spent_on_the_page,na.rm = TRUE)
#variance
var_time <- var(data$time_spent_on_the_page,na.rm = TRUE)
quantile(data$time_spent_on_the_page, probs = c(0.25,0.50,0.75))
## 25% 50% 75%
## 3.8800 5.4150 7.0225
IQR(data$time_spent_on_the_page)
## [1] 3.1425
# Histogram
hist(data$time_spent_on_the_page,
breaks=10,
border="white",
main = "Histogram of Time Spent on Page",
xlab = "Time (minutes)",
col = "lightblue")

# Boxplot
boxplot(data$time_spent_on_the_page,
breaks=10,
main = "Boxplot of Time Spent on Page",
ylab = "Time (minutes)",
col = "lightgreen")

#time spent is moderately spread out(IQR=3.14).Median and mean are similar suggesting approximatelysymmetric distributionwith no extreme skew
##(categorical variables, second)
#frequency count(landing_page)
table(data$landing_page)
##
## new old
## 50 50
#proportion,percentage(landing_page)
prop.table(table(data$landing_page))
##
## new old
## 0.5 0.5
#variable group
table(data$group)
##
## control treatment
## 50 50
prop.table(table(data$group))
##
## control treatment
## 0.5 0.5
#variable converted
table(data$converted)
##
## no yes
## 46 54
prop.table(table(data$converted))
##
## no yes
## 0.46 0.54
#variable language preferred
table(data$language_preferred)
##
## English French Spanish
## 32 34 34
prop.table(table(data$language_preferred))
##
## English French Spanish
## 0.32 0.34 0.34
#BIVARIATE ANALYSIS(two variablestogether)
#time spent vs landing page
boxplot(time_spent_on_the_page ~ landing_page,
data = data,
main = "Time Spent by Landing Page",
xlab = "Landing Page",
ylab = "Time Spent (minutes)",
col = "lightblue")

#its evaluation
data %>% group_by(landing_page) %>%
summarise(
count = n(),
mean_time = mean(time_spent_on_the_page),
median_time = median(time_spent_on_the_page),
sd_time = sd(time_spent_on_the_page)
)
## # A tibble: 2 × 5
## landing_page count mean_time median_time sd_time
## <chr> <int> <dbl> <dbl> <dbl>
## 1 new 50 6.22 6.10 1.82
## 2 old 50 4.53 4.38 2.58
#conversion vs landing page(if more usersconverted on the new page)
table(data$landing_page,data$converted)
##
## no yes
## new 17 33
## old 29 21
#conversion ratein each landing page
prop.table(table(data$landing_page,data$converted),1)
##
## no yes
## new 0.34 0.66
## old 0.58 0.42
#time spent vs lannguage preferred
boxplot(time_spent_on_the_page ~ language_preferred,
data = data,
main = "Time Spent by Language Preferred",
xlab = "Language",
ylab = "Time Spent (minutes)",
col = "lightgreen")

#its evaluation
aggregate(time_spent_on_the_page ~ language_preferred,
data = data,
FUN = function(x) c(mean = mean(x), median = median(x), sd = sd(x)))
## language_preferred time_spent_on_the_page.mean time_spent_on_the_page.median
## 1 English 5.559063 5.755000
## 2 French 5.253235 5.315000
## 3 Spanish 5.331765 5.605000
## time_spent_on_the_page.sd
## 1 2.621079
## 2 2.675413
## 3 1.818095
#conversion vs language preferred
table(data$language_preferred,data$converted)
##
## no yes
## English 11 21
## French 19 15
## Spanish 16 18
prop.table(table(data$language_preferred,data$converted),1)
##
## no yes
## English 0.3437500 0.6562500
## French 0.5588235 0.4411765
## Spanish 0.4705882 0.5294118
##users on the new page spent significantly more time on average (6.22minutes)than those on old pages(4.53minutes).Abox plot confirmed the the new distribution of page is shifted upward
##HYPOTHESIS TEST 1
#a one tailed t test
# HYPOTHESIS TEST 1
# One-tailed t-test
# H0: μ_new <= μ_old (New page mean time is LESS than or equal to old)
# H1: μ_new > μ_old (New page mean time is GREATER)
#preparing the data(checking number of users in each group)
new_page <- data %>% filter(landing_page == "new")
old_page <- data %>% filter(landing_page == "old")
nrow(new_page)
## [1] 50
nrow(old_page)
## [1] 50
#t-test
t_test_result <- t.test(
new_page$time_spent_on_the_page,
old_page$time_spent_on_the_page,
alternative = "greater",
var.equal = FALSE
)
##since p=0.0001392<0.05,we reject the null hypothesis
##users spend signifficantly more time on the new landing page than old one
##HYPOTHESIS TEST 2(chi-square test)
#H0 (Null): Conversion is independent of preferred language.
#Language choice does NOT affect conversion.
#H1 (Alternative): Conversion depends on preferred language.
#lang_conv_table
lang_conv_table <- table(data$language_preferred, data$converted)
#the test
chi_result <- chisq.test(lang_conv_table)
##p>0.05, meaninhconversin is likely independent of language preference
##HYPOTHESIS TEST 3,ANOVA (time spent vs language)
#H0 (Null): All language groups on the new landing page have the same mean time spent.
#μ1 = μ2 = μ3 = ...
#H1 (Alternative): At least one language group has a different mean time spent.
#filtering
new_only <- data %>% filter(landing_page == "new")
head(new_only)
## user_id group landing_page time_spent_on_the_page converted
## 1 546468 treatment new 7.13 yes
## 2 546462 treatment new 4.40 no
## 3 546459 treatment new 4.75 yes
## 4 546448 treatment new 5.25 yes
## 5 546461 treatment new 10.71 yes
## 6 546491 treatment new 5.86 yes
## language_preferred
## 1 English
## 2 Spanish
## 3 Spanish
## 4 French
## 5 French
## 6 Spanish
# Boxplot using base R(visual check)
boxplot(time_spent_on_the_page ~ language_preferred,
data = new_only,
col = "lightblue",
border = "darkblue",
main = "Time Spent on NEW Page by Language",
xlab = "Language",
ylab = "Time Spent (minutes)",
las = 2) # las=2 rotates x-axis labels for readability

#the test
anova_result <- aov(time_spent_on_the_page ~ language_preferred, data = new_only)
##since p>0.05,indicates no significance difference
##there is no significant difference in time spent across language groups on the new page
##CONLUSION
#engagement improved-users spent ignificantly more time on the new page
#conversion improved-new page conversion is 66%vs42%
#language does not affect conversion-conversion is independent of language
#time spend across languages is similar-no evidence of language differences on new page
##BUSUNESS RECOMMENDATIONS
#proceed with full rollout of the new landing page,as it improves both engagemement and conversion
#improve elements for old page users(A/B test CTA placement,wording)
#track long termeffects toconfirm sustained improvement