knitr::opts_chunk$set(echo = TRUE)
library(reticulate)
## Warning: package 'reticulate' was built under R version 4.5.2
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.2
## Warning: package 'ggplot2' was built under R version 4.5.2
## Warning: package 'tidyr' was built under R version 4.5.2
## Warning: package 'purrr' was built under R version 4.5.2
## Warning: package 'stringr' was built under R version 4.5.2
## Warning: package 'forcats' was built under R version 4.5.2
## Warning: package 'lubridate' was built under R version 4.5.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.2.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(dplyr)
library(readr)
library(stats)
##Read the Dataset
df <- read_csv("C:\\Users\\ADMIN\\Desktop\\y2s1\\abtest.csv")
## Rows: 100 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): group, landing_page, converted, language_preferred
## dbl (2): user_id, time_spent_on_the_page
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(df)
## # A tibble: 6 × 6
## user_id group landing_page time_spent_on_the_page converted language_preferred
## <dbl> <chr> <chr> <dbl> <chr> <chr>
## 1 546592 cont… old 3.48 no Spanish
## 2 546468 trea… new 7.13 yes English
## 3 546462 trea… new 4.4 no Spanish
## 4 546567 cont… old 3.02 no French
## 5 546459 trea… new 4.75 yes Spanish
## 6 546558 cont… old 5.28 yes English
##Exploratory Data Analysis
head(df)
## # A tibble: 6 × 6
## user_id group landing_page time_spent_on_the_page converted language_preferred
## <dbl> <chr> <chr> <dbl> <chr> <chr>
## 1 546592 cont… old 3.48 no Spanish
## 2 546468 trea… new 7.13 yes English
## 3 546462 trea… new 4.4 no Spanish
## 4 546567 cont… old 3.02 no French
## 5 546459 trea… new 4.75 yes Spanish
## 6 546558 cont… old 5.28 yes English
tail(df)
## # A tibble: 6 × 6
## user_id group landing_page time_spent_on_the_page converted language_preferred
## <dbl> <chr> <chr> <dbl> <chr> <chr>
## 1 546550 cont… old 3.05 no English
## 2 546446 trea… new 5.15 no Spanish
## 3 546544 cont… old 6.52 yes English
## 4 546472 trea… new 7.07 yes Spanish
## 5 546481 trea… new 6.2 yes Spanish
## 6 546483 trea… new 5.86 yes English
dim(df)
## [1] 100 6
summary(df)
## user_id group landing_page time_spent_on_the_page
## Min. :546443 Length:100 Length:100 Min. : 0.190
## 1st Qu.:546468 Class :character Class :character 1st Qu.: 3.880
## Median :546493 Mode :character Mode :character Median : 5.415
## Mean :546517 Mean : 5.378
## 3rd Qu.:546567 3rd Qu.: 7.022
## Max. :546592 Max. :10.710
## converted language_preferred
## Length:100 Length:100
## Class :character Class :character
## Mode :character Mode :character
##
##
##
colSums(is.na(df))
## user_id group landing_page
## 0 0 0
## time_spent_on_the_page converted language_preferred
## 0 0 0
sum(duplicated(df))
## [1] 0
# Remove duplicates if needed
df <- df %>% distinct()
##Time spent
hist(df$time_spent_on_the_page,
main="Distribution of Time Spent",
xlab="Minutes",
col="lightblue",
breaks=20)
### Group distribution
table(df$group)
##
## control treatment
## 50 50
barplot(table(df$group), main="Group Distribution")
### Conversion distribution
table(df$converted)
##
## no yes
## 46 54
barplot(table(df$converted), main="Conversion Distribution")
### Language preference
table(df$language_preferred)
##
## English French Spanish
## 32 34 34
barplot(table(df$language_preferred), main="Language Distribution")
## Bivariate Analysis ### Time spent by group
ggplot(df, aes(x=group, y=time_spent_on_the_page)) +
geom_boxplot() +
ggtitle("Time Spent by Group")
### Conversion rate by landing page
prop.table(table(df$landing_page, df$converted), 1) * 100
##
## no yes
## new 34 66
## old 58 42
### Conversion vs language
prop.table(table(df$language_preferred, df$converted), 1) * 100
##
## no yes
## English 34.37500 65.62500
## French 55.88235 44.11765
## Spanish 47.05882 52.94118
### Data splitting
old_time <- df %>% filter(landing_page == "old") %>% pull(time_spent_on_the_page)
new_time <- df %>% filter(landing_page == "new") %>% pull(time_spent_on_the_page)
### Two-tailed t-test
t.test(new_time, old_time, alternative="greater")
##
## Welch Two Sample t-test
##
## data: new_time and old_time
## t = 3.7868, df = 87.975, p-value = 0.0001392
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
## 0.9485536 Inf
## sample estimates:
## mean of x mean of y
## 6.2232 4.5324
#### Interpretation
# If p-value < 0.05 → Users spend more time on the new page.
### Convert table
tbl <- table(df$landing_page, df$converted)
tbl
##
## no yes
## new 17 33
## old 29 21
### Two-proportion z-test
old_success <- tbl["old",1] # Use numeric index instead of "1"
old_total <- sum(tbl["old", ])
new_success <- tbl["new",1]
new_total <- sum(tbl["new", ])
prop.test(c(new_success, old_success),
c(new_total, old_total),
alternative="greater")
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(new_success, old_success) out of c(new_total, old_total)
## X-squared = 4.8712, df = 1, p-value = 0.9863
## alternative hypothesis: greater
## 95 percent confidence interval:
## -0.4191348 1.0000000
## sample estimates:
## prop 1 prop 2
## 0.34 0.58
### Interpretation
# p < 0.05 → New page has higher conversion.
### Chi-square test
tbl2 <- table(df$language_preferred, df$converted)
chisq.test(tbl2)
##
## Pearson's Chi-squared test
##
## data: tbl2
## X-squared = 3.093, df = 2, p-value = 0.213
### Interpretation
# p < 0.05 → Conversion depends on language
# p > 0.05 → Conversion is independent of language
### Filtering of the new page rows
newdf <- df %>% filter(landing_page == "new")
### Perform ANOVA test
anova_result <- aov(time_spent_on_the_page ~ language_preferred, data=newdf)
summary(anova_result)
## Df Sum Sq Mean Sq F value Pr(>F)
## language_preferred 2 5.68 2.838 0.854 0.432
## Residuals 47 156.10 3.321
### Interpretation
# p < 0.05 → Time differs across languages
# p > 0.05 → All languages spend equal time
```