#LOADING THE DATA
data <- read.csv("abtest.csv")
head(data)
##   user_id     group landing_page time_spent_on_the_page converted
## 1  546592   control          old                   3.48        no
## 2  546468 treatment          new                   7.13       yes
## 3  546462 treatment          new                   4.40        no
## 4  546567   control          old                   3.02        no
## 5  546459 treatment          new                   4.75       yes
## 6  546558   control          old                   5.28       yes
##   language_preferred
## 1            Spanish
## 2            English
## 3            Spanish
## 4             French
## 5            Spanish
## 6            English
# Structure
str(data)
## 'data.frame':    100 obs. of  6 variables:
##  $ user_id               : int  546592 546468 546462 546567 546459 546558 546448 546581 546461 546548 ...
##  $ group                 : chr  "control" "treatment" "treatment" "control" ...
##  $ landing_page          : chr  "old" "new" "new" "old" ...
##  $ time_spent_on_the_page: num  3.48 7.13 4.4 3.02 4.75 ...
##  $ converted             : chr  "no" "yes" "no" "no" ...
##  $ language_preferred    : chr  "Spanish" "English" "Spanish" "French" ...
#DATA OVERVIEW
# Summary statistics
summary(data)
##     user_id          group           landing_page       time_spent_on_the_page
##  Min.   :546443   Length:100         Length:100         Min.   : 0.190        
##  1st Qu.:546468   Class :character   Class :character   1st Qu.: 3.880        
##  Median :546493   Mode  :character   Mode  :character   Median : 5.415        
##  Mean   :546517                                         Mean   : 5.378        
##  3rd Qu.:546567                                         3rd Qu.: 7.022        
##  Max.   :546592                                         Max.   :10.710        
##   converted         language_preferred
##  Length:100         Length:100        
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
## 
#number of rows
nrow(data)
## [1] 100
# Check missing values
colSums(is.na(data))
##                user_id                  group           landing_page 
##                      0                      0                      0 
## time_spent_on_the_page              converted     language_preferred 
##                      0                      0                      0
# Check duplicates
sum(duplicated(data))
## [1] 0
# UNIVARIATE ANALYSIS
##(numericalvariables, first)
#minimum
min_time <- min(data$time_spent_on_the_page,na.rm = TRUE)
#maximum
max_time <- max(data$time_spent_on_the_page,na.rm = TRUE)
#mean
mean_time <- mean(data$time_spent_on_the_page,na.rm = TRUE)
#median
median_time <- median(data$time_spent_on_the_page,na.rm = TRUE)
#standard deviation                      
sd_time <- sd(data$time_spent_on_the_page,na.rm = TRUE)
#variance
var_time <- var(data$time_spent_on_the_page,na.rm = TRUE)
quantile(data$time_spent_on_the_page, probs = c(0.25,0.50,0.75))
##    25%    50%    75% 
## 3.8800 5.4150 7.0225
IQR(data$time_spent_on_the_page)
## [1] 3.1425
# Histogram
hist(data$time_spent_on_the_page,
     breaks=10,
     border="white",
     main = "Histogram of Time Spent on Page",
     xlab = "Time (minutes)",
     col = "lightblue")

# Boxplot
boxplot(data$time_spent_on_the_page,
        breaks=10,
        main = "Boxplot of Time Spent on Page",
        ylab = "Time (minutes)",

              col = "lightgreen")

#time spent is moderately spread out(IQR=3.14).Median and mean are similar suggesting approximatelysymmetric distributionwith no extreme skew


##(categorical variables, second)
#frequency count(landing_page)
table(data$landing_page)
## 
## new old 
##  50  50
#proportion,percentage(landing_page)
prop.table(table(data$landing_page))
## 
## new old 
## 0.5 0.5
#variable group
table(data$group)
## 
##   control treatment 
##        50        50
prop.table(table(data$group))
## 
##   control treatment 
##       0.5       0.5
#variable converted
table(data$converted)
## 
##  no yes 
##  46  54
prop.table(table(data$converted))
## 
##   no  yes 
## 0.46 0.54
#variable language preferred
table(data$language_preferred)
## 
## English  French Spanish 
##      32      34      34
prop.table(table(data$language_preferred))
## 
## English  French Spanish 
##    0.32    0.34    0.34
#BIVARIATE ANALYSIS(two variablestogether)
#time spent vs landing page
boxplot(time_spent_on_the_page ~ landing_page, 
        data = data,
        main = "Time Spent by Landing Page",
        xlab = "Landing Page",
        ylab = "Time Spent (minutes)",
        col = "lightblue")

#its evaluation
data %>% group_by(landing_page) %>%
  summarise(
    count = n(),
    mean_time = mean(time_spent_on_the_page),
    median_time = median(time_spent_on_the_page),
    sd_time = sd(time_spent_on_the_page)
  )
## # A tibble: 2 × 5
##   landing_page count mean_time median_time sd_time
##   <chr>        <int>     <dbl>       <dbl>   <dbl>
## 1 new             50      6.22        6.10    1.82
## 2 old             50      4.53        4.38    2.58
#conversion vs landing page(if more usersconverted on the new page)
table(data$landing_page,data$converted)
##      
##       no yes
##   new 17  33
##   old 29  21
#conversion ratein each landing page
prop.table(table(data$landing_page,data$converted),1)
##      
##         no  yes
##   new 0.34 0.66
##   old 0.58 0.42
#time spent vs lannguage preferred
boxplot(time_spent_on_the_page ~ language_preferred,
        data = data,
        main = "Time Spent by Language Preferred",
        xlab = "Language",
        ylab = "Time Spent (minutes)",
        col = "lightgreen")

#its evaluation
aggregate(time_spent_on_the_page ~ language_preferred,
          data = data,
          FUN = function(x) c(mean = mean(x), median = median(x), sd = sd(x)))
##   language_preferred time_spent_on_the_page.mean time_spent_on_the_page.median
## 1            English                    5.559063                      5.755000
## 2             French                    5.253235                      5.315000
## 3            Spanish                    5.331765                      5.605000
##   time_spent_on_the_page.sd
## 1                  2.621079
## 2                  2.675413
## 3                  1.818095
#conversion vs language preferred
table(data$language_preferred,data$converted)
##          
##           no yes
##   English 11  21
##   French  19  15
##   Spanish 16  18
prop.table(table(data$language_preferred,data$converted),1)
##          
##                  no       yes
##   English 0.3437500 0.6562500
##   French  0.5588235 0.4411765
##   Spanish 0.4705882 0.5294118
##users on the new page spent significantly more time on average (6.22minutes)than those on old pages(4.53minutes).Abox plot confirmed the the new distribution of page is shifted upward

##HYPOTHESIS TEST 1
#a one tailed t test
# HYPOTHESIS TEST 1
# One-tailed t-test
# H0: μ_new <= μ_old  (New page mean time is LESS than or equal to old)
# H1: μ_new > μ_old   (New page mean time is GREATER)

#preparing the data(checking number of users in each group)
new_page <- data %>% filter(landing_page == "new")
old_page <- data %>% filter(landing_page == "old")
nrow(new_page)
## [1] 50
nrow(old_page)
## [1] 50
#t-test
t_test_result <- t.test(
  new_page$time_spent_on_the_page,
  old_page$time_spent_on_the_page,
  alternative = "greater",   
  var.equal = FALSE          
)
##since p=0.0001392<0.05,we reject the null hypothesis
##users spend signifficantly more time on the new landing page than old one

##HYPOTHESIS TEST 2(chi-square test)
#H0 (Null): Conversion is independent of preferred language.
#Language choice does NOT affect conversion.
#H1 (Alternative): Conversion depends on preferred language.
#lang_conv_table
lang_conv_table <- table(data$language_preferred, data$converted)
#the test
chi_result <- chisq.test(lang_conv_table)
##p>0.05, meaninhconversin is likely independent of language preference

##HYPOTHESIS TEST 3,ANOVA (time spent vs language)
#H0 (Null): All language groups on the new landing page have the same mean time spent.
#μ1 = μ2 = μ3 = ...  
#H1 (Alternative): At least one language group has a different mean time spent.

#filtering
new_only <- data %>% filter(landing_page == "new")
head(new_only)
##   user_id     group landing_page time_spent_on_the_page converted
## 1  546468 treatment          new                   7.13       yes
## 2  546462 treatment          new                   4.40        no
## 3  546459 treatment          new                   4.75       yes
## 4  546448 treatment          new                   5.25       yes
## 5  546461 treatment          new                  10.71       yes
## 6  546491 treatment          new                   5.86       yes
##   language_preferred
## 1            English
## 2            Spanish
## 3            Spanish
## 4             French
## 5             French
## 6            Spanish
# Boxplot using base R(visual check)
boxplot(time_spent_on_the_page ~ language_preferred, 
        data = new_only,
        col = "lightblue",
        border = "darkblue",
        main = "Time Spent on NEW Page by Language",
        xlab = "Language",
        ylab = "Time Spent (minutes)",
        las = 2)  # las=2 rotates x-axis labels for readability

#the test
anova_result <- aov(time_spent_on_the_page ~ language_preferred, data = new_only)
##since p>0.05,indicates no significance difference
##there is no significant difference in time spent across language groups on the new page

##CONLUSION
#engagement improved-users spent ignificantly more time on the new page
#conversion improved-new page conversion is 66%vs42%
#language does not affect conversion-conversion is independent of language
#time spend across languages is similar-no evidence of language differences on new page

##BUSUNESS RECOMMENDATIONS
#proceed with full rollout of the new landing page,as it improves both engagemement and conversion
#improve elements for old page users(A/B test CTA placement,wording)
#track long termeffects toconfirm sustained improvement