#import necesssry libraries

library(stats)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readr)
## Warning: package 'readr' was built under R version 4.5.2
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.2
## Warning: package 'ggplot2' was built under R version 4.5.2
## Warning: package 'tidyr' was built under R version 4.5.2
## Warning: package 'purrr' was built under R version 4.5.2
## Warning: package 'stringr' was built under R version 4.5.2
## Warning: package 'forcats' was built under R version 4.5.2
## Warning: package 'lubridate' was built under R version 4.5.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.2.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

#PROJECT BUSINESS STATISTICS:E-news Express:

#Problem Statement #E-News Express is experiencing a decline in the number of new monthly subscribers compared to the previous year. Management suspects that the design and content structure of the current landing page are not engaging visitors effectively, leading to low user retention and poor conversion into paid subscribers. #To address this challenge, the design team introduced a newly redesigned landing page with improved layout and more relevant content. An A/B experiment was conducted by showing the old page to one group of users (control group) and the new page to another group (treatment group). #The organization now needs a statistical evaluation to determine whether the new landing page improves user engagement and increases conversion rates. Data on user behavior has been collected, including time spent, conversion status, and preferred language, and must be analyzed to draw evidence-based business conclusions. #Research Objectives #Primary Objective #To statistically evaluate whether the new landing page design leads to better user engagement and higher subscription conversion rates compared to the old landing page. #Secondary Objectives #1. User Engagement Analysis #To determine whether users spend significantly more time on the new landing page than on the old landing page. #2. Conversion Rate Evaluation #To assess whether the proportion of users who subscribe is higher for the new landing page than for the existing page. #3. Language Impact on Conversion #To examine whether user conversion depends on the preferred language selected during the visit. #4. Behavior Across Language Groups #To evaluate whether the average time spent on the new page differs significantly across different language users. #5. Business Decision Support

#Reading The Data Into Data Frame
#first load the data set
data<-read.csv("C:\\Users\\olond\\OneDrive\\r practice\\abtest.csv")

#Exploring the dataset into adata frame

#viewing the first 6 rows in the dataset
print(head(data))
##   user_id     group landing_page time_spent_on_the_page converted
## 1  546592   control          old                   3.48        no
## 2  546468 treatment          new                   7.13       yes
## 3  546462 treatment          new                   4.40        no
## 4  546567   control          old                   3.02        no
## 5  546459 treatment          new                   4.75       yes
## 6  546558   control          old                   5.28       yes
##   language_preferred
## 1            Spanish
## 2            English
## 3            Spanish
## 4             French
## 5            Spanish
## 6            English

#viewing the last 6 rows in the dataset

print(tail(data))
##     user_id     group landing_page time_spent_on_the_page converted
## 95   546550   control          old                   3.05        no
## 96   546446 treatment          new                   5.15        no
## 97   546544   control          old                   6.52       yes
## 98   546472 treatment          new                   7.07       yes
## 99   546481 treatment          new                   6.20       yes
## 100  546483 treatment          new                   5.86       yes
##     language_preferred
## 95             English
## 96             Spanish
## 97             English
## 98             Spanish
## 99             Spanish
## 100            English

#Checking the shape of the dataset(this is looking the rows and columns in the bussiness project dataset)

dim(data)
## [1] 100   6

#the dataset have 100 rows and 6 only column #Getting summary of the dataset this give what all the dataset entails

print(summary(data))
##     user_id          group           landing_page       time_spent_on_the_page
##  Min.   :546443   Length:100         Length:100         Min.   : 0.190        
##  1st Qu.:546468   Class :character   Class :character   1st Qu.: 3.880        
##  Median :546493   Mode  :character   Mode  :character   Median : 5.415        
##  Mean   :546517                                         Mean   : 5.378        
##  3rd Qu.:546567                                         3rd Qu.: 7.022        
##  Max.   :546592                                         Max.   :10.710        
##   converted         language_preferred
##  Length:100         Length:100        
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
## 

#summary show very low deviation in the mean , meadian and the business dataset #missing values in the dataset (suming true variables)

print(colSums(is.na(data)))
##                user_id                  group           landing_page 
##                      0                      0                      0 
## time_spent_on_the_page              converted     language_preferred 
##                      0                      0                      0

#showing numberof duplicated

sum(duplicated(data))
## [1] 0

#This show that there is is no missing values in the datasets ## Univariate Analysis

# numeric summary for time_spent
mean(data$time_spent_on_the_page, na.rm = TRUE)
## [1] 5.3778
median(data$time_spent_on_the_page, na.rm = TRUE)
## [1] 5.415
sd(data$time_spent_on_the_page,na.rm = TRUE)
## [1] 2.378166
min(data$time_spent_on_the_page, na.rm = TRUE)
## [1] 0.19
max(data$time_spent_on_the_page, na.rm = TRUE)
## [1] 10.71
range(data$time_spent_on_the_page)
## [1]  0.19 10.71
IQR(data$time_spent_on_the_page, na.rm = TRUE)
## [1] 3.1425

Distribution of ’time_spent_on_the_page

Distributin of ‘time_spent_on_the_page’ (Quantitative Variable)

#visualization

library(ggplot2)

Create the Box Plot

ggplot(data, aes(x = landing_page, y = time_spent_on_the_page)) + 
  geom_boxplot(fill = "lightgreen", color = "black", outlier.colour = "red") + 
  labs(title = "Distribution of Time Spent by Landing Page",
       x = "Landing Page Type",
       y = "Time Spent (minutes)") + 
  theme_minimal()

#plotting adensity plot groupin te dataset

ggplot(data = data) +
  geom_density(aes(x = time_spent_on_the_page)) +
  labs(
    title = "Density Plot of Time Spent",
    x = "Time Spent (minutes)",
    y = "Density"
  ) +
  theme_minimal()

#BIVARIATE ANALYSIS #this involves analysis of two variable #1.statistic test :one way Anova

anova_model1 <- aov(time_spent_on_the_page ~ landing_page, data = data)
#display anova  test result
summary(anova_model1)
##              Df Sum Sq Mean Sq F value   Pr(>F)    
## landing_page  1   71.5   71.47   14.34 0.000263 ***
## Residuals    98  488.4    4.98                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#Create a contingency table for conversion vs language

cross_tab <- table(data$time_spent_on_the_page, data$language_preferred)
cross_tab
##        
##         English French Spanish
##   0.19        0      0       1
##   0.22        1      0       0
##   0.4         1      1       0
##   0.91        0      1       0
##   0.93        0      1       0
##   1.44        0      1       0
##   1.65        0      0       1
##   1.81        0      1       0
##   1.92        0      1       0
##   2.08        1      0       0
##   2.23        0      0       1
##   2.58        1      0       0
##   2.66        0      1       0
##   2.9         0      0       1
##   3.02        0      1       0
##   3.05        1      0       0
##   3.13        1      0       0
##   3.21        1      0       0
##   3.3         0      1       0
##   3.48        0      0       1
##   3.52        1      0       0
##   3.65        1      0       0
##   3.68        0      1       0
##   3.88        1      0       1
##   3.91        1      0       0
##   4.05        0      0       1
##   4.18        0      1       0
##   4.28        0      1       0
##   4.3         0      1       0
##   4.39        1      0       0
##   4.4         0      0       1
##   4.46        0      0       1
##   4.52        0      0       1
##   4.68        0      1       0
##   4.71        0      0       1
##   4.75        0      0       2
##   4.87        0      0       1
##   4.94        0      1       0
##   5.08        1      0       0
##   5.15        0      0       1
##   5.25        0      1       0
##   5.26        0      1       0
##   5.28        1      0       0
##   5.37        0      1       0
##   5.39        0      0       1
##   5.4         0      1       0
##   5.41        1      0       0
##   5.42        0      1       0
##   5.47        0      0       1
##   5.65        1      0       0
##   5.74        0      0       1
##   5.86        1      0       1
##   5.96        1      0       0
##   6.01        0      1       0
##   6.03        0      1       0
##   6.04        1      1       0
##   6.18        0      0       1
##   6.2         0      0       1
##   6.21        0      0       1
##   6.27        0      0       1
##   6.41        0      0       1
##   6.47        0      0       1
##   6.52        1      0       0
##   6.53        0      0       1
##   6.57        0      0       1
##   6.6         1      0       0
##   6.7         0      0       1
##   6.71        1      0       0
##   6.79        0      1       0
##   7.02        1      0       0
##   7.03        0      0       1
##   7.07        0      0       1
##   7.13        1      0       0
##   7.16        1      0       1
##   7.23        0      0       1
##   7.27        0      1       0
##   7.4         0      1       0
##   7.46        1      0       0
##   7.81        0      1       0
##   8.02        0      1       0
##   8.08        0      0       1
##   8.3         0      1       0
##   8.35        0      1       0
##   8.46        0      1       0
##   8.47        1      0       0
##   8.5         1      0       0
##   8.72        0      0       1
##   8.73        1      0       0
##   9.12        0      1       0
##   9.15        0      1       0
##   9.49        1      0       0
##   10.3        1      0       0
##   10.5        1      0       0
##   10.71       0      1       0

#compute t-test

p_value <- t.test(time_spent_on_the_page ~ landing_page, data = data)$p.value
p_value
## [1] 0.0002784762
alpha <- 0.05
if (p_value < alpha) {
  print("Reject the null hypothesis: significant difference.")
} else {
  print("Fail to reject the null hypothesis: no significant difference.")
}
## [1] "Reject the null hypothesis: significant difference."

##Perform t-test and extract p-value

p_value <- t.test(time_spent_on_the_page ~ landing_page, data = data)$p.value

print(p_value)
## [1] 0.0002784762

##Results and Interpretation #The mean time spent by users on the landing page is 5.38 minutes, while the median time is 5.42 minutes. The closeness of these two values indicates that the data is approximately symmetric with no severe skewness. #The standard deviation is 2.38 minutes, showing moderate variability in how long users stay on the page. This suggests that while many users spend around the average time, there are notable differences in engagement levels across users. #The minimum time spent on the page is 0.19 minutes, indicating that some users exited the site almost immediately. The maximum time is 10.71 minutes, showing that certain users remained engaged for an extended duration. #The range of the data extends from 0.19 to 10.71 minutes, giving a total spread of: #10.71-0.19=10.52” minutes” #This shows a large difference between the shortest and longest visits. #The interquartile range (IQR) is 3.14 minutes, meaning the middle 50% of users spent time within a 3-minute window. This indicates that the bulk of visitors have fairly consistent behavior and that extreme values do not strongly distort the analysis #Convertion and language are related since they all falls under characters #The time spent on the new page are not same for the different language users as shown by the dataset above #1.The time spent on the new page are not same for the different language users as shown by the dataset above #2.Spanish language is the most preffered language amoungst them

#Business Recommendations #Immediate Implementation: Replace the existing landing page with the new version across all platforms.

#Monitor Post-Implementation: Track subscription rates for 1-3 months to confirm the test results translate to real-world performance.

#Consider Additional Testing: While the new page is successful, consider testing minor variations (different CTAs, images, or headlines) to potentially improve performance further.

#Resource Allocation: Since language isn’t a differentiating factor, focus design and content resources on other segmentation strategies (like topic interest or device type). ```