#import necesssry libraries
library(stats)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readr)
## Warning: package 'readr' was built under R version 4.5.2
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.2
## Warning: package 'ggplot2' was built under R version 4.5.2
## Warning: package 'tidyr' was built under R version 4.5.2
## Warning: package 'purrr' was built under R version 4.5.2
## Warning: package 'stringr' was built under R version 4.5.2
## Warning: package 'forcats' was built under R version 4.5.2
## Warning: package 'lubridate' was built under R version 4.5.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.2.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
#PROJECT BUSINESS STATISTICS:E-news Express:
#Problem Statement #E-News Express is experiencing a decline in the number of new monthly subscribers compared to the previous year. Management suspects that the design and content structure of the current landing page are not engaging visitors effectively, leading to low user retention and poor conversion into paid subscribers. #To address this challenge, the design team introduced a newly redesigned landing page with improved layout and more relevant content. An A/B experiment was conducted by showing the old page to one group of users (control group) and the new page to another group (treatment group). #The organization now needs a statistical evaluation to determine whether the new landing page improves user engagement and increases conversion rates. Data on user behavior has been collected, including time spent, conversion status, and preferred language, and must be analyzed to draw evidence-based business conclusions. #Research Objectives #Primary Objective #To statistically evaluate whether the new landing page design leads to better user engagement and higher subscription conversion rates compared to the old landing page. #Secondary Objectives #1. User Engagement Analysis #To determine whether users spend significantly more time on the new landing page than on the old landing page. #2. Conversion Rate Evaluation #To assess whether the proportion of users who subscribe is higher for the new landing page than for the existing page. #3. Language Impact on Conversion #To examine whether user conversion depends on the preferred language selected during the visit. #4. Behavior Across Language Groups #To evaluate whether the average time spent on the new page differs significantly across different language users. #5. Business Decision Support
#Reading The Data Into Data Frame
#first load the data set
data<-read.csv("C:\\Users\\olond\\OneDrive\\r practice\\abtest.csv")
#Exploring the dataset into adata frame
#viewing the first 6 rows in the dataset
print(head(data))
## user_id group landing_page time_spent_on_the_page converted
## 1 546592 control old 3.48 no
## 2 546468 treatment new 7.13 yes
## 3 546462 treatment new 4.40 no
## 4 546567 control old 3.02 no
## 5 546459 treatment new 4.75 yes
## 6 546558 control old 5.28 yes
## language_preferred
## 1 Spanish
## 2 English
## 3 Spanish
## 4 French
## 5 Spanish
## 6 English
#viewing the last 6 rows in the dataset
print(tail(data))
## user_id group landing_page time_spent_on_the_page converted
## 95 546550 control old 3.05 no
## 96 546446 treatment new 5.15 no
## 97 546544 control old 6.52 yes
## 98 546472 treatment new 7.07 yes
## 99 546481 treatment new 6.20 yes
## 100 546483 treatment new 5.86 yes
## language_preferred
## 95 English
## 96 Spanish
## 97 English
## 98 Spanish
## 99 Spanish
## 100 English
#Checking the shape of the dataset(this is looking the rows and columns in the bussiness project dataset)
dim(data)
## [1] 100 6
#the dataset have 100 rows and 6 only column #Getting summary of the dataset this give what all the dataset entails
print(summary(data))
## user_id group landing_page time_spent_on_the_page
## Min. :546443 Length:100 Length:100 Min. : 0.190
## 1st Qu.:546468 Class :character Class :character 1st Qu.: 3.880
## Median :546493 Mode :character Mode :character Median : 5.415
## Mean :546517 Mean : 5.378
## 3rd Qu.:546567 3rd Qu.: 7.022
## Max. :546592 Max. :10.710
## converted language_preferred
## Length:100 Length:100
## Class :character Class :character
## Mode :character Mode :character
##
##
##
#summary show very low deviation in the mean , meadian and the business dataset #missing values in the dataset (suming true variables)
print(colSums(is.na(data)))
## user_id group landing_page
## 0 0 0
## time_spent_on_the_page converted language_preferred
## 0 0 0
#showing numberof duplicated
sum(duplicated(data))
## [1] 0
#This show that there is is no missing values in the datasets ## Univariate Analysis
# numeric summary for time_spent
mean(data$time_spent_on_the_page, na.rm = TRUE)
## [1] 5.3778
median(data$time_spent_on_the_page, na.rm = TRUE)
## [1] 5.415
sd(data$time_spent_on_the_page,na.rm = TRUE)
## [1] 2.378166
min(data$time_spent_on_the_page, na.rm = TRUE)
## [1] 0.19
max(data$time_spent_on_the_page, na.rm = TRUE)
## [1] 10.71
range(data$time_spent_on_the_page)
## [1] 0.19 10.71
IQR(data$time_spent_on_the_page, na.rm = TRUE)
## [1] 3.1425
#visualization
library(ggplot2)
ggplot(data, aes(x = landing_page, y = time_spent_on_the_page)) +
geom_boxplot(fill = "lightgreen", color = "black", outlier.colour = "red") +
labs(title = "Distribution of Time Spent by Landing Page",
x = "Landing Page Type",
y = "Time Spent (minutes)") +
theme_minimal()
#plotting adensity plot groupin te dataset
ggplot(data = data) +
geom_density(aes(x = time_spent_on_the_page)) +
labs(
title = "Density Plot of Time Spent",
x = "Time Spent (minutes)",
y = "Density"
) +
theme_minimal()
#BIVARIATE ANALYSIS #this involves analysis of two variable #1.statistic
test :one way Anova
anova_model1 <- aov(time_spent_on_the_page ~ landing_page, data = data)
#display anova test result
summary(anova_model1)
## Df Sum Sq Mean Sq F value Pr(>F)
## landing_page 1 71.5 71.47 14.34 0.000263 ***
## Residuals 98 488.4 4.98
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Create a contingency table for conversion vs language
cross_tab <- table(data$time_spent_on_the_page, data$language_preferred)
cross_tab
##
## English French Spanish
## 0.19 0 0 1
## 0.22 1 0 0
## 0.4 1 1 0
## 0.91 0 1 0
## 0.93 0 1 0
## 1.44 0 1 0
## 1.65 0 0 1
## 1.81 0 1 0
## 1.92 0 1 0
## 2.08 1 0 0
## 2.23 0 0 1
## 2.58 1 0 0
## 2.66 0 1 0
## 2.9 0 0 1
## 3.02 0 1 0
## 3.05 1 0 0
## 3.13 1 0 0
## 3.21 1 0 0
## 3.3 0 1 0
## 3.48 0 0 1
## 3.52 1 0 0
## 3.65 1 0 0
## 3.68 0 1 0
## 3.88 1 0 1
## 3.91 1 0 0
## 4.05 0 0 1
## 4.18 0 1 0
## 4.28 0 1 0
## 4.3 0 1 0
## 4.39 1 0 0
## 4.4 0 0 1
## 4.46 0 0 1
## 4.52 0 0 1
## 4.68 0 1 0
## 4.71 0 0 1
## 4.75 0 0 2
## 4.87 0 0 1
## 4.94 0 1 0
## 5.08 1 0 0
## 5.15 0 0 1
## 5.25 0 1 0
## 5.26 0 1 0
## 5.28 1 0 0
## 5.37 0 1 0
## 5.39 0 0 1
## 5.4 0 1 0
## 5.41 1 0 0
## 5.42 0 1 0
## 5.47 0 0 1
## 5.65 1 0 0
## 5.74 0 0 1
## 5.86 1 0 1
## 5.96 1 0 0
## 6.01 0 1 0
## 6.03 0 1 0
## 6.04 1 1 0
## 6.18 0 0 1
## 6.2 0 0 1
## 6.21 0 0 1
## 6.27 0 0 1
## 6.41 0 0 1
## 6.47 0 0 1
## 6.52 1 0 0
## 6.53 0 0 1
## 6.57 0 0 1
## 6.6 1 0 0
## 6.7 0 0 1
## 6.71 1 0 0
## 6.79 0 1 0
## 7.02 1 0 0
## 7.03 0 0 1
## 7.07 0 0 1
## 7.13 1 0 0
## 7.16 1 0 1
## 7.23 0 0 1
## 7.27 0 1 0
## 7.4 0 1 0
## 7.46 1 0 0
## 7.81 0 1 0
## 8.02 0 1 0
## 8.08 0 0 1
## 8.3 0 1 0
## 8.35 0 1 0
## 8.46 0 1 0
## 8.47 1 0 0
## 8.5 1 0 0
## 8.72 0 0 1
## 8.73 1 0 0
## 9.12 0 1 0
## 9.15 0 1 0
## 9.49 1 0 0
## 10.3 1 0 0
## 10.5 1 0 0
## 10.71 0 1 0
#compute t-test
p_value <- t.test(time_spent_on_the_page ~ landing_page, data = data)$p.value
p_value
## [1] 0.0002784762
alpha <- 0.05
if (p_value < alpha) {
print("Reject the null hypothesis: significant difference.")
} else {
print("Fail to reject the null hypothesis: no significant difference.")
}
## [1] "Reject the null hypothesis: significant difference."
##Perform t-test and extract p-value
p_value <- t.test(time_spent_on_the_page ~ landing_page, data = data)$p.value
print(p_value)
## [1] 0.0002784762
##Results and Interpretation #The mean time spent by users on the landing page is 5.38 minutes, while the median time is 5.42 minutes. The closeness of these two values indicates that the data is approximately symmetric with no severe skewness. #The standard deviation is 2.38 minutes, showing moderate variability in how long users stay on the page. This suggests that while many users spend around the average time, there are notable differences in engagement levels across users. #The minimum time spent on the page is 0.19 minutes, indicating that some users exited the site almost immediately. The maximum time is 10.71 minutes, showing that certain users remained engaged for an extended duration. #The range of the data extends from 0.19 to 10.71 minutes, giving a total spread of: #10.71-0.19=10.52” minutes” #This shows a large difference between the shortest and longest visits. #The interquartile range (IQR) is 3.14 minutes, meaning the middle 50% of users spent time within a 3-minute window. This indicates that the bulk of visitors have fairly consistent behavior and that extreme values do not strongly distort the analysis #Convertion and language are related since they all falls under characters #The time spent on the new page are not same for the different language users as shown by the dataset above #1.The time spent on the new page are not same for the different language users as shown by the dataset above #2.Spanish language is the most preffered language amoungst them
#Business Recommendations #Immediate Implementation: Replace the existing landing page with the new version across all platforms.
#Monitor Post-Implementation: Track subscription rates for 1-3 months to confirm the test results translate to real-world performance.
#Consider Additional Testing: While the new page is successful, consider testing minor variations (different CTAs, images, or headlines) to potentially improve performance further.
#Resource Allocation: Since language isn’t a differentiating factor, focus design and content resources on other segmentation strategies (like topic interest or device type). ```