#THE BUSINESS PROJECT
data<-read.csv("C:/Users/hp/Documents/abtest.csv")
#Explore the dataset and extract insights using Exploratory Data Analysis
# Viewing the first and last few rows of the dataset
head(data)
## user_id group landing_page time_spent_on_the_page converted
## 1 546592 control old 3.48 no
## 2 546468 treatment new 7.13 yes
## 3 546462 treatment new 4.40 no
## 4 546567 control old 3.02 no
## 5 546459 treatment new 4.75 yes
## 6 546558 control old 5.28 yes
## language_preferred
## 1 Spanish
## 2 English
## 3 Spanish
## 4 French
## 5 Spanish
## 6 English
Read the dataset
# View first and last few rows
head(data)
## user_id group landing_page time_spent_on_the_page converted
## 1 546592 control old 3.48 no
## 2 546468 treatment new 7.13 yes
## 3 546462 treatment new 4.40 no
## 4 546567 control old 3.02 no
## 5 546459 treatment new 4.75 yes
## 6 546558 control old 5.28 yes
## language_preferred
## 1 Spanish
## 2 English
## 3 Spanish
## 4 French
## 5 Spanish
## 6 English
tail(data)
## user_id group landing_page time_spent_on_the_page converted
## 95 546550 control old 3.05 no
## 96 546446 treatment new 5.15 no
## 97 546544 control old 6.52 yes
## 98 546472 treatment new 7.07 yes
## 99 546481 treatment new 6.20 yes
## 100 546483 treatment new 5.86 yes
## language_preferred
## 95 English
## 96 Spanish
## 97 English
## 98 Spanish
## 99 Spanish
## 100 English
# Check dataset structure
str(data)
## 'data.frame': 100 obs. of 6 variables:
## $ user_id : int 546592 546468 546462 546567 546459 546558 546448 546581 546461 546548 ...
## $ group : chr "control" "treatment" "treatment" "control" ...
## $ landing_page : chr "old" "new" "new" "old" ...
## $ time_spent_on_the_page: num 3.48 7.13 4.4 3.02 4.75 ...
## $ converted : chr "no" "yes" "no" "no" ...
## $ language_preferred : chr "Spanish" "English" "Spanish" "French" ...
Check shape/dimensions
dim(data)
## [1] 100 6
Statistical summary
summary(data)
## user_id group landing_page time_spent_on_the_page
## Min. :546443 Length:100 Length:100 Min. : 0.190
## 1st Qu.:546468 Class :character Class :character 1st Qu.: 3.880
## Median :546493 Mode :character Mode :character Median : 5.415
## Mean :546517 Mean : 5.378
## 3rd Qu.:546567 3rd Qu.: 7.022
## Max. :546592 Max. :10.710
## converted language_preferred
## Length:100 Length:100
## Class :character Class :character
## Mode :character Mode :character
##
##
##
# Check for missing values
colSums(is.na(data))
## user_id group landing_page
## 0 0 0
## time_spent_on_the_page converted language_preferred
## 0 0 0
Check for duplicates
sum(duplicated(data))
## [1] 0
#univariable
# numeric summary for time_spent
mean(data$time_spent_on_the_page, na.rm = TRUE)
## [1] 5.3778
median(data$time_spent_on_the_page, na.rm = TRUE)
## [1] 5.415
sd(data$time_spent_on_the_page,na.rm = TRUE)
## [1] 2.378166
min(data$time_spent_on_the_page, na.rm = TRUE)
## [1] 0.19
max(data$time_spent_on_the_page, na.rm = TRUE)
## [1] 10.71
range(data$time_spent_on_the_page)
## [1] 0.19 10.71
IQR(data$time_spent_on_the_page, na.rm = TRUE)
## [1] 3.1425
str(data)
## 'data.frame': 100 obs. of 6 variables:
## $ user_id : int 546592 546468 546462 546567 546459 546558 546448 546581 546461 546548 ...
## $ group : chr "control" "treatment" "treatment" "control" ...
## $ landing_page : chr "old" "new" "new" "old" ...
## $ time_spent_on_the_page: num 3.48 7.13 4.4 3.02 4.75 ...
## $ converted : chr "no" "yes" "no" "no" ...
## $ language_preferred : chr "Spanish" "English" "Spanish" "French" ...
ggplot(data, aes(x = time_spent_on_the_page)) +
geom_histogram(bins = 30, fill = "steelblue", color = "black") +
labs(title = "Distribution of Time Spent on Page",
x = "Time Spent (minutes)",
y = "Frequency") +
theme_minimal()
# Bar plots for categorical variables
ggplot(data, aes(x = group)) +
geom_bar(fill = c("control" = "tomato", "treatment" = "steelblue")) +
labs(title = "Distribution of Groups", x = "Group", y = "Count") +
theme_minimal()
ggplot(data, aes(x = language_preferred)) +
geom_bar(fill = "lightblue") +
labs(title = "Distribution of Preferred Languages", x = "Language", y = "Count") +
theme_minimal()
ggplot(data, aes(x = language_preferred)) +
geom_bar(fill = "lightblue") +
labs(title = "Distribution of Preferred Languages", x = "Language", y = "Count") +
theme_minimal()
##Results and Interpretation #The mean time spent by users on the
landing page is 5.38 minutes, while the median time is 5.42 minutes. The
closeness of these two values indicates that the data is approximately
symmetric with no severe skewness. #The standard deviation is 2.38
minutes, showing moderate variability in how long users stay on the
page. This suggests that while many users spend around the average time,
there are notable differences in engagement levels across users. #The
minimum time spent on the page is 0.19 minutes, indicating that some
users exited the site almost immediately. The maximum time is 10.71
minutes, showing that certain users remained engaged for an extended
duration. #The range of the data extends from 0.19 to 10.71 minutes,
giving a total spread of: #10.71-0.19=10.52” minutes” #This shows a
large difference between the shortest and longest visits. #The
interquartile range (IQR) is 3.14 minutes, meaning the middle 50% of
users spent time within a 3-minute window. This indicates that the bulk
of visitors have fairly consistent behavior and that extreme values do
not strongly distort the analysis #Convertion and language are related
since they all falls under characters #The time spent on the new page
are not same for the different language users as shown by the dataset
above #1.The time spent on the new page are not same for the different
language users as shown by the dataset above #2.Spanish language is the
most preffered language amoungst them
#Business Recommendations #Immediate Implementation: Replace the existing landing page with the new version across all platforms.
#Monitor Post-Implementation: Track subscription rates for 1-3 months to confirm the test results translate to real-world performance.
#Consider Additional Testing: While the new page is successful, consider testing minor variations (different CTAs, images, or headlines) to potentially improve performance further.
#Resource Allocation: Since language isn’t a differentiating factor, focus design and content resources on other segmentation strategies (like topic interest or device type). ```