#THE BUSINESS PROJECT

First load the data

data<-read.csv("C:/Users/hp/Documents/abtest.csv")

#Explore the dataset and extract insights using Exploratory Data Analysis

#   Viewing the first and last few rows of the dataset
   head(data)
##   user_id     group landing_page time_spent_on_the_page converted
## 1  546592   control          old                   3.48        no
## 2  546468 treatment          new                   7.13       yes
## 3  546462 treatment          new                   4.40        no
## 4  546567   control          old                   3.02        no
## 5  546459 treatment          new                   4.75       yes
## 6  546558   control          old                   5.28       yes
##   language_preferred
## 1            Spanish
## 2            English
## 3            Spanish
## 4             French
## 5            Spanish
## 6            English

Read the dataset

# View first and last few rows
   head(data)
##   user_id     group landing_page time_spent_on_the_page converted
## 1  546592   control          old                   3.48        no
## 2  546468 treatment          new                   7.13       yes
## 3  546462 treatment          new                   4.40        no
## 4  546567   control          old                   3.02        no
## 5  546459 treatment          new                   4.75       yes
## 6  546558   control          old                   5.28       yes
##   language_preferred
## 1            Spanish
## 2            English
## 3            Spanish
## 4             French
## 5            Spanish
## 6            English
   tail(data)
##     user_id     group landing_page time_spent_on_the_page converted
## 95   546550   control          old                   3.05        no
## 96   546446 treatment          new                   5.15        no
## 97   546544   control          old                   6.52       yes
## 98   546472 treatment          new                   7.07       yes
## 99   546481 treatment          new                   6.20       yes
## 100  546483 treatment          new                   5.86       yes
##     language_preferred
## 95             English
## 96             Spanish
## 97             English
## 98             Spanish
## 99             Spanish
## 100            English

# Check dataset structure

str(data)
## 'data.frame':    100 obs. of  6 variables:
##  $ user_id               : int  546592 546468 546462 546567 546459 546558 546448 546581 546461 546548 ...
##  $ group                 : chr  "control" "treatment" "treatment" "control" ...
##  $ landing_page          : chr  "old" "new" "new" "old" ...
##  $ time_spent_on_the_page: num  3.48 7.13 4.4 3.02 4.75 ...
##  $ converted             : chr  "no" "yes" "no" "no" ...
##  $ language_preferred    : chr  "Spanish" "English" "Spanish" "French" ...

Check shape/dimensions

 dim(data)
## [1] 100   6

Statistical summary

summary(data)
##     user_id          group           landing_page       time_spent_on_the_page
##  Min.   :546443   Length:100         Length:100         Min.   : 0.190        
##  1st Qu.:546468   Class :character   Class :character   1st Qu.: 3.880        
##  Median :546493   Mode  :character   Mode  :character   Median : 5.415        
##  Mean   :546517                                         Mean   : 5.378        
##  3rd Qu.:546567                                         3rd Qu.: 7.022        
##  Max.   :546592                                         Max.   :10.710        
##   converted         language_preferred
##  Length:100         Length:100        
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
## 

# Check for missing values

colSums(is.na(data))
##                user_id                  group           landing_page 
##                      0                      0                      0 
## time_spent_on_the_page              converted     language_preferred 
##                      0                      0                      0

Check for duplicates

sum(duplicated(data))
## [1] 0

#univariable

 # numeric summary for time_spent
   mean(data$time_spent_on_the_page, na.rm = TRUE)
## [1] 5.3778
   median(data$time_spent_on_the_page, na.rm = TRUE)
## [1] 5.415
   sd(data$time_spent_on_the_page,na.rm = TRUE)
## [1] 2.378166
   min(data$time_spent_on_the_page, na.rm = TRUE)
## [1] 0.19
   max(data$time_spent_on_the_page, na.rm = TRUE)
## [1] 10.71
   range(data$time_spent_on_the_page)
## [1]  0.19 10.71
   IQR(data$time_spent_on_the_page, na.rm = TRUE)
## [1] 3.1425

Check conversions

  str(data)  
## 'data.frame':    100 obs. of  6 variables:
##  $ user_id               : int  546592 546468 546462 546567 546459 546558 546448 546581 546461 546548 ...
##  $ group                 : chr  "control" "treatment" "treatment" "control" ...
##  $ landing_page          : chr  "old" "new" "new" "old" ...
##  $ time_spent_on_the_page: num  3.48 7.13 4.4 3.02 4.75 ...
##  $ converted             : chr  "no" "yes" "no" "no" ...
##  $ language_preferred    : chr  "Spanish" "English" "Spanish" "French" ...

Plot for time spent

   ggplot(data, aes(x = time_spent_on_the_page)) +
     geom_histogram(bins = 30, fill = "steelblue", color = "black") +
     labs(title = "Distribution of Time Spent on Page", 
          x = "Time Spent (minutes)", 
          y = "Frequency") +
     theme_minimal()

# Bar plots for categorical variables

ggplot(data, aes(x = group)) +
     geom_bar(fill = c("control" = "tomato", "treatment" = "steelblue")) +
     labs(title = "Distribution of Groups", x = "Group", y = "Count") +
     theme_minimal()

ggplot(data, aes(x = language_preferred)) +
     geom_bar(fill = "lightblue") +
     labs(title = "Distribution of Preferred Languages", x = "Language", y = "Count") +
     theme_minimal() 

ggplot(data, aes(x = language_preferred)) +
     geom_bar(fill = "lightblue") +
     labs(title = "Distribution of Preferred Languages", x = "Language", y = "Count") +
     theme_minimal()

##Results and Interpretation #The mean time spent by users on the landing page is 5.38 minutes, while the median time is 5.42 minutes. The closeness of these two values indicates that the data is approximately symmetric with no severe skewness. #The standard deviation is 2.38 minutes, showing moderate variability in how long users stay on the page. This suggests that while many users spend around the average time, there are notable differences in engagement levels across users. #The minimum time spent on the page is 0.19 minutes, indicating that some users exited the site almost immediately. The maximum time is 10.71 minutes, showing that certain users remained engaged for an extended duration. #The range of the data extends from 0.19 to 10.71 minutes, giving a total spread of: #10.71-0.19=10.52” minutes” #This shows a large difference between the shortest and longest visits. #The interquartile range (IQR) is 3.14 minutes, meaning the middle 50% of users spent time within a 3-minute window. This indicates that the bulk of visitors have fairly consistent behavior and that extreme values do not strongly distort the analysis #Convertion and language are related since they all falls under characters #The time spent on the new page are not same for the different language users as shown by the dataset above #1.The time spent on the new page are not same for the different language users as shown by the dataset above #2.Spanish language is the most preffered language amoungst them

#Business Recommendations #Immediate Implementation: Replace the existing landing page with the new version across all platforms.

#Monitor Post-Implementation: Track subscription rates for 1-3 months to confirm the test results translate to real-world performance.

#Consider Additional Testing: While the new page is successful, consider testing minor variations (different CTAs, images, or headlines) to potentially improve performance further.

#Resource Allocation: Since language isn’t a differentiating factor, focus design and content resources on other segmentation strategies (like topic interest or device type). ```