#import relevant library
import pandas as pd # for data manipulation
import numpy as np # for numerical analysis
from scipy import stats
import matplotlib.pyplot as plt # for data visulization
import seaborn as sns # for advanced ploting
data = pd.read_csv(r"C:\Users\njugu\Downloads\New folder\abtest.csv")
data.head()
## user_id group ... converted language_preferred
## 0 546592 control ... no Spanish
## 1 546468 treatment ... yes English
## 2 546462 treatment ... no Spanish
## 3 546567 control ... no French
## 4 546459 treatment ... yes Spanish
##
## [5 rows x 6 columns]
data.tail()
## user_id group ... converted language_preferred
## 95 546446 treatment ... no Spanish
## 96 546544 control ... yes English
## 97 546472 treatment ... yes Spanish
## 98 546481 treatment ... yes Spanish
## 99 546483 treatment ... yes English
##
## [5 rows x 6 columns]
data.describe()
## user_id time_spent_on_the_page
## count 100.000000 100.000000
## mean 546517.000000 5.377800
## std 52.295779 2.378166
## min 546443.000000 0.190000
## 25% 546467.750000 3.880000
## 50% 546492.500000 5.415000
## 75% 546567.250000 7.022500
## max 546592.000000 10.710000
data.shape
## (100, 6)
data.isnull().sum().sort_values(ascending=False)#for missing values
## user_id 0
## group 0
## landing_page 0
## time_spent_on_the_page 0
## converted 0
## language_preferred 0
## dtype: int64
data.duplicated().sum()#looking for duplicates
## np.int64(0)
#separating categorical column from numerical colun
numerical = data.select_dtypes(include=["int64", "float64"])
categorical = data.select_dtypes(include=["object", "category"])
numerical.describe()
## user_id time_spent_on_the_page
## count 100.000000 100.000000
## mean 546517.000000 5.377800
## std 52.295779 2.378166
## min 546443.000000 0.190000
## 25% 546467.750000 3.880000
## 50% 546492.500000 5.415000
## 75% 546567.250000 7.022500
## max 546592.000000 10.710000
categorical.describe()
## group landing_page converted language_preferred
## count 100 100 100 100
## unique 2 2 2 3
## top control old yes Spanish
## freq 50 50 54 34
# SUMMARY STATISTICS FOR NUMERICAL COLUMN
data["time_spent_on_the_page"].describe()
## count 100.000000
## mean 5.377800
## std 2.378166
## min 0.190000
## 25% 3.880000
## 50% 5.415000
## 75% 7.022500
## max 10.710000
## Name: time_spent_on_the_page, dtype: float64
# plot histogram
plt.figure(figsize=(10,5))
sns.histplot(data["time_spent_on_the_page"], kde=True)
plt.title("A HISTOGRAM OF TIME SPENT ON THE PAGE")
plt.show()
#plot the boxplot
sns.boxplot(data["time_spent_on_the_page"])
plt.title("BOXPLOT OF TIME SPENT ON THE PAGE")
plt.show()
# UNIVARIATE ANALYSIS FOR CATEGORICAL COLUMNS
# ANALYZE USER GROUP
data["group"].value_counts()
## group
## control 50
## treatment 50
## Name: count, dtype: int64
sns.countplot(x=data['group'])
plt.title("DISTRIBUTION OF USER GROUP")
plt.show()
data["landing_page"].value_counts()
## landing_page
## old 50
## new 50
## Name: count, dtype: int64
sns.countplot(x=data["landing_page"])
plt.title("DISTRIBUTION OF LANDING PAGES")
plt.show()
data["converted"].value_counts()
## converted
## yes 54
## no 46
## Name: count, dtype: int64
sns.countplot(x=data["converted"])
plt.title("CONVERSION DISTRIBUTION")
plt.show()
data['language_preferred'].value_counts()
## language_preferred
## Spanish 34
## French 34
## English 32
## Name: count, dtype: int64
sns.countplot(x=data['language_preferred'])
plt.title("Distribution of Preferred Languages")
plt.show()
sns.boxplot(x='landing_page', y='time_spent_on_the_page', data=data)
plt.title("LANDING PAGE VS TIME SPENT")
plt.show()
old_page = data[data['landing_page'] == 'old']['time_spent_on_the_page']
new_page = data[data['landing_page'] == 'new']['time_spent_on_the_page']
t_stat, p_value = stats.ttest_ind(new_page, old_page, equal_var=False)
t_stat, p_value
## (np.float64(3.7867702694199856), np.float64(0.0002784762450333098))
#We count how many users converted vs not converted for each page
pd.crosstab(data['landing_page'], data['converted'])
## converted no yes
## landing_page
## new 17 33
## old 29 21
#both variable are categorical we will use chi square test for independence
table = pd.crosstab(data['landing_page'], data['converted'])
chi2, p_value, dof, expected = stats.chi2_contingency(table)
chi2, p_value
## (np.float64(4.871175523349437), np.float64(0.02730889175492232))