import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats # <-- this defines "stats"
from statsmodels.stats.proportion import proportions_ztest
abtest = pd.read_csv("C:/Users/Admin/Desktop/statistical computing/data/abtest.csv")
print(abtest.head())
## user_id group ... converted language_preferred
## 0 546592 control ... no Spanish
## 1 546468 treatment ... yes English
## 2 546462 treatment ... no Spanish
## 3 546567 control ... no French
## 4 546459 treatment ... yes Spanish
##
## [5 rows x 6 columns]
print(abtest.tail())
## user_id group ... converted language_preferred
## 95 546446 treatment ... no Spanish
## 96 546544 control ... yes English
## 97 546472 treatment ... yes Spanish
## 98 546481 treatment ... yes Spanish
## 99 546483 treatment ... yes English
##
## [5 rows x 6 columns]
print("Rows:", abtest.shape[0])
## Rows: 100
print("Columns:", abtest.shape[1])
## Columns: 6
print(abtest.describe())
## user_id time_spent_on_the_page
## count 100.000000 100.000000
## mean 546517.000000 5.377800
## std 52.295779 2.378166
## min 546443.000000 0.190000
## 25% 546467.750000 3.880000
## 50% 546492.500000 5.415000
## 75% 546567.250000 7.022500
## max 546592.000000 10.710000
print("Missing values:", abtest.isna().sum().sum())
## Missing values: 0
print("Duplicated rows:", abtest.duplicated().sum())
## Duplicated rows: 0
print("Mean:", abtest['time_spent_on_the_page'].mean())
## Mean: 5.377800000000001
print("Median:", abtest['time_spent_on_the_page'].median())
## Median: 5.415
print("SD:", abtest['time_spent_on_the_page'].std())
## SD: 2.378166078933492
print("IQR:", stats.iqr(abtest['time_spent_on_the_page']))
## IQR: 3.1425
print("Variance:", abtest['time_spent_on_the_page'].var())
## Variance: 5.6556738989899
print("Quartiles:", abtest['time_spent_on_the_page'].quantile([0.25,0.5,0.75]))
## Quartiles: 0.25 3.8800
## 0.50 5.4150
## 0.75 7.0225
## Name: time_spent_on_the_page, dtype: float64
plt.hist(abtest['time_spent_on_the_page'])
plt.title("Histogram of Time Spent on Page")
plt.show()
plt.boxplot(abtest['time_spent_on_the_page'])
## {'whiskers': [<matplotlib.lines.Line2D object at 0x000002B541AD81A0>, <matplotlib.lines.Line2D object at 0x000002B541ADB380>], 'caps': [<matplotlib.lines.Line2D object at 0x000002B541ADB650>, <matplotlib.lines.Line2D object at 0x000002B541ADB920>], 'boxes': [<matplotlib.lines.Line2D object at 0x000002B541ADAFC0>], 'medians': [<matplotlib.lines.Line2D object at 0x000002B541ADBC20>], 'fliers': [<matplotlib.lines.Line2D object at 0x000002B541ADBF50>], 'means': []}
plt.title("Boxplot of Time Spent on Page")
plt.show()
plt.bar(range(len(abtest['time_spent_on_the_page'])), abtest['time_spent_on_the_page'])
plt.title("Barplot of Time Spent on Page")
plt.show()
abtest['landing_page'].value_counts().plot(kind='bar', title="Landing Page Counts")
plt.show()
abtest['converted'].value_counts().plot(kind='bar', title="Conversion Counts")
plt.show()
abtest['group'].value_counts().plot(kind='bar', title="Group Counts")
plt.show()
## bivariate visulizations
abtest.boxplot(column='time_spent_on_the_page', by='landing_page')
plt.title("Time Spent on Page: Old vs New")
plt.suptitle("")
plt.show()
pd.crosstab(abtest['landing_page'], abtest['converted']).plot(kind='bar')
plt.title("Landing Page vs Conversion")
plt.show()
pd.crosstab(abtest['language_preferred'], abtest['converted']).plot(kind='bar')
plt.title("Language vs Conversion")
plt.show()
abtest.boxplot(column='time_spent_on_the_page', by='converted')
plt.title("Time Spent by Conversion Status")
plt.suptitle("")
plt.show()
## hypothesis tests ## question one ## one tailed test
alpha=0.05
old_time = abtest.loc[abtest['landing_page'] == 'old', 'time_spent_on_the_page']
new_time = abtest.loc[abtest['landing_page'] == 'new', 'time_spent_on_the_page']
t_stat, p_value = stats.ttest_ind(new_time, old_time, alternative='greater')
print("T-test p-value:", p_value)
## T-test p-value: 0.0001316123528095005
print("Decision:", "Reject H0" if p_value < alpha else "Fail to reject H0")
## Decision: Reject H0
alpha= 0.05
n_new = (abtest['landing_page'] == 'new').sum()
n_old = (abtest['landing_page'] == 'old').sum()
x_new = ((abtest['landing_page'] == 'new') & (abtest['converted'] == 'yes')).sum()
x_old = ((abtest['landing_page'] == 'old') & (abtest['converted'] == 'yes')).sum()
print("New page conversion rate:", x_new/n_new)
## New page conversion rate: 0.66
print("Old page conversion rate:", x_old/n_old)
## Old page conversion rate: 0.42
count = np.array([x_new, x_old])
nobs = np.array([n_new, n_old])
z_stat, p_value = proportions_ztest(count, nobs, alternative='larger')
print("Proportion test p-value:", p_value)
## Proportion test p-value: 0.008026308204056278
print("Decision:", "Reject H0" if p_value < alpha else "Fail to reject H0")
## Decision: Reject H0
alpha=0.05
contingency_table = pd.crosstab(abtest['converted'], abtest['language_preferred'])
chi2, p_value, dof, expected = stats.chi2_contingency(contingency_table)
print("Chi-square p-value:", p_value)
## Chi-square p-value: 0.21298887487543447
print("Decision:", "Reject H0" if p_value < 0.05 else "Fail to reject H0")
## Decision: Fail to reject H0
alpha=0.05
new_page_data = abtest[abtest['landing_page'] == 'new']
anova_result = stats.f_oneway(
*[group["time_spent_on_the_page"].values for name, group in new_page_data.groupby("language_preferred")]
)
print("ANOVA p-value:", anova_result.pvalue)
## ANOVA p-value: 0.43204138694325955
print("Decision:", "Reject H0" if anova_result.pvalue < alpha else "Fail to reject H0")
## Decision: Fail to reject H0
print("Recommendations:")
## Recommendations:
print("- Adopt the new landing page (users spend more time on it).")
## - Adopt the new landing page (users spend more time on it).
print("- No need for specific language optimization (conversion and time spent not affected by language).")
## - No need for specific language optimization (conversion and time spent not affected by language).
print("- Continue tracking language trends.")
## - Continue tracking language trends.