analysis of e-news

load dataset

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats   # <-- this defines "stats"
from statsmodels.stats.proportion import proportions_ztest
abtest = pd.read_csv("C:/Users/Admin/Desktop/statistical computing/data/abtest.csv")

Basic exploration

print(abtest.head())
##    user_id      group  ... converted  language_preferred
## 0   546592    control  ...        no             Spanish
## 1   546468  treatment  ...       yes             English
## 2   546462  treatment  ...        no             Spanish
## 3   546567    control  ...        no              French
## 4   546459  treatment  ...       yes             Spanish
## 
## [5 rows x 6 columns]
print(abtest.tail())
##     user_id      group  ... converted  language_preferred
## 95   546446  treatment  ...        no             Spanish
## 96   546544    control  ...       yes             English
## 97   546472  treatment  ...       yes             Spanish
## 98   546481  treatment  ...       yes             Spanish
## 99   546483  treatment  ...       yes             English
## 
## [5 rows x 6 columns]
print("Rows:", abtest.shape[0])
## Rows: 100
print("Columns:", abtest.shape[1])
## Columns: 6
print(abtest.describe())
##              user_id  time_spent_on_the_page
## count     100.000000              100.000000
## mean   546517.000000                5.377800
## std        52.295779                2.378166
## min    546443.000000                0.190000
## 25%    546467.750000                3.880000
## 50%    546492.500000                5.415000
## 75%    546567.250000                7.022500
## max    546592.000000               10.710000
print("Missing values:", abtest.isna().sum().sum())
## Missing values: 0
print("Duplicated rows:", abtest.duplicated().sum())
## Duplicated rows: 0

univariate analysis

print("Mean:", abtest['time_spent_on_the_page'].mean())
## Mean: 5.377800000000001
print("Median:", abtest['time_spent_on_the_page'].median())
## Median: 5.415
print("SD:", abtest['time_spent_on_the_page'].std())
## SD: 2.378166078933492
print("IQR:", stats.iqr(abtest['time_spent_on_the_page']))
## IQR: 3.1425
print("Variance:", abtest['time_spent_on_the_page'].var())
## Variance: 5.6556738989899
print("Quartiles:", abtest['time_spent_on_the_page'].quantile([0.25,0.5,0.75]))
## Quartiles: 0.25    3.8800
## 0.50    5.4150
## 0.75    7.0225
## Name: time_spent_on_the_page, dtype: float64

plots of visualization

plt.hist(abtest['time_spent_on_the_page'])
plt.title("Histogram of Time Spent on Page")
plt.show()

plt.boxplot(abtest['time_spent_on_the_page'])
## {'whiskers': [<matplotlib.lines.Line2D object at 0x000002B541AD81A0>, <matplotlib.lines.Line2D object at 0x000002B541ADB380>], 'caps': [<matplotlib.lines.Line2D object at 0x000002B541ADB650>, <matplotlib.lines.Line2D object at 0x000002B541ADB920>], 'boxes': [<matplotlib.lines.Line2D object at 0x000002B541ADAFC0>], 'medians': [<matplotlib.lines.Line2D object at 0x000002B541ADBC20>], 'fliers': [<matplotlib.lines.Line2D object at 0x000002B541ADBF50>], 'means': []}
plt.title("Boxplot of Time Spent on Page")
plt.show()

plt.bar(range(len(abtest['time_spent_on_the_page'])), abtest['time_spent_on_the_page'])
plt.title("Barplot of Time Spent on Page")
plt.show()

abtest['landing_page'].value_counts().plot(kind='bar', title="Landing Page Counts")
plt.show()

abtest['converted'].value_counts().plot(kind='bar', title="Conversion Counts")
plt.show()

abtest['group'].value_counts().plot(kind='bar', title="Group Counts")
plt.show()

## bivariate visulizations

abtest.boxplot(column='time_spent_on_the_page', by='landing_page')
plt.title("Time Spent on Page: Old vs New")
plt.suptitle("")
plt.show()

pd.crosstab(abtest['landing_page'], abtest['converted']).plot(kind='bar')
plt.title("Landing Page vs Conversion")
plt.show()

pd.crosstab(abtest['language_preferred'], abtest['converted']).plot(kind='bar')
plt.title("Language vs Conversion")
plt.show()

abtest.boxplot(column='time_spent_on_the_page', by='converted')
plt.title("Time Spent by Conversion Status")
plt.suptitle("")
plt.show()

## hypothesis tests ## question one ## one tailed test

alpha=0.05
old_time = abtest.loc[abtest['landing_page'] == 'old', 'time_spent_on_the_page']
new_time = abtest.loc[abtest['landing_page'] == 'new', 'time_spent_on_the_page']

t_stat, p_value = stats.ttest_ind(new_time, old_time, alternative='greater')
print("T-test p-value:", p_value)
## T-test p-value: 0.0001316123528095005
print("Decision:", "Reject H0" if p_value < alpha else "Fail to reject H0")
## Decision: Reject H0

question two

two proportion ztest

alpha= 0.05
n_new = (abtest['landing_page'] == 'new').sum()
n_old = (abtest['landing_page'] == 'old').sum()
x_new = ((abtest['landing_page'] == 'new') & (abtest['converted'] == 'yes')).sum()
x_old = ((abtest['landing_page'] == 'old') & (abtest['converted'] == 'yes')).sum()

print("New page conversion rate:", x_new/n_new)
## New page conversion rate: 0.66
print("Old page conversion rate:", x_old/n_old)
## Old page conversion rate: 0.42
count = np.array([x_new, x_old])
nobs = np.array([n_new, n_old])
z_stat, p_value = proportions_ztest(count, nobs, alternative='larger')
print("Proportion test p-value:", p_value)
## Proportion test p-value: 0.008026308204056278
print("Decision:", "Reject H0" if p_value < alpha else "Fail to reject H0")
## Decision: Reject H0

question three

Chi square test

alpha=0.05
contingency_table = pd.crosstab(abtest['converted'], abtest['language_preferred'])
chi2, p_value, dof, expected = stats.chi2_contingency(contingency_table)
print("Chi-square p-value:", p_value)
## Chi-square p-value: 0.21298887487543447
print("Decision:", "Reject H0" if p_value < 0.05 else "Fail to reject H0")
## Decision: Fail to reject H0

question four

anova test

alpha=0.05
new_page_data = abtest[abtest['landing_page'] == 'new']
anova_result = stats.f_oneway(
    *[group["time_spent_on_the_page"].values for name, group in new_page_data.groupby("language_preferred")]
)
print("ANOVA p-value:", anova_result.pvalue)
## ANOVA p-value: 0.43204138694325955
print("Decision:", "Reject H0" if anova_result.pvalue < alpha else "Fail to reject H0")
## Decision: Fail to reject H0

Recommendations

print("Recommendations:")
## Recommendations:
print("- Adopt the new landing page (users spend more time on it).")
## - Adopt the new landing page (users spend more time on it).
print("- No need for specific language optimization (conversion and time spent not affected by language).")
## - No need for specific language optimization (conversion and time spent not affected by language).
print("- Continue tracking language trends.")
## - Continue tracking language trends.