Basic exploration

print(abtest.head())

##    user_id      group  ... converted  language_preferred
## 0   546592    control  ...        no             Spanish
## 1   546468  treatment  ...       yes             English
## 2   546462  treatment  ...        no             Spanish
## 3   546567    control  ...        no              French
## 4   546459  treatment  ...       yes             Spanish
## 
## [5 rows x 6 columns]

print(abtest.tail())

##     user_id      group  ... converted  language_preferred
## 95   546446  treatment  ...        no             Spanish
## 96   546544    control  ...       yes             English
## 97   546472  treatment  ...       yes             Spanish
## 98   546481  treatment  ...       yes             Spanish
## 99   546483  treatment  ...       yes             English
## 
## [5 rows x 6 columns]

print("Rows:", abtest.shape[0])

## Rows: 100

print("Columns:", abtest.shape[1])

## Columns: 6

print(abtest.describe())

##              user_id  time_spent_on_the_page
## count     100.000000              100.000000
## mean   546517.000000                5.377800
## std        52.295779                2.378166
## min    546443.000000                0.190000
## 25%    546467.750000                3.880000
## 50%    546492.500000                5.415000
## 75%    546567.250000                7.022500
## max    546592.000000               10.710000

print("Missing values:", abtest.isna().sum().sum())

## Missing values: 0

print("Duplicated rows:", abtest.duplicated().sum())

## Duplicated rows: 0

univariate analysis

print("Mean:", abtest['time_spent_on_the_page'].mean())

## Mean: 5.377800000000001

print("Median:", abtest['time_spent_on_the_page'].median())

## Median: 5.415

print("SD:", abtest['time_spent_on_the_page'].std())

## SD: 2.378166078933492

print("IQR:", stats.iqr(abtest['time_spent_on_the_page']))

## IQR: 3.1425

print("Variance:", abtest['time_spent_on_the_page'].var())

## Variance: 5.6556738989899

print("Quartiles:", abtest['time_spent_on_the_page'].quantile([0.25,0.5,0.75]))

## Quartiles: 0.25    3.8800
## 0.50    5.4150
## 0.75    7.0225
## Name: time_spent_on_the_page, dtype: float64

plots of visualization

plt.hist(abtest['time_spent_on_the_page'])
plt.title("Histogram of Time Spent on Page")
plt.show()

plt.boxplot(abtest['time_spent_on_the_page'])

## {'whiskers': [<matplotlib.lines.Line2D object at 0x000002B541AD81A0>, <matplotlib.lines.Line2D object at 0x000002B541ADB380>], 'caps': [<matplotlib.lines.Line2D object at 0x000002B541ADB650>, <matplotlib.lines.Line2D object at 0x000002B541ADB920>], 'boxes': [<matplotlib.lines.Line2D object at 0x000002B541ADAFC0>], 'medians': [<matplotlib.lines.Line2D object at 0x000002B541ADBC20>], 'fliers': [<matplotlib.lines.Line2D object at 0x000002B541ADBF50>], 'means': []}

plt.title("Boxplot of Time Spent on Page")
plt.show()

plt.bar(range(len(abtest['time_spent_on_the_page'])), abtest['time_spent_on_the_page'])
plt.title("Barplot of Time Spent on Page")
plt.show()

abtest['landing_page'].value_counts().plot(kind='bar', title="Landing Page Counts")
plt.show()

abtest['converted'].value_counts().plot(kind='bar', title="Conversion Counts")
plt.show()

abtest['group'].value_counts().plot(kind='bar', title="Group Counts")
plt.show()

## bivariate visulizations

abtest.boxplot(column='time_spent_on_the_page', by='landing_page')
plt.title("Time Spent on Page: Old vs New")
plt.suptitle("")
plt.show()

pd.crosstab(abtest['landing_page'], abtest['converted']).plot(kind='bar')
plt.title("Landing Page vs Conversion")
plt.show()

pd.crosstab(abtest['language_preferred'], abtest['converted']).plot(kind='bar')
plt.title("Language vs Conversion")
plt.show()

abtest.boxplot(column='time_spent_on_the_page', by='converted')
plt.title("Time Spent by Conversion Status")
plt.suptitle("")
plt.show()

## hypothesis tests ## question one ## one tailed test

alpha=0.05
old_time = abtest.loc[abtest['landing_page'] == 'old', 'time_spent_on_the_page']
new_time = abtest.loc[abtest['landing_page'] == 'new', 'time_spent_on_the_page']

t_stat, p_value = stats.ttest_ind(new_time, old_time, alternative='greater')
print("T-test p-value:", p_value)

## T-test p-value: 0.0001316123528095005

print("Decision:", "Reject H0" if p_value < alpha else "Fail to reject H0")

## Decision: Reject H0

question two

two proportion ztest

alpha= 0.05
n_new = (abtest['landing_page'] == 'new').sum()
n_old = (abtest['landing_page'] == 'old').sum()
x_new = ((abtest['landing_page'] == 'new') & (abtest['converted'] == 'yes')).sum()
x_old = ((abtest['landing_page'] == 'old') & (abtest['converted'] == 'yes')).sum()

print("New page conversion rate:", x_new/n_new)

## New page conversion rate: 0.66

print("Old page conversion rate:", x_old/n_old)

## Old page conversion rate: 0.42

count = np.array([x_new, x_old])
nobs = np.array([n_new, n_old])
z_stat, p_value = proportions_ztest(count, nobs, alternative='larger')
print("Proportion test p-value:", p_value)

## Proportion test p-value: 0.008026308204056278

print("Decision:", "Reject H0" if p_value < alpha else "Fail to reject H0")

## Decision: Reject H0

question three

Chi square test

alpha=0.05
contingency_table = pd.crosstab(abtest['converted'], abtest['language_preferred'])
chi2, p_value, dof, expected = stats.chi2_contingency(contingency_table)
print("Chi-square p-value:", p_value)

## Chi-square p-value: 0.21298887487543447

print("Decision:", "Reject H0" if p_value < 0.05 else "Fail to reject H0")

## Decision: Fail to reject H0

question four

anova test

alpha=0.05
new_page_data = abtest[abtest['landing_page'] == 'new']
anova_result = stats.f_oneway(
    *[group["time_spent_on_the_page"].values for name, group in new_page_data.groupby("language_preferred")]
)
print("ANOVA p-value:", anova_result.pvalue)

## ANOVA p-value: 0.43204138694325955

print("Decision:", "Reject H0" if anova_result.pvalue < alpha else "Fail to reject H0")

## Decision: Fail to reject H0

Recommendations

print("Recommendations:")

## Recommendations:

print("- Adopt the new landing page (users spend more time on it).")

## - Adopt the new landing page (users spend more time on it).

print("- No need for specific language optimization (conversion and time spent not affected by language).")

## - No need for specific language optimization (conversion and time spent not affected by language).

print("- Continue tracking language trends.")

## - Continue tracking language trends.

project python form

Gitau paul kimani

2025-12-12

analysis of e-news

load dataset

Basic exploration

univariate analysis

plots of visualization

question two

two proportion ztest

question three

Chi square test

question four

anova test

Recommendations