chi-squared-indpend-test.py

import pandas as pd
import plotly.graph_objects as go
from scipy.stats import chi2_contingency

# Load the dataset
file_path = 'C:\\Users\\loydt\\Downloads\\Superstore Sales Dataset.csv'
data = pd.read_csv(file_path)

# Create a contingency table
contingency_table = pd.crosstab(data['Segment'], data['Ship Mode'])

# Chi-squared test of independence
chi2_stat, p_value, dof, expected = chi2_contingency(contingency_table)

# Calculate residuals (Observed - Expected)
residuals = contingency_table.values - expected

# Create the heatmap for the contingency table
fig = go.Figure(data=go.Heatmap(
    z=contingency_table.values,
    x=contingency_table.columns,
    y=contingency_table.index,
    colorscale='Viridis',
    text=contingency_table.values,  # Show observed values in heatmap
    hoverinfo='none',  # Disable hover info to avoid confusion
    texttemplate='%{text}',  # Show the observed values as text
    textfont=dict(color='white'),  # Change text color for better visibility
))

# Customize layout
fig.update_layout(
    title='Contingency Table: Customer Segment vs Ship Mode',
    xaxis_title='Ship Mode',
    yaxis_title='Customer Segment',
    plot_bgcolor='grey',
    paper_bgcolor='black',
    font=dict(color='ghostwhite'),
)

# Show the plot
fig.show()

# Print Chi-squared results
print("Chi-squared Statistic:", chi2_stat)
print("p-value:", p_value)

# Decision on the null hypothesis
alpha = 0.05  # significance level
if p_value < alpha:
    print("Reject the null hypothesis: There is a significant association between customer segment and ship mode.")
else:
    print("Fail to reject the null hypothesis: There is no significant association between customer segment and ship mode.")

# Display observed, expected, and residuals in a formatted way
summary_df = pd.DataFrame({
    'Observed': contingency_table.values.flatten(),
    'Expected': expected.flatten(),
    'Residuals': residuals.flatten()
}, index=pd.MultiIndex.from_product([contingency_table.index, contingency_table.columns], names=['Segment', 'Ship Mode']))

# Print summary table
print("\nSummary Table of Observed, Expected, and Residuals:")
print(summary_df)