import pandas as pd
import plotly.graph_objects as go
from scipy.stats import chi2_contingency
# Load the dataset
file_path = 'C:\\Users\\loydt\\Downloads\\Superstore Sales Dataset.csv'
data = pd.read_csv(file_path)
# Create a contingency table
contingency_table = pd.crosstab(data['Segment'], data['Ship Mode'])
# Chi-squared test of independence
chi2_stat, p_value, dof, expected = chi2_contingency(contingency_table)
# Calculate residuals (Observed - Expected)
residuals = contingency_table.values - expected
# Create the heatmap for the contingency table
fig = go.Figure(data=go.Heatmap(
z=contingency_table.values,
x=contingency_table.columns,
y=contingency_table.index,
colorscale='Viridis',
text=contingency_table.values, # Show observed values in heatmap
hoverinfo='none', # Disable hover info to avoid confusion
texttemplate='%{text}', # Show the observed values as text
textfont=dict(color='white'), # Change text color for better visibility
))
# Customize layout
fig.update_layout(
title='Contingency Table: Customer Segment vs Ship Mode',
xaxis_title='Ship Mode',
yaxis_title='Customer Segment',
plot_bgcolor='grey',
paper_bgcolor='black',
font=dict(color='ghostwhite'),
)
# Show the plot
fig.show()
# Print Chi-squared results
print("Chi-squared Statistic:", chi2_stat)
print("p-value:", p_value)
# Decision on the null hypothesis
alpha = 0.05 # significance level
if p_value < alpha:
print("Reject the null hypothesis: There is a significant association between customer segment and ship mode.")
else:
print("Fail to reject the null hypothesis: There is no significant association between customer segment and ship mode.")
# Display observed, expected, and residuals in a formatted way
summary_df = pd.DataFrame({
'Observed': contingency_table.values.flatten(),
'Expected': expected.flatten(),
'Residuals': residuals.flatten()
}, index=pd.MultiIndex.from_product([contingency_table.index, contingency_table.columns], names=['Segment', 'Ship Mode']))
# Print summary table
print("\nSummary Table of Observed, Expected, and Residuals:")
print(summary_df)