# install.packages(c("tidyverse", "skimr", "corrplot", "broom"))
library(tidyverse) # dplyr, ggplot2, tidyr, readr, etc.
library(skimr) # quick summary stats
library(broom) # tidy model outputs
library(corrplot) # correlation heatmapsR vs Python: A Data Analysis Comparison
Side-by-side workflows for common data analysis tasks
Setup
Before running this notebook, ensure the following packages are installed.
R Packages
Python Packages
# pip install pandas numpy matplotlib seaborn scipy scikit-learn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler1. Creating & Loading Data
1a. Creating a Sample Dataset
We’ll build a synthetic sales dataset used throughout this notebook.
R
set.seed(42)
n <- 200
sales <- tibble(
id = 1:n,
region = sample(c("North","South","East","West"), n, replace = TRUE),
category = sample(c("Electronics","Clothing","Food","Books"), n, replace = TRUE),
units_sold = rpois(n, lambda = 50),
unit_price = round(runif(n, 5, 500), 2),
discount = round(runif(n, 0, 0.4), 2),
date = sample(seq(as.Date("2023-01-01"),
as.Date("2023-12-31"), by = "day"), n, replace = TRUE)
)
# Derived column
sales <- sales |>
mutate(revenue = round(units_sold * unit_price * (1 - discount), 2))
glimpse(sales)Rows: 200
Columns: 8
$ id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, …
$ region <chr> "North", "North", "North", "North", "South", "West", "South…
$ category <chr> "Books", "Clothing", "Electronics", "Books", "Food", "Elect…
$ units_sold <int> 35, 48, 41, 40, 51, 54, 42, 48, 41, 56, 39, 53, 50, 49, 60,…
$ unit_price <dbl> 398.24, 462.16, 86.41, 100.48, 53.33, 415.41, 403.90, 143.2…
$ discount <dbl> 0.29, 0.28, 0.01, 0.38, 0.21, 0.28, 0.02, 0.37, 0.01, 0.31,…
$ date <date> 2023-07-03, 2023-08-17, 2023-01-03, 2023-10-28, 2023-09-10…
$ revenue <dbl> 9896.26, 15972.25, 3507.38, 2491.90, 2148.67, 16151.14, 166…
Python
rng = np.random.default_rng(42)
n = 200
sales = pd.DataFrame({
"id": range(1, n + 1),
"region": rng.choice(["North","South","East","West"], n),
"category": rng.choice(["Electronics","Clothing","Food","Books"], n),
"units_sold": rng.poisson(lam=50, size=n),
"unit_price": rng.uniform(5, 500, n).round(2),
"discount": rng.uniform(0, 0.4, n).round(2),
"date": pd.to_datetime(
rng.choice(pd.date_range("2023-01-01","2023-12-31"), n)
)
})
# Derived column
sales["revenue"] = (
sales["units_sold"] * sales["unit_price"] * (1 - sales["discount"])
).round(2)
sales.info()<class 'pandas.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id 200 non-null int64
1 region 200 non-null str
2 category 200 non-null str
3 units_sold 200 non-null int64
4 unit_price 200 non-null float64
5 discount 200 non-null float64
6 date 200 non-null datetime64[us]
7 revenue 200 non-null float64
dtypes: datetime64[us](1), float64(3), int64(2), str(2)
memory usage: 12.6 KB
1b. Reading & Writing Files
R
# Write
write_csv(sales, "sales.csv")
# Read back
df <- read_csv("sales.csv",
col_types = cols(date = col_date()))
# Read Excel
# df <- readxl::read_excel("sales.xlsx")
# Read from URL
# df <- read_csv("https://example.com/data.csv")Python
# Write
sales.to_csv("sales.csv", index=False)
# Read back
df = pd.read_csv("sales.csv", parse_dates=["date"])
# Read Excel
# df = pd.read_excel("sales.xlsx")
# Read from URL
# df = pd.read_csv("https://example.com/data.csv")
print(df.dtypes)id int64
region str
category str
units_sold int64
unit_price float64
discount float64
date datetime64[us]
revenue float64
dtype: object
2. Exploratory Data Analysis (EDA)
2a. Summary Statistics
R
# Base summary
summary(sales |> select(units_sold, unit_price, discount, revenue)) units_sold unit_price discount revenue
Min. :30.00 Min. : 8.43 Min. :0.0000 Min. : 375.6
1st Qu.:43.00 1st Qu.: 93.91 1st Qu.:0.1100 1st Qu.: 3482.3
Median :49.00 Median :215.55 Median :0.2100 Median : 8222.1
Mean :48.84 Mean :235.03 Mean :0.2057 Mean : 8988.0
3rd Qu.:54.00 3rd Qu.:377.23 3rd Qu.:0.3025 3rd Qu.:13999.3
Max. :68.00 Max. :499.69 Max. :0.4000 Max. :25376.2
# Rich summary via skimr
skim(sales |> select(where(is.numeric)))| Name | select(sales, where(is.nu… |
| Number of rows | 200 |
| Number of columns | 5 |
| _______________________ | |
| Column type frequency: | |
| numeric | 5 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| id | 0 | 1 | 100.50 | 57.88 | 1.00 | 50.75 | 100.50 | 150.25 | 200.00 | ▇▇▇▇▇ |
| units_sold | 0 | 1 | 48.84 | 7.20 | 30.00 | 43.00 | 49.00 | 54.00 | 68.00 | ▁▆▇▆▁ |
| unit_price | 0 | 1 | 235.03 | 151.10 | 8.43 | 93.91 | 215.55 | 377.23 | 499.69 | ▇▆▅▅▆ |
| discount | 0 | 1 | 0.21 | 0.12 | 0.00 | 0.11 | 0.21 | 0.30 | 0.40 | ▆▆▇▇▆ |
| revenue | 0 | 1 | 8987.98 | 6175.62 | 375.56 | 3482.34 | 8222.08 | 13999.34 | 25376.22 | ▇▇▅▅▁ |
Python
# Numeric summary
print(sales[["units_sold","unit_price","discount","revenue"]].describe().round(2)) units_sold unit_price discount revenue
count 200.00 200.00 200.00 200.00
mean 49.64 242.41 0.21 9511.68
std 7.47 147.14 0.11 6151.08
min 32.00 5.61 0.00 214.19
25% 45.00 111.58 0.11 3932.91
50% 50.00 239.15 0.20 8977.58
75% 55.00 368.71 0.30 13761.40
max 65.00 497.38 0.40 25106.63
# Additional stats
print("\nSkewness:")
Skewness:
print(sales[["units_sold","unit_price","revenue"]].skew().round(3))units_sold -0.180
unit_price 0.047
revenue 0.311
dtype: float64
print("\nKurtosis:")
Kurtosis:
print(sales[["units_sold","unit_price","revenue"]].kurt().round(3))units_sold -0.514
unit_price -1.230
revenue -0.860
dtype: float64
2b. Checking for Missing Values
R
# Inject some NAs for demonstration
sales_na <- sales
sales_na$revenue[sample(n, 10)] <- NA
sales_na$region[sample(n, 5)] <- NA
# Count NAs per column
colSums(is.na(sales_na)) id region category units_sold unit_price discount date
0 5 0 0 0 0 0
revenue
10
# Percentage missing
colMeans(is.na(sales_na)) * 100 id region category units_sold unit_price discount date
0.0 2.5 0.0 0.0 0.0 0.0 0.0
revenue
5.0
Python
# Inject some NAs
sales_na = sales.copy()
sales_na.loc[rng.choice(n, 10, replace=False), "revenue"] = np.nan
sales_na.loc[rng.choice(n, 5, replace=False), "region"] = np.nan
# Count and percentage
missing = pd.DataFrame({
"count": sales_na.isna().sum(),
"percent": (sales_na.isna().mean() * 100).round(2)
})
print(missing[missing["count"] > 0]) count percent
region 5 2.5
revenue 10 5.0
2c. Handling Missing Values
R
sales_clean <- sales_na |>
# Drop rows where region is NA
drop_na(region) |>
# Impute revenue with column median
mutate(revenue = if_else(is.na(revenue),
median(revenue, na.rm = TRUE),
revenue))
cat("Rows before:", nrow(sales_na), "\n")Rows before: 200
cat("Rows after :", nrow(sales_clean), "\n")Rows after : 195
cat("NAs remaining:", sum(is.na(sales_clean)), "\n")NAs remaining: 0
Python
sales_clean = (sales_na
.dropna(subset=["region"]) # drop rows where region is NA
.assign(revenue=lambda d: d["revenue"]
.fillna(d["revenue"].median())) # impute revenue with median
.reset_index(drop=True)
)
print(f"Rows before: {len(sales_na)}")Rows before: 200
print(f"Rows after : {len(sales_clean)}")Rows after : 195
print(f"NAs remaining: {sales_clean.isna().sum().sum()}")NAs remaining: 0
3. Data Wrangling
3a. Filtering & Selecting
R
# Filter and select
high_rev <- sales |>
filter(revenue > 10000, region %in% c("North", "East")) |>
select(id, region, category, revenue, date) |>
arrange(desc(revenue))
head(high_rev, 5)Python
high_rev = (sales
.query("revenue > 10000 and region in ['North', 'East']")
[["id","region","category","revenue","date"]]
.sort_values("revenue", ascending=False)
.reset_index(drop=True)
)
print(high_rev.head(5)) id region category revenue date
0 156 East Books 25106.63 2023-09-27
1 159 East Electronics 23738.44 2023-04-30
2 122 East Electronics 23392.25 2023-07-14
3 119 North Books 20513.72 2023-04-29
4 56 North Electronics 19932.22 2023-02-10
3b. Grouping & Aggregating
R
region_summary <- sales |>
group_by(region, category) |>
summarise(
n_transactions = n(),
total_revenue = sum(revenue),
avg_revenue = mean(revenue),
avg_discount = mean(discount),
.groups = "drop"
) |>
mutate(across(where(is.numeric), \(x) round(x, 2))) |>
arrange(region, desc(total_revenue))
print(region_summary, n = 10)# A tibble: 16 × 6
region category n_transactions total_revenue avg_revenue avg_discount
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 East Electronics 11 99054. 9005. 0.23
2 East Food 10 98761. 9876. 0.25
3 East Clothing 9 76910. 8546. 0.21
4 East Books 4 42313. 10578. 0.18
5 North Electronics 19 177838. 9360. 0.19
6 North Clothing 15 118822. 7921. 0.16
7 North Books 9 80933. 8993. 0.21
8 North Food 12 57852. 4821. 0.26
9 South Electronics 16 151476. 9467. 0.21
10 South Food 15 135052. 9003. 0.19
# ℹ 6 more rows
Python
region_summary = (sales
.groupby(["region","category"])
.agg(
n_transactions=("revenue","count"),
total_revenue =("revenue","sum"),
avg_revenue =("revenue","mean"),
avg_discount =("discount","mean")
)
.round(2)
.sort_values(["region","total_revenue"], ascending=[True, False])
.reset_index()
)
print(region_summary.head(10)) region category n_transactions total_revenue avg_revenue avg_discount
0 East Electronics 16 174242.19 10890.14 0.25
1 East Food 12 123305.67 10275.47 0.23
2 East Books 11 121394.25 11035.84 0.14
3 East Clothing 11 95077.31 8643.39 0.26
4 North Books 16 157301.62 9831.35 0.18
5 North Electronics 13 91077.49 7005.96 0.22
6 North Clothing 8 77460.38 9682.55 0.22
7 North Food 9 50625.58 5625.06 0.23
8 South Clothing 17 143101.31 8417.72 0.18
9 South Electronics 13 135394.67 10414.97 0.22
3c. Reshaping: Wide ↔︎ Long
R
# Pivot wider: revenue by region × category
wide <- sales |>
group_by(region, category) |>
summarise(total = sum(revenue), .groups = "drop") |>
pivot_wider(names_from = category, values_from = total, values_fill = 0)
wide# Pivot longer: back to tidy format
long <- wide |>
pivot_longer(-region, names_to = "category", values_to = "total_revenue")
head(long)Python
# Pivot wider
wide = (sales
.groupby(["region","category"])["revenue"].sum()
.unstack(fill_value=0)
.reset_index()
)
print(wide)category region Books Clothing Electronics Food
0 East 121394.25 95077.31 174242.19 123305.67
1 North 157301.62 77460.38 91077.49 50625.58
2 South 107366.48 143101.31 135394.67 106150.00
3 West 87441.16 205153.80 77321.47 149922.15
# Pivot longer (melt)
long = wide.melt(id_vars="region",
var_name="category",
value_name="total_revenue")
print(long.head()) region category total_revenue
0 East Books 121394.25
1 North Books 157301.62
2 South Books 107366.48
3 West Books 87441.16
4 East Clothing 95077.31
3d. Joining / Merging
R
# Create a lookup table
region_info <- tibble(
region = c("North","South","East","West"),
manager = c("Alice","Bob","Carol","Dave"),
target = c(500000, 450000, 480000, 420000)
)
# Left join
sales_enriched <- sales |>
left_join(region_info, by = "region")
sales_enriched |>
select(id, region, manager, revenue, target) |>
head(5)Python
# Create a lookup table
region_info = pd.DataFrame({
"region": ["North","South","East","West"],
"manager": ["Alice","Bob","Carol","Dave"],
"target": [500000, 450000, 480000, 420000]
})
# Left join (merge)
sales_enriched = sales.merge(region_info, on="region", how="left")
print(sales_enriched[["id","region","manager","revenue","target"]].head(5)) id region manager revenue target
0 1 North Alice 15924.65 500000
1 2 West Dave 7118.75 420000
2 3 East Carol 13259.22 480000
3 4 South Bob 14775.83 450000
4 5 South Bob 13134.72 450000
3e. Window Functions
R
sales_ranked <- sales |>
group_by(region) |>
mutate(
revenue_rank = rank(desc(revenue)),
cumulative_rev = cumsum(revenue),
pct_of_region = round(revenue / sum(revenue) * 100, 2),
revenue_vs_mean = round(revenue - mean(revenue), 2)
) |>
ungroup()
sales_ranked |>
filter(region == "North") |>
select(id, revenue, revenue_rank, pct_of_region, revenue_vs_mean) |>
arrange(revenue_rank) |>
head(5)Python
sales_ranked = sales.copy()
grp = sales_ranked.groupby("region")["revenue"]
sales_ranked["revenue_rank"] = grp.rank(ascending=False)
sales_ranked["cumulative_rev"] = grp.cumsum()
sales_ranked["pct_of_region"] = (sales_ranked["revenue"] /
grp.transform("sum") * 100).round(2)
sales_ranked["revenue_vs_mean"] = (sales_ranked["revenue"] -
grp.transform("mean")).round(2)
subset = (sales_ranked[sales_ranked["region"]=="North"]
[["id","revenue","revenue_rank","pct_of_region","revenue_vs_mean"]]
.sort_values("revenue_rank")
.head(5))
print(subset) id revenue revenue_rank pct_of_region revenue_vs_mean
118 119 20513.72 1.0 5.45 12329.70
55 56 19932.22 2.0 5.29 11748.20
197 198 19794.64 3.0 5.26 11610.62
9 10 18794.32 4.0 4.99 10610.30
181 182 18556.06 5.0 4.93 10372.04
4. Date & Time Operations
R
sales_dt <- sales |>
mutate(
month = month(date, label = TRUE),
quarter = paste0("Q", quarter(date)),
day_of_wk = wday(date, label = TRUE),
week_num = isoweek(date)
)
# Monthly revenue trend
monthly <- sales_dt |>
group_by(month) |>
summarise(total_revenue = sum(revenue), .groups = "drop")
print(monthly)# A tibble: 12 × 2
month total_revenue
<ord> <dbl>
1 Jan 188270.
2 Feb 140978.
3 Mar 176551.
4 Apr 142977.
5 May 250608.
6 Jun 117562.
7 Jul 196480.
8 Aug 133284.
9 Sep 104515.
10 Oct 127692.
11 Nov 115758.
12 Dec 102921.
Python
sales_dt = sales.copy()
sales_dt["month"] = sales_dt["date"].dt.month_name().str[:3]
sales_dt["quarter"] = "Q" + sales_dt["date"].dt.quarter.astype(str)
sales_dt["day_of_wk"] = sales_dt["date"].dt.day_name().str[:3]
sales_dt["week_num"] = sales_dt["date"].dt.isocalendar().week
# Monthly revenue trend
monthly = (sales_dt
.groupby("month")["revenue"].sum()
.reset_index()
.rename(columns={"revenue":"total_revenue"})
)
print(monthly) month total_revenue
0 Apr 212447.16
1 Aug 85233.35
2 Dec 96886.35
3 Feb 170990.52
4 Jan 156386.79
5 Jul 148611.05
6 Jun 165677.16
7 Mar 190572.02
8 May 205893.52
9 Nov 134036.74
10 Oct 206183.47
11 Sep 129417.40
5. Visualization
5a. Distribution Plot
R
ggplot(sales, aes(x = revenue, fill = category)) +
geom_histogram(bins = 30, alpha = 0.7, color = "white") +
facet_wrap(~category, scales = "free_y") +
scale_fill_brewer(palette = "Set2") +
labs(title = "Revenue Distribution by Category",
x = "Revenue ($)", y = "Count") +
theme_minimal() +
theme(legend.position = "none")Python
g = sns.FacetGrid(sales, col="category", col_wrap=2,
hue="category", palette="Set2", height=3)
g.map(sns.histplot, "revenue", bins=30, alpha=0.7, edgecolor="white")g.set_axis_labels("Revenue ($)", "Count")g.figure.suptitle("Revenue Distribution by Category", y=1.02)
plt.tight_layout()
plt.show()5b. Bar Chart with Error Bars
R
sales |>
group_by(region) |>
summarise(
mean_rev = mean(revenue),
se = sd(revenue) / sqrt(n()),
.groups = "drop"
) |>
ggplot(aes(x = reorder(region, -mean_rev), y = mean_rev, fill = region)) +
geom_col(alpha = 0.85) +
geom_errorbar(aes(ymin = mean_rev - se, ymax = mean_rev + se),
width = 0.2, linewidth = 0.8) +
scale_fill_brewer(palette = "Pastel1") +
labs(title = "Average Revenue by Region",
x = "Region", y = "Avg Revenue ($)") +
theme_minimal() +
theme(legend.position = "none")Python
grp = sales.groupby("region")["revenue"]
summary = pd.DataFrame({
"mean_rev": grp.mean(),
"se": grp.sem()
}).reset_index().sort_values("mean_rev", ascending=False)
fig, ax = plt.subplots(figsize=(6, 4))
bars = ax.bar(summary["region"], summary["mean_rev"],
color=sns.color_palette("pastel"), alpha=0.85)
ax.errorbar(summary["region"], summary["mean_rev"],
yerr=summary["se"], fmt="none", color="black", capsize=5)
ax.set_title("Average Revenue by Region")
ax.set_xlabel("Region")
ax.set_ylabel("Avg Revenue ($)")
plt.tight_layout()
plt.show()5c. Scatter Plot with Regression Line
R
ggplot(sales, aes(x = units_sold, y = revenue, color = category)) +
geom_point(alpha = 0.5, size = 1.8) +
geom_smooth(method = "lm", se = TRUE, linewidth = 1) +
scale_color_brewer(palette = "Set1") +
labs(title = "Units Sold vs Revenue by Category",
x = "Units Sold", y = "Revenue ($)", color = "Category") +
theme_minimal()Python
fig, ax = plt.subplots(figsize=(7, 5))
for cat, grp in sales.groupby("category"):
ax.scatter(grp["units_sold"], grp["revenue"],
alpha=0.5, s=20, label=cat)
m, b = np.polyfit(grp["units_sold"], grp["revenue"], 1)
x_line = np.linspace(grp["units_sold"].min(), grp["units_sold"].max(), 100)
ax.plot(x_line, m * x_line + b, linewidth=1.5)
ax.set_title("Units Sold vs Revenue by Category")
ax.set_xlabel("Units Sold")
ax.set_ylabel("Revenue ($)")
ax.legend(title="Category")
plt.tight_layout()
plt.show()5d. Heatmap: Correlation Matrix
R
num_cols <- sales |> select(units_sold, unit_price, discount, revenue)
cor_mat <- cor(num_cols)
corrplot(cor_mat,
method = "color",
type = "upper",
addCoef.col = "black",
tl.col = "black",
col = colorRampPalette(c("#d73027","white","#1a9850"))(200),
title = "Correlation Matrix",
mar = c(0,0,1,0))Python
num_cols = sales[["units_sold","unit_price","discount","revenue"]]
cor_mat = num_cols.corr()
fig, ax = plt.subplots(figsize=(5, 4))
sns.heatmap(cor_mat, annot=True, fmt=".2f", cmap="RdYlGn",
vmin=-1, vmax=1, square=True, ax=ax,
linewidths=0.5, cbar_kws={"shrink": 0.8})
ax.set_title("Correlation Matrix")
plt.tight_layout()
plt.show()5e. Time Series Line Plot
R
sales |>
mutate(month = floor_date(date, "month")) |>
group_by(month, category) |>
summarise(total = sum(revenue), .groups = "drop") |>
ggplot(aes(x = month, y = total, color = category)) +
geom_line(linewidth = 1) +
geom_point(size = 2) +
scale_x_date(date_labels = "%b", date_breaks = "1 month") +
scale_color_brewer(palette = "Set1") +
labs(title = "Monthly Revenue by Category",
x = NULL, y = "Total Revenue ($)", color = "Category") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))Python
monthly_cat = (sales
.assign(month=sales["date"].dt.to_period("M").dt.to_timestamp())
.groupby(["month","category"])["revenue"].sum()
.reset_index()
)
fig, ax = plt.subplots(figsize=(9, 4))
for cat, grp in monthly_cat.groupby("category"):
ax.plot(grp["month"], grp["revenue"], marker="o",
linewidth=1.5, markersize=4, label=cat)
ax.set_title("Monthly Revenue by Category")
ax.set_ylabel("Total Revenue ($)")
ax.legend(title="Category")
fig.autofmt_xdate()
plt.tight_layout()
plt.show()6. Statistical Analysis
6a. Descriptive Statistics by Group
R
sales |>
group_by(category) |>
summarise(
n = n(),
mean = mean(revenue),
median = median(revenue),
sd = sd(revenue),
iqr = IQR(revenue),
min = min(revenue),
max = max(revenue),
.groups = "drop"
) |>
mutate(across(where(is.numeric), \(x) round(x, 1)))Python
desc = (sales
.groupby("category")["revenue"]
.agg(
n ="count",
mean ="mean",
median ="median",
sd ="std",
iqr =lambda x: x.quantile(0.75) - x.quantile(0.25),
min ="min",
max ="max"
)
.round(1)
.reset_index()
)
print(desc) category n mean median sd iqr min max
0 Books 46 10293.6 9512.4 6831.0 12409.1 227.9 25106.6
1 Clothing 57 9136.7 8559.4 5805.4 9959.5 255.8 22010.7
2 Electronics 51 9373.3 8404.6 6285.8 9235.0 214.2 23738.4
3 Food 46 9347.9 9706.1 5827.7 10236.4 268.6 21705.0
6b. Hypothesis Testing — ANOVA
Is there a significant difference in revenue across regions?
R
# One-way ANOVA
model_anova <- aov(revenue ~ region, data = sales)
summary(model_anova) Df Sum Sq Mean Sq F value Pr(>F)
region 3 1.225e+08 40828459 1.072 0.362
Residuals 196 7.467e+09 38097127
# Post-hoc: Tukey HSD
TukeyHSD(model_anova) |> tidy() |>
select(contrast, estimate, adj.p.value) |>
mutate(across(where(is.numeric), \(x) round(x, 4))) |>
arrange(adj.p.value)Python
from scipy.stats import f_oneway
from itertools import combinations
# One-way ANOVA
groups = [g["revenue"].values for _, g in sales.groupby("region")]
F, p = f_oneway(*groups)
print(f"F-statistic: {F:.4f} p-value: {p:.4f}")F-statistic: 1.2114 p-value: 0.3068
# Post-hoc: pairwise t-tests (Bonferroni)
regions = sales["region"].unique()
rows = []
for r1, r2 in combinations(regions, 2):
g1 = sales.loc[sales["region"]==r1, "revenue"]
g2 = sales.loc[sales["region"]==r2, "revenue"]
t, p_val = stats.ttest_ind(g1, g2)
rows.append({"contrast": f"{r1}-{r2}", "t": round(t,3),
"p_adj": round(p_val * 6, 4)}) # Bonferroni k=6
print(pd.DataFrame(rows).sort_values("p_adj")) contrast t p_adj
1 North-East -1.632 0.6365
0 North-West -1.577 0.7082
2 North-South -0.910 2.1912
5 East-South 0.840 2.4189
4 West-South 0.773 2.6492
3 West-East -0.070 5.6661
6c. Correlation Analysis
R
# Pearson correlation with significance
cor_test <- cor.test(sales$units_sold, sales$revenue, method = "pearson")
cat(sprintf("r = %.4f, t = %.4f, p = %.4e\n",
cor_test$estimate,
cor_test$statistic,
cor_test$p.value))r = 0.1826, t = 2.6136, p = 9.6471e-03
cat(sprintf("95%% CI: [%.4f, %.4f]\n",
cor_test$conf.int[1],
cor_test$conf.int[2]))95% CI: [0.0450, 0.3134]
Python
r, p = stats.pearsonr(sales["units_sold"], sales["revenue"])
n = len(sales)
# 95% CI via Fisher z-transformation
z = np.arctanh(r)
se = 1 / np.sqrt(n - 3)
ci = np.tanh([z - 1.96*se, z + 1.96*se])
print(f"r = {r:.4f}, p = {p:.2e}")r = 0.2528, p = 3.05e-04
print(f"95% CI: [{ci[0]:.4f}, {ci[1]:.4f}]")95% CI: [0.1182, 0.3782]
7. Linear Regression
7a. Fitting a Model
R
# Encode category as dummy variables (R does this automatically)
model <- lm(revenue ~ units_sold + unit_price + discount + category,
data = sales)
summary(model)
Call:
lm(formula = revenue ~ units_sold + unit_price + discount + category,
data = sales)
Residuals:
Min 1Q Median 3Q Max
-3485.1 -546.8 -1.8 515.1 4461.2
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -6.561e+03 6.483e+02 -10.121 <2e-16 ***
units_sold 1.764e+02 1.248e+01 14.136 <2e-16 ***
unit_price 3.954e+01 5.914e-01 66.854 <2e-16 ***
discount -1.091e+04 7.789e+02 -14.004 <2e-16 ***
categoryClothing -1.384e+02 2.661e+02 -0.520 0.604
categoryElectronics -7.212e+01 2.658e+02 -0.271 0.786
categoryFood -2.223e+02 2.752e+02 -0.808 0.420
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1246 on 193 degrees of freedom
Multiple R-squared: 0.9605, Adjusted R-squared: 0.9593
F-statistic: 782.1 on 6 and 193 DF, p-value: < 2.2e-16
Python
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
# One-hot encode category
X = pd.get_dummies(
sales[["units_sold","unit_price","discount","category"]],
drop_first=True
)
y = sales["revenue"]
model = LinearRegression().fit(X, y)
y_pred = model.predict(X)
r2 = r2_score(y, y_pred)
rmse = np.sqrt(mean_squared_error(y, y_pred))
print(f"R² = {r2:.4f}")R² = 0.9630
print(f"RMSE = {rmse:.2f}")RMSE = 1179.71
print("\nCoefficients:")
Coefficients:
coef = pd.Series(model.coef_, index=X.columns).round(4)
print(coef)units_sold 167.0509
unit_price 39.5285
discount -11859.0122
category_Clothing -130.0043
category_Electronics 277.8062
category_Food -27.5572
dtype: float64
7b. Tidy Model Output & Diagnostics
R
# Tidy coefficients
tidy(model) |>
mutate(across(where(is.numeric), \(x) round(x, 4))) |>
arrange(p.value)# Model fit metrics
glance(model) |>
select(r.squared, adj.r.squared, sigma, AIC, BIC) |>
mutate(across(everything(), \(x) round(x, 4)))Python
import statsmodels.formula.api as smf
# Rebuild with original columns for formula interface (like R's lm)
sales_model = sales.copy()
# smf.ols uses R-style formula strings — no manual dummy encoding needed
ols = smf.ols(
"revenue ~ units_sold + unit_price + discount + C(category)",
data=sales_model
).fit()
# Key metrics
print(f"R² = {ols.rsquared:.4f}")R² = 0.9630
print(f"Adj R² = {ols.rsquared_adj:.4f}")Adj R² = 0.9619
print(f"AIC = {ols.aic:.2f}")AIC = 3410.78
print(f"BIC = {ols.bic:.2f}")BIC = 3433.87
print("\nTop coefficients by |t|:")
Top coefficients by |t|:
coef_df = ols.summary2().tables[1].round(4)
print(coef_df.reindex(coef_df["t"].abs().sort_values(ascending=False).index).head(8)) Coef. Std.Err. ... [0.025 0.975]
unit_price 39.5285 0.5842 ... 38.3762 40.6808
discount -11859.0122 756.7688 ... -13351.6113 -10366.4131
units_sold 167.0509 11.4784 ... 144.4118 189.6901
Intercept -5940.1655 609.6854 ... -7142.6673 -4737.6638
C(category)[T.Electronics] 277.8062 247.2198 ... -209.7931 765.4056
C(category)[T.Clothing] -130.0043 239.6617 ... -602.6966 342.6880
C(category)[T.Food] -27.5572 251.4714 ... -523.5422 468.4278
[7 rows x 6 columns]
8. Summary: Key Differences at a Glance
| Task | R (tidyverse) | Python (pandas / sklearn) |
|---|---|---|
| Import data | read_csv() |
pd.read_csv() |
| Quick EDA | skim(), glimpse() |
.describe(), .info() |
| Filter rows | filter(x > 10) |
.query("x > 10") |
| Select columns | select(col1, col2) |
df[["col1","col2"]] |
| Add column | mutate(new = expr) |
.assign(new=lambda d: expr) |
| Group + aggregate | group_by() |> summarise() |
.groupby().agg() |
| Pivot wide | pivot_wider() |
.unstack() / pd.pivot_table() |
| Pivot long | pivot_longer() |
.melt() |
| Join tables | left_join(by = "key") |
.merge(on="key", how="left") |
| Window functions | group_by() |> mutate() |
.groupby().transform() |
| Date extraction | month(), wday() (lubridate) |
.dt.month, .dt.day_name() |
| Plotting | ggplot2 |
matplotlib / seaborn |
| Correlation test | cor.test() |
scipy.stats.pearsonr() |
| ANOVA | aov() + TukeyHSD() |
scipy.stats.f_oneway() |
| Linear regression | lm() + broom::tidy() |
sklearn or statsmodels.OLS |
| Missing values | drop_na(), replace_na() |
.dropna(), .fillna() |
| Apply function to column | mutate(across(...)) |
.apply() / vectorized ops |
| String formatting | sprintf(), glue() |
f-strings, .format() |
Rendered with Quarto. Run with quarto render notebook.qmd. Requires R packages: tidyverse, skimr, broom, corrplot; and Python packages: pandas, numpy, matplotlib, seaborn, scipy, scikit-learn, statsmodels.