# ── Libraries ─────────────────────────────────────────────────────────────────
library(ggplot2)
library(dplyr)
library(tidyr)
library(scales)
library(corrplot)
library(ggcorrplot)
library(broom)
library(knitr)
library(kableExtra)
library(patchwork)
# ── Colour palette ────────────────────────────────────────────────────────────
C1 <- "#1C7293" # primary blue
C2 <- "#E63946" # accent red
C3 <- "#2D6A4F" # green
C4 <- "#9B5DE5" # purple
C5 <- "#F4A261" # orange
C6 <- "#457B9D" # mid-blue
# ── Load & engineer data ──────────────────────────────────────────────────────
baseball <- read.csv("baseball.csv", stringsAsFactors = FALSE)
baseball$RD <- baseball$RS - baseball$RA # Run Differential
baseball$Playoffs_f <- factor(baseball$Playoffs,
labels = c("No Playoffs", "Playoffs"))The dataset covers 1,232 MLB team-seasons from 1962 to
2012 — every franchise’s full regular-season record across five
decades. It has 15 original variables plus one
engineered variable (RD).
| Variable | Type | Description | Why It Matters |
|---|---|---|---|
| Team | Categorical | Team abbreviation (e.g. OAK, NYY, BOS) | Enables franchise-level trend and team-comparison analysis |
| League | Categorical | American (AL) or National (NL) League | AL uses designated hitter; structurally higher run environment than NL |
| Year | Integer | Season year — provides era and temporal context | Controls for scoring-era effects (e.g. steroid era peaks in late 1990s) |
| RS | Numeric | Runs Scored — total offensive production | Direct precursor to wins — you cannot win without scoring |
| RA | Numeric | Runs Allowed — pitching + defensive output | Equally important as RS; every run prevented is as valuable as one scored |
| W | Integer | Team wins in a 162-game season (the outcome variable) | PRIMARY outcome — the number GMs, coaches and fans care about most |
| OBP | Ratio | On-Base % = (H + BB + HBP) / PA — all ways to reach base | Sabermetric key insight: captures walks that BA ignores; superior RS predictor |
| SLG | Ratio | Slugging % = Total Bases / AB — measures power hitting | Power metric; SLG explains even more RS variance than OBP alone |
| BA | Ratio | Batting Average = H / AB — traditional hitting metric | Historically overused by scouts; shown here to be a weaker RS predictor |
| Playoffs | Binary | 1 = made playoffs, 0 = did not (binary target for classification) | Defines competitive success; used as filter and grouping variable throughout |
| RankSeason | Ordinal | Regular-season rank among playoff qualifiers (988 NAs = non-playoff teams) | Secondary performance signal; excluded from regression (too many NAs) |
| RankPlayoffs | Ordinal | Playoff bracket finishing rank (NAs = non-playoff teams) | Secondary performance signal; excluded from regression (too many NAs) |
| OOBP | Ratio | Opponent OBP — pitching quality proxy; only available post-~1998 | Would allow full two-sided model; excluded due to 66 % missingness |
| OSLG | Ratio | Opponent SLG — pitching quality proxy; only available post-~1998 | Would allow full two-sided model; excluded due to 66 % missingness |
| RD (engineered) | Numeric | RS − RA: net run balance capturing both offense and defense in one number | Single best predictor of wins (R² = 0.880); bridges offense and defense |
OBP / SLG / BA → Runs Scored (RS)
↓
Run Differential (RS − RA) → Wins (W) → Playoffs
↑
Runs Allowed (RA) ← Pitching / Defense
Every regression in this report maps to one arrow in that chain. Understanding which metrics sit where tells an analyst exactly which levers to pull.
miss <- data.frame(
Variable = names(baseball),
N_Missing = colSums(is.na(baseball)),
Pct_Miss = round(colSums(is.na(baseball)) / nrow(baseball) * 100, 1)
) |> filter(N_Missing > 0)
kable(miss, row.names = FALSE,
caption = "Table 2 — Variables with Missing Data") |>
kable_styling(bootstrap_options = c("striped","hover"),
full_width = FALSE) |>
row_spec(0, bold = TRUE, background = C2, color = "white")| Variable | N_Missing | Pct_Miss |
|---|---|---|
| RankSeason | 988 | 80.2 |
| RankPlayoffs | 988 | 80.2 |
| OOBP | 812 | 65.9 |
| OSLG | 812 | 65.9 |
RankSeason / RankPlayoffs are NA for
non-playoff teams — expected and structurally fine. OOBP /
OSLG are 66 % missing (only tracked from ~1998 onward) and
are excluded from modelling.
desc <- baseball |>
select(W, RS, RA, OBP, SLG, BA, RD) |>
summarise(across(everything(), list(
Mean = \(x) round(mean(x, na.rm=TRUE), 3),
SD = \(x) round(sd(x, na.rm=TRUE), 3),
Min = \(x) round(min(x, na.rm=TRUE), 3),
Median = \(x) round(median(x, na.rm=TRUE), 3),
Max = \(x) round(max(x, na.rm=TRUE), 3)
))) |>
pivot_longer(everything(),
names_to = c("Variable", ".value"),
names_sep = "_(?=[^_]+$)")
kable(desc, caption = "Table 3 — Descriptive Statistics: Key Continuous Variables") |>
kable_styling(bootstrap_options = c("striped","hover"),
full_width = FALSE, font_size = 12) |>
row_spec(0, bold = TRUE, background = C1, color = "white")| Variable | Mean | SD | Min | Median | Max |
|---|---|---|---|---|---|
| W | 80.904 | 11.458 | 40.000 | 81.000 | 116.000 |
| RS | 715.082 | 91.534 | 463.000 | 711.000 | 1009.000 |
| RA | 715.082 | 93.080 | 472.000 | 709.000 | 1103.000 |
| OBP | 0.326 | 0.015 | 0.277 | 0.326 | 0.373 |
| SLG | 0.397 | 0.033 | 0.301 | 0.396 | 0.491 |
| BA | 0.259 | 0.013 | 0.214 | 0.260 | 0.294 |
| RD | 0.000 | 102.785 | -337.000 | 4.000 | 309.000 |
Notable observations from the descriptive statistics:
cor_data <- baseball |> select(RS, RA, W, OBP, SLG, BA, RD) |> na.omit()
cor_matrix <- cor(cor_data)
ggcorrplot(cor_matrix,
method = "square",
type = "lower",
lab = TRUE,
lab_size = 4.2,
colors = c(C2, "white", C1),
outline.col = "white",
ggtheme = theme_minimal(base_size = 13)) +
labs(title = "Correlation Matrix: Key MLB Performance Metrics",
subtitle = "Strongest signal: RD–W (r = 0.94); OBP outperforms BA in predicting RS") +
theme(plot.title = element_text(face = "bold"),
plot.subtitle = element_text(color = "gray50", size = 11))Figure 1 — Pearson correlation heatmap for all key numeric variables.
| Pair | r | Interpretation |
|---|---|---|
| RD ↔︎ W | 0.938 | Strongest relationship in dataset — run gap almost fully determines wins |
| SLG ↔︎ RS | 0.919 | Power hitting is marginally the top single RS predictor |
| OBP ↔︎ RS | 0.900 | OBP is nearly as strong; both far exceed BA |
| BA ↔︎ RS | 0.827 | Traditional metric — 7–9 pp weaker than OBP/SLG |
| RA ↔︎ W | -0.838 | Allowing fewer runs strongly predicts winning |
| OBP ↔︎ W | 0.482 | OBP predicts wins, but the effect is mediated by defense |
| BA ↔︎ W | 0.395 | Weakest of the direct win-predictor relationships |
ggplot(baseball, aes(x = W)) +
geom_histogram(binwidth = 5, fill = C1, color = "white", alpha = 0.9) +
geom_vline(xintercept = mean(baseball$W), linetype = "dashed",
color = C2, linewidth = 1) +
annotate("text", x = mean(baseball$W) + 2.5, y = 105,
label = paste0("Mean = ", round(mean(baseball$W), 1)),
hjust = 0, color = C2, fontface = "bold", size = 3.8) +
scale_x_continuous(breaks = seq(40, 120, 10)) +
labs(title = "Distribution of Team Wins (1962–2012)",
subtitle = "Near-normal shape centred on 81 wins — the statistical midpoint of a 162-game season",
x = "Wins", y = "Number of Team-Seasons") +
theme_minimal(base_size = 13) +
theme(plot.subtitle = element_text(color = "gray50", size = 11))Figure 2 — Distribution of team wins across all 1,232 team-seasons.
ggplot(baseball, aes(x = RS, y = RA, color = Playoffs_f)) +
geom_point(alpha = 0.45, size = 1.8) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed",
color = "gray40", linewidth = 0.8) +
scale_color_manual(values = c("No Playoffs" = "gray70", "Playoffs" = C2)) +
annotate("text", x = 540, y = 1090, label = "RA > RS\n(net losers)",
color = "gray45", size = 3.3, hjust = 0) +
annotate("text", x = 920, y = 490, label = "RS > RA\n(net winners)",
color = C3, size = 3.3, hjust = 0, fontface = "bold") +
labs(title = "Runs Scored vs. Runs Allowed (1962–2012)",
subtitle = "Playoff teams (red) cluster below the RS = RA diagonal — they consistently outscore opponents",
x = "Runs Scored (RS)", y = "Runs Allowed (RA)", color = "") +
theme_minimal(base_size = 13) +
theme(legend.position = "top",
plot.subtitle = element_text(color = "gray50", size = 11))Figure 3 — Run Scored vs Runs Allowed. Every point above the diagonal is a net run loser; every point below is a net winner. Playoff teams cluster in the lower-right.
era <- baseball |>
group_by(Year) |>
summarise(avg_RS = mean(RS),
avg_OBP = mean(OBP),
avg_BA = mean(BA), .groups = "drop")
p_rs <- ggplot(era, aes(x = Year, y = avg_RS)) +
geom_line(color = C1, linewidth = 1) +
geom_smooth(method = "loess", se = TRUE, color = C2,
fill = C2, alpha = 0.12, linewidth = 0.6) +
annotate("rect", xmin = 1994, xmax = 2005,
ymin = -Inf, ymax = Inf,
fill = C5, alpha = 0.12) +
annotate("text", x = 1999.5, y = 688, label = "High-scoring\nera", color = C5,
size = 3, fontface = "bold") +
labs(title = "League-Average Runs Scored", x = "Year", y = "Avg RS") +
theme_minimal(base_size = 12)
p_obp <- ggplot(era, aes(x = Year, y = avg_OBP)) +
geom_line(color = C3, linewidth = 1) +
geom_smooth(method = "loess", se = TRUE, color = C2,
fill = C2, alpha = 0.12, linewidth = 0.6) +
labs(title = "League-Average OBP", x = "Year", y = "Avg OBP") +
theme_minimal(base_size = 12)
p_rs + p_obp +
plot_annotation(
title = "Offensive Trends Over Time (1962–2012)",
subtitle = "RS and OBP track together closely — confirming OBP as an RS driver across eras",
theme = theme(plot.subtitle = element_text(color = "gray50", size = 11))
)Figure 4 — League-average RS and OBP per season. The late 1990s–mid 2000s scoring peak (steroid era) and subsequent decline are visible.
baseball |>
select(Playoffs_f, OBP, SLG, BA) |>
pivot_longer(c(OBP, SLG, BA), names_to = "Metric", values_to = "Value") |>
mutate(Metric = factor(Metric, levels = c("OBP","SLG","BA"))) |>
ggplot(aes(x = Playoffs_f, y = Value, fill = Playoffs_f)) +
geom_boxplot(alpha = 0.8, outlier.alpha = 0.3, width = 0.55) +
scale_fill_manual(values = c("No Playoffs" = "gray70", "Playoffs" = C2)) +
facet_wrap(~Metric, scales = "free_y", nrow = 1) +
labs(title = "Offensive Metrics: Playoff vs. Non-Playoff Teams",
subtitle = "Playoff teams show a larger separation on OBP and SLG than on BA — consistent with the Moneyball thesis",
x = "", y = "Metric Value") +
theme_minimal(base_size = 13) +
theme(legend.position = "none",
strip.text = element_text(face = "bold", size = 12),
plot.subtitle = element_text(color = "gray50", size = 11))Figure 5 — Playoff vs. non-playoff teams across the three offensive metrics. The OBP and SLG gaps are larger than the BA gap, reinforcing the Moneyball argument.
Summary of correlations, patterns and trends:
| Audience | Role | Core Question |
|---|---|---|
| General Manager / Front Office | Roster construction, player acquisition, budget allocation | Which statistics actually predict wins so we can identify undervalued players? |
| Manager / Coaching Staff / Analytics Team | In-season lineup decisions, pitching strategy, game planning | What run differential target does our team need to reach the playoffs? |
| Fans, Media & Broadcasters | Narrative, context, historical comparison | Why do low-payroll teams sometimes outperform big-budget franchises? |
Front Office / GM: - Is OBP actually a better predictor of runs than Batting Average? By how much? - If we improve our team OBP by 0.010, how many more runs should we expect to score? - What OBP and SLG levels do playoff-calibre offenses consistently achieve?
Coaching Staff / Analytics: - What Run Differential do we need across a full season to win 90+ games? - How many runs above or below average are we, and what does that imply for our win projection? - Which is more cost-efficient to improve — our offense (RS) or our pitching (RA)?
Fans & Media: - How does the Moneyball approach explain Oakland’s success with a fraction of a big-market budget? - Has the relationship between OBP and winning changed over time as teams caught on? - Which historical teams had the best and worst run differentials?
Six specific, answerable questions are proposed — one visualisation per question:
| # | Question for the Audience | Visualization |
|---|---|---|
| 1 | Does OBP predict RS better than BA, and by how much? | Side-by-side regression scatter (OBP vs BA), R² labelled |
| 2 | How strongly does Run Differential predict Wins? | Scatter: RD → W with regression line and win-target annotation |
| 3 | Does OBP predict Wins directly? | Scatter: OBP → W showing the weaker, mediated relationship |
| 4 | How do playoff teams differ from non-playoff teams on key metrics? | Grouped box plots: OBP, SLG, BA by playoff status |
| 5 | How has offensive run production shifted across eras? | Line chart: league-average RS and OBP over 50 years |
| 6 | What is the overall structure of relationships between all metrics? | Correlation heatmap with labelled coefficients |
These six questions form a complete narrative arc: from what predicts scoring → what predicts winning → how to compare teams → historical context.
A Moneyball analytics dashboard should answer the central business question — what drives wins, and where does our team stand? — at three levels: strategic overview, diagnostic deep-dive, and historical context.
| Dashboard Panel | What It Shows | Why It Helps |
|---|---|---|
| Panel 1 — KPI Cards (top strip) | Season RD, W total, RS, RA, playoff probability — large numbers at a glance | Executives see the key numbers immediately without navigating charts |
| Panel 2 — Run Differential → Wins Calculator | Interactive: move a target RD slider → read off predicted W from regression formula W = 80.9 + 0.1045 × RD; reference lines at 81 W and 90 W | Translates abstract regression math into an actionable win-target conversation |
| Panel 3 — RS vs. RA Positioning Scatter | All team-seasons as dots; selected team highlighted; diagonal RS=RA break-even line; colour = playoff status | Instantly shows whether the team is a net run winner or loser vs. the league |
| Panel 4 — Metric vs. Metric Regression View | Toggle between OBP→RS, BA→RS, SLG→RS, OBP→W, RD→W; regression line + R² and slope rendered automatically | Lets staff compare which metric they should focus on with one click |
| Panel 5 — Playoff Benchmark Box Plots | Grouped box plots for OBP, SLG, BA split by playoff status; selected team’s value shown as a dot | Shows the team’s absolute metric values in the context of playoff vs. non-playoff norms |
| Panel 6 — Era Trend Line Chart | League-average RS and OBP by year with era annotations (expansion, 1994 strike, steroid peak, post-2006 decline) | Prevents misreading modern benchmarks without era context |
| Panel 7 — Team Rankings Sortable Table | Rows = team-seasons; sortable by RD, W, OBP, SLG, BA; filterable by year, league, team; playoff rows highlighted | Enables ad-hoc comparison — which teams had similar profiles and how did they do? |
Design principles for this dashboard:
When to use: To show the shape, spread and central tendency of a single continuous variable before any modelling.
ggplot(baseball, aes(x = W)) +
geom_histogram(binwidth = 5, fill = C1, color = "white", alpha = 0.9) +
geom_vline(xintercept = mean(baseball$W), color = C2,
linetype = "dashed", linewidth = 1.1) +
geom_vline(xintercept = 90, color = C3,
linetype = "dotted", linewidth = 1) +
annotate("text", x = mean(baseball$W) + 2, y = 105,
label = paste0("Mean = ", round(mean(baseball$W),1)),
hjust = 0, color = C2, fontface = "bold", size = 3.8) +
annotate("text", x = 91, y = 105, label = "90-win\nthreshold",
hjust = 0, color = C3, fontface = "bold", size = 3.5) +
scale_x_continuous(breaks = seq(40, 120, 10)) +
labs(title = "Distribution of Team Wins (1962–2012)",
subtitle = "Bell-shaped, centred on 81 — the midpoint of a 162-game season",
x = "Wins", y = "Number of Team-Seasons") +
theme_minimal(base_size = 13) +
theme(plot.subtitle = element_text(color = "gray50", size = 11))Chart 1 — Histogram of team wins. The near-normal shape validates OLS regression as an appropriate modelling approach.
When to use: To generate hypotheses before regression — lets the analyst see every pairwise relationship in one view and prioritise which ones to model formally.
ggcorrplot(cor_matrix,
method = "square",
type = "lower",
lab = TRUE,
lab_size = 4.2,
colors = c(C2, "white", C1),
outline.col = "white",
ggtheme = theme_minimal(base_size = 13)) +
labs(title = "Correlation Matrix — MLB Performance Metrics",
subtitle = "RD–W: r = 0.938 | OBP–RS: r = 0.900 | BA–RS: r = 0.827") +
theme(plot.title = element_text(face = "bold"),
plot.subtitle = element_text(color = "gray50", size = 11))Chart 2 — Correlation heatmap. RD–W dominates; OBP clearly beats BA as an RS predictor.
When to use: To show a bivariate relationship and a natural boundary (the RS = RA break-even line) simultaneously, with a third variable encoded through colour.
ggplot(baseball, aes(x = RS, y = RA, color = Playoffs_f)) +
geom_point(alpha = 0.45, size = 1.8) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed",
color = "gray40", linewidth = 0.8) +
scale_color_manual(values = c("No Playoffs" = "gray70", "Playoffs" = C2)) +
annotate("text", x = 540, y = 1080, label = "RA > RS\n(net losers)", color = "gray45", size = 3.3) +
annotate("text", x = 910, y = 495, label = "RS > RA\n(net winners)", color = C3, size = 3.3, fontface = "bold") +
labs(title = "Runs Scored vs. Runs Allowed — Playoff Separation",
subtitle = "Colour distinguishes playoff from non-playoff teams; diagonal is the break-even line",
x = "Runs Scored (RS)", y = "Runs Allowed (RA)", color = "") +
theme_minimal(base_size = 13) +
theme(legend.position = "top",
plot.subtitle = element_text(color = "gray50", size = 11))Chart 3 — RS vs. RA scatter. The diagonal line is RS = RA. Playoff teams (red) overwhelmingly sit below it — they outscore opponents.
When to use: To formally quantify a linear relationship, display the regression line with confidence interval, and report R² as the headline measure of explanatory power.
model_obp_rs <- lm(RS ~ OBP, data = baseball)
ggplot(baseball, aes(x = OBP, y = RS)) +
geom_point(alpha = 0.3, color = C1, size = 1.5) +
geom_smooth(method = "lm", color = C2, se = TRUE, linewidth = 1.2) +
annotate("text", x = 0.280, y = 990,
label = paste0("R² = ", round(summary(model_obp_rs)$r.squared, 3),
"\nSlope = 5,490\n(+55 RS per +0.010 OBP)"),
hjust = 0, color = C1, fontface = "bold", size = 3.8) +
labs(title = "On-Base Percentage → Runs Scored",
subtitle = "OBP explains 81.1% of RS variance across 1,232 team-seasons (p < 0.001)",
x = "On-Base Percentage (OBP)", y = "Runs Scored") +
theme_minimal(base_size = 13) +
theme(plot.subtitle = element_text(color = "gray50", size = 11))Chart 4 — OBP predicts RS with R² = 0.811. Each 0.010 increase in OBP is worth ~55 additional runs per season.
When to use: As a direct visual comparison to Chart 4. Showing the same chart type with a weaker R² makes the OBP advantage immediately legible.
model_ba_rs <- lm(RS ~ BA, data = baseball)
ggplot(baseball, aes(x = BA, y = RS)) +
geom_point(alpha = 0.3, color = C6, size = 1.5) +
geom_smooth(method = "lm", color = C2, se = TRUE, linewidth = 1.2) +
annotate("text", x = 0.216, y = 990,
label = paste0("R² = ", round(summary(model_ba_rs)$r.squared, 3),
"\n⚠ 12.7 pp weaker\nthan OBP"),
hjust = 0, color = C6, fontface = "bold", size = 3.8) +
labs(title = "Batting Average → Runs Scored",
subtitle = "BA explains only 68.4% of RS variance — inferior to OBP despite being baseball's most-cited metric",
x = "Batting Average (BA)", y = "Runs Scored") +
theme_minimal(base_size = 13) +
theme(plot.subtitle = element_text(color = "gray50", size = 11))Chart 5 — BA predicts RS with R² = 0.684 — 12.7 percentage points weaker than OBP. The wider scatter around the line is visible.
When to use: To deliver the Moneyball punchline in one glance — a bar chart of model fit gives a non-technical audience an immediate takeaway without needing to read scatter plots.
model_slg_rs <- lm(RS ~ SLG, data = baseball)
r2_df <- data.frame(
Model = c("SLG → RS", "OBP → RS", "BA → RS"),
R2 = c(summary(model_slg_rs)$r.squared,
summary(model_obp_rs)$r.squared,
summary(model_ba_rs)$r.squared),
Fill = c(C5, C1, "gray60")
)
r2_df$Model <- factor(r2_df$Model, levels = r2_df$Model)
ggplot(r2_df, aes(x = Model, y = R2, fill = Fill)) +
geom_col(width = 0.55, alpha = 0.9) +
geom_text(aes(label = round(R2, 3)), vjust = -0.5,
fontface = "bold", size = 4.5) +
scale_fill_identity() +
scale_y_continuous(limits = c(0, 1), labels = percent_format()) +
labs(title = "Model Fit (R²): Which Offensive Metric Best Predicts Runs Scored?",
subtitle = "SLG and OBP clearly outperform BA — validating the core Moneyball hypothesis",
x = "Model", y = "R² (% of RS variance explained)") +
theme_minimal(base_size = 13) +
theme(plot.subtitle = element_text(color = "gray50", size = 11))Chart 6 — R² comparison across offensive metrics. SLG and OBP are near-equivalent and both far exceed BA. The Moneyball argument is visible in one bar.
When to use: To show the single most important relationship in the entire dataset — and to anchor the “10 extra runs = 1 extra win” rule of thumb that makes the analysis actionable.
model_rd_w <- lm(W ~ RD, data = baseball)
target_90 <- (90 - coef(model_rd_w)[1]) / coef(model_rd_w)[2]
ggplot(baseball, aes(x = RD, y = W)) +
geom_point(alpha = 0.3, color = C3, size = 1.5) +
geom_smooth(method = "lm", color = C2, se = TRUE, linewidth = 1.2) +
geom_vline(xintercept = 0, linetype = "dashed", color = "gray50") +
geom_hline(yintercept = 90, linetype = "dotted", color = C4, linewidth = 0.9) +
geom_vline(xintercept = target_90, linetype = "dotted", color = C4, linewidth = 0.9) +
annotate("text", x = 5, y = 44,
label = "RD = 0\n(break-even)", color = "gray50", size = 3) +
annotate("text", x = target_90 + 4, y = 44,
label = paste0("RD ≈ +", round(target_90), "\nfor 90 wins"),
color = C4, size = 3.3, fontface = "bold", hjust = 0) +
annotate("text", x = -295, y = 91,
label = "90-win playoff threshold", color = C4,
size = 3.3, hjust = 0, fontface = "bold") +
labs(title = "Run Differential → Team Wins (Strongest Model)",
subtitle = paste0("R² = ", round(summary(model_rd_w)$r.squared, 3),
" | β₁ = 0.1045 → +10 RD ≈ +1 Win | p < 0.001"),
x = "Run Differential (RS − RA)", y = "Wins") +
theme_minimal(base_size = 13) +
theme(plot.subtitle = element_text(color = "gray50", size = 11))Chart 7 — RD → W is the strongest model (R² = 0.880). The intercept at RD = 0 is W = 80.9 — exactly a break-even season. The 90-win threshold corresponds to RD ≈ +87.
When to use: To show why OBP matters for winning but is not sufficient alone — the wide scatter explains why teams must also invest in pitching and defence.
model_obp_w <- lm(W ~ OBP, data = baseball)
ggplot(baseball, aes(x = OBP, y = W)) +
geom_point(alpha = 0.3, color = C4, size = 1.5) +
geom_smooth(method = "lm", color = C2, se = TRUE, linewidth = 1.2) +
annotate("text", x = 0.280, y = 113,
label = paste0("R² = ", round(summary(model_obp_w)$r.squared, 3),
"\nMuch weaker than RD → W\n(defence and pitching moderate the effect)"),
hjust = 0, color = C4, fontface = "bold", size = 3.5) +
labs(title = "On-Base Percentage → Wins",
subtitle = "OBP is significant (p < 0.001) but explains only 23.2% of win variance — pitching/defence matter too",
x = "On-Base Percentage (OBP)", y = "Wins") +
theme_minimal(base_size = 13) +
theme(plot.subtitle = element_text(color = "gray50", size = 11))Chart 8 — OBP → Wins (R² = 0.232). The wider scatter vs. Chart 7 shows that OBP’s effect on wins is mediated by pitching and defence.
When to use: To compare distributions of multiple metrics across a categorical grouping — shows median, spread, and outliers simultaneously without requiring the audience to understand regression.
baseball |>
select(Playoffs_f, OBP, SLG, BA) |>
pivot_longer(c(OBP, SLG, BA), names_to = "Metric", values_to = "Value") |>
mutate(Metric = factor(Metric, levels = c("OBP","SLG","BA"))) |>
ggplot(aes(x = Playoffs_f, y = Value, fill = Playoffs_f)) +
geom_boxplot(alpha = 0.8, outlier.alpha = 0.3, width = 0.55) +
scale_fill_manual(values = c("No Playoffs" = "gray70", "Playoffs" = C2)) +
facet_wrap(~Metric, scales = "free_y", nrow = 1) +
labs(title = "Offensive Metrics: Playoff vs. Non-Playoff Teams",
subtitle = "OBP and SLG gaps are larger than BA gap — better separators of team quality",
x = "", y = "Metric Value") +
theme_minimal(base_size = 13) +
theme(legend.position = "none",
strip.text = element_text(face = "bold", size = 12),
plot.subtitle = element_text(color = "gray50", size = 11))Chart 9 — OBP and SLG show a larger playoff gap than BA, confirming that undervalued metrics (OBP, SLG) are better discriminators of playoff-calibre teams.
When to use: To show temporal trends and era effects that could confound cross-era comparisons — essential context for any audience interpreting historical benchmarks.
era |>
pivot_longer(c(avg_RS, avg_OBP, avg_BA),
names_to = "Metric", values_to = "Value") |>
mutate(Metric = recode(Metric,
avg_RS = "Runs Scored (RS)",
avg_OBP = "OBP",
avg_BA = "BA")) |>
filter(Metric != "BA") |>
ggplot(aes(x = Year, y = Value, color = Metric, group = Metric)) +
geom_line(linewidth = 1.1) +
geom_smooth(method = "loess", se = FALSE,
linetype = "dashed", linewidth = 0.5) +
annotate("rect", xmin = 1994, xmax = 2005,
ymin = -Inf, ymax = Inf, fill = C5, alpha = 0.10) +
annotate("text", x = 1999.5, y = 0.278,
label = "High-scoring era\n(1994–2005)",
color = C5, size = 3, fontface = "bold") +
scale_color_manual(values = c("Runs Scored (RS)" = C1, "OBP" = C3)) +
scale_y_continuous(sec.axis = dup_axis(name = NULL)) +
facet_wrap(~Metric, scales = "free_y", nrow = 1) +
labs(title = "League-Average Offensive Metrics Over Time (1962–2012)",
subtitle = "RS and OBP track closely across all eras — era context prevents misreading modern benchmarks",
x = "Season Year", y = "Value", color = "") +
theme_minimal(base_size = 13) +
theme(legend.position = "none",
strip.text = element_text(face = "bold"),
plot.subtitle = element_text(color = "gray50", size = 11))Chart 10 — League-average RS and OBP over five decades. RS and OBP co-move, confirming OBP as a stable RS driver across eras. The shaded band marks the high-scoring steroid era.
| Business Question Answered | Chart Type | Key Statistic | Primary Audience |
|---|---|---|---|
| Does OBP predict RS better than BA? | Regression scatter — OBP → RS | R² = 0.811 | GM / Front Office |
| Does OBP predict RS better than BA? | Regression scatter — BA → RS | R² = 0.684 | GM / Front Office |
| Which single metric best predicts RS? | Bar chart — R² comparison (OBP, SLG, BA) | SLG: 0.844 | OBP: 0.811 | BA: 0.684 | GM / Executive (quick comparison) |
| Does RD predict Wins? | Regression scatter — RD → W | R² = 0.880 | β₁ = 0.1045 | Coaching / Analytics |
| Does OBP predict Wins? | Regression scatter — OBP → W | R² = 0.232 | GM + Analytics |
| What is the structure of all relationships? | Correlation heatmap | RD–W: r = 0.938 | Analytics (hypothesis generation) |
| How do playoff teams differ? | Grouped box plots by playoff status | Playoff OBP and SLG medians both higher | GM + Coaching + Fans |
| How has scoring changed over time? | Dual faceted line chart | RS peaks 1994–2005; OBP tracks RS | Media / Fans / Historical context |
Generated with R 4.5.2 | ggplot2, corrplot, ggcorrplot, patchwork, kableExtra