Code:
# ----------------------------------------------------------
# Step 1: Install required packages (if missing)
# ----------------------------------------------------------
if (!require("tidyverse")) install.packages("tidyverse")
if (!require("nflreadr")) install.packages("nflreadr")
if (!require("plotly")) install.packages("plotly")
library(tidyverse)
library(nflreadr)
library(plotly)
# ----------------------------------------------------------
# Step 2: Set global options
# ----------------------------------------------------------
# Turn off scientific notation
options(scipen = 9999)
# ----------------------------------------------------------
# Step 3: Load NFL play-by-play data (2025 season)
# ----------------------------------------------------------
# Year can be adjusted as needed
data <- load_pbp(2025)
# ----------------------------------------------------------
# Step 4: Filter for run or pass plays with EPA
# ----------------------------------------------------------
pbp_rp <- data %>%
filter((rush == 1 | pass == 1), !is.na(epa))
# ----------------------------------------------------------
# Step 5: Create trimmed data frame for analysis
# ----------------------------------------------------------
mydata <- pbp_rp %>%
select(
posteam,
pass,
wp,
qtr,
down,
half_seconds_remaining
)
glimpse(mydata)
# ----------------------------------------------------------
# Step 6: Filter for early-down, first-half,
# neutral-game-state plays
# ----------------------------------------------------------
# Research question:
# Which teams were the most pass-heavy in the first half on early downs
# with win probability between 20% and 80%, excluding the final
# 2 minutes of the half?
mydata_summary <- mydata %>%
filter(
wp > 0.20,
wp < 0.80,
down <= 2,
qtr <= 2,
half_seconds_remaining > 120
) %>%
group_by(posteam) %>%
summarize(
mean_pass = mean(pass),
plays = n(),
.groups = "drop"
) %>%
arrange(desc(mean_pass))
glimpse(mydata_summary)
# ----------------------------------------------------------
# Step 7: Order teams for plotting
# ----------------------------------------------------------
mydata_summary <- mydata_summary %>%
mutate(
posteam = factor(
posteam,
levels = posteam[order(mean_pass, decreasing = TRUE)]
)
)
# ----------------------------------------------------------
# Step 8: Create Plotly visualization
# ----------------------------------------------------------
graph <- plot_ly(
data = mydata_summary,
x = ~posteam,
y = ~mean_pass,
type = "scatter",
mode = "text",
text = ~posteam,
textposition = "middle center",
hovertemplate = paste(
"<b>%{text}</b><br>",
"Pass rate: %{y:.1%}<br>",
"Plays: %{customdata}<extra></extra>"
),
customdata = ~plays
) %>%
layout(
title = "Tendency to pass, by NFL team, 2025",
xaxis = list(
title = "Team",
showticklabels = FALSE
),
yaxis = list(
title = "Percent pass plays",
tickformat = ".0%"
)
)
graph
############################################################
## 1. Check for Required Packages and Install if Missing
############################################################
# Check for Lahman (historical MLB data)
if (!requireNamespace("Lahman", quietly = TRUE)) {
install.packages("Lahman")
}
# Check for tidyverse
if (!requireNamespace("tidyverse", quietly = TRUE)) {
install.packages("tidyverse")
}
############################################################
## 2. Load Required Packages
############################################################
library(Lahman)
library(tidyverse)
############################################################
## 3. Define the Season of Interest
############################################################
# Most recently completed MLB season
season_year <- 2025
############################################################
## 4. Pull Season-Level Batting Data (Lahman)
############################################################
# The Batting table contains one row per player per season per team
batting_data <- Batting %>%
filter(yearID == season_year)
############################################################
## 5. Inspect and Filter the Data
############################################################
# Calculate plate appearances (AB + BB + HBP + SF)
qualified_hitters <- batting_data %>%
mutate(
PA = AB + BB + HBP + SF
) %>%
filter(PA >= 400)
############################################################
## 6. Summarize: Identify the Top Home Run Hitters
############################################################
# Players may appear multiple times if traded; combine them
hr_leaders <- qualified_hitters %>%
group_by(playerID) %>%
summarise(
HR = sum(HR),
PA = sum(PA),
.groups = "drop"
) %>%
arrange(desc(HR)) %>%
slice_head(n = 10) %>%
left_join(People, by = "playerID") %>%
mutate(
Name = paste(nameFirst, nameLast)
) %>%
select(Name, PA, HR)
############################################################
## 7. Create the Visualization
############################################################
HRPlot <- ggplot(hr_leaders,
aes(x = reorder(Name, HR), y = HR)) +
geom_col(fill = "firebrick") +
coord_flip() +
labs(
title = "Top 10 MLB Home Run Hitters",
subtitle = "2025 Regular Season (Minimum 400 PA)",
x = "Player",
y = "Home Runs",
caption = "Source: Lahman Baseball Database"
) +
theme_minimal()
HRPlot
############################################################
## 8. Optional Extension Ideas (Not Executed)
############################################################
#
# - Compare HR with RBI or runs scored
# - Summarize by teamID
# - Compute HR per PA (rate stat)
# - Compare two seasons side by side
#
############################################################
{r,include=FALSE}
# ----------------------------------------------------------
# Step 1: Install required packages (if missing)
# ----------------------------------------------------------
if (!require("tidyverse")) install.packages("tidyverse")
if (!require("sf")) install.packages("sf")
if (!require("leaflet")) install.packages("leaflet")
if (!require("leafpop")) install.packages("leafpop")
library(tidyverse)
library(sf)
library(leaflet)
library(leafpop)
# ----------------------------------------------------------
# Step 2: Load precinct-level vote data for Rutherford County
# ----------------------------------------------------------
VoteData <- read.csv(
"https://github.com/drkblake/Data/raw/refs/heads/main/Rutherford_Vote_Data.csv"
)
# ----------------------------------------------------------
# Step 3: Download and load Rutherford County precinct shapefile
# ----------------------------------------------------------
download.file(
"https://github.com/drkblake/Data/raw/refs/heads/main/Rutherford_Precincts.zip",
"RutherfordPrecinctMap.zip",
mode = "wb"
)
unzip("RutherfordPrecinctMap.zip")
MapInfo <- read_sf("Rutherford_Precincts.shp")
MapInfo <- MapInfo %>%
select(Precinct, geometry)
# ----------------------------------------------------------
# Step 4: Join Rutherford vote data to map and transform CRS
# ----------------------------------------------------------
DataAndMapRuCo <- left_join(
VoteData,
MapInfo,
by = "Precinct") %>%
st_as_sf() %>%
st_transform(4326)
# ----------------------------------------------------------
# Step 5: Load precinct-level vote data for Davidson County
# ----------------------------------------------------------
VoteData <- read.csv(
"https://github.com/drkblake/Data/raw/refs/heads/main/Davidson_Vote_Data.csv"
)
# ----------------------------------------------------------
# Step 6: Download and load Davidson County precinct shapefile
# ----------------------------------------------------------
download.file(
"https://github.com/drkblake/Data/raw/refs/heads/main/Davidson_Precincts.zip",
"DavidsonPrecinctMap.zip",
mode = "wb"
)
unzip("DavidsonPrecinctMap.zip")
MapInfo <- read_sf("Davidson_Precincts.shp")
MapInfo <- MapInfo %>%
select(Precinct, geometry)
# ----------------------------------------------------------
# Step 7: Join Davidson vote data to map and transform CRS
# ----------------------------------------------------------
DataAndMapDavidson <- left_join(
VoteData,
MapInfo,
by = "Precinct") %>%
st_as_sf() %>%
st_transform(4326)
# ----------------------------------------------------------
# Step 8: Combine Rutherford and Davidson precinct files
# ----------------------------------------------------------
DataAndMap <- DataAndMapRuCo %>%
bind_rows(DataAndMapDavidson) %>%
st_as_sf() %>%
st_transform(4326)
# ----------------------------------------------------------
# Step 9: Compute totals, vote shares, winner, and TRUE margin
# ----------------------------------------------------------
MarginData <- DataAndMap %>%
mutate(
Total = Trump + Harris + Other,
Pct_Trump = (Trump / Total) * 100,
Pct_Harris = (Harris / Total) * 100
) %>%
mutate(
Winner = case_when(
Trump > Harris ~ "Republican",
Harris > Trump ~ "Democratic",
TRUE ~ "Tie"
),
Margin_Pct = round(abs(Pct_Trump - Pct_Harris), 1)
)
# ----------------------------------------------------------
# Step 10: Create party-aware color palettes
# ----------------------------------------------------------
dem_pal <- colorNumeric(
palette = c("#deebf7", "#08519c"),
domain = MarginData$Margin_Pct
)
rep_pal <- colorNumeric(
palette = c("#fee0d2", "#99000d"),
domain = MarginData$Margin_Pct
)
# ----------------------------------------------------------
# Step 11: Assign fill colors using dplyr
# ----------------------------------------------------------
MarginData <- MarginData %>%
mutate(
FillColor = case_when(
Winner == "Democratic" ~ dem_pal(Margin_Pct),
Winner == "Republican" ~ rep_pal(Margin_Pct),
TRUE ~ "gray"
)
)
# ----------------------------------------------------------
# Step 12: Create popup table (with raw vote counts)
# ----------------------------------------------------------
PopupData <- MarginData %>%
st_drop_geometry() %>%
select(
County,
Precinct,
Winner,
Trump,
Harris,
Other,
`Margin (percentage points)` = Margin_Pct
)
popup_content <- popupTable(
PopupData,
feature.id = FALSE,
row.numbers = FALSE
)
# ----------------------------------------------------------
# Step 13: TRUE margin choropleth map
# ----------------------------------------------------------
TrueMarginMap <- leaflet(MarginData) %>%
addProviderTiles("CartoDB.Positron") %>%
addPolygons(
fillColor = ~FillColor,
fillOpacity = 0.7,
color = "black",
weight = 0.4,
popup = popup_content
) %>%
addLegend(
colors = c("#08519c", "#99000d"),
labels = c("Democratic winner", "Republican winner"),
title = "Winner (by margin)",
opacity = 1
)
TrueMarginMap