4/13-4/17 Three Graphs

Code:

# ----------------------------------------------------------
# Step 1: Install required packages (if missing)
# ----------------------------------------------------------

if (!require("tidyverse")) install.packages("tidyverse")
if (!require("nflreadr")) install.packages("nflreadr")
if (!require("plotly")) install.packages("plotly")

library(tidyverse)
library(nflreadr)
library(plotly)

# ----------------------------------------------------------
# Step 2: Set global options
# ----------------------------------------------------------

# Turn off scientific notation
options(scipen = 9999)

# ----------------------------------------------------------
# Step 3: Load NFL play-by-play data (2025 season)
# ----------------------------------------------------------

# Year can be adjusted as needed
data <- load_pbp(2025)

# ----------------------------------------------------------
# Step 4: Filter for run or pass plays with EPA
# ----------------------------------------------------------

pbp_rp <- data %>%
  filter((rush == 1 | pass == 1), !is.na(epa))

# ----------------------------------------------------------
# Step 5: Create trimmed data frame for analysis
# ----------------------------------------------------------

mydata <- pbp_rp %>% 
  select(
    posteam,
    pass,
    wp,
    qtr,
    down,
    half_seconds_remaining
  )

glimpse(mydata)

# ----------------------------------------------------------
# Step 6: Filter for early-down, first-half, 
# neutral-game-state plays
# ----------------------------------------------------------
# Research question:
# Which teams were the most pass-heavy in the first half on early downs
# with win probability between 20% and 80%, excluding the final
# 2 minutes of the half?

mydata_summary <- mydata %>%
  filter(
    wp > 0.20,
    wp < 0.80,
    down <= 2,
    qtr <= 2,
    half_seconds_remaining > 120
  ) %>%
  group_by(posteam) %>%
  summarize(
    mean_pass = mean(pass),
    plays = n(),
    .groups = "drop"
  ) %>%
  arrange(desc(mean_pass))

glimpse(mydata_summary)

# ----------------------------------------------------------
# Step 7: Order teams for plotting
# ----------------------------------------------------------

mydata_summary <- mydata_summary %>%
  mutate(
    posteam = factor(
      posteam,
      levels = posteam[order(mean_pass, decreasing = TRUE)]
    )
  )

# ----------------------------------------------------------
# Step 8: Create Plotly visualization
# ----------------------------------------------------------

graph <- plot_ly(
  data = mydata_summary,
  x = ~posteam,
  y = ~mean_pass,
  type = "scatter",
  mode = "text",
  text = ~posteam,
  textposition = "middle center",
  hovertemplate = paste(
    "<b>%{text}</b><br>",
    "Pass rate: %{y:.1%}<br>",
    "Plays: %{customdata}<extra></extra>"
  ),
  customdata = ~plays
) %>%
  layout(
    title = "Tendency to pass, by NFL team, 2025",
    xaxis = list(
      title = "Team",
      showticklabels = FALSE
    ),
    yaxis = list(
      title = "Percent pass plays",
      tickformat = ".0%"
    )
  )

graph

############################################################
## 1. Check for Required Packages and Install if Missing
############################################################

# Check for Lahman (historical MLB data)
if (!requireNamespace("Lahman", quietly = TRUE)) {
  install.packages("Lahman")
}

# Check for tidyverse
if (!requireNamespace("tidyverse", quietly = TRUE)) {
  install.packages("tidyverse")
}


############################################################
## 2. Load Required Packages
############################################################

library(Lahman)
library(tidyverse)


############################################################
## 3. Define the Season of Interest
############################################################

# Most recently completed MLB season
season_year <- 2025


############################################################
## 4. Pull Season-Level Batting Data (Lahman)
############################################################

# The Batting table contains one row per player per season per team
batting_data <- Batting %>%
  filter(yearID == season_year)


############################################################
## 5. Inspect and Filter the Data
############################################################

# Calculate plate appearances (AB + BB + HBP + SF)
qualified_hitters <- batting_data %>%
  mutate(
    PA = AB + BB + HBP + SF
  ) %>%
  filter(PA >= 400)


############################################################
## 6. Summarize: Identify the Top Home Run Hitters
############################################################

# Players may appear multiple times if traded; combine them
hr_leaders <- qualified_hitters %>%
  group_by(playerID) %>%
  summarise(
    HR = sum(HR),
    PA = sum(PA),
    .groups = "drop"
  ) %>%
  arrange(desc(HR)) %>%
  slice_head(n = 10) %>%
  left_join(People, by = "playerID") %>%
  mutate(
    Name = paste(nameFirst, nameLast)
  ) %>%
  select(Name, PA, HR)


############################################################
## 7. Create the Visualization
############################################################

HRPlot <- ggplot(hr_leaders,
                 aes(x = reorder(Name, HR), y = HR)) +
  geom_col(fill = "firebrick") +
  coord_flip() +
  labs(
    title = "Top 10 MLB Home Run Hitters",
    subtitle = "2025 Regular Season (Minimum 400 PA)",
    x = "Player",
    y = "Home Runs",
    caption = "Source: Lahman Baseball Database"
  ) +
  theme_minimal()

HRPlot

############################################################
## 8. Optional Extension Ideas (Not Executed)
############################################################
#
# - Compare HR with RBI or runs scored
# - Summarize by teamID
# - Compute HR per PA (rate stat)
# - Compare two seasons side by side
#
############################################################

{r,include=FALSE}

# ----------------------------------------------------------
# Step 1: Install required packages (if missing)
# ----------------------------------------------------------

if (!require("tidyverse")) install.packages("tidyverse")
if (!require("sf")) install.packages("sf")
if (!require("leaflet")) install.packages("leaflet")
if (!require("leafpop")) install.packages("leafpop")

library(tidyverse)
library(sf)
library(leaflet)
library(leafpop)

# ----------------------------------------------------------
# Step 2: Load precinct-level vote data for Rutherford County
# ----------------------------------------------------------

VoteData <- read.csv(
  "https://github.com/drkblake/Data/raw/refs/heads/main/Rutherford_Vote_Data.csv"
)

# ----------------------------------------------------------
# Step 3: Download and load Rutherford County precinct shapefile
# ----------------------------------------------------------

download.file(
  "https://github.com/drkblake/Data/raw/refs/heads/main/Rutherford_Precincts.zip",
  "RutherfordPrecinctMap.zip",
  mode = "wb"
)

unzip("RutherfordPrecinctMap.zip")

MapInfo <- read_sf("Rutherford_Precincts.shp")
MapInfo <- MapInfo %>% 
  select(Precinct, geometry)

# ----------------------------------------------------------
# Step 4: Join Rutherford vote data to map and transform CRS
# ----------------------------------------------------------

DataAndMapRuCo <- left_join(
  VoteData,
  MapInfo,
  by = "Precinct") %>%
  st_as_sf() %>%
  st_transform(4326)

# ----------------------------------------------------------
# Step 5: Load precinct-level vote data for Davidson County
# ----------------------------------------------------------

VoteData <- read.csv(
  "https://github.com/drkblake/Data/raw/refs/heads/main/Davidson_Vote_Data.csv"
)

# ----------------------------------------------------------
# Step 6: Download and load Davidson County precinct shapefile
# ----------------------------------------------------------

download.file(
  "https://github.com/drkblake/Data/raw/refs/heads/main/Davidson_Precincts.zip",
  "DavidsonPrecinctMap.zip",
  mode = "wb"
)

unzip("DavidsonPrecinctMap.zip")

MapInfo <- read_sf("Davidson_Precincts.shp")
MapInfo <- MapInfo %>% 
  select(Precinct, geometry)

# ----------------------------------------------------------
# Step 7: Join Davidson vote data to map and transform CRS
# ----------------------------------------------------------

DataAndMapDavidson <- left_join(
  VoteData,
  MapInfo,
  by = "Precinct") %>%
  st_as_sf() %>%
  st_transform(4326)

# ----------------------------------------------------------
# Step 8: Combine Rutherford and Davidson precinct files
# ----------------------------------------------------------

DataAndMap <- DataAndMapRuCo %>%
  bind_rows(DataAndMapDavidson) %>%
  st_as_sf() %>%
  st_transform(4326)

# ----------------------------------------------------------
# Step 9: Compute totals, vote shares, winner, and TRUE margin
# ----------------------------------------------------------

MarginData <- DataAndMap %>%
  mutate(
    Total = Trump + Harris + Other,
    Pct_Trump  = (Trump  / Total) * 100,
    Pct_Harris = (Harris / Total) * 100
  ) %>%
  mutate(
    Winner = case_when(
      Trump > Harris ~ "Republican",
      Harris > Trump ~ "Democratic",
      TRUE           ~ "Tie"
    ),
    Margin_Pct = round(abs(Pct_Trump - Pct_Harris), 1)
  )

# ----------------------------------------------------------
# Step 10: Create party-aware color palettes
# ----------------------------------------------------------

dem_pal <- colorNumeric(
  palette = c("#deebf7", "#08519c"),
  domain  = MarginData$Margin_Pct
)

rep_pal <- colorNumeric(
  palette = c("#fee0d2", "#99000d"),
  domain  = MarginData$Margin_Pct
)

# ----------------------------------------------------------
# Step 11: Assign fill colors using dplyr
# ----------------------------------------------------------

MarginData <- MarginData %>%
  mutate(
    FillColor = case_when(
      Winner == "Democratic" ~ dem_pal(Margin_Pct),
      Winner == "Republican" ~ rep_pal(Margin_Pct),
      TRUE                   ~ "gray"
    )
  )

# ----------------------------------------------------------
# Step 12: Create popup table (with raw vote counts)
# ----------------------------------------------------------

PopupData <- MarginData %>%
  st_drop_geometry() %>%
  select(
    County,
    Precinct,
    Winner,
    Trump,
    Harris,
    Other,
    `Margin (percentage points)` = Margin_Pct
  )

popup_content <- popupTable(
  PopupData,
  feature.id = FALSE,
  row.numbers = FALSE
)

# ----------------------------------------------------------
# Step 13: TRUE margin choropleth map
# ----------------------------------------------------------

TrueMarginMap <- leaflet(MarginData) %>%
  addProviderTiles("CartoDB.Positron") %>%
  addPolygons(
    fillColor   = ~FillColor,
    fillOpacity = 0.7,
    color       = "black",
    weight      = 0.4,
    popup       = popup_content
  ) %>%
  addLegend(
    colors = c("#08519c", "#99000d"),
    labels = c("Democratic winner", "Republican winner"),
    title  = "Winner (by margin)",
    opacity = 1
  )

TrueMarginMap

4/13-4/17 Three Graphs

Jonathan Messer

2026-04-17

Code: