Sports and Election Analyses

Tendency to Pass

The following code shows the percentage likely that an NFL was to run a passing play in the 2025 season. This excludes special plays that would be within two minutes of each half of a game and includes first and second down plays.

CODE:

# ----------------------------------------------------------
# Step 1: Install required packages (if missing)
# ----------------------------------------------------------

if (!require("tidyverse")) install.packages("tidyverse")
if (!require("nflreadr")) install.packages("nflreadr")
if (!require("plotly")) install.packages("plotly")

library(tidyverse)
library(nflreadr)
library(plotly)

# ----------------------------------------------------------
# Step 2: Set global options
# ----------------------------------------------------------

# Turn off scientific notation
options(scipen = 9999)

# ----------------------------------------------------------
# Step 3: Load NFL play-by-play data (2025 season)
# ----------------------------------------------------------

# Year can be adjusted as needed
data <- load_pbp(2025)

# ----------------------------------------------------------
# Step 4: Filter for run or pass plays with EPA
# ----------------------------------------------------------

pbp_rp <- data %>%
  filter((rush == 1 | pass == 1), !is.na(epa))

# ----------------------------------------------------------
# Step 5: Create trimmed data frame for analysis
# ----------------------------------------------------------

mydata <- pbp_rp %>% 
  select(
    posteam,
    pass,
    wp,
    qtr,
    down,
    half_seconds_remaining
  )

glimpse(mydata)

# ----------------------------------------------------------
# Step 6: Filter for early-down, first-half, 
# neutral-game-state plays
# ----------------------------------------------------------
# Research question:
# Which teams were the most pass-heavy in the first half on early downs
# with win probability between 20% and 80%, excluding the final
# 2 minutes of the half?

mydata_summary <- mydata %>%
  filter(
    wp > 0.20,
    wp < 0.80,
    down <= 2,
    qtr <= 2,
    half_seconds_remaining > 120
  ) %>%
  group_by(posteam) %>%
  summarize(
    mean_pass = mean(pass),
    plays = n(),
    .groups = "drop"
  ) %>%
  arrange(desc(mean_pass))

glimpse(mydata_summary)

# ----------------------------------------------------------
# Step 7: Order teams for plotting
# ----------------------------------------------------------

mydata_summary <- mydata_summary %>%
  mutate(
    posteam = factor(
      posteam,
      levels = posteam[order(mean_pass, decreasing = TRUE)]
    )
  )

# ----------------------------------------------------------
# Step 8: Create Plotly visualization
# ----------------------------------------------------------

graph <- plot_ly(
  data = mydata_summary,
  x = ~posteam,
  y = ~mean_pass,
  type = "scatter",
  mode = "text",
  text = ~posteam,
  textposition = "middle center",
  hovertemplate = paste(
    "<b>%{text}</b><br>",
    "Pass rate: %{y:.1%}<br>",
    "Plays: %{customdata}<extra></extra>"
  ),
  customdata = ~plays
) %>%
  layout(
    title = "Tendency to pass, by NFL team, 2025",
    xaxis = list(
      title = "Team",
      showticklabels = FALSE
    ),
    yaxis = list(
      title = "Percent pass plays",
      tickformat = ".0%"
    )
  )

graph

Top 10 MLB Home Run Hitters

This chart represents the top 10 home run hitters in the MLB from the 2025 season with a minimum of 400 plate appearances.

CODE:

############################################################
## 1. Check for Required Packages and Install if Missing
############################################################

# Check for Lahman (historical MLB data)
if (!requireNamespace("Lahman", quietly = TRUE)) {
  install.packages("Lahman")
}

# Check for tidyverse
if (!requireNamespace("tidyverse", quietly = TRUE)) {
  install.packages("tidyverse")
}


############################################################
## 2. Load Required Packages
############################################################

library(Lahman)
library(tidyverse)


############################################################
## 3. Define the Season of Interest
############################################################

# Most recently completed MLB season
season_year <- 2025


############################################################
## 4. Pull Season-Level Batting Data (Lahman)
############################################################

# The Batting table contains one row per player per season per team
batting_data <- Batting %>%
  filter(yearID == season_year)


############################################################
## 5. Inspect and Filter the Data
############################################################

# Calculate plate appearances (AB + BB + HBP + SF)
qualified_hitters <- batting_data %>%
  mutate(
    PA = AB + BB + HBP + SF
  ) %>%
  filter(PA >= 400)


############################################################
## 6. Summarize: Identify the Top Home Run Hitters
############################################################

# Players may appear multiple times if traded; combine them
hr_leaders <- qualified_hitters %>%
  group_by(playerID) %>%
  summarise(
    HR = sum(HR),
    PA = sum(PA),
    .groups = "drop"
  ) %>%
  arrange(desc(HR)) %>%
  slice_head(n = 10) %>%
  left_join(People, by = "playerID") %>%
  mutate(
    Name = paste(nameFirst, nameLast)
  ) %>%
  select(Name, PA, HR)


############################################################
## 7. Create the Visualization
############################################################

HRPlot <- ggplot(hr_leaders,
       aes(x = reorder(Name, HR), y = HR)) +
  geom_col(fill = "firebrick") +
  coord_flip() +
  labs(
    title = "Top 10 MLB Home Run Hitters",
    subtitle = "2025 Regular Season (Minimum 400 PA)",
    x = "Player",
    y = "Home Runs",
    caption = "Source: Lahman Baseball Database"
  ) +
  theme_minimal()

HRPlot

############################################################
## 8. Optional Extension Ideas (Not Executed)
############################################################
#
# - Compare HR with RBI or runs scored
# - Summarize by teamID
# - Compute HR per PA (rate stat)
# - Compare two seasons side by side
#
############################################################

True Margin Election Map

This last code chunk creates an election map showing true margin in votes for the 2024 presidential election. This shows the total number of votes for Trump, Harris and other candidates by county precinct in Davidson and Rutherford counties.

CODE:

# ----------------------------------------------------------
# Step 1: Install required packages (if missing)
# ----------------------------------------------------------

if (!require("tidyverse")) install.packages("tidyverse")
if (!require("sf")) install.packages("sf")
if (!require("leaflet")) install.packages("leaflet")
if (!require("leafpop")) install.packages("leafpop")

library(tidyverse)
library(sf)
library(leaflet)
library(leafpop)

# ----------------------------------------------------------
# Step 2: Load precinct-level vote data for Rutherford County
# ----------------------------------------------------------

VoteData <- read.csv(
  "https://github.com/drkblake/Data/raw/refs/heads/main/Rutherford_Vote_Data.csv"
)

# ----------------------------------------------------------
# Step 3: Download and load Rutherford County precinct shapefile
# ----------------------------------------------------------

download.file(
  "https://github.com/drkblake/Data/raw/refs/heads/main/Rutherford_Precincts.zip",
  "RutherfordPrecinctMap.zip",
  mode = "wb"
)

unzip("RutherfordPrecinctMap.zip")

MapInfo <- read_sf("Rutherford_Precincts.shp")
MapInfo <- MapInfo %>% 
  select(Precinct, geometry)

# ----------------------------------------------------------
# Step 4: Join Rutherford vote data to map and transform CRS
# ----------------------------------------------------------

DataAndMapRuCo <- left_join(
  VoteData,
  MapInfo,
  by = "Precinct") %>%
  st_as_sf() %>%
  st_transform(4326)

# ----------------------------------------------------------
# Step 5: Load precinct-level vote data for Davidson County
# ----------------------------------------------------------

VoteData <- read.csv(
  "https://github.com/drkblake/Data/raw/refs/heads/main/Davidson_Vote_Data.csv"
)

# ----------------------------------------------------------
# Step 6: Download and load Davidson County precinct shapefile
# ----------------------------------------------------------

download.file(
  "https://github.com/drkblake/Data/raw/refs/heads/main/Davidson_Precincts.zip",
  "DavidsonPrecinctMap.zip",
  mode = "wb"
)

unzip("DavidsonPrecinctMap.zip")

MapInfo <- read_sf("Davidson_Precincts.shp")
MapInfo <- MapInfo %>% 
  select(Precinct, geometry)

# ----------------------------------------------------------
# Step 7: Join Davidson vote data to map and transform CRS
# ----------------------------------------------------------

DataAndMapDavidson <- left_join(
  VoteData,
  MapInfo,
  by = "Precinct") %>%
  st_as_sf() %>%
  st_transform(4326)

# ----------------------------------------------------------
# Step 8: Combine Rutherford and Davidson precinct files
# ----------------------------------------------------------

DataAndMap <- DataAndMapRuCo %>%
  bind_rows(DataAndMapDavidson) %>%
  st_as_sf() %>%
  st_transform(4326)

# ----------------------------------------------------------
# Step 9: Compute totals, vote shares, winner, and TRUE margin
# ----------------------------------------------------------

MarginData <- DataAndMap %>%
  mutate(
    Total = Trump + Harris + Other,
    Pct_Trump  = (Trump  / Total) * 100,
    Pct_Harris = (Harris / Total) * 100
  ) %>%
  mutate(
    Winner = case_when(
      Trump > Harris ~ "Republican",
      Harris > Trump ~ "Democratic",
      TRUE           ~ "Tie"
    ),
    Margin_Pct = round(abs(Pct_Trump - Pct_Harris), 1)
  )

# ----------------------------------------------------------
# Step 10: Create party-aware color palettes
# ----------------------------------------------------------

dem_pal <- colorNumeric(
  palette = c("#deebf7", "#08519c"),
  domain  = MarginData$Margin_Pct
)

rep_pal <- colorNumeric(
  palette = c("#fee0d2", "#99000d"),
  domain  = MarginData$Margin_Pct
)

# ----------------------------------------------------------
# Step 11: Assign fill colors using dplyr
# ----------------------------------------------------------

MarginData <- MarginData %>%
  mutate(
    FillColor = case_when(
      Winner == "Democratic" ~ dem_pal(Margin_Pct),
      Winner == "Republican" ~ rep_pal(Margin_Pct),
      TRUE                   ~ "gray"
    )
  )

# ----------------------------------------------------------
# Step 12: Create popup table (with raw vote counts)
# ----------------------------------------------------------

PopupData <- MarginData %>%
  st_drop_geometry() %>%
  select(
    County,
    Precinct,
    Winner,
    Trump,
    Harris,
    Other,
    `Margin (percentage points)` = Margin_Pct
  )

popup_content <- popupTable(
  PopupData,
  feature.id = FALSE,
  row.numbers = FALSE
)

# ----------------------------------------------------------
# Step 13: TRUE margin choropleth map
# ----------------------------------------------------------

TrueMarginMap <- leaflet(MarginData) %>%
  addProviderTiles("CartoDB.Positron") %>%
  addPolygons(
    fillColor   = ~FillColor,
    fillOpacity = 0.7,
    color       = "black",
    weight      = 0.4,
    popup       = popup_content
  ) %>%
  addLegend(
    colors = c("#08519c", "#99000d"),
    labels = c("Democratic winner", "Republican winner"),
    title  = "Winner (by margin)",
    opacity = 1
  )

TrueMarginMap

All the Code:

Football

# ----------------------------------------------------------
# Step 1: Install required packages (if missing)
# ----------------------------------------------------------

if (!require("tidyverse")) install.packages("tidyverse")
if (!require("nflreadr")) install.packages("nflreadr")
if (!require("plotly")) install.packages("plotly")

library(tidyverse)
library(nflreadr)
library(plotly)

# ----------------------------------------------------------
# Step 2: Set global options
# ----------------------------------------------------------

# Turn off scientific notation
options(scipen = 9999)

# ----------------------------------------------------------
# Step 3: Load NFL play-by-play data (2025 season)
# ----------------------------------------------------------

# Year can be adjusted as needed
data <- load_pbp(2025)

# ----------------------------------------------------------
# Step 4: Filter for run or pass plays with EPA
# ----------------------------------------------------------

pbp_rp <- data %>%
  filter((rush == 1 | pass == 1), !is.na(epa))

# ----------------------------------------------------------
# Step 5: Create trimmed data frame for analysis
# ----------------------------------------------------------

mydata <- pbp_rp %>% 
  select(
    posteam,
    pass,
    wp,
    qtr,
    down,
    half_seconds_remaining
  )

glimpse(mydata)

# ----------------------------------------------------------
# Step 6: Filter for early-down, first-half, 
# neutral-game-state plays
# ----------------------------------------------------------
# Research question:
# Which teams were the most pass-heavy in the first half on early downs
# with win probability between 20% and 80%, excluding the final
# 2 minutes of the half?

mydata_summary <- mydata %>%
  filter(
    wp > 0.20,
    wp < 0.80,
    down <= 2,
    qtr <= 2,
    half_seconds_remaining > 120
  ) %>%
  group_by(posteam) %>%
  summarize(
    mean_pass = mean(pass),
    plays = n(),
    .groups = "drop"
  ) %>%
  arrange(desc(mean_pass))

glimpse(mydata_summary)

# ----------------------------------------------------------
# Step 7: Order teams for plotting
# ----------------------------------------------------------

mydata_summary <- mydata_summary %>%
  mutate(
    posteam = factor(
      posteam,
      levels = posteam[order(mean_pass, decreasing = TRUE)]
    )
  )

# ----------------------------------------------------------
# Step 8: Create Plotly visualization
# ----------------------------------------------------------

graph <- plot_ly(
  data = mydata_summary,
  x = ~posteam,
  y = ~mean_pass,
  type = "scatter",
  mode = "text",
  text = ~posteam,
  textposition = "middle center",
  hovertemplate = paste(
    "<b>%{text}</b><br>",
    "Pass rate: %{y:.1%}<br>",
    "Plays: %{customdata}<extra></extra>"
  ),
  customdata = ~plays
) %>%
  layout(
    title = "Tendency to pass, by NFL team, 2025",
    xaxis = list(
      title = "Team",
      showticklabels = FALSE
    ),
    yaxis = list(
      title = "Percent pass plays",
      tickformat = ".0%"
    )
  )

graph

Baseball

############################################################
## 1. Check for Required Packages and Install if Missing
############################################################

# Check for Lahman (historical MLB data)
if (!requireNamespace("Lahman", quietly = TRUE)) {
  install.packages("Lahman")
}

# Check for tidyverse
if (!requireNamespace("tidyverse", quietly = TRUE)) {
  install.packages("tidyverse")
}


############################################################
## 2. Load Required Packages
############################################################

library(Lahman)
library(tidyverse)


############################################################
## 3. Define the Season of Interest
############################################################

# Most recently completed MLB season
season_year <- 2025


############################################################
## 4. Pull Season-Level Batting Data (Lahman)
############################################################

# The Batting table contains one row per player per season per team
batting_data <- Batting %>%
  filter(yearID == season_year)


############################################################
## 5. Inspect and Filter the Data
############################################################

# Calculate plate appearances (AB + BB + HBP + SF)
qualified_hitters <- batting_data %>%
  mutate(
    PA = AB + BB + HBP + SF
  ) %>%
  filter(PA >= 400)


############################################################
## 6. Summarize: Identify the Top Home Run Hitters
############################################################

# Players may appear multiple times if traded; combine them
hr_leaders <- qualified_hitters %>%
  group_by(playerID) %>%
  summarise(
    HR = sum(HR),
    PA = sum(PA),
    .groups = "drop"
  ) %>%
  arrange(desc(HR)) %>%
  slice_head(n = 10) %>%
  left_join(People, by = "playerID") %>%
  mutate(
    Name = paste(nameFirst, nameLast)
  ) %>%
  select(Name, PA, HR)


############################################################
## 7. Create the Visualization
############################################################

HRPlot <- ggplot(hr_leaders,
       aes(x = reorder(Name, HR), y = HR)) +
  geom_col(fill = "firebrick") +
  coord_flip() +
  labs(
    title = "Top 10 MLB Home Run Hitters",
    subtitle = "2025 Regular Season (Minimum 400 PA)",
    x = "Player",
    y = "Home Runs",
    caption = "Source: Lahman Baseball Database"
  ) +
  theme_minimal()

HRPlot

############################################################
## 8. Optional Extension Ideas (Not Executed)
############################################################
#
# - Compare HR with RBI or runs scored
# - Summarize by teamID
# - Compute HR per PA (rate stat)
# - Compare two seasons side by side
#
############################################################

Election Map

# ----------------------------------------------------------
# Step 1: Install required packages (if missing)
# ----------------------------------------------------------

if (!require("tidyverse")) install.packages("tidyverse")
if (!require("sf")) install.packages("sf")
if (!require("leaflet")) install.packages("leaflet")
if (!require("leafpop")) install.packages("leafpop")

library(tidyverse)
library(sf)
library(leaflet)
library(leafpop)

# ----------------------------------------------------------
# Step 2: Load precinct-level vote data for Rutherford County
# ----------------------------------------------------------

VoteData <- read.csv(
  "https://github.com/drkblake/Data/raw/refs/heads/main/Rutherford_Vote_Data.csv"
)

# ----------------------------------------------------------
# Step 3: Download and load Rutherford County precinct shapefile
# ----------------------------------------------------------

download.file(
  "https://github.com/drkblake/Data/raw/refs/heads/main/Rutherford_Precincts.zip",
  "RutherfordPrecinctMap.zip",
  mode = "wb"
)

unzip("RutherfordPrecinctMap.zip")

MapInfo <- read_sf("Rutherford_Precincts.shp")
MapInfo <- MapInfo %>% 
  select(Precinct, geometry)

# ----------------------------------------------------------
# Step 4: Join Rutherford vote data to map and transform CRS
# ----------------------------------------------------------

DataAndMapRuCo <- left_join(
  VoteData,
  MapInfo,
  by = "Precinct") %>%
  st_as_sf() %>%
  st_transform(4326)

# ----------------------------------------------------------
# Step 5: Load precinct-level vote data for Davidson County
# ----------------------------------------------------------

VoteData <- read.csv(
  "https://github.com/drkblake/Data/raw/refs/heads/main/Davidson_Vote_Data.csv"
)

# ----------------------------------------------------------
# Step 6: Download and load Davidson County precinct shapefile
# ----------------------------------------------------------

download.file(
  "https://github.com/drkblake/Data/raw/refs/heads/main/Davidson_Precincts.zip",
  "DavidsonPrecinctMap.zip",
  mode = "wb"
)

unzip("DavidsonPrecinctMap.zip")

MapInfo <- read_sf("Davidson_Precincts.shp")
MapInfo <- MapInfo %>% 
  select(Precinct, geometry)

# ----------------------------------------------------------
# Step 7: Join Davidson vote data to map and transform CRS
# ----------------------------------------------------------

DataAndMapDavidson <- left_join(
  VoteData,
  MapInfo,
  by = "Precinct") %>%
  st_as_sf() %>%
  st_transform(4326)

# ----------------------------------------------------------
# Step 8: Combine Rutherford and Davidson precinct files
# ----------------------------------------------------------

DataAndMap <- DataAndMapRuCo %>%
  bind_rows(DataAndMapDavidson) %>%
  st_as_sf() %>%
  st_transform(4326)

# ----------------------------------------------------------
# Step 9: Compute totals, vote shares, winner, and TRUE margin
# ----------------------------------------------------------

MarginData <- DataAndMap %>%
  mutate(
    Total = Trump + Harris + Other,
    Pct_Trump  = (Trump  / Total) * 100,
    Pct_Harris = (Harris / Total) * 100
  ) %>%
  mutate(
    Winner = case_when(
      Trump > Harris ~ "Republican",
      Harris > Trump ~ "Democratic",
      TRUE           ~ "Tie"
    ),
    Margin_Pct = round(abs(Pct_Trump - Pct_Harris), 1)
  )

# ----------------------------------------------------------
# Step 10: Create party-aware color palettes
# ----------------------------------------------------------

dem_pal <- colorNumeric(
  palette = c("#deebf7", "#08519c"),
  domain  = MarginData$Margin_Pct
)

rep_pal <- colorNumeric(
  palette = c("#fee0d2", "#99000d"),
  domain  = MarginData$Margin_Pct
)

# ----------------------------------------------------------
# Step 11: Assign fill colors using dplyr
# ----------------------------------------------------------

MarginData <- MarginData %>%
  mutate(
    FillColor = case_when(
      Winner == "Democratic" ~ dem_pal(Margin_Pct),
      Winner == "Republican" ~ rep_pal(Margin_Pct),
      TRUE                   ~ "gray"
    )
  )

# ----------------------------------------------------------
# Step 12: Create popup table (with raw vote counts)
# ----------------------------------------------------------

PopupData <- MarginData %>%
  st_drop_geometry() %>%
  select(
    County,
    Precinct,
    Winner,
    Trump,
    Harris,
    Other,
    `Margin (percentage points)` = Margin_Pct
  )

popup_content <- popupTable(
  PopupData,
  feature.id = FALSE,
  row.numbers = FALSE
)

# ----------------------------------------------------------
# Step 13: TRUE margin choropleth map
# ----------------------------------------------------------

TrueMarginMap <- leaflet(MarginData) %>%
  addProviderTiles("CartoDB.Positron") %>%
  addPolygons(
    fillColor   = ~FillColor,
    fillOpacity = 0.7,
    color       = "black",
    weight      = 0.4,
    popup       = popup_content
  ) %>%
  addLegend(
    colors = c("#08519c", "#99000d"),
    labels = c("Democratic winner", "Republican winner"),
    title  = "Winner (by margin)",
    opacity = 1
  )

TrueMarginMap