library(readr)
library(dplyr)
library(rvest)
library(purrr)
# Load the leaderboard CSV
url <- "https://baseballsavant.mlb.com/leaderboard/custom?year=2025&type=batter&filter=&min=300&selections=pa%2Ck_percent%2Cbb_percent%2Cwoba%2Cxwoba%2Csweet_spot_percent%2Cbarrel_batted_rate%2Chard_hit_percent%2Cavg_best_speed%2Cavg_hyper_speed%2Cwhiff_percent%2Cswing_percent&chart=false&x=pa&y=pa&r=no&chartType=beeswarm&sort=xwoba&sortDir=desc"
csv_url <- paste0(url, "&csv=true")
basref <- read_csv(csv_url)
player_urls <- paste0(
"https://baseballsavant.mlb.com/savant-player/",
basref$player_id,
"?stats=gamelogs-r-hitting-mlb&season=2025"
)
# Function to scrape one player's game log
get_game_log <- function(player_url) {
tryCatch({
page <- read_html(player_url)
table_node <- page %>%
html_node(xpath = "/html/body/div[2]/section/div/section/div[4]/div[1]/div/table")
game_log <- table_node %>% html_table()
# Add player_id column
player_id <- gsub(".*savant-player/(\\d+).*", "\\1", player_url)
game_log$player_id <- player_id
return(game_log)
}, error = function(e) return(NULL))
}
# Process players in batches
batch_size <- 20
n_players <- length(player_urls)
batches <- split(1:n_players, ceiling(seq_along(1:n_players)/batch_size))
all_game_logs_list <- list()
for (i in seq_along(batches)) {
idx <- batches[[i]]
batch_urls <- player_urls[idx]
cat("Processing batch", i, "of", length(batches), "...\n")
# Scrape this batch
batch_logs <- map(batch_urls, ~{
Sys.sleep(0.5) # small delay to avoid overloading server
get_game_log(.x)
})
# Name each list element by player_id
names(batch_logs) <- basref$player_id[idx]
# Store batch logs
all_game_logs_list <- c(all_game_logs_list, batch_logs)
# Save batch to disk in case of interruption
saveRDS(batch_logs, paste0("game_logs_batch_", i, ".rds"))
}
# Access individual player game logs
# Example: Aaron Judge (player_id 592450)
all_game_logs_list[["592450"]]
# All game logs are in 'all_game_logs_list', each element is a separate data frame