library(readr)
library(dplyr)
library(rvest)
library(purrr)

# Load the leaderboard CSV
url <- "https://baseballsavant.mlb.com/leaderboard/custom?year=2025&type=batter&filter=&min=300&selections=pa%2Ck_percent%2Cbb_percent%2Cwoba%2Cxwoba%2Csweet_spot_percent%2Cbarrel_batted_rate%2Chard_hit_percent%2Cavg_best_speed%2Cavg_hyper_speed%2Cwhiff_percent%2Cswing_percent&chart=false&x=pa&y=pa&r=no&chartType=beeswarm&sort=xwoba&sortDir=desc"
csv_url <- paste0(url, "&csv=true")
basref <- read_csv(csv_url)

player_urls <- paste0(
  "https://baseballsavant.mlb.com/savant-player/",
  basref$player_id,
  "?stats=gamelogs-r-hitting-mlb&season=2025"
)

# Function to scrape one player's game log
get_game_log <- function(player_url) {
  tryCatch({
    page <- read_html(player_url)
    
    table_node <- page %>% 
      html_node(xpath = "/html/body/div[2]/section/div/section/div[4]/div[1]/div/table")
    
    game_log <- table_node %>% html_table()
    
    # Add player_id column
    player_id <- gsub(".*savant-player/(\\d+).*", "\\1", player_url)
    game_log$player_id <- player_id
    
    return(game_log)
  }, error = function(e) return(NULL))
}

# Process players in batches
batch_size <- 20
n_players <- length(player_urls)
batches <- split(1:n_players, ceiling(seq_along(1:n_players)/batch_size))

all_game_logs_list <- list()

for (i in seq_along(batches)) {
  idx <- batches[[i]]
  batch_urls <- player_urls[idx]
  
  cat("Processing batch", i, "of", length(batches), "...\n")
  
  # Scrape this batch
  batch_logs <- map(batch_urls, ~{
    Sys.sleep(0.5)  # small delay to avoid overloading server
    get_game_log(.x)
  })
  
  # Name each list element by player_id
  names(batch_logs) <- basref$player_id[idx]
  
  # Store batch logs
  all_game_logs_list <- c(all_game_logs_list, batch_logs)
  
  # Save batch to disk in case of interruption
  saveRDS(batch_logs, paste0("game_logs_batch_", i, ".rds"))
}

# Access individual player game logs
# Example: Aaron Judge (player_id 592450)
all_game_logs_list[["592450"]]

# All game logs are in 'all_game_logs_list', each element is a separate data frame