NBA Data Journal

Author

Nick Warino, nickwarino.com

Published

April 10, 2024

1 History of NBA League Average Statistics

Code
library(rvest)
league_per_game_url <- "https://www.basketball-reference.com/leagues/NBA_stats_per_game.html"
league_per_game_url_page <- read_html(league_per_game_url)
nba_season_history <- html_table(league_per_game_url_page)[[1]]

# Create a vector of new column names that match the first row values
new_names <- as.character(nba_season_history[1,])

# Use colnames() function to assign new column names to old column names
colnames(nba_season_history) <- new_names

# Remove the first row using slice()
nba_season_history <- nba_season_history |>
  slice(-1)

nba_season_history <- nba_season_history |>
  filter(!is.na(as.numeric(Rk))) |> 
  mutate(across(6:32, as.numeric))

#mutate the HT column to inches
nba_season_history <- nba_season_history |>
  mutate(Ht = sapply(lapply(strsplit(Ht, "-"), as.numeric), function(x) x[1] * 12 + x[2]))

# Replace TOV values below 2% with NA
nba_season_history <- nba_season_history |>
  mutate(TOV = if_else(TOV < 2, NA_real_, TOV))

# Filter 'nba_season_history' to seasons with `Pace` of NA
nba_season_history_pre_1973 <- nba_season_history |>
  filter(is.na(Pace))

nba_season_history_selected <- nba_season_history |>
  select(Season,
         PTS,
         "3PA",
         "FGA",
         "FTA",
         AST,
         "3P%",
         "FT%",
         "eFG%",
         Pace,
         ORtg,
         Wt,
         Ht,
         TRB,
         STL,
         BLK,
         TOV,) |> 
  rename(Points=PTS,
         "3 Point Attempts"="3PA",
         "Field Goal Attempts"=FGA,
         "Free Throw Attempts"=FTA,
         "Assists"=AST,
         "3 Point %"="3P%",
         "Free Throw %"="FT%",
         "Effective Field Goal %"="eFG%",
         "Pace"=Pace,
         "Points Per 100 Poss."=ORtg,
         "Weight (lbs)"=Wt,
         "Height (Inches)"=Ht,
         "Rebounds"=TRB,
         "Steals"=STL,
         "Blocks"=BLK,
         "Turnovers"=TOV,) |>
  pivot_longer(cols=2:17,
               names_to = "Stat",
               values_to = "Value")

#convert Season from factor to numeric
nba_season_history_selected$Season <- as.numeric(gsub("-.*", "", nba_season_history_selected$Season))

#convert Season from numeric to date
nba_season_history_selected$Season <- as.Date(paste0(nba_season_history_selected$Season,"-10-01"))

#load lubridate package
library(lubridate)

#add one year to every row under season
nba_season_history_selected$Season <- nba_season_history_selected$Season + years(1)


#save as csv file
write.csv(nba_season_history_selected, here("output_data", "nba_season_history_selected.csv"), row.names = FALSE)

#time series line graph with facets
ggplot(nba_season_history_selected,
       aes(x = Season,
           y = Value)) +
    geom_line() +
    facet_wrap(~Stat, scales="free_y") +  
geom_rect(aes(xmin = as.Date("1984-10-01"), xmax = as.Date("1993-07-01"), ymin = -Inf, ymax = Inf), fill = "red", alpha = 0.00199) +
  geom_rect(aes(xmin = as.Date("1995-03-19"), xmax = as.Date("1998-07-01"), ymin = -Inf, ymax = Inf), fill = "red", alpha = 0.00199) +
  geom_rect(aes(xmin = as.Date("2001-10-01"), xmax = as.Date("2003-07-01"), ymin = -Inf, ymax = Inf), fill = "red", alpha = 0.00199) +
  geom_rect(aes(xmin = as.Date("2003-10-01"), xmax = as.Date("2024-07-01"), ymin = -Inf, ymax = Inf), fill = "purple", alpha = 0.00199) +
  geom_rect(aes(xmin = as.Date("1969-10-01"), xmax = as.Date("1989-07-01"), ymin = -Inf, ymax = Inf), fill = "gold", alpha = 0.00199) +
  geom_rect(aes(xmin = as.Date("1956-10-01"), xmax = as.Date("1969-07-01"), ymin = -Inf, ymax = Inf), fill = "green", alpha = 0.00199) +
  labs(title = "NBA League Averages Per Game History, with GOAT Evolution",
       subtitle = "Green = Russell; Gold = Kareem; Red = MJ; Purple = LeBron",
       x = "Season",
       caption=paste("Sources: Basketball-Reference.com.\nnickwarino.com, created", as.character(current_date)))+
  My_Theme_WithY()

Code
library(plotly)

p <- ggplot(nba_season_history_selected,
       aes(x = Season,
           y = Value)) +
    geom_line() +
    facet_wrap(~Stat, scales="free_y") +  
  labs(title = "NBA League Averages Per Game History",
       x = "Season",
       caption=paste("Sources: Basketball-Reference.com.\nnickwarino.com, created", as.character(current_date)))

# Use ggplotly() to make it interactive
p <- ggplotly(p)

2 Playoff Clutch

Effective FG% in last 2 minutes of regulation, score within 5 points

Code
clutch_url <- "https://stathead.com/tiny/oQ2cW"
clutch_url_page <- read_html(clutch_url)
clutch_nba_history <- html_table(clutch_url_page)[[1]]

clutch_nba_history <- clutch_nba_history |>
  filter(!is.na(as.numeric(Rk))) |>
  mutate(across(3:14, as.numeric)) |>
  mutate(across(3:14, ~replace_na(., 0)))

# Rename the %Ast'd column
clutch_nba_history <- clutch_nba_history |>
  rename(Ast_d_perc = `%Ast'd`) |> 
  rename(Ast_d_tot = `Ast'd`) |> 
  rename(eFG = `eFG%`)

weighted_average_eFG <- clutch_nba_history |>
  summarize(weighted_avg_eFG = sum(eFG * FGA) / sum(FGA)) |>
  pull(weighted_avg_eFG) |>
  as.numeric()
  
#save as csv file 
write.csv(clutch_nba_history, here("output_data", "clutch_nba_history.csv"), row.names = FALSE)

ggplot(clutch_nba_history,
       aes(x = FGA,
           y = eFG)) +
  geom_point(aes(size = Ast_d_tot, fill = Ast_d_perc), alpha = 0.7, shape = 21, stroke = .5) +
  geom_text_repel(aes(label = Player), size = 5, color = "black", max.overlaps = 5) +
  scale_fill_gradient2(low = "blue", mid = "white", high = "red",
                       midpoint = median(clutch_nba_history$Ast_d_perc, na.rm = TRUE)) +
  geom_hline(yintercept = weighted_average_eFG, linetype = "dashed", color = "red", linewidth = .5) +
  labs(title = "Most Clutch NBA Players Ever in Playoffs (since 1996)",
       subtitle = "Effective FG% in last 2 minutes of regulation, score within 5 points.\nUniverse of players limited to top 100 players of FGA in same situation",
       caption=paste("Sources: Basketball-Reference.com.\nnickwarino.com, created", as.character(current_date)),
       x = "FGA",
       y = "Effective FG%",
       size = "Number of FGA assisted",
       fill = "% of FGA assisted") +
  theme(legend.position = c(.9, .9)) +
  My_Theme_WithY()

2.1 Super Clutch Situations

Effective FG% in last 1 minutes of regulation, shot to tie or take lead.

Code
clutch2_url <- "https://stathead.com/tiny/79Hlz"
clutch2_url_page <- read_html(clutch2_url)
clutch2_nba_history <- html_table(clutch2_url_page)[[1]]

clutch2_nba_history <- clutch2_nba_history |>
  filter(!is.na(as.numeric(Rk))) |>
  mutate(across(3:14, as.numeric)) |>
  mutate(across(3:14, ~replace_na(., 0)))

# Rename the %Ast'd column
clutch2_nba_history <- clutch2_nba_history |>
  rename(Ast_d_perc = `%Ast'd`) |> 
  rename(Ast_d_tot = `Ast'd`) |> 
  rename(eFG = `eFG%`)

weighted_average_eFG2 <- clutch2_nba_history |>
  summarize(weighted_avg_eFG = sum(eFG * FGA) / sum(FGA)) |>
  pull(weighted_avg_eFG) |>
  as.numeric()
  
#save as csv file 
write.csv(clutch2_nba_history, here("output_data", "clutch2_nba_history.csv"), row.names = FALSE)


ggplot(clutch2_nba_history,
       aes(x = FGA,
           y = eFG)) +
  geom_point(aes(size = Ast_d_tot, fill = Ast_d_perc), alpha = 0.7, shape = 21, stroke = .5) +
  geom_text_repel(aes(label = Player), size = 5, color = "black", max.overlaps = 10) +
  scale_fill_gradient2(low = "blue", mid = "white", high = "red",
                       midpoint = median(clutch2_nba_history$Ast_d_perc, na.rm = TRUE)) +
  geom_hline(yintercept = weighted_average_eFG2, linetype = "dashed", color = "red", linewidth = .5) +
  theme_minimal() +
  labs(title = "Most clutch NBA Players Ever in Playoffs (since 1996)",
       subtitle = "Effective FG% in last 1 minutes of regulation, shot to tie or take lead.\nUniverse of players limited to top 100 players of FGA in same situation",
       caption=paste("Sources: Basketball-Reference.com.\nnickwarino.com, created", as.character(current_date)),
       x = "FGA",
       y = "Effective FG%",
       size = "Number of FGA assisted",
       fill = "% of FGA assisted") +
  theme(legend.position = c(.9, .9)) +
  My_Theme_WithY()

2.2 Super Super Clutch Situations

Effective FG% in last 50 sec of regulation, shot to take lead.

Code
clutch3_url <- "https://stathead.com/tiny/A0FDq"
clutch3_url_page <- read_html(clutch3_url)
clutch3_nba_history <- html_table(clutch3_url_page)[[1]]

clutch3_nba_history <- clutch3_nba_history |>
  filter(!is.na(as.numeric(Rk))) |>
  mutate(across(3:14, as.numeric)) |>
  mutate(across(3:14, ~replace_na(., 0)))

# Rename the %Ast'd column
clutch3_nba_history <- clutch3_nba_history |>
  rename(Ast_d_perc = `%Ast'd`) |> 
  rename(Ast_d_tot = `Ast'd`) |> 
  rename(eFG = `eFG%`)

weighted_average_eFG3 <- clutch3_nba_history |>
  summarize(weighted_avg_eFG = sum(eFG * FGA) / sum(FGA)) |>
  pull(weighted_avg_eFG) |>
  as.numeric()
  
#save as csv file 
write.csv(clutch3_nba_history, here("output_data", "clutch3_nba_history.csv"), row.names = FALSE)

ggplot(clutch3_nba_history,
       aes(x = FGA,
           y = eFG)) +
  geom_point(aes(size = Ast_d_tot, fill = Ast_d_perc), alpha = 0.7, shape = 21, stroke = .5) +
  geom_text_repel(aes(label = Player), size = 5, color = "black", max.overlaps = 10) +
  scale_fill_gradient2(low = "blue", mid = "white", high = "red",
                       midpoint = .5) +
  geom_hline(yintercept = weighted_average_eFG3, linetype = "dashed", color = "red", linewidth = .5) +
  theme_minimal() +
  labs(title = "Most clutch NBA Players Ever in Playoffs (since 1996)",
       subtitle = "Effective FG% in last 50 sec of regulation, shot to take lead.\nUniverse of players limited to top 100 players of FGA in same situation",
       caption=paste("Sources: Basketball-Reference.com.\nnickwarino.com, created", as.character(current_date)),
       x = "FGA",
       y = "Effective FG%",
       size = "Number of FGA assisted",
       fill = "% of FGA assisted") +
  theme(legend.position = c(.9, .9)) +
  My_Theme_WithY()

3 GOAT PG

Playoff VORP by PG position

Code
pg_playoff_vorp_url <- "https://stathead.com/tiny/FeISc"
pg_playoff_vorp_page <- read_html(pg_playoff_vorp_url)
pg_playoff_vorp <- html_table(pg_playoff_vorp_page)[[1]]

pg_playoff_vorp <- pg_playoff_vorp |>
  select(-3) |> 
  mutate(across(3:26, ~replace_na(., 0)))


# Rename the %Ast'd column
# pg_playoff_vorp <- pg_playoff_vorp |>
#   rename(Ast_d_perc = `%Ast'd`) |>
#   rename(Ast_d_tot = `Ast'd`) |>
#   rename(eFG = `eFG%`)

#save as csv file
write.csv(pg_playoff_vorp, here("output_data", "pg_playoff_vorp.csv"), row.names = FALSE)

ggplot(pg_playoff_vorp,
       aes(x = VORP,
           y = BPM)) +
  geom_point(aes(size = VORP, fill = BPM), alpha = 0.7, shape = 21, stroke = .5) +
  geom_text_repel(aes(label = Player), size = 5, color = "black", max.overlaps = 5) +
  scale_fill_gradient2(low = "blue", mid = "white", high = "red",
                       midpoint = median(pg_playoff_vorp$BPM, na.rm = TRUE)) +
  theme_minimal() +
  labs(title = "LeBron could've been GOAT Point Guard, But Mostly Played Foward Positions",
       subtitle = "VORP and BPM in Playoffs While Playing PG Position.\nUniverse of players limited to top 100 players of Playoff VORP while playing PG",
       caption=paste("Sources: Basketball-Reference.com.\nnickwarino.com, created", as.character(current_date)),
       x = "Playoff VORP",
       y = "Playoff BPM",
       size = "Playoff VORP",
       fill = "Playoff BPM") +
  theme(legend.position = c(.9, .9)) +
  My_Theme_WithY()

4 Best Defenders

Code
# Function to extract table from URL
extract_table_from_url <- function(url) {
  page <- read_html(url)
  table <- html_table(page)[[1]]
  return(table)
}

# Extract tables
blockper1 <- extract_table_from_url("https://stathead.com/tiny/tdXLR")
blockper2 <- extract_table_from_url("https://stathead.com/tiny/hBlZm")
stealper1 <- extract_table_from_url("https://stathead.com/tiny/VWeQT")
stealper2 <- extract_table_from_url("https://stathead.com/tiny/krA65")
drebper1 <- extract_table_from_url("https://stathead.com/tiny/MxxJY")
drebper2 <- extract_table_from_url("https://stathead.com/tiny/JhBJx")

# Clean tables. Select `Player`, `BLK%`, `STL%`, `REB%`, MP, GS, DBPM, OBPM, BPM, VORP, Pos columns

# Function to remove duplicate columns and select specific columns
clean_and_select <- function(df, columns_to_select) {
  # Remove duplicate columns
  df <- df[, !duplicated(names(df))]
  
  # Select specific columns
  df |> 
    select(all_of(columns_to_select))
}

# Columns to select
selected_columns <- c("Player", "BLK%", "STL%", "DRB%", "MP", "GS", "DBPM", "OBPM", "BPM", "VORP", "Pos")

# Apply the function to each dataframe
blockper1_clean <- clean_and_select(blockper1, selected_columns)
blockper2_clean <- clean_and_select(blockper2, selected_columns)
stealper1_clean <- clean_and_select(stealper1, selected_columns)
stealper2_clean <- clean_and_select(stealper2, selected_columns)
drebper1_clean <- clean_and_select(drebper1, selected_columns)
drebper2_clean <- clean_and_select(drebper2, selected_columns)

# Combine tables. First combine blocks, steal, and dreb, then join by `Player`

# Combine blocks, steals, and dreb

blockper <- blockper1_clean |>
  bind_rows(blockper2_clean)

stealper <- stealper1_clean |>
  bind_rows(stealper2_clean)

drebper <- drebper1_clean |>
  bind_rows(drebper2_clean)

# bind rows for blocks steals and dreb
combined_def <- blockper |>
  bind_rows(stealper, drebper) |> # remove duplicate players
  distinct()

# Join by `Player`, keep all observations
combined_def <- combined_def |>
  select("Player", "BLK%", "STL%", "DRB%", "MP", "GS", "DBPM", "OBPM", "BPM", "VORP", "Pos") |> 
  mutate("Stocks%" = `BLK%` + `STL%`) |>
  mutate("StockDRB%" = `BLK%` + `STL%` + `DRB%`)

# gets labels of Michael Jordan and LeBron James to add to graph
mj_lbj <- combined_def |>
  filter(Player %in% c("Michael Jordan", "LeBron James"))

combined_def |>
  ggplot(aes(x = `BLK%`, y = `STL%`, fill = DBPM, size = `DRB%`)) + 
  geom_point(shape = 21, alpha = .9) +
  scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0) +
  scale_size_continuous(range = c(.1, 7)) + # add labels for Michael Jordan and LeBron James
  geom_text_repel(data = mj_lbj, aes(label = paste(Player)), size = 5, max.overlaps = 10) +
  My_Theme_WithY() + 
  ggrepel::geom_text_repel(aes(label = paste(Player)),
                           size = 5, max.overlaps = 10) +
  labs(title = "Best Defenders: Block and Steal %",
       subtitle = "X = BLK%, Y = STL%, Size = DRB%, Fill = DBPM.\nUniverse of players limited to combined top 400 players for Block %, Steal %, or DREB%",
       caption=paste("Sources: Basketball-Reference.com.\nnickwarino.com, created", as.character(current_date)),
       x = "Block Percentage",  
       y = "Steal Percentage",
       fill = "Defensive Box Plus-Minus") 

Code
combined_def |>
  ggplot(aes(x = `Stocks%`, y = `DRB%`, fill = DBPM, size = `MP`)) + 
  geom_point(shape = 21, alpha = .9) +
  scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0) +
  scale_size_continuous(range = c(.1, 7)) +
  geom_text_repel(data = mj_lbj, aes(label = paste(Player)), size = 5, max.overlaps = 10) +
  My_Theme_WithY() + 
  ggrepel::geom_text_repel(aes(label = paste(Player)),
                           size = 5, max.overlaps = 10) +
  labs(title = "Best Defenders: Stocks % and DREB %",
       subtitle = "X = BLK%, Y = STL%, Size = MP, Fill = DBPM.\nUniverse of players limited to combined top 400 players for Block %, Steal %, or DREB%",
       caption=paste("Sources: Basketball-Reference.com.\nnickwarino.com, created", as.character(current_date)),
       x = "Stocks Percentage",  
       y = "Defensive Rebound Percentage",
       fill = "Defensive Box Plus-Minus")

Wemby already unreal on defense. Unlike the two other outliers (Drummond and Whiteside), Wemby isn’t a stiff while defending on the perimeter, and seems much smarter overall on defense (i.e. doesn’t chase blocks and easily fall for pumpfakes). And obviously he’s a much better shooter, passer, and playmaker on offense (also indicating more skill and spatial awareness, which is good for team defense).

I think he’ll go down as GOAT defender. Something to track.

Code
# # Horizontal bar graph by DBPM
combined_def |>
  arrange(desc(DBPM)) |>
  head(50) |>
  ggplot(aes(x=reorder(Player, DBPM), y=DBPM)) +
  geom_segment(aes(x=reorder(Player, DBPM), xend=reorder(Player, DBPM), y=0, yend=DBPM), size=1, color="black") +
  geom_point(color="black", size=10) +
  geom_text(aes(label=sprintf("%.1f", DBPM)), color="white") +  # Formatting label to 1 decimal place
  coord_flip() +
  theme_minimal() +
  labs(title="Top 50 Career DBPM", 
       subtitle="Ranking players by Career Regular Season.\nUniverse of players limited to combined top 400 players for Block %, Steal %, or DREB%",
       caption=paste("Sources: Basketball-Reference.com.\nnickwarino.com, created", as.character(current_date)),
       x="Player",
       y="DBPM") +
  My_Theme_WithY()

5 Greatest Games Ever

Code
# Download data from BBall Ref
# Function to process the data
process_data <- function(url, bpm = TRUE) {
  page <- read_html(url)
  data <- html_table(page)[[1]]
  
  if (bpm) {
    data <- data |> 
      rename("BPM2" = 3, "MP2" = 12, Road = 8) |> 
      select(-c(1, 3, 12))
  } else {
    data <- data |> 
      rename("GS2" = 3, Road = 7) |> 
      select(-c(1, 3))
  }

  return(data)
}

# URLs for BPM and GS data
bpm_rs_urls <- c("https://stathead.com/tiny/x9frE",
                 "https://stathead.com/tiny/9bRhn",
                 "https://stathead.com/tiny/rWFnt",
                 "https://stathead.com/tiny/zSeTH",
                 "https://stathead.com/tiny/gYZEl")
gs_rs_urls <- c("https://stathead.com/tiny/mqR2d",
                "https://stathead.com/tiny/C3djd",
                "https://stathead.com/tiny/KE9qJ",
                "https://stathead.com/tiny/tghKu",
                "https://stathead.com/tiny/gxAjF")

# Processing the data
greatest_games_bpm_rs_list <- lapply(bpm_rs_urls, process_data, bpm = TRUE)
greatest_games_gs_rs_list <- lapply(gs_rs_urls, process_data, bpm = FALSE)
greatest_games_bpm_combined <- bind_rows(greatest_games_bpm_rs_list)
greatest_games_gs_combined <- bind_rows(greatest_games_gs_rs_list)
Code
#Combine both datasets by rows
greatest_games <- greatest_games_bpm_combined |>
  bind_rows(greatest_games_gs_combined)

# Remove duplicate rows by Player and Date
greatest_games <- greatest_games |>
  distinct(Player, Date, .keep_all = TRUE)

# Write CSV
write_csv(greatest_games, "greatest_games.csv")

# Create Colored Bubble Plot in GGPlot with GmSc on X, BPM/100*MP on Y, size of bubble is MP, colore is "+/-"

greatest_games <- greatest_games |>
  filter(!is.na(`GmSc`)) |>
  filter(!is.na(`BPM`)) |>
  mutate(GameBPM = BPM/25*MP)

# Filter all LeBron games
lebron_greatest_games <- greatest_games |>
  filter(Player == "LeBron James")

greatest_games |>
  ggplot(aes(x = GmSc, y = BPM, fill = PTS)) +
  geom_point(shape = 21, alpha = .9, size = 3) +
  scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 50) +
  scale_x_continuous(limits = c(40, 70)) +
  theme(legend.position = c(.9, .9)) +
  My_Theme_WithY() + # add ggrepel labels of `Player` and `Date` to the plot
  ggrepel::geom_text_repel(aes(label = paste(Player, Date)), size = 5, nudge_x = 1, nudge_y = 1, max.overlaps = 5) + # add lebron james labels to the plot
  geom_text_repel(data = lebron_greatest_games, aes(label = paste(Player, Date)), size = 5, nudge_x = -1, nudge_y = 1, max.overlaps = 5) +
  
  labs(title = "Greatest Single-Game Individual Performances Since 1972",
       subtitle = "Single-Game BPM and Game Score.\nUniverse of players limited to top 1000 Game Score games.",
       caption=paste("Sources: Basketball-Reference.com.\nnickwarino.com, created", as.character(current_date)),
       x = "Game Score",
       y = "Box Score Plus-Minus",
       fill = "Points Scored")

6 Greatest Team Offense Ever

Adjust for league offensive rating

Greatest Team Offenses Since 1972
Team Season ORtg+ ORtg SRS
DAL 2003-04 1.089 112.1 4.86
PHO 2004-05 1.079 114.5 7.08
GSW 2015-16 1.076 114.5 10.38
DAL 2001-02 1.074 112.2 4.41
UTA 1997-98 1.073 112.7 5.73
PHO 2009-10 1.072 115.3 4.67
CHI 1996-97 1.072 114.4 10.70
SAC 2003-04 1.072 110.3 5.41
CHI 1995-96 1.071 115.2 11.80
BOS 1987-88 1.069 115.4 6.15
DEN 1981-82 1.069 114.3 0.13
PHO 2006-07 1.069 113.9 7.28
DAL 2002-03 1.069 110.7 7.90
LAL 1986-87 1.067 115.6 8.32
CHI 1991-92 1.067 115.5 10.07
LAL 1997-98 1.066 111.9 6.88
UTA 1996-97 1.065 113.6 7.97
LAC 2014-15 1.064 112.4 6.80
IND 1998-99 1.064 108.7 3.86
ORL 1994-95 1.063 115.1 6.44

7 NBA Goat

Code
# Function to extract table from URL
extract_table_from_url <- function(url) {
  page <- read_html(url)
  table <- html_table(page)[[1]]
  return(table)
}

# Extract tables
r_ws_url <- c("https://stathead.com/tiny/QKiYH",
              "https://stathead.com/tiny/RYDCj",
              "https://stathead.com/tiny/0yUoP",
              "https://stathead.com/tiny/NUtGM",
              "https://stathead.com/tiny/A5XHn")
p_ws_url <- c("https://stathead.com/tiny/ptbsk",
              "https://stathead.com/tiny/EadMu",
              "https://stathead.com/tiny/UJA6r",
              "https://stathead.com/tiny/PWfx6",
              "https://stathead.com/tiny/tiEyh")

# Processing the data
r_ws_list <- lapply(r_ws_url, extract_table_from_url)

# combine them into single dataframes
r_ws <- bind_rows(r_ws_list)

# Processing the data
p_ws_list <- lapply(p_ws_url, extract_table_from_url)

# combine them into single dataframes
p_ws <- bind_rows(p_ws_list)
Code
# Remove column 3 from r_ws and p_ws
r_ws <- r_ws |>
  select(2, 4, 5, 7, 10, 15, 18, 19) |> # add suffix to every column name "_rs"
  rename_with(~ paste0(., "_r"), -1)

p_ws <- p_ws |>
  select(2, 4, 5, 7, 10, 15, 18, 19) |> # add suffix to every column name "_rs"
  rename_with(~ paste0(., "_p"), -1)

p_ws <- p_ws |> 
  separate(From_p, into = c("start_year", "From_YearEnd_P"), sep = "-") |> 
  mutate(
    From_YearEnd_P = paste0(substr(start_year, 1, 2), From_YearEnd_P),
    From_YearEnd_P = if_else(From_YearEnd_P == "1900", "2000", From_YearEnd_P),
    From_YearEnd_P = year(ymd(paste0(From_YearEnd_P, "-01-01")))
  ) |> 
  select(-start_year) |> 
  separate(To_p, into = c("start_year", "To_YearEnd_P"), sep = "-") |> 
  mutate(
    To_YearEnd_P = paste0(substr(start_year, 1, 2), To_YearEnd_P),
    To_YearEnd_P = if_else(To_YearEnd_P == "1900", "2000", To_YearEnd_P),
    To_YearEnd_P = year(ymd(paste0(To_YearEnd_P, "-01-01")))
  ) |> 
  select(-start_year)

r_ws <- r_ws |> 
  separate(From_r, into = c("start_year", "From_YearEnd_r"), sep = "-") |> 
  mutate(
    From_YearEnd_r = paste0(substr(start_year, 1, 2), From_YearEnd_r),
    From_YearEnd_r = if_else(From_YearEnd_r == "1900", "2000", From_YearEnd_r),
    From_YearEnd_r = year(ymd(paste0(From_YearEnd_r, "-01-01")))
  ) |> 
  select(-start_year) |> 
  separate(To_r, into = c("start_year", "To_YearEnd_r"), sep = "-") |> 
  mutate(
    To_YearEnd_r = paste0(substr(start_year, 1, 2), To_YearEnd_r),
    To_YearEnd_r = if_else(To_YearEnd_r == "1900", "2000", To_YearEnd_r),
    To_YearEnd_r = year(ymd(paste0(To_YearEnd_r, "-01-01")))
  ) |> 
  select(-start_year)
Code
# Bubble Graph
ggplot(goat_data,
       aes(x=Rate_Z,
           y=Total_Z))+
  geom_point(alpha=0.7, size=5, shape=21, aes(fill=GOAT_Score)) +
  scale_fill_gradient2(midpoint=0, low="blue", mid="white",
                       high="red", space ="Lab", name="GOAT Score") +
  scale_size(range = c(2, 24)) +
  geom_text_repel(aes(label=Player), size=5, color="black", max.overlaps = 15) +
  theme_minimal()+
  labs(title="GOAT NBA Players", 
       subtitle="Ranking players by value per possession and total career value (playoffs and regular season). BPM and VORP used for post-1974 players, WS and WS/48 used for pre-1974 players.\nUniverse of players limited to unique 1200 players listed in top 1000 list for regular season or playoff WS",
       caption=paste("Sources: Basketball-Reference.com.\nnickwarino.com, created", as.character(current_date)),
       x="Value Per Possession in Regular Season and Playoffs",
       y="Value (Total) in Regular Season and Playoffs",
       color="GOAT Scores (Mean of Rate and Total Z-Scores") +
  My_Theme_WithY()

Code
goat_data |>
  mutate(Rank_Player = paste(Rank, ". ", Player, sep="")) |> 
  head(100) |> 
  ggplot(aes(x=reorder(Rank_Player, GOAT_Score), y=GOAT_Score)) +
    geom_segment(aes(x=Rank_Player, xend=Rank_Player, y=0, yend=GOAT_Score), size=1, color="black") +
    geom_point(aes(color=if_else(To_YearEnd_r == 2024, "2024", "Other")), size=10) + 
    scale_color_manual(values = c("2024" = "red", "Other" = "black")) +
    geom_text(aes(label=sprintf("%.1f", GOAT_Score)), color="white") +  # Formatting label to 1 decimal place
  coord_flip() +
  labs(title="Top 100 NBA Players", 
       subtitle="Ranking players by value per possession and total career value (playoffs and regular season). BPM and VORP used for post-1974 \nplayers, WS and WS/48 used for pre-1974 players. Universe of players limited to unique 1200 players listed in top 1000 list \nfor regular season or playoff WS",
       caption=paste("Sources: Basketball-Reference.com.\nnickwarino.com, created", as.character(current_date)),
       x="Player",
       y="GOAT Score (Mean of Rate and Total Z-Scores)") +
  My_Theme_WithY() +
  theme(legend.position = "none")

8 Wemby and Chet Tracker: AKA Greatest Rookies Ever

Code
# Function to extract table from URL
extract_table_from_url <- function(url) {
  page <- read_html(url)
  table <- html_table(page)[[1]]
  return(table)
}

# Extract tables
rookies_1974_2024_tables <- c("https://stathead.com/tiny/skxFd",
                       "https://stathead.com/tiny/3BQme",
                       "https://stathead.com/tiny/CpSrM",
                       "https://stathead.com/tiny/NwDZB",
                       "https://stathead.com/tiny/YC5jw",
                       "https://stathead.com/tiny/UCpvt",
                       "https://stathead.com/tiny/xSwSt",
                       "https://stathead.com/tiny/f02hE",
                       "https://stathead.com/tiny/COE1s",
                       "https://stathead.com/tiny/XyVKE",
                       "https://stathead.com/tiny/kiFnl",
                       "https://stathead.com/tiny/UBgqZ",
                       "https://stathead.com/tiny/NpXJY",
                       "https://stathead.com/tiny/aVbhc") # min 100 min
Code
# Processing the data
rookies_1974_2024 <- lapply(rookies_1974_2024_tables, extract_table_from_url)

# combine them into single dataframes
rookies_1974_2024 <- bind_rows(rookies_1974_2024)

# Clean up the data
rookies_1974_2024 <- rookies_1974_2024 |> 
  select(c(-3)) |> 
  rename(VORP = "VORP...20") |> 
  separate(Season, into = c("start_year", "Season"), sep = "-") |> 
  mutate(
    Season = paste0(substr(start_year, 1, 2), Season),
    Season = if_else(Season == "1900", "2000", Season),
    Season = year(ymd(paste0(Season, "-01-01"))))

# Get average games played

avg_games_played_url <- read_html("https://www.basketball-reference.com/leagues/NBA_2024.html#per_game-team")
avg_games_played <- html_table(avg_games_played_url)[[5]]

# Filter only last row, select column 3
avg_games_played <- avg_games_played |> 
  filter(Team == "League Average") |> 
  select(3) |> # get data as numerical "Values"
  pull()

# Prorate VORP for Season 2024 based on remaining games 
prorate <- 82/avg_games_played

rookies_1974_2024 <- rookies_1974_2024 |> 
  mutate(VORP_prorated = if_else(Season == 2024, VORP * prorate, VORP)) |> # round to 1 decimal
  mutate(VORP_prorated = round(VORP_prorated, 1))
Code
# Bubble Graph

# Highlight Player "Victor Wembenyama" on bubble graphs (geom_point is filled red)
rookies_1974_2024 <- rookies_1974_2024 |> 
  mutate(highlight = if_else(Player %in% c("Victor Wembanyama", "Chet Holmgren", "Scoot Henderson"), "yes", "no"))

ggplot(rookies_1974_2024, aes(x=BPM, y=VORP_prorated)) +
  geom_point(alpha=0.7, shape=21, size=3, aes(color=highlight, fill=highlight)) +
  scale_color_manual(values = c("yes" = "black", "no" = "black")) + # Ensure borders are black for all
  scale_fill_manual(values = c("yes" = "red", "no" = "black")) + # Highlight fill color for Victor Wembanyama
  geom_text_repel(data = subset(rookies_1974_2024, highlight == "yes"), # Filter for highlighted player
                  aes(label=Player), size=5, color="black",
                  nudge_x = 1, nudge_y = 1,
                  box.padding = 0.35, # Slight padding for better visibility
                  point.padding = 0.2, # Adjusted for optimal positioning
                  max.overlaps = 5) +
  geom_text_repel(aes(label=Player), size=5, color="black",
                  nudge_x = 0, nudge_y = 0,
                  box.padding = 0,
                  point.padding = 0,
                  max.overlaps = 3) +
  labs(title="NBA Rookies by BPM and VORP (BPM by minutes played) (1974-2024)",
       subtitle="Ranking NBA rookies by value per 100 possessions (BPM) and total value (VORP), 2024 rookies VORP prorated to full season.\nUniverse of players limited to all rookies since 1974 with at least 100 minutes played",
       caption=paste("Sources: Basketball-Reference.com.\nnickwarino.com, created", as.character(current_date)),
       x="Value per 100 possessions (BPM)",
       y="Total Value (VORP)") +
  guides(color=FALSE, fill=FALSE) + # Hide the legend for 'highlight'
  My_Theme_WithY()

Code
rookies_age_less_21 <- rookies_1974_2024 |> 
  filter(Age<21)

ggplot(rookies_age_less_21, aes(x=BPM, y=VORP_prorated)) +
  geom_point(alpha=0.7, shape=21, size=3, aes(color=highlight, fill=highlight)) +
  scale_color_manual(values = c("yes" = "black", "no" = "black")) + # Ensure borders are black for all
  scale_fill_manual(values = c("yes" = "red", "no" = "black")) + # Highlight fill color for Victor Wembanyama
  geom_text_repel(aes(label=Player), size=5, color="black",
                  nudge_x = 0, nudge_y = 0,
                  box.padding = 0,
                  point.padding = 0,
                  max.overlaps = 10) +
  labs(title="NBA Rookies (20 and younger) by BPM and VORP (BPM by minutes played) (2024)",
       subtitle="Ranking NBA rookies by value per 100 possessions (BPM) and total value (VORP), 2024 rookies VORP prorated to full season.\nUniverse of players limited to all rookies since 1974 age 20 and younger with at least 100 minutes played",
       caption=paste("Sources: Basketball-Reference.com.\nnickwarino.com, created", as.character(current_date)),
       x="Value per 100 possessions (BPM)",
       y="Total Value (VORP)") +
  guides(color=FALSE, fill=FALSE) + # Hide the legend for 'highlight'
  My_Theme_WithY()

Code
# Highlight Player "Victor Wembenyama" on bubble graphs (geom_point is filled red)
rookies_2024 <- rookies_1974_2024 |> 
  mutate(highlight = if_else(Player %in% c("Victor Wembanyama", "Chet Holmgren", "Scoot Henderson"), "yes", "no")) |>
  filter(Season == 2024)


ggplot(rookies_2024, aes(x=BPM, y=VORP_prorated)) +
  geom_point(alpha=0.7, shape=21, size=3, aes(color=highlight, fill=highlight)) +
  scale_color_manual(values = c("yes" = "black", "no" = "black")) + # Ensure borders are black for all
  scale_fill_manual(values = c("yes" = "red", "no" = "black")) + # Highlight fill color for Victor Wembanyama
  geom_text_repel(aes(label=Player), size=5, color="black",
                  nudge_x = 0, nudge_y = 0,
                  box.padding = 0,
                  point.padding = 0,
                  max.overlaps = 10) +
  labs(title="NBA Rookies by BPM, VORP, and Minutes Played (2024)",
       subtitle="Ranking 2024 NBA rookies by value per 100 possessions (BPM) and total value (VORP), 2024 rookies VORP prorated to full season.\nUniverse of players limited to all rookies since 1974 age 20 and younger with at least 100 minutes played",
       caption=paste("Sources: Basketball-Reference.com.\nnickwarino.com, created", as.character(current_date)),
       x="Value per 100 possessions (BPM)",
       y="Total Value (VORP)") +
  guides(color=FALSE, fill=FALSE) + # Hide the legend for 'highlight'
  My_Theme_WithY()

Code
ggplot(rookies_1974_2024, aes(x=BPM, y=VORP_prorated)) +
  geom_point(alpha=0.7, shape=21, size=3, aes(color=highlight, fill=highlight)) +
  scale_color_manual(values = c("yes" = "black", "no" = "black")) + # Ensure borders are black for all
  scale_fill_manual(values = c("yes" = "red", "no" = "black")) + # Highlight fill color for Victor Wembanyama
  geom_text_repel(data = subset(rookies_1974_2024, highlight == "yes"), # Filter for highlighted player
                  aes(label=Player), size=5, color="black",
                  nudge_x = 1, nudge_y = 1,
                  box.padding = 0.35, # Slight padding for better visibility
                  point.padding = 0.2, # Adjusted for optimal positioning
                  max.overlaps = 5) +
  geom_text_repel(aes(label=Player), size=5, color="black",
                  nudge_x = 0, nudge_y = 0,
                  box.padding = 0,
                  point.padding = 0,
                  max.overlaps = 5) +
  labs(title="NBA Rookies by BPM, VORP, and Minutes Played (1974-2024), by Age",
       subtitle="Ranking NBA rookies by value per 100 possessions (BPM) and total value (VORP), 2024 rookies VORP prorated to full season.\nUniverse of players limited to all rookies since 1974 age 20 and younger with at least 100 minutes played",
       caption=paste("Sources: Basketball-Reference.com.\nnickwarino.com, created", as.character(current_date)),
       x="Value per 100 possessions (BPM)",
       y="Total Value (VORP)") +
  guides(color=FALSE, fill=FALSE) + # Hide the legend for 'highlight'
  facet_wrap(~Age, ncol=3) + # Facet by 'Age', adjust 'ncol' as needed for layout
  My_Theme_WithY()

Code
# For rookies, calculate Z scores for age, BPM, and VORP_prorated. Age Z score should be reverse scale (lower age scores higher)
rookies_1974_2024 <- rookies_1974_2024 |> 
  mutate(Z_Age = scale(-Age), # Reverse scale for Age
         Z_BPM = scale(BPM),
         Z_VORP_prorated = scale(VORP_prorated))

# For rookies, create "GOAT Rookie Score", which is mean of all 3 Z scores.
rookies_1974_2024 <- rookies_1974_2024 |> 
  mutate(GOAT_Rookie_Score = rowMeans(select(rookies_1974_2024, starts_with("Z_"))))

rookies_1974_2024 <- rookies_1974_2024 |>
  arrange(desc(GOAT_Rookie_Score)) |>
  mutate(Rank = rank(-GOAT_Rookie_Score, ties.method = "min"), # Calculate rank
         Season_2024 = if_else(Season == 2024, TRUE, FALSE)) # Identify 2024 players
Code
rookies_1974_2024 |> 
  mutate(Rank_Player = paste(Rank, ". ", Player, sep="")) |> 
  head(50) |> 
  ggplot(aes(x=reorder(Rank_Player, GOAT_Rookie_Score), y=GOAT_Rookie_Score)) +
    geom_segment(aes(x=Rank_Player, xend=Rank_Player, y=0, yend=GOAT_Rookie_Score), size=1, color="black") +
    geom_point(aes(color=if_else(Season == 2024, "2024", "Other")), size=10) + 
    scale_color_manual(values = c("2024" = "red", "Other" = "black")) +
    geom_text(aes(label=sprintf("%.1f", GOAT_Rookie_Score)), color="white") +  # Formatting label to 1 decimal place
    coord_flip() +
    theme_minimal() +
    labs(title="Top 50 GOAT NBA Rookies (1974-2024) by `GOAT Rookie Score`", 
         subtitle="GOAT Rookie Score calculated as mean of Z-Scores for Age, BPM, and VORP pro-rated to full season. 2024 rookies highlighted in red.",
         caption=paste("Sources: Basketball-Reference.com.\nnickwarino.com, created", as.character(current_date)),
         x="GOAT Rookie Score (mean Z-Scores for Age, BPM, and VORP pro-rated)",
         y="") + # Removing y-axis label as the names will serve as labels
    guides(color=FALSE) + # Remove color legend
    My_Theme_WithY()

Code
ggplot(rookies_1974_2024, aes(x=Age, y=VORP_prorated, size=highlight)) +
  geom_point(alpha=0.4, shape=21, aes(color=highlight, fill=highlight)) +
  scale_color_manual(values = c("yes" = "black", "no" = "black")) + # Ensure borders are black for all
  scale_fill_manual(values = c("yes" = "red", "no" = "black")) + # Highlight fill color for Victor Wembanyama
  scale_size_manual(values = c("yes" = 4, "no" = 2)) +  # Adjusted size
  geom_text_repel(aes(label=Player), size=5, color="black",
                  nudge_x = 0, nudge_y = 0,
                  box.padding = 0,
                  point.padding = 0,
                  max.overlaps = 3) +
  geom_text_repel(data = subset(rookies_1974_2024, highlight == "yes"), # Filter for highlighted player
                  aes(label=Player), size=5, color="black",
                  nudge_x = 0, nudge_y = 0,
                  box.padding = 0, # Slight padding for better visibility
                  point.padding = 0, # Adjusted for optimal positioning
                  max.overlaps = 5) +
  labs(title="NBA Rookies by Age and VORP",
       subtitle="Ranking NBA rookies by total value (VORP), 2024 rookies VORP prorated to full season.\nUniverse of players limited to all rookies since 1974 with at least 100 minutes played",
       caption=paste("Sources: Basketball-Reference.com.\nnickwarino.com, created", as.character(current_date)),
       x="Age",
       y="Total Value (VORP Prorated)") +
  guides(color=FALSE, fill=FALSE, size=FALSE) + # Hide the legend for 'highlight'
  My_Theme_WithY()