# 1. Setup
library(tidyverse) Warning: package 'tidyverse' was built under R version 4.5.2
Warning: package 'tidyr' was built under R version 4.5.2
Warning: package 'readr' was built under R version 4.5.2
Warning: package 'purrr' was built under R version 4.5.2
Warning: package 'dplyr' was built under R version 4.5.2
Warning: package 'stringr' was built under R version 4.5.2
Warning: package 'forcats' was built under R version 4.5.2
Warning: package 'lubridate' was built under R version 4.5.2
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.6
✔ forcats 1.0.1 ✔ stringr 1.6.0
✔ ggplot2 4.0.2 ✔ tibble 3.3.0
✔ lubridate 1.9.4 ✔ tidyr 1.3.2
✔ purrr 1.2.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(stringi) Warning: package 'stringi' was built under R version 4.5.2
library(ggrepel) Warning: package 'ggrepel' was built under R version 4.5.2
library(patchwork) Warning: package 'patchwork' was built under R version 4.5.2
library(gridExtra) Warning: package 'gridExtra' was built under R version 4.5.2
Attaching package: 'gridExtra'
The following object is masked from 'package:dplyr':
combine
library(plotly) Warning: package 'plotly' was built under R version 4.5.2
Attaching package: 'plotly'
The following object is masked from 'package:ggplot2':
last_plot
The following object is masked from 'package:stats':
filter
The following object is masked from 'package:graphics':
layout
library(cluster) Warning: package 'cluster' was built under R version 4.5.3
library(NbClust) Warning: package 'NbClust' was built under R version 4.5.2
library(hoopR)Warning: package 'hoopR' was built under R version 4.5.3
# eliminate scientific notation in salaries
options(scipen = 999)
library(tidyverse)
library(readr)
# ============================================================
# STEP 1 — Load and clean salary CSV
# ============================================================
salaries_raw <- read_csv("sportsref_download.csv", skip = 1)Rows: 530 Columns: 10
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (9): Player, Tm, 2025-26, 2026-27, 2027-28, 2028-29, 2029-30, 2030-31, G...
dbl (1): Rk
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Preview
glimpse(salaries_raw)Rows: 530
Columns: 10
$ Rk <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, …
$ Player <chr> "Stephen Curry", "Joel Embiid", "Nikola Jokić", "Kevin Dura…
$ Tm <chr> "GSW", "PHI", "DEN", "HOU", "BOS", "WAS", "MIL", "GSW", "BO…
$ `2025-26` <chr> "$59,606,817", "$55,224,526", "$55,224,526", "$54,708,609",…
$ `2026-27` <chr> "$62,587,158", "$58,100,000", "$59,033,114", "$43,902,439",…
$ `2027-28` <chr> NA, "$62,748,000", "$62,841,702", "$46,097,561", "$62,786,6…
$ `2028-29` <chr> NA, "$67,396,000", NA, NA, "$67,116,798", NA, NA, NA, "$64,…
$ `2029-30` <chr> NA, NA, NA, NA, "$71,446,914", NA, NA, NA, NA, "$69,191,228…
$ `2030-31` <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ Guaranteed <chr> "$122,193,975", "$176,072,526", "$114,257,640", "$98,611,04…
# Function to clean a salary string and return a numeric value
# Handles inputs like "$59,606,817" or "59606817" or "$59606817"
clean_salary <- function(salary) {
cleaned <- c() # empty vector to store results
for (i in salary) {
if (is.character(i)) {
# if salary is a string, strip $ signs, commas, and whitespace
i <- i |>
str_remove_all("\\$") |>
str_remove_all(",") |>
str_trim() |>
as.numeric()
}
cleaned <- c(cleaned, i)
}
return(cleaned)
}
# Apply the function inside mutate()
salary_df <- salaries_raw |>
select(Player, Tm, `2025-26`) |>
rename(salary = `2025-26`) |>
filter(!is.na(Player), !is.na(salary)) |>
mutate(salary = clean_salary(salary))
#will only have one player per row. keeps highest salary row
salary_df_clean <- salary_df |>
group_by(Player) |>
summarize(
Tm = Tm[which.max(salary)], # keep team associated with highest salary
salary = max(salary), # keep highest salary value
.groups = "drop"
)
# Confirm duplicates are gone
salary_df_clean |>
count(Player) |>
filter(n > 1)# A tibble: 0 × 2
# ℹ 2 variables: Player <chr>, n <int>
# ============================================================
# STEP 2 Performance data (hoopR, 2024-25 regular season)
# ============================================================
# hoopR season labels use end of year, so 2024-25 = 2025
# pull per game box scores for every player
#install.packages("hoopR")
pbox_raw <- hoopR::load_nba_player_box(seasons = 2025)
colnames(pbox_raw) [1] "game_id" "season"
[3] "season_type" "game_date"
[5] "game_date_time" "athlete_id"
[7] "athlete_display_name" "team_id"
[9] "team_name" "team_location"
[11] "team_short_display_name" "minutes"
[13] "field_goals_made" "field_goals_attempted"
[15] "three_point_field_goals_made" "three_point_field_goals_attempted"
[17] "free_throws_made" "free_throws_attempted"
[19] "offensive_rebounds" "defensive_rebounds"
[21] "rebounds" "assists"
[23] "steals" "blocks"
[25] "turnovers" "fouls"
[27] "plus_minus" "points"
[29] "starter" "ejected"
[31] "did_not_play" "reason"
[33] "active" "athlete_jersey"
[35] "athlete_short_name" "athlete_headshot_href"
[37] "athlete_position_name" "athlete_position_abbreviation"
[39] "team_display_name" "team_uid"
[41] "team_slug" "team_logo"
[43] "team_abbreviation" "team_color"
[45] "team_alternate_color" "home_away"
[47] "team_winner" "team_score"
[49] "opponent_team_id" "opponent_team_name"
[51] "opponent_team_location" "opponent_team_display_name"
[53] "opponent_team_abbreviation" "opponent_team_logo"
[55] "opponent_team_color" "opponent_team_alternate_color"
[57] "opponent_team_score"
glimpse(pbox_raw)Rows: 32,193
Columns: 57
$ game_id <int> 401766128, 401766128, 401766128, 401…
$ season <int> 2025, 2025, 2025, 2025, 2025, 2025, …
$ season_type <int> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, …
$ game_date <date> 2025-06-22, 2025-06-22, 2025-06-22,…
$ game_date_time <dttm> 2025-06-22 20:00:00, 2025-06-22 20:…
$ athlete_id <int> 3149673, 3133628, 4395712, 4396909, …
$ athlete_display_name <chr> "Pascal Siakam", "Myles Turner", "An…
$ team_id <int> 11, 11, 11, 11, 11, 11, 11, 11, 11, …
$ team_name <chr> "Pacers", "Pacers", "Pacers", "Pacer…
$ team_location <chr> "Indiana", "Indiana", "Indiana", "In…
$ team_short_display_name <chr> "Pacers", "Pacers", "Pacers", "Pacer…
$ minutes <dbl> 37, 24, 37, 30, 7, 21, 1, 5, 28, 16,…
$ field_goals_made <int> 5, 2, 4, 1, 3, 0, 0, 0, 8, 0, 0, 6, …
$ field_goals_attempted <int> 13, 4, 10, 5, 5, 4, 1, 0, 13, 1, 0, …
$ three_point_field_goals_made <int> 2, 1, 2, 1, 3, 0, 0, 0, 0, 0, 0, 2, …
$ three_point_field_goals_attempted <int> 5, 3, 3, 3, 4, 3, 1, 0, 0, 1, 0, 5, …
$ free_throws_made <int> 4, 1, 5, 0, 0, 0, 0, 2, 0, 0, 0, 10,…
$ free_throws_attempted <int> 7, 4, 6, 0, 0, 0, 0, 2, 0, 0, 0, 10,…
$ offensive_rebounds <int> 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 7, …
$ defensive_rebounds <int> 3, 3, 4, 5, 0, 2, 0, 1, 6, 3, 0, 6, …
$ rebounds <int> 4, 4, 5, 6, 0, 2, 0, 1, 6, 4, 0, 13,…
$ assists <int> 2, 1, 6, 1, 0, 1, 0, 0, 3, 0, 0, 3, …
$ steals <int> 1, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 2, …
$ blocks <int> 1, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 0, …
$ turnovers <int> 1, 1, 2, 1, 1, 3, 0, 1, 7, 1, 0, 3, …
$ fouls <int> 2, 3, 3, 6, 0, 2, 0, 2, 2, 1, 0, 3, …
$ plus_minus <chr> "-4", "-12", "-9", "+6", "-2", "-13"…
$ points <int> 16, 6, 15, 3, 9, 0, 0, 2, 16, 0, 0, …
$ starter <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, FALSE,…
$ ejected <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, F…
$ did_not_play <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, F…
$ reason <chr> "COACH'S DECISION", "COACH'S DECISIO…
$ active <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, T…
$ athlete_jersey <chr> "43", "33", "2", "23", "0", "1", "3"…
$ athlete_short_name <chr> "P. Siakam", "M. Turner", "A. Nembha…
$ athlete_headshot_href <chr> "https://a.espncdn.com/i/headshots/n…
$ athlete_position_name <chr> "Forward", "Center", "Guard", "Guard…
$ athlete_position_abbreviation <chr> "F", "C", "G", "G", "G", "F", "C", "…
$ team_display_name <chr> "Indiana Pacers", "Indiana Pacers", …
$ team_uid <chr> "s:40~l:46~t:11", "s:40~l:46~t:11", …
$ team_slug <chr> "indiana-pacers", "indiana-pacers", …
$ team_logo <chr> "https://a.espncdn.com/i/teamlogos/n…
$ team_abbreviation <chr> "IND", "IND", "IND", "IND", "IND", "…
$ team_color <chr> "002d62", "002d62", "002d62", "002d6…
$ team_alternate_color <chr> "ffd520", "ffd520", "ffd520", "ffd52…
$ home_away <chr> "away", "away", "away", "away", "awa…
$ team_winner <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, F…
$ team_score <int> 91, 91, 91, 91, 91, 91, 91, 91, 91, …
$ opponent_team_id <int> 25, 25, 25, 25, 25, 25, 25, 25, 25, …
$ opponent_team_name <chr> "Thunder", "Thunder", "Thunder", "Th…
$ opponent_team_location <chr> "Oklahoma City", "Oklahoma City", "O…
$ opponent_team_display_name <chr> "Oklahoma City Thunder", "Oklahoma C…
$ opponent_team_abbreviation <chr> "OKC", "OKC", "OKC", "OKC", "OKC", "…
$ opponent_team_logo <chr> "https://a.espncdn.com/i/teamlogos/n…
$ opponent_team_color <chr> "007ac1", "007ac1", "007ac1", "007ac…
$ opponent_team_alternate_color <chr> "ef3b24", "ef3b24", "ef3b24", "ef3b2…
$ opponent_team_score <int> 103, 103, 103, 103, 103, 103, 103, 1…
# keep only regular season games, season_type == 2
pbox <- pbox_raw |>
filter(season_type == 2)
# minutes is stored as character in some seasons
pbox <- pbox |>
mutate(minutes = suppressWarnings(as.numeric(minutes)))
# aggregate from per-game rows to per-player-per-season totals
# compute per-game averages
stats_df <- pbox |>
filter(!is.na(athlete_display_name)) |>
group_by(athlete_display_name, team_abbreviation) |>
summarize(
GP = sum(!is.na(points)),
MIN = sum(minutes, na.rm = TRUE),
PTS = sum(points, na.rm = TRUE),
REB = sum(rebounds, na.rm = TRUE),
AST = sum(assists, na.rm = TRUE),
STL = sum(steals, na.rm = TRUE),
BLK = sum(blocks, na.rm = TRUE),
TOV = sum(turnovers, na.rm = TRUE),
FGM = sum(field_goals_made, na.rm = TRUE),
FGA = sum(field_goals_attempted, na.rm = TRUE),
FG3M = sum(three_point_field_goals_made, na.rm = TRUE),
FG3A = sum(three_point_field_goals_attempted, na.rm = TRUE),
FTM = sum(free_throws_made, na.rm = TRUE),
FTA = sum(free_throws_attempted, na.rm = TRUE),
PF = sum(fouls, na.rm = TRUE),
position = athlete_position_abbreviation[which.max(!is.na(athlete_position_abbreviation))], # most common position
.groups = "drop"
) |>
# players traded mid season appear under more than one team. fix them up to a single season total by player
group_by(athlete_display_name) |>
summarize(
team = team_abbreviation[which.max(GP)], # primary team = most games
position = position[which.max(GP)], # position from primary team stint
GP = sum(GP),
MIN = sum(MIN),
PTS = sum(PTS),
REB = sum(REB),
AST = sum(AST),
STL = sum(STL),
BLK = sum(BLK),
TOV = sum(TOV),
FGM = sum(FGM),
FGA = sum(FGA),
FG3M = sum(FG3M),
FG3A = sum(FG3A),
FTM = sum(FTM),
FTA = sum(FTA),
PF = sum(PF),
.groups = "drop"
) |>
filter(GP>=20)|> #drops players with less than 20 games
# shooting percentages
mutate(
FG_PCT = if_else(FGA > 0, FGM / FGA, NA_real_),
FG3_PCT = if_else(FG3A > 0, FG3M / FG3A, NA_real_),
FT_PCT = if_else(FTA > 0, FTM / FTA, NA_real_)
) |>
# per-game averages
mutate(
MPG = MIN / GP,
PPG = PTS / GP,
RPG = REB / GP,
APG = AST / GP,
SPG = STL / GP,
BPG = BLK / GP,
TOPG = TOV / GP,
PFPG = PF / GP
) |>
rename(player_name = athlete_display_name)
stats_df <- stats_df |>
mutate(position = case_when(
position %in% c("G", "PG", "SG") ~ "G",
position %in% c("F", "SF", "PF") ~ "F",
position == "C" ~ "C"
))
# Verify
stats_df |>
count(position, sort = TRUE)# A tibble: 3 × 2
position n
<chr> <int>
1 G 219
2 F 163
3 C 65
# ============================================================
# STEP 3 — Use a join command to combine Salary and Player Stats
# ============================================================
# Join stats_df + salary_df
# inner_join — only keep players with both stats AND a salary
# names differ between tables so we specify both sides
nba_df <- stats_df |>
inner_join(salary_df_clean, by = c("player_name" = "Player"))
# Check for any duplicate player rows
nba_df |>
count(player_name) |>
filter(n > 1)# A tibble: 0 × 2
# ℹ 2 variables: player_name <chr>, n <int>
# Spot check to confirm if the join method worked
nba_df |>
select(player_name, position, team, salary, GP, PPG, RPG, APG) |>
arrange(desc(salary)) |>
head(10)# A tibble: 10 × 8
player_name position team salary GP PPG RPG APG
<chr> <chr> <chr> <dbl> <int> <dbl> <dbl> <dbl>
1 Stephen Curry G GS 59606817 66 23.7 4.5 5.92
2 Kevin Durant F PHX 54708609 60 25.8 5.95 4.27
3 Anthony Davis F LAL 54126450 46 25.3 11.8 3.59
4 Giannis Antetokounmpo F MIL 54126450 62 30.2 12.0 6.31
5 Jayson Tatum F BOS 54126450 69 26.6 8.51 5.81
6 Devin Booker G PHX 53142264 69 26.0 4.01 6.93
7 Jaylen Brown G BOS 53142264 61 21.7 5.75 4.49
8 Karl-Anthony Towns C NY 53142264 67 24.0 12.6 2.93
9 LeBron James F LAL 52627153 64 24.5 7.92 8.39
10 Paul George F PHI 51666090 41 16.2 5.34 4.34
