TylerWasif_FinalProject

Introduction

Quarto enables you to weave together content and executable code into a finished document. To learn more about Quarto see https://quarto.org.

Data Processioning

Data Processioning

# 1. Setup

library(tidyverse)      
Warning: package 'tidyverse' was built under R version 4.5.2
Warning: package 'tidyr' was built under R version 4.5.2
Warning: package 'readr' was built under R version 4.5.2
Warning: package 'purrr' was built under R version 4.5.2
Warning: package 'dplyr' was built under R version 4.5.2
Warning: package 'stringr' was built under R version 4.5.2
Warning: package 'forcats' was built under R version 4.5.2
Warning: package 'lubridate' was built under R version 4.5.2
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.6
✔ forcats   1.0.1     ✔ stringr   1.6.0
✔ ggplot2   4.0.2     ✔ tibble    3.3.0
✔ lubridate 1.9.4     ✔ tidyr     1.3.2
✔ purrr     1.2.1     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(stringi)       
Warning: package 'stringi' was built under R version 4.5.2
library(ggrepel)       
Warning: package 'ggrepel' was built under R version 4.5.2
library(patchwork)    
Warning: package 'patchwork' was built under R version 4.5.2
library(gridExtra)      
Warning: package 'gridExtra' was built under R version 4.5.2

Attaching package: 'gridExtra'

The following object is masked from 'package:dplyr':

    combine
library(plotly)         
Warning: package 'plotly' was built under R version 4.5.2

Attaching package: 'plotly'

The following object is masked from 'package:ggplot2':

    last_plot

The following object is masked from 'package:stats':

    filter

The following object is masked from 'package:graphics':

    layout
library(cluster)     
Warning: package 'cluster' was built under R version 4.5.3
library(NbClust)        
Warning: package 'NbClust' was built under R version 4.5.2
library(hoopR)
Warning: package 'hoopR' was built under R version 4.5.3
# eliminate scientific notation in salaries
options(scipen = 999)
library(tidyverse)
library(readr)

# ============================================================
# STEP 1 — Load and clean salary CSV
# ============================================================

salaries_raw <- read_csv("sportsref_download.csv", skip = 1)
Rows: 530 Columns: 10
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (9): Player, Tm, 2025-26, 2026-27, 2027-28, 2028-29, 2029-30, 2030-31, G...
dbl (1): Rk

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Preview
glimpse(salaries_raw)
Rows: 530
Columns: 10
$ Rk         <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, …
$ Player     <chr> "Stephen Curry", "Joel Embiid", "Nikola Jokić", "Kevin Dura…
$ Tm         <chr> "GSW", "PHI", "DEN", "HOU", "BOS", "WAS", "MIL", "GSW", "BO…
$ `2025-26`  <chr> "$59,606,817", "$55,224,526", "$55,224,526", "$54,708,609",…
$ `2026-27`  <chr> "$62,587,158", "$58,100,000", "$59,033,114", "$43,902,439",…
$ `2027-28`  <chr> NA, "$62,748,000", "$62,841,702", "$46,097,561", "$62,786,6…
$ `2028-29`  <chr> NA, "$67,396,000", NA, NA, "$67,116,798", NA, NA, NA, "$64,…
$ `2029-30`  <chr> NA, NA, NA, NA, "$71,446,914", NA, NA, NA, NA, "$69,191,228…
$ `2030-31`  <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ Guaranteed <chr> "$122,193,975", "$176,072,526", "$114,257,640", "$98,611,04…
# Function to clean a salary string and return a numeric value
# Handles inputs like "$59,606,817" or "59606817" or "$59606817"

clean_salary <- function(salary) {
  
  cleaned <- c()  # empty vector to store results
  
  for (i in salary) {
    if (is.character(i)) {
      # if salary is a string, strip $ signs, commas, and whitespace
      i <- i |>
        str_remove_all("\\$") |>
        str_remove_all(",") |>
        str_trim() |>
        as.numeric()
    }
    cleaned <- c(cleaned, i)
  }
  return(cleaned)
}

# Apply the function inside mutate()

salary_df <- salaries_raw |>
  select(Player, Tm, `2025-26`) |>
  rename(salary = `2025-26`) |>
  filter(!is.na(Player), !is.na(salary)) |>
  mutate(salary = clean_salary(salary))

#will only have one player per row. keeps highest salary row
salary_df_clean <- salary_df |>
  group_by(Player) |>
  summarize(
    Tm     = Tm[which.max(salary)],   # keep team associated with highest salary
    salary = max(salary),             # keep highest salary value
    .groups = "drop"                  
  )

# Confirm duplicates are gone
salary_df_clean |>
  count(Player) |>
  filter(n > 1)
# A tibble: 0 × 2
# ℹ 2 variables: Player <chr>, n <int>
# ============================================================
# STEP 2 Performance data (hoopR, 2024-25 regular season)
# ============================================================

# hoopR season labels use end of year, so 2024-25 = 2025
# pull per game box scores for every player
#install.packages("hoopR")
pbox_raw <- hoopR::load_nba_player_box(seasons = 2025)
colnames(pbox_raw)
 [1] "game_id"                           "season"                           
 [3] "season_type"                       "game_date"                        
 [5] "game_date_time"                    "athlete_id"                       
 [7] "athlete_display_name"              "team_id"                          
 [9] "team_name"                         "team_location"                    
[11] "team_short_display_name"           "minutes"                          
[13] "field_goals_made"                  "field_goals_attempted"            
[15] "three_point_field_goals_made"      "three_point_field_goals_attempted"
[17] "free_throws_made"                  "free_throws_attempted"            
[19] "offensive_rebounds"                "defensive_rebounds"               
[21] "rebounds"                          "assists"                          
[23] "steals"                            "blocks"                           
[25] "turnovers"                         "fouls"                            
[27] "plus_minus"                        "points"                           
[29] "starter"                           "ejected"                          
[31] "did_not_play"                      "reason"                           
[33] "active"                            "athlete_jersey"                   
[35] "athlete_short_name"                "athlete_headshot_href"            
[37] "athlete_position_name"             "athlete_position_abbreviation"    
[39] "team_display_name"                 "team_uid"                         
[41] "team_slug"                         "team_logo"                        
[43] "team_abbreviation"                 "team_color"                       
[45] "team_alternate_color"              "home_away"                        
[47] "team_winner"                       "team_score"                       
[49] "opponent_team_id"                  "opponent_team_name"               
[51] "opponent_team_location"            "opponent_team_display_name"       
[53] "opponent_team_abbreviation"        "opponent_team_logo"               
[55] "opponent_team_color"               "opponent_team_alternate_color"    
[57] "opponent_team_score"              
glimpse(pbox_raw)
Rows: 32,193
Columns: 57
$ game_id                           <int> 401766128, 401766128, 401766128, 401…
$ season                            <int> 2025, 2025, 2025, 2025, 2025, 2025, …
$ season_type                       <int> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, …
$ game_date                         <date> 2025-06-22, 2025-06-22, 2025-06-22,…
$ game_date_time                    <dttm> 2025-06-22 20:00:00, 2025-06-22 20:…
$ athlete_id                        <int> 3149673, 3133628, 4395712, 4396909, …
$ athlete_display_name              <chr> "Pascal Siakam", "Myles Turner", "An…
$ team_id                           <int> 11, 11, 11, 11, 11, 11, 11, 11, 11, …
$ team_name                         <chr> "Pacers", "Pacers", "Pacers", "Pacer…
$ team_location                     <chr> "Indiana", "Indiana", "Indiana", "In…
$ team_short_display_name           <chr> "Pacers", "Pacers", "Pacers", "Pacer…
$ minutes                           <dbl> 37, 24, 37, 30, 7, 21, 1, 5, 28, 16,…
$ field_goals_made                  <int> 5, 2, 4, 1, 3, 0, 0, 0, 8, 0, 0, 6, …
$ field_goals_attempted             <int> 13, 4, 10, 5, 5, 4, 1, 0, 13, 1, 0, …
$ three_point_field_goals_made      <int> 2, 1, 2, 1, 3, 0, 0, 0, 0, 0, 0, 2, …
$ three_point_field_goals_attempted <int> 5, 3, 3, 3, 4, 3, 1, 0, 0, 1, 0, 5, …
$ free_throws_made                  <int> 4, 1, 5, 0, 0, 0, 0, 2, 0, 0, 0, 10,…
$ free_throws_attempted             <int> 7, 4, 6, 0, 0, 0, 0, 2, 0, 0, 0, 10,…
$ offensive_rebounds                <int> 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 7, …
$ defensive_rebounds                <int> 3, 3, 4, 5, 0, 2, 0, 1, 6, 3, 0, 6, …
$ rebounds                          <int> 4, 4, 5, 6, 0, 2, 0, 1, 6, 4, 0, 13,…
$ assists                           <int> 2, 1, 6, 1, 0, 1, 0, 0, 3, 0, 0, 3, …
$ steals                            <int> 1, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 2, …
$ blocks                            <int> 1, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 0, …
$ turnovers                         <int> 1, 1, 2, 1, 1, 3, 0, 1, 7, 1, 0, 3, …
$ fouls                             <int> 2, 3, 3, 6, 0, 2, 0, 2, 2, 1, 0, 3, …
$ plus_minus                        <chr> "-4", "-12", "-9", "+6", "-2", "-13"…
$ points                            <int> 16, 6, 15, 3, 9, 0, 0, 2, 16, 0, 0, …
$ starter                           <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, FALSE,…
$ ejected                           <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, F…
$ did_not_play                      <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, F…
$ reason                            <chr> "COACH'S DECISION", "COACH'S DECISIO…
$ active                            <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, T…
$ athlete_jersey                    <chr> "43", "33", "2", "23", "0", "1", "3"…
$ athlete_short_name                <chr> "P. Siakam", "M. Turner", "A. Nembha…
$ athlete_headshot_href             <chr> "https://a.espncdn.com/i/headshots/n…
$ athlete_position_name             <chr> "Forward", "Center", "Guard", "Guard…
$ athlete_position_abbreviation     <chr> "F", "C", "G", "G", "G", "F", "C", "…
$ team_display_name                 <chr> "Indiana Pacers", "Indiana Pacers", …
$ team_uid                          <chr> "s:40~l:46~t:11", "s:40~l:46~t:11", …
$ team_slug                         <chr> "indiana-pacers", "indiana-pacers", …
$ team_logo                         <chr> "https://a.espncdn.com/i/teamlogos/n…
$ team_abbreviation                 <chr> "IND", "IND", "IND", "IND", "IND", "…
$ team_color                        <chr> "002d62", "002d62", "002d62", "002d6…
$ team_alternate_color              <chr> "ffd520", "ffd520", "ffd520", "ffd52…
$ home_away                         <chr> "away", "away", "away", "away", "awa…
$ team_winner                       <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, F…
$ team_score                        <int> 91, 91, 91, 91, 91, 91, 91, 91, 91, …
$ opponent_team_id                  <int> 25, 25, 25, 25, 25, 25, 25, 25, 25, …
$ opponent_team_name                <chr> "Thunder", "Thunder", "Thunder", "Th…
$ opponent_team_location            <chr> "Oklahoma City", "Oklahoma City", "O…
$ opponent_team_display_name        <chr> "Oklahoma City Thunder", "Oklahoma C…
$ opponent_team_abbreviation        <chr> "OKC", "OKC", "OKC", "OKC", "OKC", "…
$ opponent_team_logo                <chr> "https://a.espncdn.com/i/teamlogos/n…
$ opponent_team_color               <chr> "007ac1", "007ac1", "007ac1", "007ac…
$ opponent_team_alternate_color     <chr> "ef3b24", "ef3b24", "ef3b24", "ef3b2…
$ opponent_team_score               <int> 103, 103, 103, 103, 103, 103, 103, 1…
# keep only regular season games, season_type == 2
pbox <- pbox_raw |>
  filter(season_type == 2)

# minutes is stored as character in some seasons
pbox <- pbox |>
  mutate(minutes = suppressWarnings(as.numeric(minutes)))

# aggregate from per-game rows to per-player-per-season totals
# compute per-game averages 
stats_df <- pbox |>
  filter(!is.na(athlete_display_name)) |>
  group_by(athlete_display_name, team_abbreviation) |>
  summarize(
    GP    = sum(!is.na(points)),
    MIN   = sum(minutes,                              na.rm = TRUE),
    PTS   = sum(points,                               na.rm = TRUE),
    REB   = sum(rebounds,                             na.rm = TRUE),
    AST   = sum(assists,                              na.rm = TRUE),
    STL   = sum(steals,                               na.rm = TRUE),
    BLK   = sum(blocks,                               na.rm = TRUE),
    TOV   = sum(turnovers,                            na.rm = TRUE),
    FGM   = sum(field_goals_made,                     na.rm = TRUE),
    FGA   = sum(field_goals_attempted,                na.rm = TRUE),
    FG3M  = sum(three_point_field_goals_made,         na.rm = TRUE),
    FG3A  = sum(three_point_field_goals_attempted,    na.rm = TRUE),
    FTM   = sum(free_throws_made,                     na.rm = TRUE),
    FTA   = sum(free_throws_attempted,                na.rm = TRUE),
    PF    = sum(fouls,                                na.rm = TRUE),
    position = athlete_position_abbreviation[which.max(!is.na(athlete_position_abbreviation))],  # most common position
    .groups = "drop"
  ) |>
  # players traded mid season appear under more than one team. fix them up to a single season total by player
  group_by(athlete_display_name) |>
  summarize(
    team     = team_abbreviation[which.max(GP)],   # primary team = most games
    position = position[which.max(GP)],            # position from primary team stint
    GP    = sum(GP),
    MIN   = sum(MIN),
    PTS   = sum(PTS),
    REB   = sum(REB),
    AST   = sum(AST),
    STL   = sum(STL),
    BLK   = sum(BLK),
    TOV   = sum(TOV),
    FGM   = sum(FGM),
    FGA   = sum(FGA),
    FG3M  = sum(FG3M),
    FG3A  = sum(FG3A),
    FTM   = sum(FTM),
    FTA   = sum(FTA),
    PF    = sum(PF),
    .groups = "drop"
  ) |>
  filter(GP>=20)|> #drops players with less than 20 games
  # shooting percentages 
  mutate(
    FG_PCT  = if_else(FGA  > 0, FGM  / FGA,  NA_real_),
    FG3_PCT = if_else(FG3A > 0, FG3M / FG3A, NA_real_),
    FT_PCT  = if_else(FTA  > 0, FTM  / FTA,  NA_real_)
  ) |>
  # per-game averages
  mutate(
    MPG   = MIN / GP,
    PPG   = PTS / GP,
    RPG   = REB / GP,
    APG   = AST / GP,
    SPG   = STL / GP,
    BPG   = BLK / GP,
    TOPG  = TOV / GP,
    PFPG  = PF  / GP
  ) |>
  rename(player_name = athlete_display_name)

stats_df <- stats_df |>
  mutate(position = case_when(
    position %in% c("G", "PG", "SG") ~ "G",
    position %in% c("F", "SF", "PF") ~ "F",
    position == "C"                  ~ "C"
  ))

# Verify
stats_df |>
  count(position, sort = TRUE)
# A tibble: 3 × 2
  position     n
  <chr>    <int>
1 G          219
2 F          163
3 C           65
# ============================================================
# STEP 3 — Use a join command to combine Salary and Player Stats
# ============================================================

# Join stats_df + salary_df
# inner_join — only keep players with both stats AND a salary
# names differ between tables so we specify both sides

nba_df <- stats_df |>
  inner_join(salary_df_clean, by = c("player_name" = "Player"))


# Check for any duplicate player rows
nba_df |>
  count(player_name) |>
  filter(n > 1)
# A tibble: 0 × 2
# ℹ 2 variables: player_name <chr>, n <int>
# Spot check to confirm if the join method worked
nba_df |>
  select(player_name, position, team, salary, GP, PPG, RPG, APG) |>
  arrange(desc(salary)) |>
  head(10)
# A tibble: 10 × 8
   player_name           position team    salary    GP   PPG   RPG   APG
   <chr>                 <chr>    <chr>    <dbl> <int> <dbl> <dbl> <dbl>
 1 Stephen Curry         G        GS    59606817    66  23.7  4.5   5.92
 2 Kevin Durant          F        PHX   54708609    60  25.8  5.95  4.27
 3 Anthony Davis         F        LAL   54126450    46  25.3 11.8   3.59
 4 Giannis Antetokounmpo F        MIL   54126450    62  30.2 12.0   6.31
 5 Jayson Tatum          F        BOS   54126450    69  26.6  8.51  5.81
 6 Devin Booker          G        PHX   53142264    69  26.0  4.01  6.93
 7 Jaylen Brown          G        BOS   53142264    61  21.7  5.75  4.49
 8 Karl-Anthony Towns    C        NY    53142264    67  24.0 12.6   2.93
 9 LeBron James          F        LAL   52627153    64  24.5  7.92  8.39
10 Paul George           F        PHI   51666090    41  16.2  5.34  4.34

Data Visualization

# ============================================================
# STEP 4 — Visualize Data
# ============================================================

# Histogram
p1_hist_salarydistribution <- nba_df |>
  ggplot(aes(x = salary/1e6)) + #dividing by 1e6 to make the Xaxis more readable
  geom_histogram(bins = 30, fill = "blue", color = "white") +
  labs(
    title    = "NBA Player Salary Distribution",
    caption  = "Data: Basketball Reference + hoopR",
    x        = "Salary (USD Millions)",
    y        = "Number of Players"
  ) +
  theme_light() +
  theme(plot.title = element_text(hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5))
#Graph1
p1_hist_salarydistribution #show data

p_top10_salary <- nba_df |>
  arrange(desc(salary)) |>
  slice_head(n = 10) |>
  ggplot(aes(x = salary / 1e6, y = reorder(player_name, salary), fill = position)) +
  geom_bar(stat = "identity") +
  labs(
    title   = "Top 10 Highest Paid NBA Players",
    x       = "Salary (Millions USD)",
    y       = "Player",
    fill    = "Position",
    caption = "Data: Basketball Reference + hoopR"
  ) +
  theme_light() +
  theme(
    plot.title = element_text(hjust = 0.5)
  )
#Graph2
p_top10_salary #show graph

p_top10_ppg <- nba_df |>
  arrange(desc(PPG)) |>
  slice_head(n = 10) |>
  ggplot(aes(x = PPG, y = reorder(player_name, PPG), fill = position)) +
  geom_bar(stat = "identity") +
  labs(
    title   = "Top 10 Scorers in the NBA",
    x       = "Points Per Game (PPG)",
    y       = "Player",
    fill    = "Position",
    caption = "Data: Basketball Reference + hoopR"
  ) +
  theme_light() +
  theme(
    plot.title = element_text(hjust = 0.5)
  )
#Graph3
p_top10_ppg #show graph

#Boxplot
p_salary_position <- nba_df |>
  ggplot(aes(x = position, y = salary / 1e6, color = position)) +
  geom_boxplot() +
  labs(
    title    = "NBA Salary Distribution by Position",
    subtitle = "How does pay vary across Guards, Forwards and Centers?",
    x        = "Position",
    y        = "Salary (Millions USD)",
    caption  = "Data: Basketball Reference + hoopR"
  ) +
  theme_light() +
  theme(
    plot.title      = element_text(hjust = 0.5),
    plot.subtitle   = element_text(hjust = 0.5),
    legend.position = "none"
  )
#Graph4
p_salary_position

#scatterplot
p_salary_ppg <- nba_df |>
  ggplot(aes(x = PPG, y = salary / 1e6)) +
  geom_point(aes(color = position), alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE, color = "black") +
  labs(
    title   = "Salary vs. Points Per Game",
    x       = "Points Per Game (PPG)",
    y       = "Salary (Millions USD)",
    color   = "Position",
    caption = "Data: Basketball Reference + hoopR"
  ) +
  theme_light() +
  theme(plot.title = element_text(hjust = 0.5))
#Graph5
p_salary_ppg #scatter salary vs PPG
`geom_smooth()` using formula = 'y ~ x'

p_salary_rpg <- nba_df |>
  ggplot(aes(x = RPG, y = salary / 1e6)) +
  geom_point(aes(color = position), alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE, color = "black") +
  labs(
    title   = "Salary vs. Rebounds Per Game",
    x       = "Rebounds Per Game (RPG)",
    y       = "Salary (Millions USD)",
    color   = "Position",
    caption = "Data: Basketball Reference + hoopR"
  ) +
  theme_light() +
  theme(plot.title = element_text(hjust = 0.5))

#Graph6
p_salary_rpg #scatter salary vs RPG
`geom_smooth()` using formula = 'y ~ x'

p_salary_apg <- nba_df |>
  ggplot(aes(x = APG, y = salary / 1e6)) +
  geom_point(aes(color = position), alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE, color = "black") +
  labs(
    title   = "Salary vs. Assists Per Game",
    x       = "Assists Per Game (APG)",
    y       = "Salary (Millions USD)",
    color   = "Position",
    caption = "Data: Basketball Reference + hoopR"
  ) +
  theme_light() +
  theme(plot.title = element_text(hjust = 0.5))

#Graph7
p_salary_apg #scatter salary vs APG
`geom_smooth()` using formula = 'y ~ x'

#adding a new variable (value score = PPG + RPG + APG)
nba_df <- nba_df |>
  mutate(
    salary_M    = salary / 1e6,
    value_score = (PPG + RPG + APG) / salary_M
  )

#Graph 8: top 10 most underpaid nba players
p_underpaid <- nba_df |>
  arrange(desc(value_score)) |>
  slice_head(n = 10) |>
  ggplot(aes(x = value_score, y = reorder(player_name, value_score), fill = position)) +
  geom_bar(stat = "identity") +
  labs(
    title    = "Top 10 Most Underpaid NBA Players",
    subtitle = "Highest production per million dollars earned",
    x        = "Value Score (PPG + RPG + APG) / Salary (Millions)",
    y        = "Player",
    fill     = "Position",
    caption  = "Data: Basketball Reference + hoopR"
  ) +
  theme_light() +
  theme(
    plot.title    = element_text(hjust = 0.5),
    plot.subtitle = element_text(hjust = 0.5)
  )

p_underpaid

#Graph 9: top 10 most overderpaid nba players
p_overpaid <- nba_df |>
  arrange(value_score) |>
  slice_head(n = 10) |>
  ggplot(aes(x = value_score, y = reorder(player_name, value_score), fill = position)) +
  geom_bar(stat = "identity") +
  labs(
    title    = "Top 10 Most Overpaid NBA Players",
    subtitle = "Lowest production per million dollars earned",
    x        = "Value Score (PPG + RPG + APG) / Salary (Millions)",
    y        = "Player",
    fill     = "Position",
    caption  = "Data: Basketball Reference + hoopR"
  ) +
  theme_light() +
  theme(
    plot.title    = element_text(hjust = 0.5),
    plot.subtitle = element_text(hjust = 0.5)
  )

p_overpaid

#Graph 10: Scatter plot of Salary vs Value score
p_salary_value <- nba_df |>
  ggplot(aes(x = salary_M, y = value_score)) +
  geom_point(aes(color = position), alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE, color = "black") +
  labs(
    title    = "NBA Salary vs. Value Score",
    x        = "Salary (Millions USD)",
    y        = "Value Score (PPG + RPG + APG) / Salary (Millions)",
    color    = "Position",
    caption  = "Data: Basketball Reference + hoopR"
  ) +
  theme_light() +
  theme(
    plot.title    = element_text(hjust = 0.5),
    plot.subtitle = element_text(hjust = 0.5)
  )

p_salary_value
`geom_smooth()` using formula = 'y ~ x'

Interactive Plots

# Salary vs PPG animated by postion (G,F,C)
library(plotly)
library(ggplot2)
plot_ly(
  data=nba_df,
  x=~salary,
  y=~PPG,
  frame = ~position,
  type='histogram2d'
) |>
  layout(
    title='NBA Salary vs Points Per Game By Position',
    xaxis=list(title='Salary'),
    yaxis=list(title='Points Per Game')
  )

Interactive Tables

interactive_nba_df <- nba_df |>
  select(player_name,team,GP,FG_PCT,MPG,PPG,RPG,APG, salary)

#install.packages('DT')
library(DT)
Warning: package 'DT' was built under R version 4.5.3
datatable(interactive_nba_df)

Data Analysis (Linear Regression)

# ============================================================
# Create a linear regression model
# ============================================================

#multiple linear regression using PPG,Apg,rpg, position as predictors
#salary as the dependent variable
lm.salary <- lm(salary~PPG+RPG+APG+position,data=nba_df)


summary(lm.salary)

Call:
lm(formula = salary ~ PPG + RPG + APG + position, data = nba_df)

Residuals:
      Min        1Q    Median        3Q       Max 
-24794744  -5353311   -150095   4022497  26343675 

Coefficients:
            Estimate Std. Error t value             Pr(>|t|)    
(Intercept) -4603246    1805915  -2.549               0.0112 *  
PPG          1382454     110979  12.457 < 0.0000000000000002 ***
RPG           102851     275646   0.373               0.7093    
APG          1550150     366050   4.235            0.0000291 ***
positionF     223638    1460884   0.153               0.8784    
positionG   -3057859    1748534  -1.749               0.0812 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 7928000 on 361 degrees of freedom
Multiple R-squared:  0.6756,    Adjusted R-squared:  0.6711 
F-statistic: 150.4 on 5 and 361 DF,  p-value: < 0.00000000000000022
#Adjusted R Squared Value is 0.6711
lm.salary

Call:
lm(formula = salary ~ PPG + RPG + APG + position, data = nba_df)

Coefficients:
(Intercept)          PPG          RPG          APG    positionF    positionG  
   -4603246      1382454       102851      1550150       223638     -3057858  
  • Adjusted R Squared Value is 0.6711 lm.salary

  • Formula: Salary = -4603246 + 102851RPG + 1382454PPG + 1550150APG + positionF*223638 -3057858*positionG

    • Intercept (-4603246): Baseline Salary when PPG, APG, RPG and Position are 0 (reference point)

    • PPG (1382454): Holding the other predictors constant, a 1 unit increase in PPG increases Salary by 1382454

    • RPG(102851): Holding the other predictors constant, a 1 unit increase in RPG increases Salary by 102851

    • APG (1550150): Holding the other predictors constant, a 1 unit increase in APG increases Salary by 1550150

    • PositionF (223638): Holding the other predictors constant, a 1 unit increase in APG increases Salary by 223638

    • PositionG (-3057858): Holding the other predictors constant, a 1 unit increase in RPG decreases Salary by 3057858

Data Analysis (K means Clustering)

# ============================================================
# Create a clustering model
# ============================================================
install.packages("cluster")
Warning: package 'cluster' is in use and will not be installed
library(cluster)
install.packages("NbClust")
Warning: package 'NbClust' is in use and will not be installed
library(NbClust)

#############Guards Cluster ###############################################
#will leave salary out since that is the dependent variable
Guards_data <- nba_df |>
  filter(position=='G')|>
  select(PPG,RPG,APG)
guard_scale <- scale(Guards_data)

# Estimate the best number of clusters from 2 to 10 using k-means criteria
guard_number_cluster_estimate <- NbClust(
  guard_scale,
  distance = "euclidean",
  min.nc = 2,
  max.nc = 10,
  method = "kmeans"
)

*** : The Hubert index is a graphical method of determining the number of clusters.
                In the plot of Hubert index, we seek a significant knee that corresponds to a 
                significant increase of the value of the measure i.e the significant peak in Hubert
                index second differences plot. 
 

*** : The D index is a graphical method of determining the number of clusters. 
                In the plot of D index, we seek a significant knee (the significant peak in Dindex
                second differences plot) that corresponds to a significant increase of the value of
                the measure. 
 
******************************************************************* 
* Among all indices:                                                
* 12 proposed 2 as the best number of clusters 
* 5 proposed 3 as the best number of clusters 
* 1 proposed 5 as the best number of clusters 
* 4 proposed 6 as the best number of clusters 
* 1 proposed 8 as the best number of clusters 
* 1 proposed 10 as the best number of clusters 

                   ***** Conclusion *****                            
 
* According to the majority rule, the best number of clusters is  2 
 
 
******************************************************************* 
# Show the voting results for the best number of clusters
guard_number_cluster_estimate$Best.nc
                    KL       CH Hartigan     CCC    Scott  Marriot TrCovW
Number_clusters 3.0000   2.0000   3.0000  8.0000   3.0000      6.0    6.0
Value_Index     9.7768 230.2336  34.6762 -0.5577 147.0916 259716.7 1354.5
                 TraceW Friedman   Rubin Cindex    DB Silhouette   Duda
Number_clusters  3.0000   6.0000  6.0000 5.0000 2.000     2.0000 2.0000
Value_Index     38.5848   2.7396 -0.5293 0.1688 0.884     0.4818 0.9738
                PseudoT2  Beale Ratkowsky    Ball PtBiserial   Frey McClain
Number_clusters   2.0000 2.0000    2.0000  3.0000     2.0000 2.0000  2.0000
Value_Index       3.6821 0.0453    0.5289 62.1959     0.6311 1.2431  0.4309
                  Dunn Hubert SDindex Dindex    SDbw
Number_clusters 2.0000      0  2.0000      0 10.0000
Value_Index     0.0763      0  2.3432      0  0.2542
# Set seed for reproducibility
set.seed(123)

# Run PAM clustering with 5 clusters
# Note: the slides call this k-means, but this function is PAM
kmeans_guard_scalecluster <- pam(guard_scale, k = 5)

# Show medoids for the clusters
kmeans_guard_scalecluster$medoids
            PPG        RPG        APG
[1,] -1.0424153 -1.0959902 -0.8563508
[2,] -0.2404101  0.1462520 -0.5416298
[3,]  0.7273842  0.6367270  0.3044005
[4,] -0.2868656 -0.4275408  0.6800349
[5,]  1.7910421  1.1503261  1.5637620
# Show the cluster assignment for each row
kmeans_guard_scalecluster$clustering
  [1] 1 1 2 2 1 2 3 2 2 4 3 2 5 2 5 3 1 2 3 3 1 1 3 3 1 1 2 4 2 3 5 1 1 3 1 3 2
 [38] 4 3 3 2 1 3 1 4 1 5 5 4 5 3 5 3 5 5 1 3 1 5 3 3 1 3 1 1 1 1 2 2 2 2 3 4 2
 [75] 5 2 1 3 1 5 3 1 3 5 5 4 5 2 2 5 1 1 1 1 2 1 3 2 2 1 1 3 1 4 5 2 5 2 3 2 1
[112] 3 2 2 2 2 2 2 3 2 1 2 1 5 5 1 2 2 4 1 3 1 4 2 2 3 4 2 1 2 2 2 3 2 1 1 1 3
[149] 1 2 3 1 1 1 3 1 1 4 4 1 3 5 1 5 3 2 4 2 1 2 2 5 4 4 5 1 5 2 5 4 2 5
# Plot the clustering result in two reduced dimensions
plot(kmeans_guard_scalecluster)

# Add the assigned cluster to the Guard data
guard_cluster <- Guards_data %>%
  mutate(cluster = kmeans_guard_scalecluster$clustering)

# Show the dataset with assigned clusters
guard_cluster
# A tibble: 182 × 4
     PPG   RPG   APG cluster
   <dbl> <dbl> <dbl>   <int>
 1  5.87  1.52  2.04       1
 2  5.27  1.27  1.30       1
 3 11.2   3.89  1.26       2
 4 11.5   3.84  1.68       2
 5  6.44  1.92  1.78       1
 6  6.54  2.74  2.38       2
 7 13.9   8.25  3.72       3
 8 10.0   2.31  1.12       2
 9  3.48  2.87  1.27       2
10 10.4   3.28  5.17       4
# ℹ 172 more rows
# Compute the average of each variable by cluster
guard_cluster_summary <- guard_cluster %>%
  group_by(cluster) %>%
  summarise(across(everything(), ~ mean(.x, na.rm = TRUE)))

# Show cluster summaries
guard_cluster_summary
# A tibble: 5 × 4
  cluster   PPG   RPG   APG
    <int> <dbl> <dbl> <dbl>
1       1  4.30  1.40  1.26
2       2  8.98  3.14  1.89
3       3 16.5   4.13  3.88
4       4  9.47  2.72  4.68
5       5 22.6   4.93  6.50
#############Forward Cluster ###############################################
#will leave salary out since that is the dependent variable
Forward_data <- nba_df |>
  filter(position=='F')|>
  select(PPG,RPG,APG)
Forward_scale <- scale(Forward_data)

# Estimate the best number of clusters from 2 to 10 using k-means criteria
forward_number_cluster_estimate <- NbClust(
  Forward_scale,
  distance = "euclidean",
  min.nc = 2,
  max.nc = 10,
  method = "kmeans"
)

*** : The Hubert index is a graphical method of determining the number of clusters.
                In the plot of Hubert index, we seek a significant knee that corresponds to a 
                significant increase of the value of the measure i.e the significant peak in Hubert
                index second differences plot. 
 

*** : The D index is a graphical method of determining the number of clusters. 
                In the plot of D index, we seek a significant knee (the significant peak in Dindex
                second differences plot) that corresponds to a significant increase of the value of
                the measure. 
 
******************************************************************* 
* Among all indices:                                                
* 9 proposed 2 as the best number of clusters 
* 4 proposed 3 as the best number of clusters 
* 3 proposed 4 as the best number of clusters 
* 2 proposed 6 as the best number of clusters 
* 3 proposed 8 as the best number of clusters 
* 3 proposed 10 as the best number of clusters 

                   ***** Conclusion *****                            
 
* According to the majority rule, the best number of clusters is  2 
 
 
******************************************************************* 
# Show the voting results for the best number of clusters
forward_number_cluster_estimate$Best.nc
                    KL       CH Hartigan     CCC    Scott  Marriot  TrCovW
Number_clusters 6.0000   2.0000   4.0000  4.0000   3.0000      8.0   6.000
Value_Index     4.1034 160.8207  26.4362 -2.3346 104.4787 180430.1 309.911
                 TraceW Friedman   Rubin  Cindex     DB Silhouette   Duda
Number_clusters  3.0000   8.0000  8.0000 10.0000 2.0000     2.0000 2.0000
Value_Index     28.1498   4.8729 -0.6092  0.2154 0.8777     0.5151 0.8345
                PseudoT2  Beale Ratkowsky    Ball PtBiserial   Frey McClain
Number_clusters   2.0000 2.0000    2.0000  3.0000     2.0000 4.0000  2.0000
Value_Index      22.0134 0.3342    0.5231 48.6794     0.6661 3.9738  0.2635
                   Dunn Hubert SDindex Dindex    SDbw
Number_clusters 10.0000      0  3.0000      0 10.0000
Value_Index      0.0446      0  3.5484      0  0.2808
# Set seed for reproducibility
set.seed(123)

# Run PAM clustering with 5 clusters
# Note: the slides call this k-means, but this function is PAM
kmeans_forward_scalecluster <- pam(Forward_scale, k = 5)

# Show medoids for the clusters
kmeans_forward_scalecluster$medoids
            PPG         RPG         APG
[1,]  0.4516973  0.60882104  0.09425922
[2,]  1.2091516  1.02579079  1.76440613
[3,] -1.1050835 -1.10554449 -0.81485321
[4,] -0.6820470 -0.03297669 -0.19356416
[5,] -0.2004506 -0.48536758 -0.42985608
# Show the cluster assignment for each row
kmeans_forward_scalecluster$clustering
  [1] 1 1 2 3 1 4 2 5 4 5 1 5 3 5 3 5 1 1 4 2 5 5 2 3 5 3 2 4 5 3 2 5 2 1 3 5 5
 [38] 4 3 3 4 1 3 1 3 4 4 2 4 5 3 3 1 4 5 4 2 3 5 4 1 1 4 1 4 3 5 3 2 4 5 1 1 5
 [75] 4 2 4 4 1 3 4 4 1 4 4 1 2 3 3 5 3 1 2 1 5 1 3 5 1 5 3 1 3 3 1 2 2 5 2 3 4
[112] 2 4 1 5 5 3 1 2 3 1 5 1 1 4 5 2 3 2 5 4 3 5 2
# Plot the clustering result in two reduced dimensions
plot(kmeans_forward_scalecluster)

# Add the assigned cluster to the Guard data
forward_cluster <- Forward_data %>%
  mutate(cluster = kmeans_forward_scalecluster$clustering)

# Show the dataset with assigned clusters
forward_cluster
# A tibble: 134 × 4
     PPG   RPG   APG cluster
   <dbl> <dbl> <dbl>   <int>
 1 14.8   4.98 2.98        1
 2 17.8   4.47 2.51        1
 3 25.3  11.8  3.59        2
 4  2.35  1.14 0.306       3
 5 13.7   8.41 2.18        1
 6  8.34  5.09 1.03        4
 7 21.0   4.85 3.63        2
 8 10.5   2.79 1.41        5
 9  8.18  4.05 2.18        4
10  9.33  3.02 0.891       5
# ℹ 124 more rows
# Compute the average of each variable by cluster
forward_cluster_summary <- forward_cluster %>%
  group_by(cluster) %>%
  summarise(across(everything(), ~ mean(.x, na.rm = TRUE)))

# Show cluster summaries
forward_cluster_summary
# A tibble: 5 × 4
  cluster   PPG   RPG   APG
    <int> <dbl> <dbl> <dbl>
1       1 14.8   5.94 2.18 
2       2 21.6   7.86 4.87 
3       3  4.08  2.23 0.793
4       4  6.93  4.73 1.73 
5       5  9.91  3.54 1.42 
#############Center Cluster ###############################################
#will leave salary out since that is the dependent variable
Center_data <- nba_df |>
  filter(position=='C')|>
  select(PPG,RPG,APG)
Center_scale <- scale(Center_data)

# Estimate the best number of clusters from 2 to 10 using k-means criteria
Center_number_cluster_estimate <- NbClust(
  Center_scale,
  distance = "euclidean",
  min.nc = 2,
  max.nc = 10,
  method = "kmeans"
)

*** : The Hubert index is a graphical method of determining the number of clusters.
                In the plot of Hubert index, we seek a significant knee that corresponds to a 
                significant increase of the value of the measure i.e the significant peak in Hubert
                index second differences plot. 
 

*** : The D index is a graphical method of determining the number of clusters. 
                In the plot of D index, we seek a significant knee (the significant peak in Dindex
                second differences plot) that corresponds to a significant increase of the value of
                the measure. 
 
******************************************************************* 
* Among all indices:                                                
* 9 proposed 2 as the best number of clusters 
* 6 proposed 3 as the best number of clusters 
* 2 proposed 4 as the best number of clusters 
* 4 proposed 7 as the best number of clusters 
* 2 proposed 10 as the best number of clusters 

                   ***** Conclusion *****                            
 
* According to the majority rule, the best number of clusters is  2 
 
 
******************************************************************* 
# Show the voting results for the best number of clusters
Center_number_cluster_estimate$Best.nc
                     KL      CH Hartigan     CCC   Scott  Marriot  TrCovW
Number_clusters  3.0000  2.0000    3.000  7.0000  3.0000    7.000  4.0000
Value_Index     36.2287 68.2235   16.427 -1.0726 48.6111 5858.348 80.2851
                TraceW Friedman   Rubin  Cindex     DB Silhouette  Duda
Number_clusters   3.00   7.0000  7.0000 10.0000 2.0000     2.0000 2.000
Value_Index      14.97   7.7682 -0.9102  0.2338 0.8429     0.4675 0.794
                PseudoT2 Beale Ratkowsky    Ball PtBiserial Frey McClain   Dunn
Number_clusters   2.0000 2.000    2.0000  3.0000     2.0000    1  2.0000 4.0000
Value_Index       9.8607 0.427    0.5383 17.8517     0.6162   NA  0.4575 0.2025
                Hubert SDindex Dindex    SDbw
Number_clusters      0  3.0000      0 10.0000
Value_Index          0  2.7041      0  0.1098
# Set seed for reproducibility
set.seed(123)

# Run PAM clustering with 5 clusters
# Note: the slides call this k-means, but this function is PAM
kmeans_center_scalecluster <- pam(Center_scale, k = 5)

# Show medoids for the clusters
kmeans_center_scalecluster$medoids
             PPG         RPG        APG
[1,] -0.80209960 -0.95102961 -0.9359590
[2,] -0.28772073  0.07694669  0.5102304
[3,]  1.12873212 -0.16220015  0.8060718
[4,]  0.05087929  0.48400415 -0.6566734
[5,]  1.28897144  1.15779252  1.0362972
# Show the cluster assignment for each row
kmeans_center_scalecluster$clustering
 [1] 1 2 3 4 1 5 1 3 1 3 1 4 2 1 5 2 4 1 5 2 1 5 5 5 5 5 1 1 1 1 5 4 1 2 5 2 1 1
[39] 3 3 1 2 4 1 5 1 1 5 2 4 4
# Plot the clustering result in two reduced dimensions
plot(kmeans_center_scalecluster)

# Add the assigned cluster to the Guard data
center_cluster <- Center_data %>%
  mutate(cluster = kmeans_center_scalecluster$clustering)

# Show the dataset with assigned clusters
center_cluster
# A tibble: 51 × 4
     PPG   RPG   APG cluster
   <dbl> <dbl> <dbl>   <int>
 1  4.82  3.73 0.412       1
 2  8.48  6.11 2.11        2
 3 12.9   6.63 2.33        3
 4  7.32  7.75 0.85        4
 5  1.92  2.04 0.44        1
 6 17.6   9.79 4.37        5
 7  5.82  5.14 1.09        1
 8 12.9   5.11 1.74        3
 9  4.62  4.44 0.5         1
10 14.5   8    1.81        3
# ℹ 41 more rows
# Compute the average of each variable by cluster
center_cluster_summary <- center_cluster %>%
  group_by(cluster) %>%
  summarise(across(everything(), ~ mean(.x, na.rm = TRUE)))

# Show cluster summaries
center_cluster_summary
# A tibble: 5 × 4
  cluster   PPG   RPG   APG
    <int> <dbl> <dbl> <dbl>
1       1  4.53  3.48 0.702
2       2  7.79  6.73 2.01 
3       3 14.1   6.46 1.95 
4       4  8.47  7.94 1.02 
5       5 15.0  10.6  2.63 

Results

Conclusion

GeoTag

#install.packages("leaflet")
library(leaflet)
Warning: package 'leaflet' was built under R version 4.5.3
leaflet() |>
  addTiles()|>
  addMarkers(
    lng = -84.5184,
    lat = 33.9384,
    popup = 'KSU Marietta'
  )

QR Code