library(readr)
library(dplyr) 
library(stringr)
library(ggplot2)
library(knitr)
library(tidyr)
library(stringi)

Import Data

Per 100 Possessions

per_poss <- as.data.frame(read_csv("C:/Users/mikek/Documents/NBA-Player-Clustering/Data/per_poss.csv"))

per_poss1 <- per_poss %>%
  select('Year', everything(),-contains("Unnamed")) %>%
  rename_all(~ str_replace_all(., " ", "_")) %>%
  rename_all(~ str_replace_all(., "3", "Thr")) %>%
  rename_all(~ str_replace_all(., "2", "Two")) 
head(per_poss1)[1:8]

##   Year Rk              Player Pos Age  Tm  G GS
## 1 2001  1  Mahmoud Abdul-Rauf  PG  31 VAN 41  0
## 2 2001  2   Tariq Abdul-Wahad  SG  26 DEN 29 12
## 3 2001  3 Shareef Abdur-Rahim  SF  24 VAN 81 81
## 4 2001  4      Cory Alexander  PG  27 ORL 26  0
## 5 2001  5  Courtney Alexander  PG  23 TOT 65 24
## 6 2001  5  Courtney Alexander  PG  23 DAL 38  6

Advanced Stats

adv <- as.data.frame(read_csv("C:/Users/mikek/Documents/NBA-Player-Clustering/Data/advanced.csv"))

adv1 <- adv %>%
  select('Year', everything(),-contains("Unnamed")) %>%
  rename_all(~ str_replace_all(., "%", "_PCT")) %>%
  rename_all(~ str_replace_all(., " ", "_")) %>%
  rename_all(~ str_replace_all(., "3", "Thr")) %>%
  rename_all(~ str_replace_all(., "2", "Two")) 
head(adv1)[1:8]

##   Year Rk              Player Pos Age  Tm  G   MP
## 1 2001  1  Mahmoud Abdul-Rauf  PG  31 VAN 41  486
## 2 2001  2   Tariq Abdul-Wahad  SG  26 DEN 29  420
## 3 2001  3 Shareef Abdur-Rahim  SF  24 VAN 81 3241
## 4 2001  4      Cory Alexander  PG  27 ORL 26  227
## 5 2001  5  Courtney Alexander  PG  23 TOT 65 1382
## 6 2001  5  Courtney Alexander  PG  23 DAL 38  472

Shooting Stats

shoot <- as.data.frame(read_csv("C:/Users/mikek/Documents/NBA-Player-Clustering/Data/shooting.csv"))

shoot1 <- shoot %>%
  select('Year', everything(),-contains("Unnamed")) %>%
  rename_all(~ str_replace_all(., " ", "_")) %>%
  rename(Avg_Dist_FGA = Dist.) %>%
  rename_all(~ str_replace_all(., "-", "_")) %>%
  rename_all(~ str_replace_all(., "_of", "")) %>%
  rename_all(~ str_replace_all(., "_by", "")) %>%
  rename_all(~ str_replace_all(., "Distance", "Dis"))
head(shoot1)[1:8]

##   Year Rk              Player Pos Age  Tm  G   MP
## 1 2001  1  Mahmoud Abdul-Rauf  PG  31 VAN 41  486
## 2 2001  2   Tariq Abdul-Wahad  SG  26 DEN 29  420
## 3 2001  3 Shareef Abdur-Rahim  SF  24 VAN 81 3241
## 4 2001  4      Cory Alexander  PG  27 ORL 26  227
## 5 2001  5  Courtney Alexander  PG  23 TOT 65 1382
## 6 2001  5  Courtney Alexander  PG  23 DAL 38  472

Per Game

per_game <- as.data.frame(read_csv("C:/Users/mikek/Documents/NBA-Player-Clustering/Data/per_game.csv"))

per_game1 <- per_game %>%
  select('Year', everything(),-contains("Unnamed")) %>%
  rename_all(~ str_replace_all(., "%", "_PCT")) %>%
  rename_all(~ str_replace_all(., "3", "Thr")) %>%
  rename_all(~ str_replace_all(., "2", "Two")) 
head(per_game1)[1:8]

##   Year Rk              Player Pos Age  Tm  G GS
## 1 2001  1  Mahmoud Abdul-Rauf  PG  31 VAN 41  0
## 2 2001  2   Tariq Abdul-Wahad  SG  26 DEN 29 12
## 3 2001  3 Shareef Abdur-Rahim  SF  24 VAN 81 81
## 4 2001  4      Cory Alexander  PG  27 ORL 26  0
## 5 2001  5  Courtney Alexander  PG  23 TOT 65 24
## 6 2001  5  Courtney Alexander  PG  23 DAL 38  6

Cleaning the Data

More Cleaning

There’s a fair amount players that we need to remove from teh dataset given their lack of contribution to the team (Ex. playing only a few games or not playing a lot of minutes).

The data includes a team name called ‘TOT’ which indicates that a player played or more than one team. These players’ stats appear on multiple lines - one for each team and one for the Total that season. We need to remove the underlying teams for each player with ‘TOT’ as their team so we’re left with only one row for each player for each season they played. We’ll create a unique index (since the index wasn’t included on the scrape) based on rank (which is essentially an index value based on player name), player name and year. We’ll arrange by Games (TOT will always be the highest) and remove duplicate instances for each dataframe

Per 100 Possessions

per_poss2 <- per_poss1 %>%
  mutate(player_year = paste0(Rk,Player,Year)) %>% # creates an ID
  mutate(player_year = str_replace_all(player_year," ","")) #removes spaces

per_poss3 <- per_poss2 %>% 
  arrange(Year, Player, desc(G))%>%
  filter(duplicated(player_year) == FALSE)

# compares the old df to the new df based on Team equaling 'TOT'. The TRUE valued should remain the same, the FALSE value should change because we're removing duplicates
per_poss2 %>% count(Tm == 'TOT')

##   Tm == "TOT"     n
## 1       FALSE 11944
## 2        TRUE  1304

per_poss3 %>% count(Tm == 'TOT')

##   Tm == "TOT"    n
## 1       FALSE 9244
## 2        TRUE 1304

Advanced

adv2 <- adv1 %>%
  mutate(player_year = paste0(Rk,Player,Year)) %>% # creates an ID
  mutate(player_year = str_replace_all(player_year," ","")) #removes spaces

adv3 <- adv2 %>% 
  arrange(Year, Player, desc(G))%>%
  filter(duplicated(player_year) == FALSE)

# compares the old df to the new df based on Team equaling 'TOT'. The TRUE valued should remain the same, the FALSE value should change because we're removing duplicates
adv2 %>% count(Tm == 'TOT')

##   Tm == "TOT"     n
## 1       FALSE 11944
## 2        TRUE  1304

adv3 %>% count(Tm == 'TOT')

##   Tm == "TOT"    n
## 1       FALSE 9244
## 2        TRUE 1304

Shooting

shoot2 <- shoot1 %>%
  mutate(player_year = paste0(Rk,Player,Year)) %>% # creates an ID
  mutate(player_year = str_replace_all(player_year," ","")) #removes spaces

shoot3 <- shoot2 %>% 
  arrange(Year, Player, desc(G))%>%
  filter(duplicated(player_year) == FALSE)

# compares the old df to the new df based on Team equaling 'TOT'. The TRUE valued should remain the same, the FALSE value should change because we're removing duplicates
shoot2 %>% count(Tm == 'TOT')

##   Tm == "TOT"     n
## 1       FALSE 11944
## 2        TRUE  1304

shoot3 %>% count(Tm == 'TOT')

##   Tm == "TOT"    n
## 1       FALSE 9244
## 2        TRUE 1304

Per Game

per_game2 <- per_game1 %>%
  mutate(player_year = paste0(Rk,Player,Year)) %>% # creates an ID
  mutate(player_year = str_replace_all(player_year," ","")) #removes spaces

per_game3 <- per_game2 %>% 
  arrange(Year, Player, desc(G))%>%
  filter(duplicated(player_year) == FALSE)

# compares the old df to the new df based on Team equaling 'TOT'. The TRUE valued should remain the same, the FALSE value should change because we're removing duplicates
per_game2 %>% count(Tm == 'TOT')

##   Tm == "TOT"     n
## 1       FALSE 11944
## 2        TRUE  1304

per_game3 %>% count(Tm == 'TOT')

##   Tm == "TOT"    n
## 1       FALSE 9244
## 2        TRUE 1304

There are duplicate column names in each df, so need to change names to include a unique identifier

adv4 <- adv3 %>%
  rename_at(vars(8:28), ~paste0(.,"_adv"))

per_game4 <- per_game3 %>%
  rename_at(vars(9:31), ~paste0(.,"_pg"))

per_poss4 <- per_poss3 %>%
  rename_at(vars(9:32), ~paste0(.,"_pp"))

shoot4 <- shoot3 %>%
  rename_at(vars(8:29), ~paste0(.,"_sho"))

There still could be some duplicate stat columns within each of the dataframes, so a preliminary df will be created and analyzed manually to identify these instances

dupcol_df <- per_poss4 %>%
  select(1:33) %>%
  left_join(select(adv4, c(7:29)), by = 'player_year') %>% 
  left_join(select(shoot4, c(7:31)), by = 'player_year') %>% 
  left_join(select(per_game4, c(7:32)), by = 'player_year')
dupcol_df <- dupcol_df %>% select(order(colnames(dupcol_df)))

Obvious Duplicated Columns

Columns will be removed based on a hierarchy of the dataframes: 1) Per 100, 2) Advanced, 3) Shooting, 4) Per Game

FG_PCT: Per 100, Shooting, and Per Game
FT_PCT: Per 100, Per Game
G: Per 100, Advanced, Shooting, Per Game
GS: Per 100, Per Game
MP: Per 100, Advanced, Shooting
ThrP_PCT: Per 100, Per Game
TwoP_PCT: Per 100, Per Game

dupcol_df1 <- per_poss4 %>%
  select(1,3:33) %>% #remove rank as it's meaningless
  left_join(select(adv4, c(9:29)), by = 'player_year') %>% #remove G and MP
  left_join(select(shoot4, c(10:31)), by = 'player_year') %>% #remove G MP FG_PCT
  left_join(select(per_game4, -c(1:8,12,15,22)), by = 'player_year') #remove FG_PCT, FT_PCT, G, GS, ThrP_PCT, TwoP_PCT

Not so Obvious Duplicated Columns

These columns are named differently, but they account for the same statistics when their values are for each player are reviewed

ThrPAr_adv and PCT_FGA_Dis_3P_sho
ThrP_PCT_pp and FG_PCT_Dis_3P_sho
TwoP_PCT_pp, FG_PCT_Dis_2P_sho,and TwoP_PCT_pg
STL_pp and STL_PCT_adv

These duplicated columns will be removed leaving us another step closer to our final dataset

final_df2 <- dupcol_df1 %>%
  select(-c('PCT_FGA_Dis_3P_sho','FG_PCT_Dis_3P_sho','FG_PCT_Dis_2P_sho','TwoP_PCT_pg','STL_PCT_adv')) %>%
  select(player_year, everything())

It’s likely that players who didn’t play a ton of games will impact the analysis, so it would be beneficial to remove these players.

I’ll start by looking at the number of games played each year. Some season were shortened due to the lockout or to covid, so picking an arbitrary number might not be the best idea.

q <- c(0.25,0.5,0.75)
qs_games_per_year <- final_df2 %>%
  group_by(Year) %>%
  summarize(min = min(G),
            quant25 = quantile(G, probs = q[1]),
            quant50 = quantile(G, probs = q[2]),
            quant75 = quantile(G, probs = q[3]),
            max = max(G)) %>%
  print(n=40)

## # A tibble: 22 × 6
##     Year   min quant25 quant50 quant75   max
##    <dbl> <dbl>   <dbl>   <dbl>   <dbl> <dbl>
##  1  2001     1    34      62      77      83
##  2  2002     1    32.8    63      76.2    83
##  3  2003     1    36      66      78      83
##  4  2004     1    35.2    61      76      85
##  5  2005     1    35      63      76      84
##  6  2006     1    33.8    61      76.8    82
##  7  2007     1    38      63      76      82
##  8  2008     2    35      65      76      82
##  9  2009     1    38      63      77      82
## 10  2010     1    37.2    64.5    76      82
## 11  2011     1    39.8    62.5    76      83
## 12  2012     1    28      49      61      66
## 13  2013     1    39      61      76      82
## 14  2014     1    31.2    61      76      83
## 15  2015     1    33      61      74      83
## 16  2016     1    37      62      76      82
## 17  2017     1    35.2    62.5    75      82
## 18  2018     1    21      55.5    73      82
## 19  2019     1    27      56      73      82
## 20  2020     1    22      48      62      74
## 21  2021     1    26.8    46      61      72
## 22  2022     1    17      48      66      82

# kable(qs_games_per_year)

Players could have played more than 82 games in a year due to trades mid-season.

#creates a new df
final_df_with_qs_g <- left_join(final_df2, qs_games_per_year[1:4], by ="Year")

ggplot(final_df_with_qs_g, aes(x=G, y=as.factor(Year))) + 
  geom_boxplot(color="#63727A", fill="#5A2D81", alpha=0.3) +
  labs(title="Box Plot of Games per Season",x="Games", y = "Season")+
  theme(plot.title=element_text(hjust=0.5),
        plot.subtitle=element_text(hjust=0.5)) +
  scale_x_continuous(limits = c(0,86), expand = c(0,0), breaks=seq(0, 85, 5))

Create a cutoff

Removing players at the 25% percentile or 50% percentile would likely make the most sense.

final_df_with_qs_g <- final_df_with_qs_g %>%
  mutate_all(~ replace_na(., 0)) %>%
  filter(G >= quant25)
  
final_df3 <- final_df_with_qs_g %>%
  select(-c(min, quant25, quant50))

Given the data and the objective, it’s likely helpful to use a minutes cutoff as well as a games played cutoff. 12 minutes will be the cutoff and can always be changed to include a 20 mpg cutoff.

I’ll look at minutes as well.

q <- c(0.25,0.5,0.75)
qs_mins_per_year <- final_df2 %>%
  group_by(Year) %>%
  summarize(min = min(MP_pg),
            quant25 = quantile(MP_pg, probs = q[1]),
            quant50 = quantile(MP_pg, probs = q[2]),
            quant75 = quantile(MP_pg, probs = q[3]),
            max = max(MP_pg)) %>%
  print(n=40)

## # A tibble: 22 × 6
##     Year   min quant25 quant50 quant75   max
##    <dbl> <dbl>   <dbl>   <dbl>   <dbl> <dbl>
##  1  2001   1      11.8    19.7    28.8  42  
##  2  2002   3      11.6    20.2    29.4  43.7
##  3  2003   1.3    12.3    19.7    29.3  42.5
##  4  2004   1.9    12.1    19.6    29.8  42.5
##  5  2005   1.7    12.6    19.2    29.4  42.4
##  6  2006   0      11.0    19.4    28.8  43.1
##  7  2007   1      11.2    20.2    29.5  42.5
##  8  2008   2.4    11.5    20      29.2  41.8
##  9  2009   1.3    12.1    19.8    31    39.9
## 10  2010   0      12.6    21.2    30    41.4
## 11  2011   1.7    12.3    20.2    28.4  40.3
## 12  2012   1      13.5    21      27.5  39.4
## 13  2013   0      12.4    19.5    28    38.7
## 14  2014   1      11.6    19.4    28.5  38.7
## 15  2015   0.7    13.1    19.8    27.8  38.7
## 16  2016   2      13      20.0    28.0  42  
## 17  2017   1      12.8    19.1    27    37.8
## 18  2018   1      12.3    19      26.8  36.9
## 19  2019   0.7    12.3    19.2    27.2  36.9
## 20  2020   1      12      18.9    27.8  37.5
## 21  2021   2      12.6    19.6    27.7  37.6
## 22  2022   1      11.1    18.1    27.3  43.5

#creates a new df
final_df_with_qs_mpg <- left_join(final_df3, qs_mins_per_year[1:4], by ="Year")

final_df_with_qs_mpg <- final_df_with_qs_mpg %>%
  mutate_all(~ replace_na(., 0))

cutoff_df <- final_df_with_qs_mpg %>%
  select(-min,-quant25,-quant50) %>% #removes unneeded variables
  filter(MP_pg >= 12) %>%
  mutate(HOF = ifelse(grepl("\\*", Player),1,0)) %>% #creates a new variable called Hall of Fame
  mutate(Player = stri_trans_general(Player, "Latin-ASCII")) %>%
  mutate(Player = gsub("[.'*]", "", Player)) %>%
  mutate(Player = gsub("\\-", " ", Player)) 

a <- cutoff_df %>% distinct(Player)

A unique player index was created in excel, imported, and joined to the df. The previous column that was used to remove players with more than one line per season (Tm == ‘Tot’) wasn’t useful for identifying unique players overall.

Only valuable columns for our model were selected, columns were reordered, and a few columns were renamed.

library(readxl)
PlayerIndex <- read_excel("C:/Users/mikek/Documents/NBA-Player-Clustering/Data/PlayerIndex.xlsx")

cutoff_df <- left_join(cutoff_df ,PlayerIndex %>% select(Player,index1),by="Player")

cutoff_df <- cutoff_df %>%
  select('Year','Player','index1', everything())

table(cutoff_df$Pos)

## 
##     C  C-PF    PF  PF-C PF-SF    PG PG-SF PG-SG    SF  SF-C SF-PF SF-SG    SG 
##  1345    18  1395    14    19  1429     1    21  1289     1    15    24  1494 
## SG-PF SG-PG SG-SF 
##     4    19    17

# The values in the Pos (position) column sometimes included multiple positions (SG-SF, PF-C, etc.).  These values were amended to only include the first position in the string.
cutoff_df <- cutoff_df %>%
  mutate(Pos = substr(Pos,1,2)) %>%
  mutate(Pos = str_remove_all(Pos,"-"))

table(cutoff_df$Pos)

## 
##    C   PF   PG   SF   SG 
## 1363 1428 1451 1329 1534

#Remove Other columns
model_df <- cutoff_df %>%
  select(-c(player_year,Age,G, GS)) %>%
  select(-MP_pp) # this is total minutes instead of per 100 possessions


a <- model_df %>% select(sort(names(.)))

#update variable names to add better groupings for the analysis - new = old
model_df <- model_df %>%
  rename(
    "Reb_O_pp"="ORB_pp","Reb_D_pp"="DRB_pp", "Reb_T_pp"="TRB_pp",
    
    "Rtg_O_pp" = "ORtg_pp", "Rtg_D_pp" = "DRtg_pp",
    
    "FG_PCT_T" = "TS_PCT_adv",
    "ThrP_Att_Rt_adv" = "ThrPAr_adv",
    "FT_Rt_adv" ="FTr_adv",
    
    "Reb_O_pct_adv" = "ORB_PCT_adv","Reb_D_pct_adv" = "DRB_PCT_adv", "Reb_T_pct_adv" = "TRB_PCT_adv",
    
    "WS_O_adv" = "OWS_adv", "WS_D_adv" = "DWS_adv", "WS_T_adv" = "WS_adv", "WS_48_adv" = "WS_per_48_adv",
    
    "BPM_O_adv" = "OBPM_adv", "BPM_D_adv" = "DBPM_adv", "BPM_T_adv" = "BPM_adv",
    
    "FGA_Avg_Dis_sho" = "Avg_Dist_FGA_sho", "FGA%_2p_sho" = "PCT_FGA_Dis_2P_sho", "FGA%_0003_sho" = "PCT_FGA_Dis_0_3_sho", "FGA%_0310_sho" = "PCT_FGA_Dis_3_10_sho",   "FGA%_1016_sho" = "PCT_FGA_Dis_10_16_sho", "FGA%_163p_sho" = "PCT_FGA_Dis_16_3P_sho",
    
    "FG%_0003_sho" = "FG_PCT_Dis_0_3_sho","FG%_0310_sho" = "FG_PCT_Dis_3_10_sho", "FG%_1016_sho" = "FG_PCT_Dis_10_16_sho", "FG%_163p_sho" = "FG_PCT_Dis_16_3P_sho",
    
    "ASTd_FG%_2p_sho" = "PCT_FG_Astd_2P_sho", "ASTd_FG%_3p_sho" = "PCT_FG_Astd_3P_sho",
    "ThrPA%_Corn_sho" = "Corner_3s_PCT_3PA_sho", "ThrP%_Corn_sho" = "Corner_3s_PCT_3P_sho",
    
    "FG_PCT_e_pg" = "eFG_PCT_pg",
    "Reb_O_pg" = "ORB_pg",
    "Reb_D_pg" = "DRB_pg",
    "Reb_T_pg" = "TRB_pg"
  ) %>%
  select(-c(Dunks_PCT_FGA_sho,Dunks_sho, Heave_Att_sho,Heaves_Comp,index1)) %>%
  rename_with(~ str_replace(., "_PCT", "_pct"), contains("_PCT")) %>%
  rename_with(~ str_replace(., "%", "_pct"), contains("%"))
  # select(sort(colnames(.)))

# write.csv(model_df,"C:/Users/mikek/Documents/NBA-Player-Clustering/Data/model0623.csv")

NBA Player Segmentation Pt. 1 - Cleaning The Data

Mike Kaminski

2023-06-12

Import Data

Per 100 Possessions

Advanced Stats

Shooting Stats

Per Game

Cleaning the Data

More Cleaning

Per 100 Possessions

Advanced

Shooting

Per Game

Obvious Duplicated Columns

Not so Obvious Duplicated Columns

Create a cutoff