Objective

This R markdown shall show the author’s progression on his skills and learning of R.

Importing the dataset

A players table from basketball-reference.com with stats per 100 possessions across the NBA 2024-25 regular season.

#install.packages("rvest")
library(rvest)
## Warning: package 'rvest' was built under R version 4.4.3
url <- "https://www.basketball-reference.com/leagues/NBA_2025_per_poss.html#per_poss::17"

document <- read_html(url)
mainTable <- document %>% html_element("table") %>% html_table()

head(mainTable)

Cleaning

  • Remove records of traded players for individual teams; Keep “2TM” or “3TM” records.
  • Rename eFG% column.
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
mainTable <- mainTable %>% 
  arrange(Player, Team) %>%
  distinct(Player, .keep_all = TRUE) %>% 
  filter(Player != "League Average")
colnames(mainTable)[colnames(mainTable) == "eFG%"] <- "eFG"
#write.csv(mainTable, "mainTable2.csv")

Transforming

  • Get the top & bottom 5 players per position in eFG% with minimum minutes played and field goal attempts.
  • Remove unnecessary columns.
  • Include positional averages.
minG <- 65
minMP <- 1500
minFGA <- 15
topN <- 5

library(dplyr)

crop_df <- mainTable %>% 
  select(Player, Team, Pos, G, MP, FGA, eFG) %>% 
  transform(G = as.double(G), MP = as.double(MP)) %>% 
  #mainTable[c("Player","Team","Pos", "G", "MP", "FGA","eFG")] %>%
  #filter(G >= minG) %>% 
  filter(MP >= minMP, FGA >= minFGA)
  
top_df <- slice_max(crop_df, by = Pos, 
                    order_by = eFG, n = topN, with_ties = FALSE)
bottom_df <- slice_min(crop_df, by = Pos,
                       order_by = eFG, n = topN, with_ties = FALSE)

positionsOrder <- c("PG", "SG", "SF", "PF", "C")

result_df <- union(top_df, bottom_df) %>% 
  arrange(match(Pos, positionsOrder), desc(eFG))

mean_df <- crop_df %>%
  group_by(Pos) %>%
  summarise(mean(eFG))

#league average per position with filter - reference line
#include lookup col (TS%) from another df

head(result_df)

Plotting

library(ggplot2)
ggplot(result_df, aes(x=eFG, y=reorder(Player, FGA), colour=MP))+
         geom_point(aes(size=FGA))+
         geom_segment(aes(x=0.550, xend=eFG, y=Player, yend=Player))+
  scale_x_continuous(labels = scales::label_number(accuracy = 0.001))+
  facet_grid(factor(Pos, levels=positionsOrder)~., scales="free_y")+
  labs(title=paste("Top and bottom",topN,"players in eFG% by position"),
       subtitle=paste("Min",minFGA, "FGA &",minMP, "minutes played in the NBA 2024-25 regular season"),
       caption="Source: basketball-reference.com",
       y="Player (Descending FGA)",
       x="eFG% (Positional Average Dashed)")+
  geom_vline(aes(xintercept=.data[["mean(eFG)"]]), mean_df, linetype=5)

Notes

  • Notable efficient players: Payton Pritchard
  • Notable inefficient players: Alex Sarr
  • Difference in point guards and centers’ average eFG% is 0.048pp.