## [1] "C:/Users/User/Documents/R/AMS"

1.0 Data Set

You can load this code to install nbastatR ‘devtools::install_github(“abresler/nbastatR”)’

# Install and Load the nbastatR package
library('nbastatR')

# Access the game tables for a particular season
gamedata <- game_logs(seasons = 2020)

# Take a quick look at the data
head(gamedata)

1.1. Outline the variables

This data set includes the following variables:

  • Match Statistics and Outcome such as Points scored, Minutes Played

  • Season, Game and Player details

  • Rest days, Days for next game

1.2. Describe the data source

This data source is about the overall statistics of NBA matches from 1897 to 2021

1.1.3 Source of Data

The data is from nbastatR that can be obtain from the ‘nbastatR’ package. It combines the data from NBA’s API and Basketball Reference (https://www.basketball-reference.com/)

1.1.4 Data Access

To install the package, use the following code:

devtools::install_github("abresler/nbastatR")
library(nbastatR)

If an error is produced regarding connection buffer, increase it by running the following code

Sys.setenv("VROOM_CONNECTION_SIZE" = 131072 * 2)

1.2 Visualisation

library(nbastatR)
library(ggplot2)
library(dplyr)

# Create visualization
gamedata <- game_logs(seasons = 2020)
## Acquiring NBA basic player game logs for the 2019-20 Regular Season
#### Here we look at the minutes played and points scored
plot1 <- ggplot(gamedata, aes(minutes,pts))+
  geom_point()
plot1

#### Here we look at the minutes played and points scored according to teams
plot2 <-ggplot(gamedata, aes(minutes,pts))+
  geom_line()+
  geom_smooth(method = 'loess')+
  facet_wrap(~nameTeam)
plot2

# A summary of points scored for the dataset
summary(gamedata$pts)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    4.00    9.00   10.57   15.00   61.00
# Points scored on histogram
hist(gamedata$pts, breaks=50)

# Let's look at the top scorers and their playing time and their match outcome
topscorer <- gamedata %>% 
  select(namePlayer,pts,minutes,outcomeGame) %>% 
  arrange(desc(pts))
topscorer
## # A tibble: 22,393 x 4
##    namePlayer       pts minutes outcomeGame
##    <chr>          <dbl>   <dbl> <chr>      
##  1 Damian Lillard    61      45 W          
##  2 Damian Lillard    61      41 W          
##  3 Damian Lillard    60      40 L          
##  4 James Harden      60      31 W          
##  5 James Harden      59      38 W          
##  6 James Harden      55      41 W          
##  7 Bradley Beal      55      41 L          
##  8 James Harden      54      36 W          
##  9 Kyrie Irving      54      32 W          
## 10 Bradley Beal      53      39 L          
## # ... with 22,383 more rows

2.0 Analytics Piece

# Load the packages
# Make sure to install them first before loading
library(tidyverse)
library(nbastatR)
library(BasketballAnalyzeR)
library(jsonlite)
library(janitor)
library(extrafont)
library(ggrepel)
library(scales)
library(teamcolors)
library(zoo)
library(future)
library(lubridate)

#If an error is produced regarding connection buffer, increase it by running the following codeSys.setenv
("VROOM_CONNECTION_SIZE" = 131072 * 2)
## [1] 262144
# Get the game IDs from the past 3 seasons
# Select seasons from 2018-2020 because it's the recent ones
selectedSeasons <- c(2018:2020)

# Get game IDs for Regular Season and Playoffs
gameIds_Reg <- suppressWarnings(seasons_schedule(seasons = selectedSeasons, season_types = "Regular Season") %>% select(idGame, slugMatchup))
## Acquiring NBA basic team game logs for the 2017-18 Regular Season
## Acquiring NBA basic team game logs for the 2018-19 Regular Season
## Acquiring NBA basic team game logs for the 2019-20 Regular Season
gameIds_PO <- suppressWarnings(seasons_schedule(seasons = selectedSeasons, season_types = "Playoffs") %>% select(idGame, slugMatchup))
## Acquiring NBA basic team game logs for the 2017-18 Playoffs
## Acquiring NBA basic team game logs for the 2018-19 Playoffs
## Acquiring NBA basic team game logs for the 2019-20 Playoffs
gameIds_all <- rbind(gameIds_Reg, gameIds_PO)

# Peek at the game IDs
head(gameIds_all)
## # A tibble: 6 x 2
##     idGame slugMatchup
##      <dbl> <chr>      
## 1 21700001 BOS @ CLE  
## 2 21700002 HOU @ GSW  
## 3 21700003 DET vs. CHA
## 4 21700004 IND vs. BKN
## 5 21700005 MIA @ ORL  
## 6 21700006 PHI @ WAS
tail(gameIds_all)
## # A tibble: 6 x 2
##     idGame slugMatchup
##      <dbl> <chr>      
## 1 41900401 LAL vs. MIA
## 2 41900402 MIA @ LAL  
## 3 41900403 MIA vs. LAL
## 4 41900404 MIA vs. LAL
## 5 41900405 MIA @ LAL  
## 6 41900406 MIA vs. LAL
## Retrieve gamelog data for players and teams
# Get player gamelogs
P_gamelog_reg <- suppressWarnings(game_logs(seasons = selectedSeasons, league = "NBA", result_types = "player", season_types = "Regular Season"))
## Acquiring NBA basic player game logs for the 2017-18 Regular Season
## Acquiring NBA basic player game logs for the 2018-19 Regular Season
## Acquiring NBA basic player game logs for the 2019-20 Regular Season
P_gamelog_po <- suppressWarnings(game_logs(seasons = selectedSeasons, league = "NBA", result_types = "player", season_types = "Playoffs"))
## Acquiring NBA basic player game logs for the 2017-18 Playoffs
## Acquiring NBA basic player game logs for the 2018-19 Playoffs
## Acquiring NBA basic player game logs for the 2019-20 Playoffs
P_gamelog_all <- rbind(P_gamelog_reg, P_gamelog_po)
View(head(P_gamelog_all))

# Get team gamelogs
T_gamelog_reg <- suppressWarnings(game_logs(seasons = selectedSeasons, league = "NBA", result_types = "team", season_types = "Regular Season"))
## Acquiring NBA basic team game logs for the 2017-18 Regular Season
## Acquiring NBA basic team game logs for the 2018-19 Regular Season
## Acquiring NBA basic team game logs for the 2019-20 Regular Season
T_gamelog_po <- suppressWarnings(game_logs(seasons = selectedSeasons, league = "NBA", result_types = "team", season_types = "Playoffs"))
## Acquiring NBA basic team game logs for the 2017-18 Playoffs
## Acquiring NBA basic team game logs for the 2018-19 Playoffs
## Acquiring NBA basic team game logs for the 2019-20 Playoffs
T_gamelog_all <- rbind(T_gamelog_reg, T_gamelog_po)
View(head(T_gamelog_all))


## In this example, we use Regular Season data for our analysis
# Create Tbox (Team boxscore) for each Regular Season and create variables
Tbox <- T_gamelog_reg %>%
  group_by("Season"=yearSeason, "Team"=slugTeam) %>%
  dplyr::summarise(GP=n(), MIN=sum(round(minutesTeam/5)),
                   PTS=sum(ptsTeam),
                   W=sum(outcomeGame=="W"), L=sum(outcomeGame=="L"),
                   P2M=sum(fg2mTeam), P2A=sum(fg2aTeam), P2p=P2M/P2A,
                   P3M=sum(fg3mTeam), P3A=sum(fg3aTeam), P3p=P3M/P3A,
                   FTM=sum(ftmTeam), FTA=sum(ftaTeam), FTp=FTM/FTA,
                   OREB=sum(orebTeam), DREB=sum(drebTeam), AST=sum(astTeam),
                   TOV=sum(tovTeam), STL=sum(stlTeam), BLK=sum(blkTeam),
                   PF=sum(pfTeam), PM=sum(plusminusTeam)) %>%
  as.data.frame()

# Create Obox (Opponent Team boxscore) for each Regular Season
Obox <- T_gamelog_reg %>%
  group_by("Season"=yearSeason, "Team"=slugOpponent) %>%
  dplyr::summarise(GP=n(), MIN=sum(round(minutesTeam/5)),
                   PTS=sum(ptsTeam),
                   W=sum(outcomeGame=="L"), L=sum(outcomeGame=="W"),
                   P2M=sum(fg2mTeam), P2A=sum(fg2aTeam), P2p=P2M/P2A,
                   P3M=sum(fg3mTeam), P3A=sum(fg3aTeam), P3p=P3M/P3A,
                   FTM=sum(ftmTeam), FTA=sum(ftaTeam), FTp=FTM/FTA,
                   OREB=sum(orebTeam), DREB=sum(drebTeam), AST=sum(astTeam),
                   TOV=sum(tovTeam), STL=sum(stlTeam), BLK=sum(blkTeam),
                   PF=sum(pfTeam), PM=sum(plusminusTeam)) %>%
  as.data.frame()

# Create Pbox (Player boxscore) for each Regular Season
Pbox <- P_gamelog_reg %>%
  group_by("Season"=yearSeason, "Team"=slugTeam, "Player"=namePlayer) %>%
  dplyr::summarise(GP=n(), MIN=sum(minutes), PTS=sum(pts),
                   P2M=sum(fg2m), P2A=sum(fg2a), P2p=100*P2M/P2A,
                   P3M=sum(fg3m), P3A=sum(fg3a), P3p=100*P3M/P3A,
                   FTM=sum(ftm), FTA=sum(fta), FTp=100*FTM/FTA,
                   OREB=sum(oreb), DREB=sum(dreb), AST=sum(ast),
                   TOV=sum(tov), STL=sum(stl), BLK=sum(blk),
                   PF=sum(pf)) %>%
  as.data.frame()

View(Pbox[Pbox$Player=="LeBron James",])

###################################################################

# 2.3. Vizualization

# Bar plots

# You can pick any team you want, for this part, I chose LAL and filter players who played more than 1000 mins
teamSelected <- "LAL"
Pbox.sel <- subset(Pbox, Team==teamSelected &
                     MIN>=1000)

# I pick 2020 season
seasonSelected <- 2020

# barline is a combination of bar chart and line chart, take a look at the chart below
barline(data=Pbox.sel[Pbox.sel$Season==seasonSelected,], id="Player",
        bars=c("P2M","P3M","FTM"), line="PTS",
        order.by="PTS", labels.bars=c("2PM","3PM","FTM"),
        title=teamSelected)

# Next we make a scatter plot between two numeric variable, Assist and Turnovers per minute, we differentiate thep points scored by color
# we create a selection of colors by using the colorRampPalette function
teamSelected <- "LAL"
Pbox.sel <- subset(Pbox, Team==teamSelected & MIN>=1000)
attach(Pbox.sel)
X <- data.frame(AST, TOV, PTS)/MIN
detach(Pbox.sel)
mypal <- colorRampPalette(c("blue","yellow","red"))

scatterplot(X, data.var=c("AST","TOV"), z.var="PTS",
            labels=paste(Pbox.sel$Player,", ",Pbox.sel$Season), palette=mypal)

# Let's zoom in 
scatterplot(X, data.var=c("AST","TOV"), z.var="PTS",
            labels=paste(Pbox.sel$Player,", ",Pbox.sel$Season), palette=mypal,
            zoom=c(0.2,0.325,0.05,0.125))

# Lebron consistently excels in terms of assists and poins while having slightly higher turnovers. The other player that could math him in the graph is Rajon Rando which is a pure point guard



###################################################################
# Bubble plots are scatter plots with a third numeric variable and in this example we take 2, 3 points and Freethrow percetange for the whole teams in the NBA for season 2020

teamSelected <- "LAL"
seasonSelected <- 2020
Tbox.sel <- subset(Tbox,Season==seasonSelected)

attach(Tbox.sel)
X <- data.frame(T=Team, P2p, P3p, FTp, AS=P2A+P3A+FTA)
detach(Tbox.sel)
labs <- c("2-point shots (% made)",
          "3-point shots (% made)",
          "free throws (% made)",
          "Total shots attempted")
bubbleplot(X, id="T", x="P2p", y="P3p", col="FTp",
           size="AS", labels=labs)

###################################################################


# Now we select the players in best four teams (all fours are finalists of their respective conference in that season), we analyse their defensive metrics such as steals, defensive rebounds and blocks
teamsSelected <- c("LAL", "MIA", "BOS", "DEN")
seasonSelected <- 2020
Pbox.sel <- subset(Pbox, Team %in% teamsSelected & MIN>=1500 & Season==seasonSelected)
                   
attach(Pbox.sel)
X <- data.frame(ID=Player, Team, V1=DREB/MIN, V2=STL/MIN,
                V3=BLK/MIN, V4=MIN)
detach(Pbox.sel)
labs <- c("Defensive Rebounds","Steals","Blocks",
          "Total minutes played")
bubbleplot(X, id="ID", x="V1", y="V2", col="V3",
           size="V4", text.col="Team", labels=labs,
           title=paste0("NBA Players in ", seasonSelected),
           text.legend=TRUE, text.size=3.5, scale=FALSE)

###################################################################

# 2.4. Clustering

# We use K-means clustering to partition the data set into groups. K represents the number of groups. Each cluster or groups is represented by its centroid. The four factors used are 
# Effective Field Goal Percentage
# Turnover Ratio
# Rebound Percentage
# Free Throw Rate

seasonSelected <- 2020
Tbox.sel <- subset(Tbox.sel, Season==seasonSelected)
Obox.sel <- subset(Obox,Season==seasonSelected)

FF <- fourfactors(Tbox.sel,Obox.sel)
OD.Rtg <- FF$ORtg/FF$DRtg
F1.r <- FF$F1.Off/FF$F1.Def
F2.r <- FF$F2.Def/FF$F2.Off
F3.Off <- FF$F3.Off
F3.Def <- FF$F3.Def
P3M.ff <- Tbox.sel$P3M
STL.r <- Tbox.sel$STL/Obox.sel$STL
data <- data.frame(OD.Rtg, F1.r, F2.r, F3.Off, F3.Def, P3M.ff, STL.r)

RNGkind(sample.kind="Rounding")
set.seed(29)
kclu1 <- kclustering(data)
plot(kclu1)

# The graph above indicates that 7 clusters would be the best choice. We want to minimize the number of clusters while at the same time achieving the most consistency and information.

#####################################################################

# Now we make a radial plots of Cluster Heterogeneity Index (CHI)
set.seed(29)
kclu2 <- kclustering(data, labels=Tbox.sel$Team, k=7)
plot(kclu2)

# The radial plots of the average cluster profiles are shown above. CHI gives us an idea of what the clusters represent. For example, Cluster 3 has Cluster Heterogeneity Index of 0.47 and contains the GSW and NYK, with high offensive rating.

#####################################################################

# Bubble plot
kclu2.W <- tapply(Tbox.sel$W, kclu2$Subjects$Cluster, mean)

cluster <- as.factor(kclu2$Subjects$Cluster)
Xbubble <- data.frame(Team=Tbox.sel$Team, PTS=Tbox.sel$PTS,
                      PTS.Opp=Obox.sel$PTS, cluster,
                      W=Tbox.sel$W)


labs <- c("PTS", "PTS.Opp", "cluster", "Wins")
bubbleplot(Xbubble, id="Team", x="PTS", y="PTS.Opp",
           col="cluster", size="W", labels=labs,
           title=paste0("NBA Team Clusters - ",seasonSelected))

# The bubble plot above depicts the 2020 NBA teams, with the x-axis depicting the points scored and y-axis the points against. The colors indicate the cluster in which a team is placed and the size of the bubble the number of wins.