INTRODUCTION
CHAPTER 2: Data and Basic Statistical Analyses
- 2.2 BASIC STATISTICAL ANALYSES
CHAPTER 3 : Discovering Patterns in Data
CHAPTER 4: Findings Groups in Data
- 4.2 CLUSTER ANALYSIS
CHAPTER 5 Modeling Relationships in Data
- 5.1 LINEAR MODELS
  - 5.1.1 Simple linear regression model
- 5.2 NONPARAMETRIC REGRESSION
  - 5.2.1 Polynomial local regression
  - 5.2.2 Gaussian kernel smoothing
CHAPTER 6: The R package BasketballAnalyzeR

INTRODUCTION

The following R codes allow one to replicate all the analyses and examples in the book “Basketball Data Science with Application in R” (by P. Zuccolotto and M. Manisera).
It is based on the “BasketballAnalyzeR” package developed with M. Sandri. See https://bdsports.unibs.it/basketballanalyzer/ for further explanations and updates.

Warning: If you want to reproduce the figures contained in the book and if the version of your R machine is > 3.6.0, you need to type RNGkind(sample.kind=“Rounding”) at the beginning of your working session.

RNGkind(sample.kind="Rounding")

## Warning in RNGkind(sample.kind = "Rounding"): non-uniform 'Rounding' sampler
## used

rm(list=ls())
# install.packages("devtools", repos="https://cran.stat.unipd.it/")
# devtools::install_github("sndmrc/BasketballAnalyzeR",force=TRUE)
library(BasketballAnalyzeR)

## Loading required package: ggplot2

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

## 
## If you want to reproduce the figures contained in the book of
## Zuccolotto and Manisera (2020) and
## if the version of your R machine is >= 3.6.0, you need to type
## RNGkind(sample.kind = "Rounding")
## at the beginning of your working session

CHAPTER 2: Data and Basic Statistical Analyses

This chapter provides an overview of basic methods for analyzing basketball data, primarily focusing on indexes and charts.

data(package="BasketballAnalyzeR")
PbP <- PbPmanipulation(PbP.BDB)

2.2 BASIC STATISTICAL ANALYSES

This part covers various statistical analyses methods such as pace, ratings, four factors, bar-line plots, radial plots, scatter plots, bubble plots, variability analysis, inequality analysis, and shot charts.

2.2.1 Pace, Ratings, Four Factors

rm(list=ls())

tm <- c("BOS","CLE","GSW","HOU")
selTeams <- which(Tadd$team %in% tm)
FF.sel <- fourfactors(Tbox[selTeams,], Obox[selTeams,])

plot(FF.sel)

FF <- fourfactors(Tbox,Obox)
listPlots <- plot(FF)

library(gridExtra)
grid.arrange(grobs=listPlots[1:2], ncol=1)

2.2.2 Bar-line plots

rm(list=ls())

X <- data.frame(Tbox, PTS.O=Obox$PTS, TOV.O=Obox$TOV,
                CONF=Tadd$Conference)
XW <- subset(X, CONF=="W")
labs <- c("Steals","Blocks","Defensive Rebounds")
barline(data=XW, id="Team", bars=c("STL","BLK","DREB"),
        line="TOV.O", order.by="PTS.O", labels.bars=labs)

Pbox.HR <- subset(Pbox, Team=="Houston Rockets" &
                  MIN>=500)
barline(data=Pbox.HR, id="Player",
        bars=c("P2p","P3p","FTp"), line="MIN",
        order.by="PM", labels.bars=c("2P%","3P%","FT%"),
        title="Houston Rockets")

2.2.3 Radial plots

rm(list=ls())

Pbox.PG <- subset(Pbox, Player=="Russell Westbrook" |
                  Player=="Stephen Curry" |
                  Player=="Chris Paul" |
                  Player=="Kyrie Irving" |
                  Player=="Damian Lillard" |
                  Player=="Kyle Lowry" |
                  Player=="John Wall" |
                  Player=="Rajon Rondo" |
                  Player=="Kemba Walker")
attach(Pbox.PG)
X <- data.frame(P2M, P3M, FTM, REB=OREB+DREB, AST,
                STL, BLK)/MIN
detach(Pbox.PG)
radialprofile(data=X, title=Pbox.PG$Player, std=FALSE)

radialprofile(data=X, title=Pbox.PG$Player, std=TRUE)

2.2.4 Scatter plots

rm(list=ls())

Pbox.sel <- subset(Pbox, MIN>= 500)
attach(Pbox.sel)
X <- data.frame(AST, TOV, PTS)/MIN
detach(Pbox.sel)
mypal <- colorRampPalette(c("blue","yellow","red"))
scatterplot(X, data.var=c("AST","TOV"), z.var="PTS",
            labels=Pbox.sel$Player, palette=mypal)

SAS <- which(Pbox.sel$Team=="San Antonio Spurs")
scatterplot(X, data.var=c("AST","TOV"), z.var="PTS",
            labels=Pbox.sel$Player, palette=mypal,
            subset=SAS)

SAS <- which(Pbox.sel$Team=="San Antonio Spurs")
scatterplot(X, data.var=c("AST","TOV"), z.var="PTS",
            labels=Pbox.sel$Player, palette=mypal,
            subset=SAS, zoom=c(0,0.25,0.05,0.10))

## Warning: Removed 169 rows containing missing values (`geom_text()`).

## Warning: Removed 5 rows containing missing values (`geom_label_repel()`).

2.2.5 Bubble plots

rm(list=ls())

attach(Tbox)
X <- data.frame(T=Team, P2p, P3p, FTp, AS=P2A+P3A+FTA)
detach(Tbox)
labs <- c("2-point shots (% made)",
          "3-point shots (% made)",
          "free throws (% made)",
          "Total shots attempted")
bubbleplot(X, id="T", x="P2p", y="P3p", col="FTp",
           size="AS", labels=labs)

Pbox.GSW.CC <- subset(Pbox,
               (Team=="Golden State Warriors" |
                      Team =="Cleveland Cavaliers") &
                      MIN>=500)
attach(Pbox.GSW.CC)
X <- data.frame(ID=Player, Team, V1=DREB/MIN, V2=STL/MIN,
                V3=BLK/MIN, V4=MIN)
detach(Pbox.GSW.CC)
labs <- c("Defensive Rebounds","Steals","Blocks",
          "Total minutes played")
bubbleplot(X, id="ID", x="V1", y="V2", col="V3",
           size="V4", text.col="Team", labels=labs,
           title="GSW and CC during the regular season",
           text.legend=TRUE, text.size=3.5, scale=FALSE)

## Warning: ggrepel: 2 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

2.2.6 Variability analysis

rm(list=ls())

Pbox.OKC <- subset(Pbox, Team=="Oklahoma City Thunder"
                   & MIN>=500)
vrb1 <- variability(data=Pbox.OKC, data.var="P3p",
                    size.var="P3A")

vrb1 <- variability(data=Pbox.OKC, data.var="P3p",
                    size.var="P3A",weight=TRUE)


vrb2 <- variability(data=Pbox.OKC,
                    data.var=c("P2p","P3p","FTp"),
                    size.var=c("P2A","P3A","FTA"),
                    weight=TRUE)

plot(vrb2, title="Variability diagram - OKC")

2.2.7 Inequality analysis

rm(list=ls())

Pbox.BN <- subset(Pbox, Team=="Brooklyn Nets")
ineqBN <- inequality(Pbox.BN$PTS, nplayers=8)
Pbox.MB <- subset(Pbox, Team=="Milwaukee Bucks")
ineqMB <- inequality(Pbox.MB$PTS, nplayers=8)

library(gridExtra)
p1 <- plot(ineqBN, title="Brooklyn Nets")
p2 <- plot(ineqMB, title="Milwaukee Bucks")
grid.arrange(p1, p2, nrow=1)

no.teams <- nrow(Tbox)
INEQ <- array(0, no.teams)
for (k in 1:no.teams) {
     Teamk <- Tbox$Team[k]
     Pbox.sel <- subset(Pbox, Team==Teamk)
     index <- inequality(Pbox.sel$PTS, npl=8)
     INEQ[k] <- index$Gini
     }

dts <- data.frame(INEQ, PTS=Tbox$PTS,
                  CONF=Tadd$Conference)
mypal <- colorRampPalette(c("blue","red"))
scatterplot(dts, data.var=c("INEQ","PTS"), z.var="CONF",
            labels=Tbox$Team, palette=mypal,
            repel_labels=TRUE)

rm(list=ls())
PbP <- PbPmanipulation(PbP.BDB)

PbP.GSW <- subset(PbP, team="GSW")

lineup <- c("Stephen Curry", "Kevin Durant",
             "Klay Thompson", "Draymond Green",
             "Zaza Pachulia")
filt5 <- apply(PbP.GSW[, 4:13], 1,
               function(x) {
               x <- as.character(x)
               sum(x %in% lineup)==5
               })


subPbP.GSW <- PbP.GSW[filt5, ]
PTS5 <- sapply(lineup,
               function(x) {
               filt <- subPbP.GSW$player==x
               sum(subPbP.GSW$points[filt], na.rm=T)
               })

inequality(PTS5,nplayer=5)

## $Gini
## [1] 16.97
## 
## $Lorenz
##                  F         Q
##                0.0 0.0000000
## Draymond Green 0.2 0.1195029
## Zaza Pachulia  0.4 0.2820268
## Stephen Curry  0.6 0.5086042
## Kevin Durant   0.8 0.7504780
## Klay Thompson  1.0 1.0000000
## 
## attr(,"class")
## [1] "inequality" "list"

rm(list=ls())
PbP <- PbPmanipulation(PbP.BDB)

PbP.GSW.DET <- subset(PbP, team=="GSW" & oppTeam=="DET")

lineup <- c("Stephen Curry", "Kevin Durant",
             "Klay Thompson", "Draymond Green",
             "Zaza Pachulia")
filt5 <- apply(PbP.GSW.DET[, 4:13], 1,
               function(x) {
               x <- as.character(x)
               sum(x %in% lineup)==5
               })


subPbP.GSW.DET <- PbP.GSW.DET[filt5, ]
PTS5 <- sapply(lineup,
               function(x) {
               filt <- subPbP.GSW.DET$player==x
               sum(subPbP.GSW.DET$points[filt], na.rm=T)
               })

inequality(PTS5,nplayer=5)

## $Gini
## [1] 48.44
## 
## $Lorenz
##                  F       Q
##                0.0 0.00000
## Zaza Pachulia  0.2 0.06250
## Draymond Green 0.4 0.12500
## Kevin Durant   0.6 0.31250
## Stephen Curry  0.8 0.53125
## Klay Thompson  1.0 1.00000
## 
## attr(,"class")
## [1] "inequality" "list"

2.2.8 Shot charts

rm(list=ls())
PbP <- PbPmanipulation(PbP.BDB)

subdata <- subset(PbP, player=="Kevin Durant")
subdata$xx <- subdata$original_x/10
subdata$yy <- subdata$original_y/10-41.75

shotchart(data=subdata, x="xx", y="yy", type=NULL,
          scatter=TRUE)

shotchart(data=subdata, x="xx", y="yy", z="result", type=NULL,
          scatter=TRUE)

shotchart(data=subdata, x="xx", y="yy", z="playlength", 
          num.sect=5, type="sectors", scatter = TRUE)

shotchart(data=subdata, x="xx", y="yy", z="playlength", 
          num.sect=5, type="sectors", scatter=FALSE, result="result")

CHAPTER 3 : Discovering Patterns in Data

This chapter is about using Data Science to reveal hidden mechanisms governing the analyzed phenomena in basketball. It discusses various patterns such as distributions, associations, similarities, interactions, classifications, and trends.

3.1 DETECTING ASSOCIATIONS BETWEEN VARIABLES

This part focuses on statistical dependence, mean dependence, and correlation.

3.1.1 Statistical dependence

rm(list=ls())
PbP <- PbPmanipulation(PbP.BDB)

PbP.GSW <- subset(PbP, team=="GSW")
ev <- c("ejection","end of period","jump ball",
        "start of period","unknown","violation",
        "timeout","sub","foul","turnover")
event.unsel <- which(PbP.GSW$event_type %in% ev)
PbP.GSW.ev <- PbP.GSW[-event.unsel,]
attach(PbP.GSW.ev)
T <- table(oppTeam, event_type, exclude=ev)
detach(PbP.GSW.ev)

library(vcd)

## Loading required package: grid

assocstats(T)

##                     X^2 df P(> X^2)
## Likelihood Ratio 115.26 84 0.013396
## Pearson          116.25 84 0.011421
## 
## Phi-Coefficient   : NA 
## Contingency Coeff.: 0.097 
## Cramer's V        : 0.056

3.1.2 Mean dependence

rm(list=ls())
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:gridExtra':
## 
##     combine

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(lsr)
library(tibble)
FF <- fourfactors(Tbox, Obox)
attach(Tbox)
attach(FF)

## The following object is masked from Tbox:
## 
##     Team

X <- data.frame(PTS, P2M, P3M, FTM, REB=OREB+DREB, AST,
                STL, BLK, ORtg, DRtg)
detach(Tbox)
detach(FF)

Playoff <- Tadd$Playoff
eta <- sapply(X, function(Y){
  cm <- round(tapply(Y, Playoff, mean), 1)
  eta2 <- etaSquared(aov(Y~Playoff))[1]*100
  c(cm, round(eta2, 2))
}) %>%
  t() %>%
  as.data.frame() %>%
  dplyr::rename(No=N, Yes=Y, eta2=V3) %>%
  rownames_to_column('rownm') %>%
  dplyr::arrange(-eta2) %>%
  column_to_rownames('rownm')

eta

##          No    Yes  eta2
## DRtg  107.9  104.6 42.53
## ORtg  104.0  108.1 40.25
## STL   601.9  659.6 28.77
## PTS  8576.0 8844.8 19.28
## BLK   365.6  420.4 18.12
## FTM  1328.0 1394.4  5.58
## P2M  2353.7 2417.2  3.28
## AST  1875.5 1931.6  3.17
## P3M   846.9  871.9  1.07
## REB  3558.1 3577.5  0.49

3.1.3 Correlation

rm(list=ls())
data <- subset(Pbox, MIN>=500)
attach(data)
X <- data.frame(AST, TOV)/MIN
detach(data)

cor(X$AST, X$TOV)

## [1] 0.6873883

cor(rank(X$AST), rank(X$TOV))

## [1] 0.6679628

cor(X$AST, X$TOV, method="spearman")

## [1] 0.6679628

cor(X)

##           AST       TOV
## AST 1.0000000 0.6873883
## TOV 0.6873883 1.0000000

3.2 ANALYZING PAIRWISE LINEAR CORRELATION AMONG VARIABLES

This part focuses on the analysis of pairwise linear correlation among variables.

rm(list=ls())
data <- base::merge(Pbox, Tadd, by="Team")
data <- subset(data, MIN >= 500)

attach(data)
X <- data.frame(PTS, P3M, P2M, REB=(OREB+DREB), AST,
                TOV, STL, BLK)/MIN
X <- data.frame(X, Playoff=Playoff)
detach(data)

corrmatrix <- corranalysis(X[,1:8], threshold=0.5)
plot(corrmatrix)

## Warning: The `guide` argument in `scale_*()` cannot be `FALSE`. This was deprecated in
## ggplot2 3.3.4.
## ℹ Please use "none" instead.
## ℹ The deprecated feature was likely used in the BasketballAnalyzeR package.
##   Please report the issue to the authors.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

scatterplot(X, data.var=1:8, z.var="Playoff",
            diag=list(continuous="blankDiag"))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

3.3 DISPLAYING INDIVIDUAL CASES ACCORDING TO THEIR SIMILARITY

This part discusses how to visualize similarities among individuals.

rm(list=ls())

attach(Pbox)
data <- data.frame(PTS, P3M, P2M, REB=OREB+DREB,
                   AST, TOV, STL, BLK)
detach(Pbox)
data <- subset(data, Pbox$MIN>=1500)
id <- Pbox$Player[Pbox$MIN>=1500]

mds <- MDSmap(data)

## initial  value 15.910514 
## iter   5 value 13.124944
## final  value 12.967089 
## converged

plot(mds, labels=id)

selp <- which(id=="Al Horford" | id=="Kyle Korver" |
              id=="Myles Turner" | id=="Kyle Kuzma" |
              id=="Andrew Wiggins")

plot(mds, labels=id, subset=selp, col.subset="tomato")

plot(mds, labels=id, subset=selp, col.subset="tomato",
     zoom=c(0,3,0,2))

## Warning: Removed 120 rows containing missing values (`geom_text()`).

## Warning: Removed 4 rows containing missing values (`geom_label_repel()`).

#plot(mds, z.var=c("P2M","P3M","AST","REB"), level.plot=FALSE, palette=topo.colors)

#plot(mds, z.var=c("P2M","P3M","AST","REB"),contour=TRUE,palette=topo.colors)

3.4 ANALYZING NETWORK RELATIONSHIPS

This part focuses on the analysis of network relationships.

rm(list=ls())
PbP <- PbPmanipulation(PbP.BDB)
PbP.GSW <- subset(PbP, team=="GSW")

netdata <- assistnet(PbP.GSW)
#netdata
#RNGkind(sample.kind="Rounding")

set.seed(7)
plot(netdata)

plot(netdata, layout="circle", edge.thr=20)

cols <- paste0(c("a","h"), rep(1:5,each=2))
PbP.GSW.DG0 <- PbP.GSW[!apply(PbP.GSW[,cols], 1, "%in%",
                       x="Draymond Green"),]
netdata.DG0 <- assistnet(PbP.GSW.DG0)

set.seed(1)
plot(netdata.DG0)

PbP.GSW.DG0 <- subset(PbP.GSW.DG0,
                      ShotType=="2P" | ShotType=="3P")
p0 <- mean(PbP.GSW.DG0$points)
pl0 <- mean(PbP.GSW.DG0$playlength)

PbP.GSW.DG1 <- PbP.GSW[apply(PbP.GSW[,cols], 1, "%in%",
                       x="Draymond Green"),]
PbP.GSW.DG1 <- subset(PbP.GSW.DG1,
                      ShotType=="2P" | ShotType=="3P")
p1 <- mean(PbP.GSW.DG1$points)
pl1 <- mean(PbP.GSW.DG1$playlength)

plot(netdata, layout="circle", edge.thr=20,
     node.col="FGPTS_AST", node.size="ASTPTS")

plot(netdata, layout="circle", edge.thr=20,
     node.col="FGPTS", node.size="FGPTS_ASTp")

TAB <- netdata$assistTable
X <- netdata$nodeStats

names(X)[1] <- "Player"
data <- base::merge(X, Pbox, by="Player")

mypal <- colorRampPalette(c("blue","yellow","red"))
scatterplot(data, data.var=c("FGM","FGM_ASTp"),
            z.var="MIN", labels=data$Player,
            palette=mypal, repel_labels=TRUE)

sel <- which(data$MIN > 984)
tab <- TAB[sel,sel]

no.pl <- nrow(tab)
pR <- pM <- vector(no.pl, mode="list")
GiniM <- array(NA, no.pl)
GiniR <- array(NA, no.pl)
for (pl in 1:no.pl) {
     ineqplM <- inequality(tab[pl,], npl=no.pl)
     GiniM[pl] <- ineqplM$Gini
     ineqplR <- inequality(tab[,pl], npl=no.pl)
     GiniR[pl] <- ineqplR$Gini
     title <- rownames(tab)[pl]
     pM[[pl]] <- plot(ineqplM, title=title)
     pR[[pl]] <- plot(ineqplR, title=title)
     }

library(gridExtra)
grid.arrange(grobs=pM, nrow=2)

grid.arrange(grobs=pR, nrow=2)

library(vcd)
assocstats(tab)

##                     X^2 df P(> X^2)
## Likelihood Ratio 670.48 49        0
## Pearson          507.67 49        0
## 
## Phi-Coefficient   : NA 
## Contingency Coeff.: 0.512 
## Cramer's V        : 0.226

XX <- data.frame(X[sel,], GiniM, GiniR)
labs <- c("Gini Index for assists made",
          "Gini Index for assists received",
          "Assists received", "Assists made")
bubbleplot(XX, id="Player", x="GiniM", y="GiniR",
           col="FGM_AST", size="AST",
           labels=labs, text.size=4)

library(tidygraph)

## 
## Attaching package: 'tidygraph'

## The following object is masked from 'package:stats':
## 
##     filter

library(igraph)

## 
## Attaching package: 'igraph'

## The following object is masked from 'package:tidygraph':
## 
##     groups

## The following object is masked from 'package:tibble':
## 
##     as_data_frame

## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union

## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum

## The following object is masked from 'package:base':
## 
##     union

#library(CINNA)
net1 <- as_tbl_graph(netdata$assistNet)
class(net1) <- "igraph"
centr_degree(net1)

## $res
##  [1] 27  6 23 29 22 29 28 27 29 27 26 28 26 27 27 23
## 
## $centralization
## [1] 0.1333333
## 
## $theoretical_max
## [1] 450

alpha_centrality(net1)

##   Andre Iguodala     Damian Jones       David West   Draymond Green 
##      -0.06373293       0.51593323      -0.10318665      -0.12746586 
##     JaVale McGee      Jordan Bell     Kevin Durant     Kevon Looney 
##      -0.07587253      -0.12746586      -0.08952959       0.13050076 
##    Klay Thompson       Nick Young      Omri Casspi    Patrick McCaw 
##      -0.12746586      -0.12746586      -0.08345979      -0.12746586 
##       Quinn Cook Shaun Livingston    Stephen Curry    Zaza Pachulia 
##      -0.01213961      -0.08952959      -0.12746586      -0.10318665

closeness(net1, mode="all")

##   Andre Iguodala     Damian Jones       David West   Draymond Green 
##       0.06250000       0.04166667       0.05882353       0.06666667 
##     JaVale McGee      Jordan Bell     Kevin Durant     Kevon Looney 
##       0.05882353       0.06666667       0.06666667       0.06666667 
##    Klay Thompson       Nick Young      Omri Casspi    Patrick McCaw 
##       0.06666667       0.06250000       0.06250000       0.06250000 
##       Quinn Cook Shaun Livingston    Stephen Curry    Zaza Pachulia 
##       0.06666667       0.06250000       0.06250000       0.05555556

betweenness(net1)

##   Andre Iguodala     Damian Jones       David West   Draymond Green 
##        1.4926768        0.0000000        0.1602564        3.7195998 
##     JaVale McGee      Jordan Bell     Kevin Durant     Kevon Looney 
##        0.2511655        3.7195998        2.8723776       14.7650544 
##    Klay Thompson       Nick Young      Omri Casspi    Patrick McCaw 
##        3.7195998        1.4926768        0.8890443        1.5695998 
##       Quinn Cook Shaun Livingston    Stephen Curry    Zaza Pachulia 
##        2.7370241        0.9723776        1.3953574        0.2435897

#calculate_centralities(net1)

3.5 ESTIMATING DENSITY OF EVENTS

This part discusses density with respect to a concurrent variable, density in space, and joint density of two variables.

3.5.1 Density with respect to a concurrent variable

rm(list=ls())
PbP <- PbPmanipulation(PbP.BDB)

data.team <- subset(PbP, team=="GSW" & result!="")
data.opp <- subset(PbP, team!="GSW" & result!="")

densityplot(data=data.team, shot.type="2P",
            var="periodTime", best.scorer=TRUE)

densityplot(data=data.team, shot.type="2P",
            var="totalTime", best.scorer=TRUE)

densityplot(data=data.team, shot.type="2P",
            var="playlength", best.scorer=TRUE)

densityplot(data=data.team, shot.type="2P",
            var="shot_distance", best.scorer=TRUE)

densityplot(data=data.opp, shot.type="2P",
            var="periodTime", best.scorer=TRUE)

densityplot(data=data.opp, shot.type="2P",
            var="totalTime",best.scorer=TRUE)

densityplot(data=data.opp, shot.type="2P",
            var="playlength", best.scorer=TRUE)

densityplot(data=data.opp, shot.type="2P",
            var="shot_distance", best.scorer=TRUE)

KD <- subset(PbP, player=="Kevin Durant" & result!="")
SC <- subset(PbP, player=="Stephen Curry" & result!="")
densityplot(data=KD, shot.type="field",
            var="playlength")

densityplot(data=KD, shot.type="field",
            var="shot_distance")

densityplot(data=SC, shot.type="field",
            var="playlength")

densityplot(data=SC, shot.type="field",
            var="shot_distance")

3.5.2 Density in space

rm(list=ls())
PbP <- PbPmanipulation(PbP.BDB)

PbP$xx <- PbP$original_x/10
PbP$yy <- PbP$original_y/10 - 41.75

KT <- subset(PbP, player=="Klay Thompson")

shotchart(data=KT, x="xx", y="yy",
          type="density-polygons")

## Warning: The dot-dot notation (`..level..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(level)` instead.
## ℹ The deprecated feature was likely used in the BasketballAnalyzeR package.
##   Please report the issue to the authors.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

shotchart(data=KT, x="xx", y="yy", type="density-raster")

shotchart(data=KT, x="xx", y="yy", type="density-hexbin")

shotchart(data=KT, x="xx", y="yy",
          type="density-polygons", scatter=TRUE)

shotchart(data=KT, x="xx", y="yy", type="density-raster",
          scatter=TRUE, pt.col="tomato", pt.alpha=0.1)

shotchart(data=KT, x="xx", y="yy", type="density-hexbin",
          nbins=50, palette="bwr")

3.5.3 Joint density of two variables

rm(list=ls())
data <- subset(Pbox, MIN>=500)
attach(data)
X <- data.frame(PTS, P3M, P2M, REB=OREB+DREB, AST)/MIN
detach(data)

scatterplot(X, data.var=1:5,
            lower=list(continuous="density"),
            diag=list(continuous="densityDiag"))

CHAPTER 4: Findings Groups in Data

This chapter focuses on identifying clusters or groups within basketball data.

4.2 CLUSTER ANALYSIS

4.2.1 k-means clustering of NBA teams

rm(list=ls())

FF <- fourfactors(Tbox,Obox)
OD.Rtg <- FF$ORtg/FF$DRtg
F1.r <- FF$F1.Off/FF$F1.Def
F2.r <- FF$F2.Def/FF$F2.Off
F3.Off <- FF$F3.Off
F3.Def <- FF$F3.Def
P3M <- Tbox$P3M
STL.r <- Tbox$STL/Obox$STL
data <- data.frame(OD.Rtg, F1.r, F2.r, F3.Off, F3.Def,
                   P3M, STL.r)

#RNGkind(sample.kind="Rounding")
set.seed(29)
kclu1 <- kclustering(data)
plot(kclu1)

set.seed(29)
kclu2 <- kclustering(data, labels=Tbox$Team, k=5)
plot(kclu2)

kclu2.PO <- table(kclu2$Subjects$Cluster, Tadd$Playoff)
kclu2.W <- tapply(Tbox$W, kclu2$Subjects$Cluster, mean)

Xbar <- data.frame(cluster=c(1:5), N=kclu2.PO[,1],
                   Y=kclu2.PO[,2], W=kclu2.W)
barline(data=Xbar, id="cluster", bars=c("N","Y"),
        labels.bars=c("Playoff: NO","Playoff: YES"),
        line="W", label.line="average wins",
        decreasing=FALSE)

cluster <- as.factor(kclu2$Subjects$Cluster)
Xbubble <- data.frame(Team=Tbox$Team, PTS=Tbox$PTS,
                      PTS.Opp=Obox$PTS, cluster,
                      W=Tbox$W)
labs <- c("PTS", "PTS.Opp", "cluster", "Wins")
bubbleplot(Xbubble, id="Team", x="PTS", y="PTS.Opp",
           col="cluster", size="W", labels=labs)

4.2.1 k-means clustering of Golden State Warriors shots

rm(list=ls())
PbP <- PbPmanipulation(PbP.BDB)

shots <- subset(PbP,
                !is.na(PbP$shot_distance) &
                PbP$team=="GSW")
shots <- dplyr::mutate_if(shots, is.factor, droplevels)

attach(shots)
data <- data.frame(PTS=points, DIST=shot_distance,
                   TIMEQ=periodTime, PL=playlength)
detach(shots)
#RNGkind(sample.kind="Rounding")
set.seed(1)
kclu1 <- kclustering(data, algorithm="MacQueen",
                     nclumax=15, iter.max=500)
plot(kclu1)

set.seed(1)
kclu2 <- kclustering(data, algorithm="MacQueen",
                     iter.max=500, k=6)
plot(kclu2)

cluster <- as.factor(kclu2$Subjects$Cluster)
shots <- data.frame(shots, cluster)
shots$xx <- shots$original_x/10
shots$yy <- shots$original_y/10 - 41.75

no.clu <- 6
p1 <- p2 <- vector(no.clu, mode="list")
for (k in 1:no.clu) {
     shots.k <- subset(shots,cluster==k)
     p1[[k]] <- shotchart(data=shots.k, x="xx", y="yy",
                          z="result", type=NULL,
                          scatter = TRUE,
                          drop.levels=FALSE)
p2[[k]] <- shotchart(data=shots.k, x="xx", y="yy",
                     z="periodTime",
                     col.limits=c(0,720),
                     result="result", num.sect=5,
                     type="sectors", scatter=FALSE)
     }

library(gridExtra)
grid.arrange(grobs=p1, nrow=3)

grid.arrange(grobs=p2, nrow=3)

shots.pl <- table(shots$player, shots$cluster)
Xineq <- as.data.frame.matrix(shots.pl)

no.clu <- 6
p <- vector(no.clu, mode="list")
for (k in 1:no.clu) {
     ineqC <- inequality(Xineq[,k], npl=nrow(Xineq))
     title <- paste("Cluster", k)
     p[[k]] <- plot(ineqC, title=title)
}

library(gridExtra)
grid.arrange(grobs=p, nrow=3)

shots.perc <- shots.pl/rowSums(shots.pl)
Xbar <- data.frame(player=rownames(shots.pl),
                   rbind(shots.perc),
                   FGA=rowSums(shots.pl))
labclusters <- c("Cluster 1","Cluster 2","Cluster 3",
                 "Cluster 4","Cluster 5","Cluster 6")

barline(data=Xbar, id="player", line="FGA",
        bars=c("X1","X2","X3","X4","X5","X6"),
        order.by="FGA", label.line="Field goals attempted",
        labels.bars=labclusters)

4.2.3 Hierarchical clustering of NBA players

rm(list=ls())

attach(Pbox)
data <- data.frame(PTS, P3M, REB=OREB+DREB,
                   AST, TOV, STL, BLK, PF)
detach(Pbox)

data <- subset(data, Pbox$MIN>=1500)
ID <- Pbox$Player[Pbox$MIN>=1500]

hclu1 <- hclustering(data)
plot(hclu1)

hclu2 <- hclustering(data, labels=ID, k=9)
plot(hclu2, profiles=TRUE)

plot(hclu2, rect=TRUE, colored.branches=TRUE,
     cex.labels=0.5)

## Warning in par(oldmar): argument 1 does not name a graphical parameter

Pbox.subset <- subset(Pbox, MIN>=1500)
MIN <- Pbox.subset$MIN
X <- data.frame(hclu2$Subjects, scale(data), MIN)

dvar <- c("PTS","P3M","REB","AST",
          "TOV","STL","BLK","PF")
svar <- "MIN"
yRange <- range(X[,dvar])
sizeRange <- c(1500, 3300)
no.clu <- 9
p <- vector(no.clu, mode="list")
for (k in 1:no.clu) {
     XC <- subset(X, Cluster==k)
     vrb <- variability(XC[,3:11], data.var=dvar,
                        size.var=svar, weight=FALSE,
                        VC=FALSE)
     title <- paste("Cluster", k)
     p[[k]] <- plot(vrb, size.lim=sizeRange, ylim=yRange,
               title=title, leg.pos=c(0,1),
               leg.just=c(-0.5,0),
               leg.box="vertical",
               leg.brk=seq(1500,3000,500),
               leg.title.pos="left", leg.nrow=1,
               max.circle=7)
     }
library(gridExtra)
grid.arrange(grobs=p, ncol=3)

CHAPTER 5 Modeling Relationships in Data

The chapter discusses the concept of statistical modeling as a way to approximate the mechanisms or rules that govern the functioning of phenomena.

5.1 LINEAR MODELS

This part discusses the most traditional statistical models in the Data Modeling Culture, which is a linear combination of the predictors. It also discusses the concept of multiple linear regression, which can be used when the outcome is a numerical variable for which we can assume a Gaussian distribution.
A flexible generalization of the multiple linear regression model allows for an outcome variable with a probability distribution other than Gaussian, provided it belongs to the exponential family.

5.1.1 Simple linear regression model

rm(list=ls())

Pbox.sel <- subset(Pbox, MIN>=500)
attach(Pbox.sel)
X <- AST/MIN
Y <- TOV/MIN
Pl <- Player
detach(Pbox.sel)
out <- simplereg(x=X, y=Y, type="lin")
xtitle <- "AST per minute"
ytitle <- "TOV per minute"
plot(out, xtitle=xtitle, ytitle=ytitle)

selp <- which(Pl=="Damian Lillard")
plot(out, labels=Pl, subset=selp, xtitle=xtitle,
     ytitle=ytitle)

plot(out, labels=Pl, subset="quant",
     Lx=0, Ux=0.97, Ly=0, Uy=0.97,
     xtitle=xtitle, ytitle=ytitle)

5.2 NONPARAMETRIC REGRESSION

This chapter discusses the concept of nonparametric regression, which is typically performed by means of smoothing techniques, such as kernel smoothing, k-nearest neighbor estimates, and spline smoothing. The chapter also provides examples where nonparametric regression is carried out on basketball data using a polynomial local regression technique and Nadaraya-Watson Gaussian kernel smoothing.

5.2.1 Polynomial local regression

rm(list=ls())

Pbox.sel <- subset(Pbox, MIN>=500)
attach(Pbox.sel)
X <- (DREB+OREB)/MIN
Y <- P3M/MIN
Pl <- Player
detach(Pbox.sel)

out <- simplereg(x=X, y=Y, type="lin")
xtitle <- "REB per minute"
ytitle <- "P3M per minute"
plot(out, xtitle=xtitle, ytitle=ytitle)

out <- simplereg(x=X, y=Y, type="pol")
plot(out, labels=Pl, subset="quant",
     Lx=0, Ux=0.90, Ly=0, Uy=0.95,
     xtitle=xtitle, ytitle=ytitle)

## Warning: ggrepel: 26 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

5.2.2 Gaussian kernel smoothing

rm(list=ls())

data <- subset(Pbox, MIN>=500)
attach(data)
X <- data.frame(PTS, P3M, P2M, REB=OREB+DREB, AST)/MIN
detach(data)

scatterplot(X, data.var=1:5,
lower=list(continuous="smooth_loess"),
diag=list(continuous="barDiag"))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

5.2.2.1 Estimation of scoring probability

rm(list=ls())
PbP <- PbPmanipulation(PbP.BDB)

PbP.GSW <- subset(PbP, team=="GSW" & result!="")
p1 <- scoringprob(data=PbP.GSW, shot.type="3P",
                  var="playlength")
p2 <- scoringprob(data=PbP.GSW, shot.type="3P",
                  var="periodTime", bw=300)
library(gridExtra)
grid.arrange(p1, p2, ncol=2)

pl1 <- c("Kevin Durant","Draymond Green","Klay Thompson")
p1 <- scoringprob(data=PbP.GSW, shot.type="2P",
players=pl1, var="shot_distance",
col.team="gray")
pl2 <- c("Kevin Durant","Draymond Green")
p2 <- scoringprob(data=PbP.GSW, shot.type="2P",
players=pl2, var="totalTime", bw=1500,
col.team="gray")
library(gridExtra)
grid.arrange(p1, p2, ncol=2)

5.2.2.2 Estimation of expected points

rm(list=ls())
PbP <- PbPmanipulation(PbP.BDB)

PbP.GSW <- subset(PbP, team=="GSW")
pl <- c("Stephen Curry","Kevin Durant")
mypal <- colorRampPalette(c("red","green"))
expectedpts(data=PbP.GSW, players=pl,
            col.team="gray", palette=mypal,
            col.hline="gray")

Pbox.GSW <- subset(Pbox, PTS>=500 &
                   Team=="Golden State Warriors")
pl <- Pbox.GSW$Player
mypal <- colorRampPalette(c("red","green"))
expectedpts(data=PbP.GSW, players=pl,
            col.team="gray", palette=mypal,
            col.hline="gray")

expectedpts(data=PbP.GSW, bw=300, players=pl,
            col.team="gray", palette=mypal,
            col.hline="gray", var="periodTime",
            xlab="Period time")

rm(list=ls()) 
PbP <- PbPmanipulation(PbP.BDB) 
 
top <- subset(Tadd, Playoff=="Y" & team!="GSW")$team
bot <- subset(Tadd, Playoff=="N")$team 
 
bot_top <- function(X, k) {
           dts <- subset(subset(X, oppTeam %in% base::get(k)),
                         team=="GSW")
           dts$player <- paste(dts$player, k)
           return(dts)
           }

PbP.GSW <- base::rbind(bot_top(PbP, "top"), bot_top(PbP, "bot"))
pl <- c("Stephen Curry top","Stephen Curry bot",
        "Kevin Durant top", "Kevin Durant bot")
mypal <- colorRampPalette(c("red","green"))
expectedpts(data=PbP.GSW, bw=1200, players=pl,
            col.team="gray", palette=mypal,
            col.hline="gray", var="totalTime",
            xlab="Total time", x.range=NULL)

CHAPTER 6: The R package BasketballAnalyzeR

This chapter discusses the open-source package for the statistical language R, designed for the analysis and visualization of basketball data.

6.2 PREPARING DATA

This section discusses how to format the input datasets according to the structure required by the functions in the package.

rm(list=ls())

dts <- read.csv(file="2012-18_teamBoxScore.csv")
dts$gmDate <- as.Date(as.character(dts$gmDate))
year <- as.numeric(format(dts$gmDate,"%Y"))
month <- as.numeric(format(dts$gmDate,"%m"))
dts$season <- ifelse(month<5, paste0(year-1,"-",year),
                              paste0(year,"-",year+1))

library(dplyr)
Tbox2 <- dts %>%
  group_by(season, teamAbbr) %>%
  summarise(GP=n(), MIN=sum(round(teamMin/5)),
    PTS=sum(teamPTS),
    W=sum(teamRslt=="Win"), L=sum(teamRslt=="Loss"),
    P2M=sum(team2PM), P2A=sum(team2PA), P2p=P2M/P2A,
    P3M=sum(team3PM), P3A=sum(team3PA), P3p=P3M/P3A,
    FTM=sum(teamFTM), FTA=sum(teamFTA), FTp=FTM/FTA,
    OREB=sum(teamORB), DREB=sum(teamDRB), AST=sum(teamAST),
    TOV=sum(teamTO), STL=sum(teamSTL), BLK=sum(teamBLK),
    PF=sum(teamPF), PM=sum(teamPTS-opptPTS)) %>%
  rename(Season=season, Team=teamAbbr) %>%
  as.data.frame()

Obox2 <- dts %>%
  group_by(season, teamAbbr) %>%
  summarise(GP=n(), MIN=sum(round(opptMin/5)),
    PTS=sum(opptPTS),
    W=sum(opptRslt=="Win"), L=sum(opptRslt=="Loss"),
    P2M=sum(oppt2PM), P2A=sum(oppt2PA), P2p=100*P2M/P2A,
    P3M=sum(oppt3PM), P3A=sum(oppt3PA), P3p=100*P3M/P3A,
    FTM=sum(opptFTM), FTA=sum(opptFTA), FTp=100*FTM/FTA,
    OREB=sum(opptORB), DREB=sum(opptDRB), AST=sum(opptAST),
    TOV=sum(opptTO), STL=sum(opptSTL), BLK=sum(opptBLK),
    PF=sum(opptPF), PM=sum(teamPTS-opptPTS)) %>%
  rename(Season=season, Team=teamAbbr) %>%
  as.data.frame()

dts <- read.csv(file="2012-18_playerBoxScore.csv",
                encoding="UTF-8")
dts$gmDate <- as.Date(as.character(dts$gmDate))
year <- as.numeric(format(dts$gmDate,"%Y"))
month <- as.numeric(format(dts$gmDate,"%m"))
dts$season <- ifelse(month<5, paste0(year-1,"-",year),
                              paste0(year,"-",year+1))
Pbox2 <- dts %>%
  group_by(season, teamAbbr, playDispNm) %>%
  summarise(GP=n(), MIN=sum(playMin), PTS=sum(playPTS),
    P2M=sum(play2PM), P2A=sum(play2PA), P2p=100*P2M/P2A,
    P3M=sum(play3PM), P3A=sum(play3PA), P3p=100*P3M/P3A,
    FTM=sum(playFTM), FTA=sum(playFTA), FTp=100*FTM/FTA,
    OREB=sum(playORB), DREB=sum(playDRB), AST=sum(playAST),
    TOV=sum(playTO), STL=sum(playSTL), BLK=sum(playBLK),
    PF=sum(playPF)) %>%
  rename(Season=season, Team=teamAbbr,
         Player=playDispNm) %>%
  as.data.frame()

6.3 CUSTOMIZING PLOTS

This section discusses how to customize the plots generated by the package.

rm(list=ls())

Pbox.sel <- subset(Pbox, MIN>=500)
attach(Pbox.sel)
X <- data.frame(AST, TOV, PTSpm=PTS)/MIN
detach(Pbox.sel)
mypal <- colorRampPalette(c("blue","yellow","red"))
p1 <- scatterplot(X, data.var=c("AST","TOV"),
                  z.var="PTSpm", palette=mypal)
print(p1)

class(p1)

## [1] "gg"     "ggplot"

p2 <- p1 +
      labs(title="Scatter plot", x="Assists",
           y="Turnovers") +
      scale_x_continuous(breaks=seq(0,0.35,0.05),
                         limits=c(0,0.35)) +
      theme(panel.background=element_rect(fill="#FFCCCC20",
            colour="red", size=3)) +
      guides(color=FALSE)

## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

print(p2)

p3 <- p1 +
      geom_segment(x=0.225, y=0.025, xend=X$AST[143]+0.005,
                   yend=X$TOV[143]-0.001, size=1,
                   color="red",
                   arrow=arrow(length=unit(0.25, "cm"),
                   type="closed", angle=20)) +
annotate("text", x=0.225, y=0.025,
        label=Pbox.sel[143,"Player"],
        color="red", fontface=2, hjust=0)

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

print(p3)

p3 + geom_rect(xmin=0.2, ymin=0.075,
               xmax=Inf, ymax=Inf,
               fill="#DDDDDDAA", color=NA)

p3$layers <- c(geom_rect(xmin=0.2, ymin=0.075,
                         xmax=Inf, ymax=Inf,
                         fill="#DDDDDDAA", color=NA),
               p3$layers)
print(p3)

library(cowplot)
ggdraw() +
  draw_plot(p1) +
  draw_plot(p2, x=0.55, y=0.06, width=0.3, height=0.325)

q1 <- ggplot_build(p1)
q1$data[[1]]$shape <- 17
q1$data[[1]]$size <- 3
p1b <- ggplot_gtable(q1)
plot(p1b)

str(q1$data[[1]])

## 'data.frame':    361 obs. of  11 variables:
##  $ colour: chr  "#E1E11D" "#FFAA00" "#DFDF1F" "#C7C737" ...
##  $ x     : num  0.0869 0.2007 0.1274 0.0549 0.0584 ...
##  $ y     : num  0.0775 0.0881 0.0878 0.0588 0.0558 ...
##  $ text  : chr  "AST: 0.0868506493506493<br>TOV: 0.0775162337662338<br>PTSpm: 0.469967532467532" "AST: 0.200673724735322<br>TOV: 0.088065447545717<br>PTSpm: 0.626082771896054" "AST: 0.127445500279486<br>TOV: 0.0877585243152599<br>PTSpm: 0.467300167691448" "AST: 0.0549019607843137<br>TOV: 0.0588235294117647<br>PTSpm: 0.435294117647059" ...
##  $ PANEL : Factor w/ 1 level "1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ group : int  218 339 276 103 120 216 51 324 74 311 ...
##   ..- attr(*, "n")= int 361
##  $ shape : num  17 17 17 17 17 17 17 17 17 17 ...
##  $ size  : num  3 3 3 3 3 3 3 3 3 3 ...
##  $ fill  : logi  NA NA NA NA NA NA ...
##  $ alpha : logi  NA NA NA NA NA NA ...
##  $ stroke: num  0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...

6.4 BUILDING INTERACTIVE GRAPHICS

This section discusses how to build interactive graphics using the package.

library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:igraph':
## 
##     groups

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

Pbox.sel <- subset(Pbox, MIN>=500)
attach(Pbox.sel)
X <- data.frame(AST,TOV, PTSpm=PTS)/MIN
detach(Pbox.sel)
mypal <- colorRampPalette(c("blue","yellow","red"))
p5 <- scatterplot(X, data.var=c("AST","TOV"),
                  z.var="PTSpm", palette=mypal)
ggplotly(p5, tooltip="text")

data <- Pbox[1:64, c("PTS","P3M","P2M","OREB","Team")]
p6 <- scatterplot(data, data.var=1:4, z.var="Team")
ggplotly(p6)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Can only have one: highlight

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Can only have one: highlight

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Can only have one: highlight

Basketball Data Science with Applications in R

R-Markdown file authored by Emmanuel Nartey

2023-05-15