NBA Data Exploration

Author: Jim Hester Created: 2013 Apr 02 03:17:40 PM Last Modified: 2013 Aug 13 03:15:43 PM

Read NBA Data

Team Data

teams = read.csv('data/teams.csv', check.names=F)
opp = read.csv('data/opp.csv', check.names=F)
misc = read.csv('data/misc.csv', check.names=F)
wl = read.csv('data/wl.csv', check.names=F)
buildings = read.csv('data/buildings.csv', check.names=F)
#add defense suffix to the four factors so they do not have the same name, have to specify by number.
names(misc)[16:19] = paste(names(misc)[16:19], '_Def', sep='')
#merge the datasets
team_stats = merge(merge(merge(merge(teams, opp, by='Team', suffixes=c('', '_Opp'),
                                     all.x=T), by=c('Team'), misc, all.x=T),
                         wl, all.x=T), buildings, all.x=T)
names(team_stats) = gsub('3P', 'Three_Point', names(team_stats))
save(team_stats, file='data/team_stats.RData')

Player Data

Compare the first seasons of Bradley Beal and Dion Waiters, both shooting guards picked with the 3rd and 4th picks in 2012. Also get data for Lebron James, to serve as a positive control dataset of exceptional performance.

library(stringr)
library(lubridate)
beal = read.csv('data/beal.csv', check.names=F)
beal$Name = 'Bradley Beal'
beal$BDay = ymd('1993-06-28')
waiters = read.csv('data/waiters.csv', check.names=F)
waiters$Name = 'Dion Waiters'
waiters$BDay = ymd('1991-12-10')
james = read.csv('data/james.csv', check.names=F)
james$Name = 'Lebron James'
james$BDay = ymd('1984-12-30')
players = rbind(beal, waiters, james)
#location to be home and away
location_map = c('Away', 'Home')
names(location_map) = c('', '@')
players$Location = location_map[players$Location]
#split result into two, the actual win or loss, and margin of victory
result_split = str_split_fixed(players$Result, ' ', n=2)
players$Result = result_split[,1]
#remove paranthesis around margin of victory
players$MOV = as.numeric(gsub('[()+]', '', result_split[,2]))
#convert minutes played to seconds played
players$MP = ms(players$MP)
players$seconds = players$MP %/% seconds(1)
#Basketball reference does not accurately take into account leap days for their
#age calcuations.  So do it ourselves based on the players birthdays, see the
#table with three different birthdays for evidence.
players$Date = ymd(as.character(players$Date))
age_split = str_split_fixed(players$Age, '-', n=2)
#table(players$Date - years(age_split[,1]) - days(age_split[,2]))
players$Age = players$BDay %--% players$Date
#Get long name for players team
players$Team = save(players, file='data/players.RData')

Author: Jim Hester Created: 2013 Jul 02 01:30:16 PM Last Modified: 2013 Aug 13 03:23:37 PM

Exploratory Plots

library(ggplot2)
colors = read.table('data/colors',fill=T, header=T, comment.char='')
team_abb_long2short = c('Atlanta Hawks' = 'ATL',
                        'Boston Celtics' = 'BOS',
                        'Brooklyn Nets' = 'BKN',
                        'Charlotte Bobcats' = 'CHA',
                        'Chicago Bulls' = 'CHI',
                        'Cleveland Cavaliers' = 'CLE',
                        'Dallas Mavericks' = 'DAL',
                        'Denver Nuggets' = 'DEN',
                        'Detroit Pistons' = 'DET',
                        'Golden State Warriors' = 'GSW',
                        'Houston Rockets' = 'HOU',
                        'Indiana Pacers' = 'IND',
                        'Los Angeles Clippers' = 'LAC',
                        'Los Angeles Lakers' = 'LAL',
                        'Memphis Grizzlies' = 'MEM',
                        'Miami Heat' = 'MIA',
                        'Milwaukee Bucks' = 'MIL',
                        'Minnesota Timberwolves' = 'MIN',
                        'New Orleans Hornets' = 'NOH',
                        'New York Knicks' = 'NYK',
                        'Oklahoma City Thunder' = 'OKC',
                        'Orlando Magic' = 'ORL',
                        'Philadelphia 76ers' = 'PHI',
                        'Phoenix Suns' = 'PHX',
                        'Phoenix Suns' = 'PHO',
                        'Portland Trail Blazers' = 'POR',
                        'Sacramento Kings' = 'SAC',
                        'San Antonio Spurs' = 'SAS',
                        'Toronto Raptors' = 'TOR',
                        'Utah Jazz' = 'UTA',
                        'Washington Wizards' = 'WAS')
flip_named_vector = function(x){
  ret = names(x)
  names(ret) = x
  ret
}
team_abb_short2long = flip_named_vector(team_abb_long2short)
color1 = as.character(colors$Color1)
names(color1) = team_abb_short2long[as.character(colors$Team)]
color2 = as.character(colors$Color2)
names(color2) = team_abb_short2long[as.character(colors$Team)]
x_angle <- theme(axis.text.x = element_text(size = 7,
      hjust = 1,
      vjust = 1,
      angle = 30))
colors = list(aes(fill=Team, color=Team),
              scale_fill_manual(values=color1),
              scale_color_manual(values=color2),
              theme(legend.position="none"))
point_color = list(geom_point(pch=21, size=5), colors, geom_dl(method='smart.grid2', aes(label=Team)))

Black magic modifying the grid Grob to increase the line width of the points so that they are easily seen.

library(grid)
# gp is the parameter to edit the graphics parameters
edit_par = function(x, ..., geom_regex='geom.*'){
  g = ggplotGrob(x)
  panel_indexes = grep('panel', g$layout$name)
  for(panel_index in panel_indexes){
    g$grobs[[panel_index]] = editGrob(g$grobs[[panel_index]],geom_regex, grep=T, global=T, ...)
  }
  g
}
plot_scatter = function(x){
  grid.newpage()
  grid.draw(make_scatter(x))
}
make_scatter = function(x){
  edit_par(x, geom_regex='geom_point.*', gp=gpar(lwd=4))
}

Points

Points Per Game

load('data/team_stats.RData')
#bar plot of team Points per game
ppg = ggplot(team_stats, aes(x=reorder(Team, `PTS/G`), y=`PTS/G`)) +
  geom_bar(size=2, width=.5, stat='identity') + colors + x_angle +
  labs(x=NULL, y='Points/Game')
ppg

ppg + facet_wrap(~Conference, scales='free_x')

Points Per Possession

Points per game is not the best measurement of scoring efficiency, as it does not take into account pace, so plot points per possession instead.

team_stats = mutate(team_stats, `Points/Possession`=`PTS/G`/Pace)
ppp = ppg %+% team_stats + aes(x=reorder(Team, `Points/Possession`), y=`Points/Possession`) +
  labs(x=NULL)
ppp

ppp + facet_wrap(~Conference, scales='free_x')

Wins Vs Attendance

Building capacities from wikipedia. Note these attendance figures seem to include playoff attendance, so playoff teams have additional games to increase their attendance figures.

library(plyr)
library(scales)
smart.grid2 = list('calc.boxes', 'empty.grid')
team_stats$`%Capacity` = (team_stats$Attendance) / (team_stats$Capacity * 82)
att = ggplot(team_stats, aes(x=`%Capacity`, y=Wins)) +
  geom_smooth(aes(group=1), method=lm, se=FALSE, color="grey") + point_color +
  geom_dl(aes(label=Team), method='smart.grid2') +
  scale_x_continuous(labels=percent) + labs(x='Percent Capacity')
plot_scatter(att)

plot_scatter(att + facet_wrap(~Conference))

There seems to be a clear linear trend with number of games won and the amount of seats filled. Orlando's ticket sales were artificially boosted this year because season tickets for the season were sold before Dwight Howard was traded to the Lakers. Detroit is handicapped by having the largest capacity stadium, as well as the economic downturn hitting that city very hard.

Age Vs Wins

This is an interesting comparison, as it pretty easily shows which teams are under or over performing relative to their experience. In general terms, if a franchise is above the fit line it is on the right track and should continue to do well in the future. However if a franchise is below the fit line it's players are in general performing worse than expected based on their experience.

library(directlabels)
avw = ggplot(team_stats, aes(x=Age, y=Wins)) +
  geom_smooth(aes(group=1), se=FALSE, color="grey") + point_color +
  geom_dl(aes(label=Team), method='smart.grid2') + labs(x='Average Age')
plot_scatter(avw)

plot_scatter(avw + facet_wrap(~Conference))

The runs of Boston, Dallas and the Lakers all seem to be over, and rebuilding their teams seems like the only solution going forward. Houston, Indiana, Denver, Memphis and Oklahoma City all are in great shape in the coming years, forward, they should continue to improve or maintain their quality play. San Antonio, the Clippers, Knicks and Miami all are performing better than their similarity experienced peers, but their windows may be closing soon unless they overhaul their rosters.

Assists Vs Turnovers

#calculate Assists per possession by dividing total assists by games, and adjusting for pace.
team_stats = mutate(team_stats, AST_PG=AST/G, AST_PP=AST_PG/Pace)
at = ggplot(team_stats, aes(x=AST_PP, y=`TOV%`/100)) +
  geom_smooth(aes(group=1), se=FALSE, color="grey", method='lm') + point_color +
  labs(y='Turnover %', x='Assist') + scale_x_continuous(labels=percent) + scale_y_continuous(labels=percent)
plot_scatter(at)

#facet by pace
quantiles = seq(0,1,.25)
quantile_names = function(x){
  names = vector('character', length=length(x)-1)
  for(i in 2:length(x)){
    names[i-1]=paste(x[i-1],x[i], sep='-')
  }
  names
}
plot_scatter(at %+%
               mutate(team_stats,
                      Pace_class =
                        cut(Pace, quantile(Pace, quantiles),
                            labels=quantile_names(percent(quantiles)),
                            include.lowest=TRUE)) + facet_wrap(~Pace_class))

Assists seem to have a positive correlation to turnovers. There also seems to be some effect of Pace as well, although interestingly the top 25% fastest teams have a lower turnover rate than the 50%-75% teams. Clear outliers include the Rockets, with a very high turnover percentage, likely due to the very fast pace they play at. The Knicks, with a very low turnover percentage, but also a very slow pace, and very low assist percentage, likely due to the many jump shooting, shoot first players on their team. The 76ers are an interesting outlier as well, with a very low turnover rate for their assist percentage.

Principle Component Analysis

Principle component analysis is an unsupervised clustering method often used to reduce the dimensionality of highly dimensional data. It can be thought of as revealing the internal structure of the data in way that best explains the variance observed.

#note need to scale the data so that all the observations are on the same scale
team_numeric = scale(subset(team_stats, select=c(FG, FGA, Three_Point, Three_PointA, FT, FTA, ORB,
                                        DRB, AST, STL, BLK, TOV, PF, PTS,
                                        FG_Opp, FGA_Opp, Three_Point_Opp, Three_PointA_Opp,
                                        FT_Opp, FTA_Opp, ORB_Opp, DRB_Opp,
                                        AST_Opp, STL_Opp, BLK_Opp, TOV_Opp,
                                        PF_Opp, PTS_Opp, Age)), center=T, scale=T)
rownames(team_numeric) = team_stats$Team
pc = prcomp(team_numeric)
team_stats_pc = merge(team_stats, pc$x, by.x='Team', by.y='row.names')
library(gridExtra)
p1 = make_scatter(ggplot(team_stats_pc, aes(x=PC1, y=PC2)) + point_color)
p2 = make_scatter(ggplot(team_stats_pc, aes(x=Pace, y=Wins)) + point_color + scale_y_reverse())
grid.arrange(p1, p2, ncol=2)

The first principle component seems to be stratified by pace, and the second by wins. PC1 aligning with pace is only slightly surprising, as all of the teams totals will be higher with an increased pace. PC2 aligning with wins is quite surprising however, although PTS and PTS_Opp could be proxy values for total wins. Still it is a surprise to me. The first plot has PC1 vs PC2 next to Pace vs Wins for easy comparison. Note the Y axis is reversed in the Pace vs Wins to line up with the PC plot.

These plots are PC1 vs Pace and PC2 vs Wins, to show the correlation between them.

plot_scatter(ggplot(team_stats_pc, aes(x=PC1, y=Pace, group=1)) + point_color + geom_smooth(method=lm, se=F, color='grey'))

plot_scatter(ggplot(team_stats_pc, aes(x=PC2, y=Wins, group=1)) + point_color + geom_smooth(method=lm, se=F, color='grey'))

Random Forest Analysis

Use RandomForest to perform a regression on wins to get an idea of which statistics are the most important to predict wins. The dotted line is the absolute value of the lowest variable. A good rule of thumb for which variables are significant, following the idea that random variables will be randomly distributed around 0.

Note that this is a very small number of observations as it is only one season of team data, but you could do the same approach with more seasons to get more confidence.

library(randomForest)
set.seed(Sys.time())
formula = as.formula(Wins ~ FG + FGA + Three_Point + Three_PointA + FT + FTA +
  ORB + DRB + AST + STL + BLK + TOV + PF + PTS + FG_Opp + FGA_Opp +
  Three_Point_Opp + Three_PointA_Opp + FT_Opp + FTA_Opp + ORB_Opp + DRB_Opp +
  AST_Opp + STL_Opp + BLK_Opp + TOV_Opp + PF_Opp + PTS_Opp + Age)
rf = randomForest(formula, data=team_stats, mtry=5, ntree=10000, importance=TRUE)
imp = data.frame(type=rownames(imp), importance(rf), check.names=F)
imp$type = reorder(imp$type, imp$`%IncMSE`)
ggplot(data=imp, aes(x=type, y=`%IncMSE`)) + geom_bar(stat='identity') + geom_hline(yintercept=abs(min(imp$`%IncMSE`)), col=2, linetype='dashed') + coord_flip()

It is interesting to me that the Opponents Assist amount ranks so highly, I would think that assists must be a good proxy for open looks at the basket, so if you have a large number of opponent assists your defense must be very bad. Also interesting is the large importance of steals, contrasted with the negative importance of blocks, this is likely due to the fact that steals always result in a turnover, while blocks often do not, especially with the tendency for big men to block balls into the stands rather than trying to get possession.

Player Comparison

See basketball-reference for the Game Score definition; It is a rough measure of productivity for a individual game.

load('data/players.RData')
players$Team = team_abb_short2long[as.character(players$Tm)]
player_color = function(player, color){
  color_player = color[ player$Team ]
  names(color_player) = player$Name
  color_player
}
color1_player = player_color(players, color1)
color2_player = player_color(players, color2)
colors_players =
  list(aes(fill=Name, color=Name),
              scale_fill_manual(values=color1_player),
              scale_color_manual(values=color2_player))
#overall gamescore
ggplot(players, aes(y=GmSc, x=Name, color=Name)) +
  geom_boxplot(outlier.colour=NA) +
  geom_jitter(position=position_jitter(width=.25)) + colors_players +
  theme(legend.position='none') + labs(x=NULL, y='Game Score')

The boxplots show that Waiters is somewhat more consistent than Beal, but their median game last season was essentially the same in terms of overall productivity. Waiters did have a few outlier games when he did both better and worse than Beal though. James is clearly a much better player than either of the rookies, even his worst games are almost above the 75 percentile of Beal and Waiters.

ggplot(players, aes(y=GmSc, x=Result)) + geom_boxplot(outlier.colour=NA) +
  geom_jitter(position=position_jitter(width=.25)) + facet_wrap(~Name) +
  colors_players + theme(legend.position='none') + labs(y='Game Score')

The same points separated into games in which the team won and lost. All three players seem to perform better in wins than in losses, although it is tough to make too many conclusions as the number of games in each category is not the same.

ggplot(players, aes(x=GmSc, y=MOV)) + geom_point() +
  facet_wrap(~Name) + geom_smooth() + colors_players +
  theme(legend.position='none') + labs(x='Game Score', y='Margin of Victory')

Rather than using discrete wins and losses, plot game score against continuous margin of victory.

#gamescores over time
ggplot(players, aes(y=GmSc, x=Date, color=Name, fill=Name)) + geom_point() + geom_smooth() + colors_players + theme(legend.position='bottom')

These are the same points as the previous graphs, just plotted over the season rather than as a boxplot. You can see that Waiters started off stronger in comparison to Beal, however Beal improved faster as the season progressed until he was injured. Perhaps the mid season return of John Wall helped his improvement, while the injury troubles (Varejao etc) of the Cavs hurt Waiters.

Author: Jim Hester Created: 2013 Jul 02 01:30:16 PM Last Modified: 2013 Aug 03 07:43:47 PM