Today is March 5, 2015. We are 60 games, or about 73%, through the regular season. While team trends can still change as the season nears its end, the available data can help create a snapshot of which teams are best positioned to make the NBA finals.
There are two separate datasets being analyzed. The first comes from basketball-reference.com (http://bit.ly/1AMgKJR) and contains every team and its associated stats from the 1970 season through the present season (2014-15). Some of these stats include total points scored, assists, turnovers, rebounds, effective field goal precentage, offensive rating and defensive rating. The other data file is a simple csv file containing each year and the corresponding finals teams.
#load libraries
library(ggplot2)
library(ggthemes)
library(dplyr)
library(knitr)
#set directory to where csv files are located
setwd("/Users/pingelmo/Dropbox/data/nba/teams")
#initialize data frame
nba <- data.frame()
#assign files to list
files_list <- list.files()
#read in csv and row bind into one data frame
for(file in files_list){nba<-rbind(nba,read.table(file,header=T,sep=",",skip=2,stringsAsFactors=F))}
#remove rows not containing team data
nba <- nba[!nba$Season=="Season",]
rownames(nba) <- NULL
# convert characters to numeric
nba[,c(1,5:40)] <- sapply(nba[,c(1,5:40)],as.numeric)
#convert character to factor
nba$Season <- as.factor(nba$Season)
#remove asterisk from Team Name
nba$Tm <- substr(nba[,3],1,3)
#season end
nba$season_end <- as.numeric(substr(nba[,2],1,4))+1
options("scipen"=100, "digits"=4)
#load NBA champions data
finals <- read.csv("/Users/pingelmo/Dropbox/data/nba/finals.csv",header=T)
#create new binary championship variable
finals$finals <- 1
#rename to match NBA dataset
names(finals) <- c("Season","Tm","finals")
#join
nba <- left_join(nba,finals)
#remove NAs
nba$finals <- ifelse(is.na(nba$finals),0,1)
#rename fields
names(nba)[8] <- "win_pct"
names(nba)[33] <- "eFG"
names(nba)[34] <- "TOV_rate"
names(nba)[35] <- "ORB_rate"
names(nba)[36] <- "FT_FGA"
names(nba)[37] <- "opp_eFG"
names(nba)[38] <- "opp_TOV"
names(nba)[39] <- "opp_ORB"
names(nba)[40] <- "opp_FT_FGA"
# get rid of NAs, replace with 0
nba$X2P <- ifelse(is.na(nba$X2P),0,nba$X2P)
nba$X2PA <- ifelse(is.na(nba$X2PA),0,nba$X2PA)
nba$X3P <- ifelse(is.na(nba$X3P),0,nba$X3P)
nba$X3PA <- ifelse(is.na(nba$X3PA),0,nba$X3PA)
#create percentage variables
nba$netRtg <- nba$ORtg - nba$DRtg
nba$fg_per <- nba$FG/nba$FGA
nba$x2p_per <- nba$X2P/nba$X2PA
nba$x3p_per <- nba$X3P/nba$X3PA
nba$ft_per <- nba$FT/nba$FTA
nba$ast_turn <- nba$AST/nba$TOV
nba$pts_per_fga <- nba$PTS/nba$FGA
Now that the data has been loaded and cleaned, we can begin to explore it and build some interesting models that can help us determine estimated Wins and the probability of reaching the NBA championship.
I hypothesized that Offensive Rating, Defensive Rating, Wins, Offensive Rebounds, Assists and Effective Field Goal Percentage were most responsible for determining whether a team reached the finals. Before we get into some regressions to test that hypothesis, let’s plot some historical data to see where past finals teams have landed compared to their peers that season.
In the below charts, the larger, light blue dots represent the teams that reached the NBA finals each season. The two most recent lockout shortened seasons (1998-99,2011-12) will fall well below the Loess Regression line when not plotting an efficiency metric.
# Offensive Rating
nba %>% ggplot(aes(x=season_end,y=ORtg,size=finals,col=finals))+geom_point()+geom_smooth()+scale_size_continuous(range=c(1,4))+ggtitle("Offensive Rating")+theme_fivethirtyeight()+theme(legend.position="none",plot.title=element_text(size=rel(1.6),hjust=0))+xlab("Season End")+ylab("Offensive Rating")+annotate("text",x=1977,y=116,label="Made the NBA Finals",col="#1F93F2")
# Defensive Rating - the lower the better
nba %>% ggplot(aes(x=season_end,y=DRtg,size=finals,col=finals))+geom_point()+geom_smooth()+scale_size_continuous(range=c(1,4))+ggtitle("Defensive Rating")+theme_fivethirtyeight()+theme(legend.position="none",plot.title=element_text(size=rel(1.6)))+xlab("Season End")+ylab("Defensive Rating")+annotate("text",x=1977,y=116,label="Made the NBA Finals",col="#1F93F2")
# Wins
nba %>% ggplot(aes(x=season_end,y=W,size=finals,col=finals))+geom_point()+geom_smooth()+scale_size_continuous(range=c(1,4))+ggtitle("Wins")+theme_fivethirtyeight()+theme(legend.position="none",plot.title=element_text(size=rel(1.6)))+xlab("Season End")+ylab("Wins")+annotate("text",x=1976.6,y=75,label="Made the NBA Finals",col="#1F93F2")
# Offensive Rebounds
nba %>% ggplot(aes(x=season_end,y=ORB,size=finals,col=finals))+geom_point()+geom_smooth()+scale_size_continuous(range=c(1,4))+ggtitle("Offensive Rebounds")+theme_fivethirtyeight()+theme(legend.position="none",plot.title=element_text(size=rel(1.6)))+xlab("Season End")+ylab("ORB")+annotate("text",x=1976.5,y=1535,label="Made the NBA Finals",col="#1F93F2")
# Assists
nba %>% ggplot(aes(x=season_end,y=AST,size=finals,col=finals))+geom_point()+geom_smooth()+scale_size_continuous(range=c(1,4))+ggtitle("Assists")+theme_fivethirtyeight()+theme(legend.position="none",plot.title=element_text(size=rel(1.6)))+xlab("Season End")+ylab("Assists")+annotate("text",x=1976.5,y=2600,label="Made the NBA Finals",col="#1F93F2")
# Effective Field Goal
nba %>% ggplot(aes(x=season_end,y=eFG,size=finals,col=finals))+geom_point()+geom_smooth()+scale_size_continuous(range=c(1,4))+ggtitle("Effective Field Goal")+theme_fivethirtyeight()+theme(legend.position="none",plot.title=element_text(size=rel(1.6)))+xlab("Season End")+ylab("eFG")+annotate("text",x=1976.5,y=.57,label="Made the NBA Finals",col="#1F93F2")
In order to determine the probability from a binary observation (Reach Finals or Not), we must use a logit regression. After running the regression a few times, Wins, Offensive Rating and Defensive Rating proved to be the most statistically significant in determining finals teams.
summary(glm(finals~W+DRtg+ORtg,nba,family="binomial"))
##
## Call:
## glm(formula = finals ~ W + DRtg + ORtg, family = "binomial",
## data = nba)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.573 -0.291 -0.124 -0.038 3.279
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.0415 4.3081 -0.47 0.63559
## W 0.1221 0.0342 3.57 0.00036 ***
## DRtg -0.2780 0.0937 -2.97 0.00301 **
## ORtg 0.2077 0.0905 2.30 0.02168 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 563.17 on 1099 degrees of freedom
## Residual deviance: 362.34 on 1096 degrees of freedom
## AIC: 370.3
##
## Number of Fisher Scoring iterations: 7
If we are going to consider wins a major input into our logistic regression, then we must first project how many wins each team will finish with this year. We can do this with a multiple linear regression. I will only use Offensive Rating and Defensive Rating in order to avoid any multicollinearity issues.
summary(lm(W~DRtg+ORtg,nba))
##
## Call:
## lm(formula = W ~ DRtg + ORtg, data = nba)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24.468 -1.866 0.594 2.927 9.612
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 25.1652 4.4039 5.71 0.000000014 ***
## DRtg -2.4062 0.0375 -64.19 < 0.0000000000000002 ***
## ORtg 2.5471 0.0357 71.41 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.65 on 1097 degrees of freedom
## Multiple R-squared: 0.862, Adjusted R-squared: 0.862
## F-statistic: 3.42e+03 on 2 and 1097 DF, p-value: <0.0000000000000002
This simple model is very effective in determining the projected wins of teams with a R-Squared of .86.
Below are the projected wins for every team this season. Unsurprisingly, the Golden State Warriors sit atop the rankings with a projected 66 wins and gaudy 10.5 point Net rating (Offensive rating - Defensive Rating).
nba %>% filter(season_end==2015) %>% mutate(projected_wins=25.165+2.547*ORtg-2.406*DRtg) %>% select(Tm,ORtg,DRtg,projected_wins) %>% arrange(desc(projected_wins)) %>% kable()
| Tm | ORtg | DRtg | projected_wins |
|---|---|---|---|
| GSW | 111.3 | 100.8 | 66.12 |
| LAC | 112.6 | 106.0 | 56.92 |
| ATL | 108.9 | 102.3 | 56.40 |
| POR | 107.6 | 102.4 | 52.85 |
| DAL | 110.3 | 105.5 | 52.27 |
| MEM | 106.7 | 102.3 | 50.80 |
| TOR | 110.5 | 106.5 | 50.37 |
| CLE | 110.4 | 106.5 | 50.11 |
| HOU | 106.5 | 102.8 | 49.08 |
| SAS | 105.8 | 102.2 | 48.74 |
| CHI | 107.7 | 104.8 | 47.33 |
| OKC | 106.1 | 103.3 | 46.86 |
| MIL | 103.1 | 101.5 | 43.55 |
| PHO | 107.7 | 107.0 | 42.03 |
| NOP | 108.5 | 108.0 | 41.67 |
| WAS | 104.2 | 103.5 | 41.54 |
| IND | 102.2 | 103.0 | 37.65 |
| BOS | 104.5 | 105.6 | 37.25 |
| DET | 104.8 | 106.1 | 36.81 |
| UTA | 105.6 | 107.0 | 36.69 |
| CHO | 100.4 | 102.9 | 33.31 |
| BRK | 103.2 | 106.5 | 31.78 |
| MIA | 103.8 | 107.2 | 31.62 |
| SAC | 104.3 | 108.8 | 29.04 |
| DEN | 103.0 | 108.1 | 27.42 |
| ORL | 101.2 | 107.2 | 25.00 |
| LAL | 104.0 | 110.8 | 23.47 |
| MIN | 102.6 | 110.8 | 19.90 |
| NYK | 101.6 | 110.3 | 18.56 |
| PHI | 93.7 | 104.7 | 11.91 |
# Projected Wins and Champsionship probability
nba %>% filter(season_end==2015) %>% mutate(projected_wins=25.165+2.547*ORtg-2.406*DRtg,finals_prob=exp(-2.0415+.1221*projected_wins+.2077*ORtg-.278*DRtg)/(1+exp(-2.0415+.1221*projected_wins+.2077*ORtg-.278*DRtg))*100) %>% select(Tm,ORtg,DRtg,projected_wins,finals_prob) %>% arrange(desc(finals_prob)) %>% kable()
| Tm | ORtg | DRtg | projected_wins | finals_prob |
|---|---|---|---|---|
| GSW | 111.3 | 100.8 | 66.12 | 75.5196 |
| ATL | 108.9 | 102.3 | 56.40 | 27.3688 |
| LAC | 112.6 | 106.0 | 56.92 | 23.6421 |
| POR | 107.6 | 102.4 | 52.85 | 15.3493 |
| DAL | 110.3 | 105.5 | 52.27 | 11.1105 |
| MEM | 106.7 | 102.3 | 50.80 | 10.7445 |
| HOU | 106.5 | 102.8 | 49.08 | 7.5388 |
| SAS | 105.8 | 102.2 | 48.74 | 7.4004 |
| TOR | 110.5 | 106.5 | 50.37 | 7.2592 |
| CLE | 110.4 | 106.5 | 50.11 | 6.9177 |
| OKC | 106.1 | 103.3 | 46.86 | 4.7422 |
| CHI | 107.7 | 104.8 | 47.33 | 4.6183 |
| MIL | 103.1 | 101.5 | 43.55 | 2.8555 |
| WAS | 104.2 | 103.5 | 41.54 | 1.6303 |
| PHO | 107.7 | 107.0 | 42.03 | 1.3576 |
| NOP | 108.5 | 108.0 | 41.67 | 1.1629 |
| IND | 102.2 | 103.0 | 37.65 | 0.7756 |
| BOS | 104.5 | 105.6 | 37.25 | 0.5794 |
| DET | 104.8 | 106.1 | 36.81 | 0.5090 |
| UTA | 105.6 | 107.0 | 36.69 | 0.4610 |
| CHO | 100.4 | 102.9 | 33.31 | 0.3243 |
| BRK | 103.2 | 106.5 | 31.78 | 0.1772 |
| MIA | 103.8 | 107.2 | 31.62 | 0.1621 |
| SAC | 104.3 | 108.8 | 29.04 | 0.0842 |
| DEN | 103.0 | 108.1 | 27.42 | 0.0641 |
| ORL | 101.2 | 107.2 | 25.00 | 0.0421 |
| LAL | 104.0 | 110.8 | 23.47 | 0.0230 |
| MIN | 102.6 | 110.8 | 19.90 | 0.0111 |
| NYK | 101.6 | 110.3 | 18.56 | 0.0088 |
| PHI | 93.7 | 104.7 | 11.91 | 0.0036 |
This model gives Golden State an incredible 75% probability to reach the NBA Championship. This is driven by GSW’s historically great Net Rating. In a distant second are the Atlanta Hawks, with a 27% probability. Of course there are some limitations with this model. First, it does not take into account the incredibly difficult journey Golden State must take through the Western Conference playoffs. Of the top 10 teams listed above, 7 of them are in the Western Conference. Atlanta’s journey will prove much easier, likely facing the Toronto Raptors or Cleveland Cavaliers in the Eastern Conference Finals. Another limitation with this model is its non-time-sensititve ratings. For instance, the Clevelend Cavaliers began the season at an under .500 clip, at one point sitting at 19-20. Since that moment, the team finally began to jell, improved both their Offensive and Defensive Ratings and is now 2nd in the East at 37-24. This model does not take into account this increased play, which likely would contribute to a higher probability of making the NBA finals.
Below are the probabilities for the every finals team since 1980.
Some observations:nba %>% filter(finals==1,season_end>=1980) %>% mutate(finals_prob=exp(-2.0415+.1221*W+.2077*ORtg-.278*DRtg)/(1+exp(-2.0415+.1221*W+.2077*ORtg-.278*DRtg))*100) %>% select(season_end,Tm,ORtg,DRtg,W,finals_prob) %>% arrange(desc(season_end)) %>% kable()
| season_end | Tm | ORtg | DRtg | W | finals_prob |
|---|---|---|---|---|---|
| 2014 | SAS | 110.5 | 102.4 | 62 | 50.3087 |
| 2014 | MIA | 110.9 | 105.8 | 54 | 13.8647 |
| 2013 | MIA | 112.3 | 103.7 | 66 | 62.5559 |
| 2013 | SAS | 108.3 | 101.6 | 58 | 32.9468 |
| 2012 | OKC | 109.8 | 103.2 | 47 | 10.0929 |
| 2012 | MIA | 106.6 | 100.2 | 46 | 10.5300 |
| 2011 | MIA | 111.7 | 103.5 | 58 | 36.9908 |
| 2011 | DAL | 109.7 | 105.0 | 57 | 18.4356 |
| 2010 | LAL | 108.8 | 103.7 | 57 | 21.2045 |
| 2010 | BOS | 107.7 | 103.8 | 50 | 8.1390 |
| 2009 | LAL | 112.8 | 104.7 | 65 | 55.4028 |
| 2009 | ORL | 109.2 | 101.9 | 59 | 38.1082 |
| 2008 | BOS | 110.2 | 98.9 | 66 | 80.3993 |
| 2008 | LAL | 113.0 | 105.5 | 57 | 28.0759 |
| 2007 | SAS | 109.2 | 99.9 | 58 | 48.7238 |
| 2007 | CLE | 105.5 | 101.3 | 50 | 10.1056 |
| 2006 | DAL | 111.8 | 105.0 | 60 | 33.5226 |
| 2006 | MIA | 108.7 | 104.5 | 52 | 10.2817 |
| 2005 | SAS | 107.5 | 98.8 | 59 | 50.5937 |
| 2005 | DET | 105.6 | 101.2 | 54 | 16.1301 |
| 2004 | LAL | 105.5 | 101.3 | 56 | 18.9548 |
| 2004 | DET | 102.0 | 95.4 | 54 | 31.3480 |
| 2003 | SAS | 105.6 | 99.7 | 60 | 37.7780 |
| 2003 | NJN | 103.8 | 98.1 | 49 | 14.5401 |
| 2002 | LAL | 109.4 | 101.7 | 58 | 37.5212 |
| 2002 | NJN | 104.0 | 99.5 | 52 | 14.7732 |
| 2001 | LAL | 108.4 | 104.8 | 56 | 13.9000 |
| 2001 | PHI | 103.6 | 98.9 | 56 | 23.4984 |
| 2000 | LAL | 107.3 | 98.2 | 67 | 75.5065 |
| 2000 | IND | 108.5 | 103.6 | 56 | 18.7057 |
| 1999 | SAS | 104.0 | 95.0 | 37 | 8.8427 |
| 1999 | NYK | 98.6 | 97.5 | 27 | 0.4630 |
| 1998 | CHI | 107.7 | 99.8 | 62 | 53.8322 |
| 1998 | UTA | 112.7 | 105.4 | 62 | 40.9820 |
| 1997 | CHI | 114.4 | 102.4 | 69 | 84.2517 |
| 1997 | UTA | 113.6 | 104.0 | 64 | 61.1975 |
| 1996 | CHI | 115.2 | 101.8 | 72 | 91.5005 |
| 1996 | SEA | 110.3 | 102.1 | 64 | 57.4054 |
| 1995 | ORL | 115.1 | 107.8 | 57 | 24.1600 |
| 1995 | HOU | 109.7 | 107.4 | 47 | 3.3076 |
| 1994 | HOU | 105.9 | 101.4 | 58 | 23.9856 |
| 1994 | NYK | 105.7 | 98.2 | 57 | 39.4721 |
| 1993 | PHO | 113.3 | 106.7 | 62 | 35.4003 |
| 1993 | CHI | 112.9 | 106.1 | 57 | 24.4481 |
| 1992 | CHI | 115.5 | 104.5 | 67 | 74.6030 |
| 1992 | POR | 111.4 | 104.2 | 57 | 28.6668 |
| 1991 | CHI | 114.6 | 105.2 | 61 | 49.0856 |
| 1991 | LAL | 112.1 | 105.0 | 58 | 29.5977 |
| 1990 | DET | 109.9 | 103.5 | 59 | 31.3379 |
| 1990 | POR | 110.5 | 104.4 | 59 | 28.7010 |
| 1989 | DET | 110.8 | 104.7 | 63 | 39.1112 |
| 1989 | LAL | 113.8 | 106.7 | 57 | 24.8218 |
| 1988 | LAL | 113.1 | 107.3 | 62 | 30.7927 |
| 1988 | DET | 110.5 | 105.3 | 54 | 14.5462 |
| 1987 | LAL | 115.6 | 106.5 | 65 | 57.3983 |
| 1987 | BOS | 113.5 | 106.8 | 59 | 27.8071 |
| 1986 | BOS | 111.8 | 102.6 | 67 | 69.7888 |
| 1986 | HOU | 110.1 | 107.6 | 51 | 5.4196 |
| 1985 | BOS | 112.8 | 106.3 | 63 | 38.4134 |
| 1985 | LAL | 114.1 | 107.0 | 62 | 37.3149 |
| 1984 | BOS | 110.9 | 104.4 | 62 | 38.6851 |
| 1984 | LAL | 110.9 | 107.3 | 54 | 9.5906 |
| 1983 | PHI | 108.3 | 100.9 | 65 | 58.3877 |
| 1983 | LAL | 110.5 | 105.2 | 58 | 22.1931 |
| 1982 | PHI | 109.6 | 103.9 | 58 | 25.3510 |
| 1982 | LAL | 110.2 | 105.5 | 57 | 17.9129 |
| 1981 | BOS | 108.4 | 102.6 | 62 | 38.2389 |
| 1981 | HOU | 107.0 | 106.7 | 40 | 0.9989 |
| 1980 | LAL | 109.5 | 103.9 | 60 | 29.8061 |
| 1980 | PHI | 105.0 | 101.0 | 59 | 24.8412 |