Title

This is an R HTML document. When you click the Knit HTML button a web page will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

#nba home advantage and offensive/defensive analysis of 2018-19 season
#using bayesian hierarchical modeling

#get data
path<-'Desktop/nba_database/Regular+Season/team_stats_nba.com/game_logs/boxscore/2018-19.csv'
data<-read.csv(path)

#get rid of duplicate games
l<-c()
for(i in 1:nrow(data)){
  if(substr(data[i, "MATCHUP"], 5, 5)=='@'){l<-append(l, i)}
}
data<-data[-l,]

#create dataframe to store relevant data
#we will only use home team, away team, home points and away points
df <- matrix(data = NA, nrow=nrow(data), ncol = 5)
colnames(df)<- c('home_team', 'away_team', 'home_points', 'away_points', 'team_id')
df<-as.data.frame(df)
#appropriate to note some of these games went to overtime and this is not
#accounted for in this analysis 

#fill matrix with relevant data
for(i in 1:nrow(df)){
  df[i,]<-c(data[i, 'TEAM_ABBREVIATION'], substr(data[i, "MATCHUP"], 9, 12),
            data[i, 'PTS'], data[i, 'PTS']-data[i, 'PLUS_MINUS'], as.character(data[i, 'TEAM_ID']))
}

###############################
###############################
###############################



#create a mixed effects model in stan to sample from 
library(rstan)

## Loading required package: StanHeaders

## Loading required package: ggplot2

## rstan (Version 2.21.7, GitRev: 2e1f913d3ca3)

## For execution on a local, multicore CPU with excess RAM we recommend calling
## options(mc.cores = parallel::detectCores()).
## To avoid recompilation of unchanged Stan programs, we recommend calling
## rstan_options(auto_write = TRUE)

options(mc.cores = parallel::detectCores())
rstan_options(auto_write = TRUE)
#stan_model = "
#data {
#int<lower=0> n_teams; //number of teams
#int<lower=0> n_games; //number of games
#int<lower=0> home_team[n_games]; //home team index
#int<lower=0> away_team[n_games]; //away team index
#int<lower=0> home_pts[n_games]; //home team points
#int<lower=0> away_pts[n_games]; //away team points
#}

#parameters {
#//hyper parameteres
#real mu_off; //hyper prior for mean of offensive ability of team
#real<lower=0> tau_off; //hyper prior for sd of offensive ability of team

#real mu_def; //hyper prior for mean of defensive ability of team
#real<lower=0> tau_def; //hyper prior for sd of defensive ability of team

#//parameters
#real home; //home team advatage in pts
#vector[n_teams] off; //offensive ability of each team
#vector[n_teams] def; //defensive ability of each team
#}

#model {
#//hyperpriors, model should not be very sensitive to these 
#mu_off ~ normal(0,.5);
#tau_off ~ normal(0,2);
#mu_def ~ normal(0,.5);
#tau_def ~ normal(0,2);

#//priors
#off~normal(mu_off,tau_off);
#def~normal(mu_def,tau_def);
#home~normal(3,5); //they say 3pts for home court is what vegas prices in

#//likelihood, poisson for counts, 111.2 is the average pts a team scores
#home_pts~poisson(111+home+off[home_team]-def[away_team]);
#away_pts~poisson(111+off[away_team]-def[home_team]);
#}

#"
stan_model = [1256 chars quoted with '"']
#fit the stan model
model = stan_model(model_code=stan_model)

## Trying to compile a simple C file

## Running /Library/Frameworks/R.framework/Resources/bin/R CMD SHLIB foo.c
## clang -mmacosx-version-min=10.13 -I"/Library/Frameworks/R.framework/Resources/include" -DNDEBUG   -I"/Library/Frameworks/R.framework/Versions/4.2/Resources/library/Rcpp/include/"  -I"/Library/Frameworks/R.framework/Versions/4.2/Resources/library/RcppEigen/include/"  -I"/Library/Frameworks/R.framework/Versions/4.2/Resources/library/RcppEigen/include/unsupported"  -I"/Library/Frameworks/R.framework/Versions/4.2/Resources/library/BH/include" -I"/Library/Frameworks/R.framework/Versions/4.2/Resources/library/StanHeaders/include/src/"  -I"/Library/Frameworks/R.framework/Versions/4.2/Resources/library/StanHeaders/include/"  -I"/Library/Frameworks/R.framework/Versions/4.2/Resources/library/RcppParallel/include/"  -I"/Library/Frameworks/R.framework/Versions/4.2/Resources/library/rstan/include" -DEIGEN_NO_DEBUG  -DBOOST_DISABLE_ASSERTS  -DBOOST_PENDING_INTEGER_LOG2_HPP  -DSTAN_THREADS  -DBOOST_NO_AUTO_PTR  -include '/Library/Frameworks/R.framework/Versions/4.2/Resources/library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp'  -D_REENTRANT -DRCPP_PARALLEL_USE_TBB=1   -I/usr/local/include   -fPIC  -Wall -g -O2  -c foo.c -o foo.o
## In file included from <built-in>:1:
## In file included from /Library/Frameworks/R.framework/Versions/4.2/Resources/library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp:13:
## In file included from /Library/Frameworks/R.framework/Versions/4.2/Resources/library/RcppEigen/include/Eigen/Dense:1:
## In file included from /Library/Frameworks/R.framework/Versions/4.2/Resources/library/RcppEigen/include/Eigen/Core:88:
## /Library/Frameworks/R.framework/Versions/4.2/Resources/library/RcppEigen/include/Eigen/src/Core/util/Macros.h:628:1: error: unknown type name 'namespace'
## namespace Eigen {
## ^
## /Library/Frameworks/R.framework/Versions/4.2/Resources/library/RcppEigen/include/Eigen/src/Core/util/Macros.h:628:16: error: expected ';' after top level declarator
## namespace Eigen {
##                ^
##                ;
## In file included from <built-in>:1:
## In file included from /Library/Frameworks/R.framework/Versions/4.2/Resources/library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp:13:
## In file included from /Library/Frameworks/R.framework/Versions/4.2/Resources/library/RcppEigen/include/Eigen/Dense:1:
## /Library/Frameworks/R.framework/Versions/4.2/Resources/library/RcppEigen/include/Eigen/Core:96:10: fatal error: 'complex' file not found
## #include <complex>
##          ^~~~~~~~~
## 3 errors generated.
## make: *** [foo.o] Error 1

#prepare parameters to be feed into stan model
nteams<-length(unique(df$home_team))
ngames<-nrow(df)
teams<-unique(df$home_team)

home_team<-unlist(sapply(1:ngames, function(g)
            which(teams==df$home_team[g])))
away_team<-unlist(sapply(1:ngames, function(g)
            which(teams==df$away_team[g])))

home_pts<-as.numeric(df$home_points)
away_pts<-as.numeric(df$away_points)



#fit model to data
r = sampling(model, list(n_teams=nteams,
                    n_games = ngames,
                    teams = teams,
                    home_team = home_team,
                    away_team = away_team,
                    home_pts = home_pts,
                    away_pts = away_pts))



#extract parameter estimates 
params<-extract(r)



#posterior distribution of home parameter
hist(params$home, main = 'home court advantage in 2018-19')

mean(params$home)

## [1] 2.643348

#take out posterior mean to be best estimates of parameters
#we also have estimates for sd of these estimates but will not visualize them here
off<-colMeans(params$off)
def<-colMeans(params$def)



#create df to visualize estimates of off and def paramteres for each team
results <- matrix(data = NA, nrow=30, ncol = 5)
colnames(results)<- c('def', 'off', 'TEAM_NAME', 'playoff_round_reached', 'team_id')
results<-as.data.frame(results)

results$off<-off
results$def<-def
results$TEAM_NAME<-c('MEM', 'CHA', 'DEN', 'ATL', 'MIL', 'POR', 'LAC', 'BKN', 'NYK', 'SAS', 'PHI',
                'NOP', 'OKC', 'WAS', 'MIN', 'DAL', 'UTA', 'DET', 'MIA', 'CLE', 'CHI', 'LAL',
                'TOR', 'SAC', 'IND', 'HOU', 'BOS', 'GSW', 'ORL', 'PHX')



#include which round the team reached that postseason
results$playoff_round_reached<-as.factor(c(0, 0, 2, 0, 3, 3, 1, 1, 0, 1, 2,
            0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0,
            4, 0, 1, 2, 2, 4, 0, 0))



#plot results for off and def
plot(results$off, results$def, col = results$playoff_round_reached, pch = 19,
     main = 'back~no playoffs, red~1st round exit, green~2nd round exit, 
     blue ~ conf finals, light blue~finals')
text(results$off, results$def, labels=results$TEAM_NAME, pos=3)

#comparison with traditional nba stats, off and def rating via nba.com
path<-'Desktop/nba_database/Regular+Season/team_stats_nba.com/general/advanced/2018-19.csv'
data<-read.csv(path)

data<-data[,c('OFF_RATING', 'DEF_RATING', 'TEAM_NAME', 'TEAM_ID')]
data$TEAM_NAME<-c('ATL', 'BOS', 'BKN', 'CHA', 'CHI', 'CLE', 'DAL', 'DEN', 'DET',
                  'GSW', 'HOU', 'IND', 'LAC', 'LAL', 'MEM', 'MIA', 'MIL', 'MIN',
                  'NOP', 'NYK', 'OKC', 'ORL', 'PHI', 'PHX', 'POR', 'SAC', 'SAS',
                  'TOR', 'UTA', 'WAS')


#create one dataframe for ploting and regression ease
new <- merge(data, results, by = 'TEAM_NAME')

plot(new$OFF_RATING, new$off, col = new$playoff_round_reached, pch = 19)
text(new$OFF_RATING, new$off, labels=new$TEAM_NAME, pos=3)

summary(lm(new$OFF_RATING~new$off))

## 
## Call:
## lm(formula = new$OFF_RATING ~ new$off)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.6450 -1.5027  0.2735  1.0807  3.6458 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 110.07315    0.32754 336.063  < 2e-16 ***
## new$off       0.62391    0.08765   7.118 9.57e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.773 on 28 degrees of freedom
## Multiple R-squared:  0.6441, Adjusted R-squared:  0.6314 
## F-statistic: 50.67 on 1 and 28 DF,  p-value: 9.575e-08

plot(new$DEF_RATING, new$def, col = new$playoff_round_reached, pch = 19)
text(new$DEF_RATING, new$def, labels=new$TEAM_NAME, pos=3)

summary(lm(new$OFF_RATING~new$off))

## 
## Call:
## lm(formula = new$OFF_RATING ~ new$off)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.6450 -1.5027  0.2735  1.0807  3.6458 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 110.07315    0.32754 336.063  < 2e-16 ***
## new$off       0.62391    0.08765   7.118 9.57e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.773 on 28 degrees of freedom
## Multiple R-squared:  0.6441, Adjusted R-squared:  0.6314 
## F-statistic: 50.67 on 1 and 28 DF,  p-value: 9.575e-08

#these are only estimates with uncertainty attached to them, and this
#uncertainty is not visualized here. However, we might over-extrapolate to say
#offensive rating overrated Portland's and Houston's offense this season
#while underrating the lakers and hawks offense

#more over-extrapolation, defensive rating underrated the Cavs (still awful) 
#defense, and was too kind the the Hawks defense 

#interestingly enough, the bucks had the best defensive rating this regular
#season, but are more middle of the pack by this metric. 

#you might interpret these graphs as follows:
#for offense, a team to the right of another team has a better offensive rating
#while a team above another team has a better offense by my parameter

#for defense, a team to the left of another team has a better defensive rating
#while a team above another team has a better defense by my parameter