nba <- nba %>%
distinct(Year, Player, Tm, .keep_all = T) %>%
filter(G > 5)
Create new variables
nba <- nba %>%
mutate(PPG = PTS/G, .after = PTS,
positive_WS = ifelse(WS > 0, 1, 0),
TOVPG = TOV/G,
STLPG = STL/G,
TRBPG = TRB/G,
ASTPG = AST/G,
PFPG = PF/G)
WS - Win Shares
This variable attempts to estimate the number of wins a player directly contributed to his team(s). If the value of this variable is negative that it essentially means a that if replaced, with the right player, the team would have won more games.
nba %>%
group_by(positive_WS) %>%
summarise(counts = n(),
proportions = n()/nrow(nba))
## # A tibble: 2 × 3
## positive_WS counts proportions
## <dbl> <int> <dbl>
## 1 0 2822 0.141
## 2 1 17190 0.859
Response variable: positive_WS
Explanatory variables: PPG (points per game), TRBPG (total rebounds per game), TOVPG (turnovers per game), ASTPG (assists per game), and STLPG (steals per game).
model <- glm(positive_WS ~ PPG + ASTPG + TRBPG + STLPG + TOVPG, data = nba,
family = binomial(link = 'logit'))
summary(model)
##
## Call:
## glm(formula = positive_WS ~ PPG + ASTPG + TRBPG + STLPG + TOVPG,
## family = binomial(link = "logit"), data = nba)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.96834 0.05645 -17.15 <2e-16 ***
## PPG 0.58092 0.01880 30.89 <2e-16 ***
## ASTPG 0.94982 0.04673 20.33 <2e-16 ***
## TRBPG 0.96323 0.03149 30.59 <2e-16 ***
## STLPG 1.42504 0.13972 10.20 <2e-16 ***
## TOVPG -4.66497 0.12036 -38.76 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 16281.9 on 20011 degrees of freedom
## Residual deviance: 9427.7 on 20006 degrees of freedom
## AIC: 9439.7
##
## Number of Fisher Scoring iterations: 7
PPG \(e^{0.58092} = 1.788\) Based on the coefficient for PPG in the model, increasing the amount of PPG by 1 a player 1.788 times more likely to be classified in the positive_WS group. Another way to say this is that by increasing your PPG by one it increases your odds of being in the positive_WS group by 78.8%
Other positive coefficients can be interpreted in this same way.
TOVPG
\(e^{-4.66497} = 0.009\)
Based on the coefficient for TOVPG in the model, by increasing a players TOVPG by 1 their odds of being in the positive_WS group will decrease by 99.1%.
ppg_coef <- model$coefficients['PPG']
ppg_se <- 0.01880
z_crit <- qnorm(0.975)
logodd_LCL <- ppg_coef - z_crit*ppg_se
logodd_UCL <- ppg_coef + z_crit*ppg_se
odd_LCL <- exp(logodd_LCL)
odd_UCL <- exp(logodd_UCL)
paste("log odds 95% CI = (", logodd_LCL, ",",logodd_UCL , ")")
## [1] "log odds 95% CI = ( 0.544073610353817 , 0.617768256172523 )"
paste("odds 95% CI = (", odd_LCL, ",", odd_UCL, ")")
## [1] "odds 95% CI = ( 1.72301146282636 , 1.85478401658463 )"
nba %>%
ggplot(mapping = aes(x=PPG, y = OWS)) +
geom_point() +
geom_smooth(method = "lm",color = "gray", se = F) +
geom_smooth(color = "blue", se = F) +
labs(title = "OWS vs PPG")
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
model <- lm(nba$OWS~nba$PPG)
summary(model)
##
## Call:
## lm(formula = nba$OWS ~ nba$PPG)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.9962 -0.6645 0.0193 0.5546 8.9630
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.874630 0.016789 -52.1 <2e-16 ***
## nba$PPG 0.252271 0.001616 156.1 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.361 on 20010 degrees of freedom
## Multiple R-squared: 0.549, Adjusted R-squared: 0.549
## F-statistic: 2.436e+04 on 1 and 20010 DF, p-value: < 2.2e-16
Looking at this plot it appears there is a non linear relationship between PPG and OWS. It looks similar to a parabola. In order to model this better we should square the PPG values and model these againse OWS to see if this is a better fit.
nba <- nba %>%
mutate(PPG_squard = PPG^2)
nba %>%
ggplot(mapping = aes(x=PPG_squard, y = OWS)) +
geom_point() +
geom_smooth(method = "lm",color = "gray", se = F) +
geom_smooth(color = "blue", se = F) +
labs(title = "OWS vs PPG")
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
model <- lm(nba$OWS~nba$PPG_squard + nba$PPG)
summary(model)
##
## Call:
## lm(formula = nba$OWS ~ nba$PPG_squard + nba$PPG)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.0884 -0.5958 -0.0541 0.4921 8.9070
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.3200025 0.0248126 -12.90 <2e-16 ***
## nba$PPG_squard 0.0061763 0.0002071 29.83 <2e-16 ***
## nba$PPG 0.1088519 0.0050619 21.50 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.331 on 20009 degrees of freedom
## Multiple R-squared: 0.5682, Adjusted R-squared: 0.5682
## F-statistic: 1.316e+04 on 2 and 20009 DF, p-value: < 2.2e-16
Here the relationship between these two variables look much more linear. A couple other things to notes, this model has a slightly higher adjusted r squared value so therefore would be the model of choice if we were choosing strictly between these two.