nba <- nba %>%
distinct(Year, Player, Tm, .keep_all = T) %>%
filter(G > 5)
Create new variables
nba <- nba %>%
mutate(PPG = PTS/G, .after = PTS,
ORBPG = ORB/G,
DRBPG = DRB/G,
BLKPG = BLK/G,
height = Feet *12 + Inches)
nba <- nba %>%
mutate(decade = Year - (Year %% 10), .after = Year)
Response Variable - Height (in)
Explanatory Variables - Weight, Decade, Offensive Rebounds Per Game (ORBPG), Defensive Rebounds Per Game (DRBPG), and Blocks Per Game (BLKPG).
nba %>%
ggplot(mapping = aes(x= height)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
What transformations need to be done in order to make them more linearly related to height?
nba %>%
ggplot(mapping = aes(x = Wt, y = height)) +
geom_point() +
geom_smooth(method = "lm") +
geom_smooth(color = "grey") +
labs("Height vs Weight")
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
nba %>%
ggplot(mapping = aes(x = sqrt(Wt), y = height)) +
geom_point() +
geom_smooth(method = "lm") +
geom_smooth(color = "grey") +
labs("Height vs Sqrt(Weight)")
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
nba %>%
ggplot(mapping = aes(x = DRBPG, y = height)) +
geom_point() +
geom_smooth(method = "lm") +
geom_smooth(color = "grey") +
labs("Height vs DRBPG")
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
nba %>%
ggplot(mapping = aes(x = log(DRBPG), y = height)) +
geom_point() +
geom_smooth(method = "lm") +
geom_smooth(color = "grey") +
labs("Height vs log(DRBPG)")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 29 rows containing non-finite values (`stat_smooth()`).
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 29 rows containing non-finite values (`stat_smooth()`).
nba %>%
ggplot(mapping = aes(x = ORBPG, y = height)) +
geom_point() +
geom_smooth(method = "lm") +
geom_smooth(color = "grey") +
labs("Height vs ORBPG")
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
nba %>%
ggplot(mapping = aes(x = log(ORBPG), y = height)) +
geom_point(alpha = 0.2) +
geom_smooth(method = "lm") +
geom_smooth(color = "grey") +
labs("Height vs log(ORBPG)")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 269 rows containing non-finite values (`stat_smooth()`).
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 269 rows containing non-finite values (`stat_smooth()`).
nba %>%
ggplot(mapping = aes(x = BLKPG, y = height)) +
geom_point() +
geom_smooth(method = "lm") +
geom_smooth(color = "grey") +
labs("Height vs BLKPG")
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
nba %>%
ggplot(mapping = aes(x = sqrt(BLKPG), y = height)) +
geom_point(alpha = 0.2) +
geom_smooth(method = "lm") +
geom_smooth(color = "grey") +
labs("Height vs sqrt(BLKPG)")
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
None of these above transformations appear to be vast improvements from their standard relationship
m_data <- nba %>%
select(height, decade, Wt, ORBPG, DRBPG, BLKPG)
corr <- round(cor(m_data), 2)
ggcorrplot(corr, lab = TRUE)
attach(nba)
m <- lm(height ~ Wt + DRBPG + ORBPG + BLKPG + as.factor(decade))
summary(m)
##
## Call:
## lm(formula = height ~ Wt + DRBPG + ORBPG + BLKPG + as.factor(decade))
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.4720 -1.2296 -0.0466 1.3341 12.9299
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 57.5190310 0.1234708 465.851 < 2e-16 ***
## Wt 0.1001309 0.0006232 160.671 < 2e-16 ***
## DRBPG -0.1747243 0.0139883 -12.491 < 2e-16 ***
## ORBPG 0.2364444 0.0309638 7.636 2.34e-14 ***
## BLKPG 1.5753609 0.0395331 39.849 < 2e-16 ***
## as.factor(decade)1990 -0.4677784 0.0453045 -10.325 < 2e-16 ***
## as.factor(decade)2000 -0.9186704 0.0458170 -20.051 < 2e-16 ***
## as.factor(decade)2010 -1.1910754 0.0466262 -25.545 < 2e-16 ***
## as.factor(decade)2020 -1.2881367 0.0605642 -21.269 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.983 on 20003 degrees of freedom
## Multiple R-squared: 0.7053, Adjusted R-squared: 0.7051
## F-statistic: 5983 on 8 and 20003 DF, p-value: < 2.2e-16
detach(nba)
The meaning of the weight coefficient: If we have players from the same decade that have the same stats we would expect a player that weights 10 more pounds than the other to be a little more than 1 inch taller.
Something else that is interesting to note is that the decade coefficient becomes a more negative number as the decades increase. I personally would have expected this phenomena to occur as the NBA has become more of a small ball league focused on shooting and speed rather than throwing the ball into the post.
autoplot(m, which = 1, ncol = 1, label.size = 3)
autoplot(m, which = 2, ncol = 1, label.size = 3)
autoplot(m, which = 3, ncol = 1, label.size = 3)
autoplot(m, which = 4, ncol = 1, label.size = 3)
autoplot(m, which = 5, ncol = 1, label.size = 3)
autoplot(m, which = 6, ncol = 1, label.size = 3)
Pretty much all of these diagnostic plots show some reason to be alarmed about the model not being appropriate. I think a lot of this is caused by there being a lot in data in the middle and sparse data at the extremes. A good example of this can be seen in the Cook’s D chart. Here all the big spikes marked with their observation numbers correspond to Manute Bol. He was 7ft 7in and only weighed 210 pounds. There are a lot of guards and forward that are much shorter, likely more than a foot shorter, and weight the same. This data is interesting because it involves a lot highly unique body types, but this same fact also causes a lot of problems when trying to predict height based on these metrics.