library(tidyverse)
## ── Attaching packages ───────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.3 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.1 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ──────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
data <- read.csv('Thesis/MASTER_UPDATED.csv')
totalPoints <- data$PTS_home + data$PTS_away
data = subset(data, select = -c(PTS_home,PTS_away))
data$total_points = totalPoints
ggplot(data, aes(FG_PCT_home, total_points, size = FG3_PCT_home,
color = FT_PCT_home))+geom_point(alpha = .5)+
xlab("Field Goal %")+ylab("Total Points")+ggtitle("Shooting %'s vs. Total Points")+
labs(size = "3-Point FG%", color = 'Free Throw %')
ggplot(data, aes(AST_home, total_points, size = REB_home))+
geom_point(alpha=.4)+xlab('Assists')+ylab('Points')+
ggtitle('Assists and Rebounds vs. Total Points')+labs(size = "Rebounds")
ggplot(data, aes(DEF.RTG.y, total_points, size = BLK.y, color = STL.y))+
geom_point(alpha = .5)+xlab('Opponent Defensive Rating')+
ylab('Total Points')+ggtitle('Defensive Stats vs. Total Points')+
labs(color = 'Opponent Steals', size = 'Opponent Blocks')
My categorical variable is the season the data point came from. My response variable will be total points. My numeric predictor will be home team field goal percentage.
slm <- lm(total_points ~ FG_PCT_home, data = data)
summary(slm)
##
## Call:
## lm(formula = total_points ~ FG_PCT_home, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -63.487 -12.393 -0.758 11.529 107.174
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 135.094 2.216 60.98 <2e-16 ***
## FG_PCT_home 174.863 4.739 36.90 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18.49 on 5128 degrees of freedom
## Multiple R-squared: 0.2098, Adjusted R-squared: 0.2097
## F-statistic: 1362 on 1 and 5128 DF, p-value: < 2.2e-16
As can be seen in the model summary graphic above, home field goal percentage seems to be significant to the response variable.
season <- cut(data$SEASON, br = c(2015,2016,2017,2018,2019))
data$SEASON <- season
model <- lm(total_points~SEASON, data = data)
summary(model)
##
## Call:
## lm(formula = total_points ~ SEASON, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -68.145 -13.867 -0.867 13.133 107.133
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 210.7217 0.5366 392.718 <2e-16 ***
## SEASON(2016,2017] 1.4237 0.7620 1.868 0.0618 .
## SEASON(2017,2018] 11.1455 0.7625 14.616 <2e-16 ***
## SEASON(2018,2019] 11.6399 0.8409 13.842 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20.11 on 5126 degrees of freedom
## Multiple R-squared: 0.06567, Adjusted R-squared: 0.06512
## F-statistic: 120.1 on 3 and 5126 DF, p-value: < 2.2e-16
anova(model)
## Analysis of Variance Table
##
## Response: total_points
## Df Sum Sq Mean Sq F value Pr(>F)
## SEASON 3 145740 48580 120.09 < 2.2e-16 ***
## Residuals 5126 2073539 405
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot(data, aes(y=total_points, x=SEASON, fill=SEASON))+
geom_boxplot()
According to the ANOVA table F-test, season seems to be a significant variable. However, this is partially visible in the box-plot. It can be seen that the scores seem to increase very slightly each successive season and the low end and high end scores box increase as well.
mlr <- lm(total_points ~ FG_PCT_home + SEASON, data = data)
summary(mlr)
##
## Call:
## lm(formula = total_points ~ FG_PCT_home + SEASON, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -59.193 -11.842 -0.806 11.223 101.661
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 129.7739 2.1564 60.180 <2e-16 ***
## FG_PCT_home 174.7002 4.5408 38.473 <2e-16 ***
## SEASON(2016,2017] 1.0934 0.6713 1.629 0.103
## SEASON(2017,2018] 10.9137 0.6718 16.246 <2e-16 ***
## SEASON(2018,2019] 11.5327 0.7408 15.568 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.72 on 5125 degrees of freedom
## Multiple R-squared: 0.2751, Adjusted R-squared: 0.2745
## F-statistic: 486.1 on 4 and 5125 DF, p-value: < 2.2e-16
ggplot(data, aes(x=FG_PCT_home, y=total_points))+geom_point()+
geom_abline(intercept = mlr$coefficients[1] + mlr$coefficients[3], slope = mlr$coefficients[2], color="red", lwd=1) + geom_abline(intercept = mlr$coefficients[1] + mlr$coefficients[4], slope = mlr$coefficients[2], color="blue", lwd=1) + geom_abline(intercept = mlr$coefficients[1] + mlr$coefficients[5], slope = mlr$coefficients[2], color="green", lwd=1)
2016-17: \(y = 130.86 + 174.70x\)
2017-18: \(y = 140.68 + 174.70x\)
2018-19: \(y = 141.30 + 174.70x\)
model2 <- lm(total_points ~ FG_PCT_home * SEASON, data = data)
summary(model2)
##
## Call:
## lm(formula = total_points ~ FG_PCT_home * SEASON, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -58.914 -11.826 -0.793 11.287 101.677
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 133.436 4.103 32.520 <2e-16 ***
## FG_PCT_home 166.797 8.796 18.962 <2e-16 ***
## SEASON(2016,2017] -5.099 5.792 -0.880 0.379
## SEASON(2017,2018] 7.485 5.851 1.279 0.201
## SEASON(2018,2019] 6.127 6.247 0.981 0.327
## FG_PCT_home:SEASON(2016,2017] 13.342 12.390 1.077 0.282
## FG_PCT_home:SEASON(2017,2018] 7.401 12.525 0.591 0.555
## FG_PCT_home:SEASON(2018,2019] 11.661 13.377 0.872 0.383
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.72 on 5122 degrees of freedom
## Multiple R-squared: 0.2752, Adjusted R-squared: 0.2742
## F-statistic: 277.9 on 7 and 5122 DF, p-value: < 2.2e-16
ggplot(data, aes(x = FG_PCT_home, y = total_points)) + geom_point() +
geom_abline(intercept = model2$coefficients[1] + model2$coefficients[3], slope = model2$coefficients[2] +model2$coefficients[6], color="red", lwd=1) + geom_abline(intercept = model2$coefficients[1] + model2$coefficients[4], slope = model2$coefficients[2] +model2$coefficients[7], color="blue", lwd=1) + geom_abline(intercept = model2$coefficients[1] + model2$coefficients[5], slope = model2$coefficients[2] +model2$coefficients[8], color="green", lwd=1)
First model MSE:
mean(slm$residuals^2)
## [1] 341.8425
Second model MSE:
mean(model$residuals^2)
## [1] 404.1987
Third model MSE:
mean(mlr$residuals^2)
## [1] 313.6189
Fourth model MSE:
mean(model2$residuals^2)
## [1] 313.537
This was an interesting exercise. My data set has many continuous variables that could be significant. It was interesting to see that season was significant. I believe this could be due to the increasing popularity of three point shots in NBA games.
library(leaps)
regfit.fwd<-regsubsets(total_points~., data=data,
method="forward")
summary(regfit.fwd)
## Subset selection object
## Call: regsubsets.formula(total_points ~ ., data = data, method = "forward")
## 31 Variables (and intercept)
## Forced in Forced out
## SEASON(2016,2017] FALSE FALSE
## SEASON(2017,2018] FALSE FALSE
## SEASON(2018,2019] FALSE FALSE
## FG_PCT_home FALSE FALSE
## FT_PCT_home FALSE FALSE
## FG3_PCT_home FALSE FALSE
## AST_home FALSE FALSE
## REB_home FALSE FALSE
## FG_PCT_away FALSE FALSE
## FT_PCT_away FALSE FALSE
## FG3_PCT_away FALSE FALSE
## AST_away FALSE FALSE
## REB_away FALSE FALSE
## DEF.RTG.x FALSE FALSE
## DREB.x FALSE FALSE
## DREB..x FALSE FALSE
## STL.x FALSE FALSE
## BLK.x FALSE FALSE
## OPP.PTS.OFF.TOV.x FALSE FALSE
## OPP.PTS.2ND.CHANCE.x FALSE FALSE
## OPP.PTS.FB.x FALSE FALSE
## OPP.PTS.PAINT.x FALSE FALSE
## DEF.RTG.y FALSE FALSE
## DREB.y FALSE FALSE
## DREB..y FALSE FALSE
## STL.y FALSE FALSE
## BLK.y FALSE FALSE
## OPP.PTS.OFF.TOV.y FALSE FALSE
## OPP.PTS.2ND.CHANCE.y FALSE FALSE
## OPP.PTS.FB.y FALSE FALSE
## OPP.PTS.PAINT.y FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: forward
## SEASON(2016,2017] SEASON(2017,2018] SEASON(2018,2019] FG_PCT_home
## 1 ( 1 ) " " " " " " " "
## 2 ( 1 ) " " " " " " "*"
## 3 ( 1 ) " " " " " " "*"
## 4 ( 1 ) " " " " " " "*"
## 5 ( 1 ) " " " " " " "*"
## 6 ( 1 ) " " " " " " "*"
## 7 ( 1 ) " " " " " " "*"
## 8 ( 1 ) " " " " " " "*"
## FT_PCT_home FG3_PCT_home AST_home REB_home FG_PCT_away FT_PCT_away
## 1 ( 1 ) " " " " " " " " "*" " "
## 2 ( 1 ) " " " " " " " " "*" " "
## 3 ( 1 ) " " " " " " "*" "*" " "
## 4 ( 1 ) " " " " " " "*" "*" " "
## 5 ( 1 ) " " " " " " "*" "*" "*"
## 6 ( 1 ) "*" " " " " "*" "*" "*"
## 7 ( 1 ) "*" " " " " "*" "*" "*"
## 8 ( 1 ) "*" " " " " "*" "*" "*"
## FG3_PCT_away AST_away REB_away DEF.RTG.x DREB.x DREB..x STL.x BLK.x
## 1 ( 1 ) " " " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " " " " " " " " "
## 4 ( 1 ) " " " " "*" " " " " " " " " " "
## 5 ( 1 ) " " " " "*" " " " " " " " " " "
## 6 ( 1 ) " " " " "*" " " " " " " " " " "
## 7 ( 1 ) "*" " " "*" " " " " " " " " " "
## 8 ( 1 ) "*" " " "*" " " " " " " " " " "
## OPP.PTS.OFF.TOV.x OPP.PTS.2ND.CHANCE.x OPP.PTS.FB.x OPP.PTS.PAINT.x
## 1 ( 1 ) " " " " " " " "
## 2 ( 1 ) " " " " " " " "
## 3 ( 1 ) " " " " " " " "
## 4 ( 1 ) " " " " " " " "
## 5 ( 1 ) " " " " " " " "
## 6 ( 1 ) " " " " " " " "
## 7 ( 1 ) " " " " " " " "
## 8 ( 1 ) " " " " " " " "
## DEF.RTG.y DREB.y DREB..y STL.y BLK.y OPP.PTS.OFF.TOV.y
## 1 ( 1 ) " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " " " " "
## 4 ( 1 ) " " " " " " " " " " " "
## 5 ( 1 ) " " " " " " " " " " " "
## 6 ( 1 ) " " " " " " " " " " " "
## 7 ( 1 ) " " " " " " " " " " " "
## 8 ( 1 ) " " " " " " " " " " " "
## OPP.PTS.2ND.CHANCE.y OPP.PTS.FB.y OPP.PTS.PAINT.y
## 1 ( 1 ) " " " " " "
## 2 ( 1 ) " " " " " "
## 3 ( 1 ) " " " " " "
## 4 ( 1 ) " " " " " "
## 5 ( 1 ) " " " " " "
## 6 ( 1 ) " " " " " "
## 7 ( 1 ) " " " " " "
## 8 ( 1 ) " " " " "*"
## 75% of the sample size
smp_size <- floor(0.75 * nrow(data))
## set the seed to make your partition reproducible
set.seed(1)
train_ind <- sample(seq_len(nrow(data)), size = smp_size)
train <- data[train_ind, ]
test <- data[-train_ind, ]
best_model <- lm(total_points ~ FG_PCT_away + FG_PCT_home + REB_home + REB_away + FT_PCT_away + FT_PCT_home + FG3_PCT_away + FG3_PCT_home, data = train)
summary(best_model)
##
## Call:
## lm(formula = total_points ~ FG_PCT_away + FG_PCT_home + REB_home +
## REB_away + FT_PCT_away + FT_PCT_home + FG3_PCT_away + FG3_PCT_home,
## data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -36.685 -7.366 -0.484 6.664 47.200
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -263.66386 5.36571 -49.14 <2e-16 ***
## FG_PCT_away 289.18803 4.34057 66.62 <2e-16 ***
## FG_PCT_home 280.02564 4.37867 63.95 <2e-16 ***
## REB_home 1.66041 0.03274 50.71 <2e-16 ***
## REB_away 1.61792 0.03308 48.92 <2e-16 ***
## FT_PCT_away 35.03222 1.70698 20.52 <2e-16 ***
## FT_PCT_home 33.97552 1.74915 19.42 <2e-16 ***
## FG3_PCT_away 30.81403 2.25986 13.63 <2e-16 ***
## FG3_PCT_home 27.11195 2.26966 11.95 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.72 on 3838 degrees of freedom
## Multiple R-squared: 0.7392, Adjusted R-squared: 0.7386
## F-statistic: 1360 on 8 and 3838 DF, p-value: < 2.2e-16
pred <- predict(best_model, test)
mean(pred)
## [1] 215.5694
mean(data$total_points)
## [1] 216.2887