library(tidyverse)
## ── Attaching packages ───────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.3     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.1     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ──────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
data <- read.csv('Thesis/MASTER_UPDATED.csv')
totalPoints <- data$PTS_home + data$PTS_away
data = subset(data, select = -c(PTS_home,PTS_away))
data$total_points = totalPoints
ggplot(data, aes(FG_PCT_home, total_points, size = FG3_PCT_home,
                 color = FT_PCT_home))+geom_point(alpha = .5)+
  xlab("Field Goal %")+ylab("Total Points")+ggtitle("Shooting %'s vs. Total Points")+
  labs(size = "3-Point FG%", color = 'Free Throw %')

ggplot(data, aes(AST_home, total_points, size = REB_home))+
  geom_point(alpha=.4)+xlab('Assists')+ylab('Points')+
  ggtitle('Assists and Rebounds vs. Total Points')+labs(size = "Rebounds")

ggplot(data, aes(DEF.RTG.y, total_points, size = BLK.y, color = STL.y))+
  geom_point(alpha = .5)+xlab('Opponent Defensive Rating')+
  ylab('Total Points')+ggtitle('Defensive Stats vs. Total Points')+
  labs(color = 'Opponent Steals', size = 'Opponent Blocks')

My categorical variable is the season the data point came from. My response variable will be total points. My numeric predictor will be home team field goal percentage.

slm <- lm(total_points ~ FG_PCT_home, data = data)
summary(slm)
## 
## Call:
## lm(formula = total_points ~ FG_PCT_home, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -63.487 -12.393  -0.758  11.529 107.174 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  135.094      2.216   60.98   <2e-16 ***
## FG_PCT_home  174.863      4.739   36.90   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 18.49 on 5128 degrees of freedom
## Multiple R-squared:  0.2098, Adjusted R-squared:  0.2097 
## F-statistic:  1362 on 1 and 5128 DF,  p-value: < 2.2e-16

As can be seen in the model summary graphic above, home field goal percentage seems to be significant to the response variable.

season <- cut(data$SEASON, br = c(2015,2016,2017,2018,2019))
data$SEASON <- season
model <- lm(total_points~SEASON, data = data)
summary(model)
## 
## Call:
## lm(formula = total_points ~ SEASON, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -68.145 -13.867  -0.867  13.133 107.133 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       210.7217     0.5366 392.718   <2e-16 ***
## SEASON(2016,2017]   1.4237     0.7620   1.868   0.0618 .  
## SEASON(2017,2018]  11.1455     0.7625  14.616   <2e-16 ***
## SEASON(2018,2019]  11.6399     0.8409  13.842   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.11 on 5126 degrees of freedom
## Multiple R-squared:  0.06567,    Adjusted R-squared:  0.06512 
## F-statistic: 120.1 on 3 and 5126 DF,  p-value: < 2.2e-16
anova(model)
## Analysis of Variance Table
## 
## Response: total_points
##             Df  Sum Sq Mean Sq F value    Pr(>F)    
## SEASON       3  145740   48580  120.09 < 2.2e-16 ***
## Residuals 5126 2073539     405                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot(data, aes(y=total_points, x=SEASON, fill=SEASON))+
  geom_boxplot()

According to the ANOVA table F-test, season seems to be a significant variable. However, this is partially visible in the box-plot. It can be seen that the scores seem to increase very slightly each successive season and the low end and high end scores box increase as well.

E)

mlr <- lm(total_points ~ FG_PCT_home + SEASON, data = data)
summary(mlr)
## 
## Call:
## lm(formula = total_points ~ FG_PCT_home + SEASON, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -59.193 -11.842  -0.806  11.223 101.661 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       129.7739     2.1564  60.180   <2e-16 ***
## FG_PCT_home       174.7002     4.5408  38.473   <2e-16 ***
## SEASON(2016,2017]   1.0934     0.6713   1.629    0.103    
## SEASON(2017,2018]  10.9137     0.6718  16.246   <2e-16 ***
## SEASON(2018,2019]  11.5327     0.7408  15.568   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.72 on 5125 degrees of freedom
## Multiple R-squared:  0.2751, Adjusted R-squared:  0.2745 
## F-statistic: 486.1 on 4 and 5125 DF,  p-value: < 2.2e-16
ggplot(data, aes(x=FG_PCT_home, y=total_points))+geom_point()+
  geom_abline(intercept = mlr$coefficients[1] + mlr$coefficients[3], slope = mlr$coefficients[2], color="red", lwd=1) + geom_abline(intercept = mlr$coefficients[1] + mlr$coefficients[4], slope = mlr$coefficients[2], color="blue", lwd=1) + geom_abline(intercept = mlr$coefficients[1] + mlr$coefficients[5], slope = mlr$coefficients[2], color="green", lwd=1)

2016-17: \(y = 130.86 + 174.70x\)
2017-18: \(y = 140.68 + 174.70x\)
2018-19: \(y = 141.30 + 174.70x\)

F)

model2 <- lm(total_points ~ FG_PCT_home * SEASON, data = data)
summary(model2)
## 
## Call:
## lm(formula = total_points ~ FG_PCT_home * SEASON, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -58.914 -11.826  -0.793  11.287 101.677 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    133.436      4.103  32.520   <2e-16 ***
## FG_PCT_home                    166.797      8.796  18.962   <2e-16 ***
## SEASON(2016,2017]               -5.099      5.792  -0.880    0.379    
## SEASON(2017,2018]                7.485      5.851   1.279    0.201    
## SEASON(2018,2019]                6.127      6.247   0.981    0.327    
## FG_PCT_home:SEASON(2016,2017]   13.342     12.390   1.077    0.282    
## FG_PCT_home:SEASON(2017,2018]    7.401     12.525   0.591    0.555    
## FG_PCT_home:SEASON(2018,2019]   11.661     13.377   0.872    0.383    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.72 on 5122 degrees of freedom
## Multiple R-squared:  0.2752, Adjusted R-squared:  0.2742 
## F-statistic: 277.9 on 7 and 5122 DF,  p-value: < 2.2e-16
ggplot(data, aes(x = FG_PCT_home, y = total_points)) + geom_point() +
  geom_abline(intercept = model2$coefficients[1] + model2$coefficients[3], slope = model2$coefficients[2] +model2$coefficients[6], color="red", lwd=1) + geom_abline(intercept = model2$coefficients[1] + model2$coefficients[4], slope = model2$coefficients[2] +model2$coefficients[7], color="blue", lwd=1) + geom_abline(intercept = model2$coefficients[1] + model2$coefficients[5], slope = model2$coefficients[2] +model2$coefficients[8], color="green", lwd=1)

G)

First model MSE:

mean(slm$residuals^2)
## [1] 341.8425

Second model MSE:

mean(model$residuals^2)
## [1] 404.1987

Third model MSE:

mean(mlr$residuals^2)
## [1] 313.6189

Fourth model MSE:

mean(model2$residuals^2)
## [1] 313.537

H)

This was an interesting exercise. My data set has many continuous variables that could be significant. It was interesting to see that season was significant. I believe this could be due to the increasing popularity of three point shots in NBA games.

library(leaps)
regfit.fwd<-regsubsets(total_points~., data=data,
                       method="forward")
summary(regfit.fwd)
## Subset selection object
## Call: regsubsets.formula(total_points ~ ., data = data, method = "forward")
## 31 Variables  (and intercept)
##                      Forced in Forced out
## SEASON(2016,2017]        FALSE      FALSE
## SEASON(2017,2018]        FALSE      FALSE
## SEASON(2018,2019]        FALSE      FALSE
## FG_PCT_home              FALSE      FALSE
## FT_PCT_home              FALSE      FALSE
## FG3_PCT_home             FALSE      FALSE
## AST_home                 FALSE      FALSE
## REB_home                 FALSE      FALSE
## FG_PCT_away              FALSE      FALSE
## FT_PCT_away              FALSE      FALSE
## FG3_PCT_away             FALSE      FALSE
## AST_away                 FALSE      FALSE
## REB_away                 FALSE      FALSE
## DEF.RTG.x                FALSE      FALSE
## DREB.x                   FALSE      FALSE
## DREB..x                  FALSE      FALSE
## STL.x                    FALSE      FALSE
## BLK.x                    FALSE      FALSE
## OPP.PTS.OFF.TOV.x        FALSE      FALSE
## OPP.PTS.2ND.CHANCE.x     FALSE      FALSE
## OPP.PTS.FB.x             FALSE      FALSE
## OPP.PTS.PAINT.x          FALSE      FALSE
## DEF.RTG.y                FALSE      FALSE
## DREB.y                   FALSE      FALSE
## DREB..y                  FALSE      FALSE
## STL.y                    FALSE      FALSE
## BLK.y                    FALSE      FALSE
## OPP.PTS.OFF.TOV.y        FALSE      FALSE
## OPP.PTS.2ND.CHANCE.y     FALSE      FALSE
## OPP.PTS.FB.y             FALSE      FALSE
## OPP.PTS.PAINT.y          FALSE      FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: forward
##          SEASON(2016,2017] SEASON(2017,2018] SEASON(2018,2019] FG_PCT_home
## 1  ( 1 ) " "               " "               " "               " "        
## 2  ( 1 ) " "               " "               " "               "*"        
## 3  ( 1 ) " "               " "               " "               "*"        
## 4  ( 1 ) " "               " "               " "               "*"        
## 5  ( 1 ) " "               " "               " "               "*"        
## 6  ( 1 ) " "               " "               " "               "*"        
## 7  ( 1 ) " "               " "               " "               "*"        
## 8  ( 1 ) " "               " "               " "               "*"        
##          FT_PCT_home FG3_PCT_home AST_home REB_home FG_PCT_away FT_PCT_away
## 1  ( 1 ) " "         " "          " "      " "      "*"         " "        
## 2  ( 1 ) " "         " "          " "      " "      "*"         " "        
## 3  ( 1 ) " "         " "          " "      "*"      "*"         " "        
## 4  ( 1 ) " "         " "          " "      "*"      "*"         " "        
## 5  ( 1 ) " "         " "          " "      "*"      "*"         "*"        
## 6  ( 1 ) "*"         " "          " "      "*"      "*"         "*"        
## 7  ( 1 ) "*"         " "          " "      "*"      "*"         "*"        
## 8  ( 1 ) "*"         " "          " "      "*"      "*"         "*"        
##          FG3_PCT_away AST_away REB_away DEF.RTG.x DREB.x DREB..x STL.x BLK.x
## 1  ( 1 ) " "          " "      " "      " "       " "    " "     " "   " "  
## 2  ( 1 ) " "          " "      " "      " "       " "    " "     " "   " "  
## 3  ( 1 ) " "          " "      " "      " "       " "    " "     " "   " "  
## 4  ( 1 ) " "          " "      "*"      " "       " "    " "     " "   " "  
## 5  ( 1 ) " "          " "      "*"      " "       " "    " "     " "   " "  
## 6  ( 1 ) " "          " "      "*"      " "       " "    " "     " "   " "  
## 7  ( 1 ) "*"          " "      "*"      " "       " "    " "     " "   " "  
## 8  ( 1 ) "*"          " "      "*"      " "       " "    " "     " "   " "  
##          OPP.PTS.OFF.TOV.x OPP.PTS.2ND.CHANCE.x OPP.PTS.FB.x OPP.PTS.PAINT.x
## 1  ( 1 ) " "               " "                  " "          " "            
## 2  ( 1 ) " "               " "                  " "          " "            
## 3  ( 1 ) " "               " "                  " "          " "            
## 4  ( 1 ) " "               " "                  " "          " "            
## 5  ( 1 ) " "               " "                  " "          " "            
## 6  ( 1 ) " "               " "                  " "          " "            
## 7  ( 1 ) " "               " "                  " "          " "            
## 8  ( 1 ) " "               " "                  " "          " "            
##          DEF.RTG.y DREB.y DREB..y STL.y BLK.y OPP.PTS.OFF.TOV.y
## 1  ( 1 ) " "       " "    " "     " "   " "   " "              
## 2  ( 1 ) " "       " "    " "     " "   " "   " "              
## 3  ( 1 ) " "       " "    " "     " "   " "   " "              
## 4  ( 1 ) " "       " "    " "     " "   " "   " "              
## 5  ( 1 ) " "       " "    " "     " "   " "   " "              
## 6  ( 1 ) " "       " "    " "     " "   " "   " "              
## 7  ( 1 ) " "       " "    " "     " "   " "   " "              
## 8  ( 1 ) " "       " "    " "     " "   " "   " "              
##          OPP.PTS.2ND.CHANCE.y OPP.PTS.FB.y OPP.PTS.PAINT.y
## 1  ( 1 ) " "                  " "          " "            
## 2  ( 1 ) " "                  " "          " "            
## 3  ( 1 ) " "                  " "          " "            
## 4  ( 1 ) " "                  " "          " "            
## 5  ( 1 ) " "                  " "          " "            
## 6  ( 1 ) " "                  " "          " "            
## 7  ( 1 ) " "                  " "          " "            
## 8  ( 1 ) " "                  " "          "*"
## 75% of the sample size
smp_size <- floor(0.75 * nrow(data))

## set the seed to make your partition reproducible
set.seed(1)
train_ind <- sample(seq_len(nrow(data)), size = smp_size)

train <- data[train_ind, ]
test <- data[-train_ind, ]
best_model <- lm(total_points ~ FG_PCT_away + FG_PCT_home + REB_home + REB_away + FT_PCT_away + FT_PCT_home + FG3_PCT_away + FG3_PCT_home, data = train)
summary(best_model)
## 
## Call:
## lm(formula = total_points ~ FG_PCT_away + FG_PCT_home + REB_home + 
##     REB_away + FT_PCT_away + FT_PCT_home + FG3_PCT_away + FG3_PCT_home, 
##     data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -36.685  -7.366  -0.484   6.664  47.200 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -263.66386    5.36571  -49.14   <2e-16 ***
## FG_PCT_away   289.18803    4.34057   66.62   <2e-16 ***
## FG_PCT_home   280.02564    4.37867   63.95   <2e-16 ***
## REB_home        1.66041    0.03274   50.71   <2e-16 ***
## REB_away        1.61792    0.03308   48.92   <2e-16 ***
## FT_PCT_away    35.03222    1.70698   20.52   <2e-16 ***
## FT_PCT_home    33.97552    1.74915   19.42   <2e-16 ***
## FG3_PCT_away   30.81403    2.25986   13.63   <2e-16 ***
## FG3_PCT_home   27.11195    2.26966   11.95   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.72 on 3838 degrees of freedom
## Multiple R-squared:  0.7392, Adjusted R-squared:  0.7386 
## F-statistic:  1360 on 8 and 3838 DF,  p-value: < 2.2e-16
pred <- predict(best_model, test)
mean(pred)
## [1] 215.5694
mean(data$total_points)
## [1] 216.2887