5.0a Appendix: NBA Statistics and Salaries

library(tidyverse)
## ── Attaching packages ─────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1     ✔ purrr   0.3.2
## ✔ tibble  2.1.3     ✔ dplyr   0.8.3
## ✔ tidyr   1.0.0     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0
## ── Conflicts ────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(tidyr)
library(leaps)
library(tree)
## Registered S3 method overwritten by 'tree':
##   method     from
##   print.tree cli
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
## 
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
## 
##     nasa

Packages required to run R code and analysis.

salary<-read.csv("NBA_season1718_salary (1).csv", header=TRUE, 
                 stringsAsFactors = FALSE)
head(salary)
##   X         Player  Tm season17_18
## 1 1  Stephen Curry GSW    34682550
## 2 2   LeBron James CLE    33285709
## 3 3   Paul Millsap DEN    31269231
## 4 4 Gordon Hayward BOS    29727900
## 5 5  Blake Griffin DET    29512900
## 6 6     Kyle Lowry TOR    28703704
player_stats<-read.csv("nba_extra2.csv", header=TRUE, 
                       stringsAsFactors = FALSE)
head(player_stats)
##   Rk                 Player Pos Age  Tm  G GS   MP  FG FGA   FG. X3P X3PA
## 1 NA                             NA     NA NA   NA  NA  NA    NA  NA   NA
## 2  1 Alex Abrines:abrinal01  SG  24 OKC 75  8 1134 115 291 0.395  84  221
## 3 NA                             NA     NA NA   NA  NA  NA    NA  NA   NA
## 4  2     Quincy Acy:acyqu01  PF  27 BRK 70  8 1359 130 365 0.356 102  292
## 5 NA                             NA     NA NA   NA  NA  NA    NA  NA   NA
## 6  3 Steven Adams:adamsst01   C  24 OKC 76 76 2487 448 712 0.629   0    2
##    X3P. X2P X2PA  X2P.  eFG.  FT FTA   FT. ORB DRB TRB AST STL BLK TOV  PF
## 1    NA  NA   NA    NA    NA  NA  NA    NA  NA  NA  NA  NA  NA  NA  NA  NA
## 2 0.380  31   70 0.443 0.540  39  46 0.848  26  88 114  28  38   8  25 124
## 3    NA  NA   NA    NA    NA  NA  NA    NA  NA  NA  NA  NA  NA  NA  NA  NA
## 4 0.349  28   73 0.384 0.496  49  60 0.817  40 217 257  57  33  29  60 149
## 5    NA  NA   NA    NA    NA  NA  NA    NA  NA  NA  NA  NA  NA  NA  NA  NA
## 6 0.000 448  710 0.631 0.629 160 286 0.559 384 301 685  88  92  78 128 215
##    PTS
## 1   NA
## 2  353
## 3   NA
## 4  411
## 5   NA
## 6 1056
summary(player_stats)
##        Rk           Player              Pos                 Age       
##  Min.   :  1.0   Length:1328        Length:1328        Min.   :19.00  
##  1st Qu.:139.0   Class :character   Class :character   1st Qu.:23.00  
##  Median :266.5   Mode  :character   Mode  :character   Median :26.00  
##  Mean   :270.8                                         Mean   :26.19  
##  3rd Qu.:401.2                                         3rd Qu.:29.00  
##  Max.   :540.0                                         Max.   :41.00  
##  NA's   :664                                           NA's   :664    
##       Tm                  G               GS              MP        
##  Length:1328        Min.   : 1.00   Min.   : 0.00   Min.   :   1.0  
##  Class :character   1st Qu.:17.00   1st Qu.: 0.00   1st Qu.: 186.0  
##  Mode  :character   Median :46.00   Median : 4.00   Median : 755.0  
##                     Mean   :43.28   Mean   :19.71   Mean   : 972.9  
##                     3rd Qu.:71.00   3rd Qu.:35.00   3rd Qu.:1651.5  
##                     Max.   :82.00   Max.   :82.00   Max.   :3026.0  
##                     NA's   :664     NA's   :664     NA's   :664     
##        FG             FGA              FG.              X3P        
##  Min.   :  0.0   Min.   :   0.0   Min.   :0.0000   Min.   :  0.00  
##  1st Qu.: 22.0   1st Qu.:  58.0   1st Qu.:0.3950   1st Qu.:  1.75  
##  Median :102.0   Median : 224.5   Median :0.4400   Median : 18.00  
##  Mean   :159.5   Mean   : 347.2   Mean   :0.4414   Mean   : 42.27  
##  3rd Qu.:253.0   3rd Qu.: 554.0   3rd Qu.:0.4930   3rd Qu.: 64.25  
##  Max.   :857.0   Max.   :1687.0   Max.   :1.0000   Max.   :265.00  
##  NA's   :664     NA's   :664      NA's   :668      NA's   :664     
##       X3PA            X3P.             X2P             X2PA       
##  Min.   :  0.0   Min.   :0.0000   Min.   :  0.0   Min.   :   0.0  
##  1st Qu.:  7.0   1st Qu.:0.2500   1st Qu.: 15.0   1st Qu.:  33.0  
##  Median : 56.5   Median :0.3370   Median : 71.0   Median : 143.0  
##  Mean   :117.2   Mean   :0.3100   Mean   :117.2   Mean   : 230.0  
##  3rd Qu.:189.2   3rd Qu.:0.3795   3rd Qu.:181.2   3rd Qu.: 361.2  
##  Max.   :722.0   Max.   :1.0000   Max.   :725.0   Max.   :1361.0  
##  NA's   :664     NA's   :729      NA's   :664     NA's   :664     
##       X2P.             eFG.             FT              FTA        
##  Min.   :0.0000   Min.   :0.000   Min.   :  0.00   Min.   :  0.00  
##  1st Qu.:0.4422   1st Qu.:0.458   1st Qu.:  8.00   1st Qu.: 12.00  
##  Median :0.4980   Median :0.506   Median : 37.50   Median : 51.00  
##  Mean   :0.4931   Mean   :0.498   Mean   : 66.93   Mean   : 87.19  
##  3rd Qu.:0.5468   3rd Qu.:0.551   3rd Qu.: 97.00   3rd Qu.:120.75  
##  Max.   :1.0000   Max.   :1.500   Max.   :624.00   Max.   :727.00  
##  NA's   :682      NA's   :668     NA's   :664      NA's   :664     
##       FT.              ORB              DRB             TRB        
##  Min.   :0.0000   Min.   :  0.00   Min.   :  0.0   Min.   :   0.0  
##  1st Qu.:0.6670   1st Qu.:  5.00   1st Qu.: 22.0   1st Qu.:  29.0  
##  Median :0.7680   Median : 21.50   Median : 95.5   Median : 121.5  
##  Mean   :0.7411   Mean   : 39.01   Mean   :135.3   Mean   : 174.3  
##  3rd Qu.:0.8330   3rd Qu.: 53.00   3rd Qu.:208.0   3rd Qu.: 259.2  
##  Max.   :1.0000   Max.   :399.00   Max.   :848.0   Max.   :1247.0  
##  NA's   :722      NA's   :664      NA's   :664     NA's   :664     
##       AST              STL              BLK              TOV        
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.00   Min.   :  0.00  
##  1st Qu.: 11.75   1st Qu.:  5.00   1st Qu.:  2.00   1st Qu.:  8.00  
##  Median : 51.00   Median : 23.00   Median : 10.00   Median : 36.00  
##  Mean   : 93.18   Mean   : 31.15   Mean   : 19.01   Mean   : 55.01  
##  3rd Qu.:126.25   3rd Qu.: 47.00   3rd Qu.: 25.00   3rd Qu.: 86.00  
##  Max.   :820.00   Max.   :177.00   Max.   :193.00   Max.   :381.00  
##  NA's   :664      NA's   :664      NA's   :664      NA's   :664     
##        PF              PTS        
##  Min.   :  0.00   Min.   :   0.0  
##  1st Qu.: 16.75   1st Qu.:  59.0  
##  Median : 66.50   Median : 264.0  
##  Mean   : 79.91   Mean   : 428.1  
##  3rd Qu.:132.00   3rd Qu.: 667.2  
##  Max.   :285.00   Max.   :2251.0  
##  NA's   :664      NA's   :664

Loading and labeling of the two datasets that will be manipulated and joined to create the final dataset that will be used in the NBA Statistics and Salaries analysis.

player_stats2<-player_stats%>%
  filter(is.na(Rk)==FALSE)

head(player_stats2)
##   Rk                  Player Pos Age  Tm  G GS   MP  FG FGA   FG. X3P X3PA
## 1  1  Alex Abrines:abrinal01  SG  24 OKC 75  8 1134 115 291 0.395  84  221
## 2  2      Quincy Acy:acyqu01  PF  27 BRK 70  8 1359 130 365 0.356 102  292
## 3  3  Steven Adams:adamsst01   C  24 OKC 76 76 2487 448 712 0.629   0    2
## 4  4   Bam Adebayo:adebaba01   C  20 MIA 69 19 1368 174 340 0.512   0    7
## 5  5 Arron Afflalo:afflaar01  SG  32 ORL 53  3  682  65 162 0.401  27   70
## 6  6  Cole Aldrich:aldrico01   C  29 MIN 21  0   49   5  15 0.333   0    0
##    X3P. X2P X2PA  X2P.  eFG.  FT FTA   FT. ORB DRB TRB AST STL BLK TOV  PF
## 1 0.380  31   70 0.443 0.540  39  46 0.848  26  88 114  28  38   8  25 124
## 2 0.349  28   73 0.384 0.496  49  60 0.817  40 217 257  57  33  29  60 149
## 3 0.000 448  710 0.631 0.629 160 286 0.559 384 301 685  88  92  78 128 215
## 4 0.000 174  333 0.523 0.512 129 179 0.721 118 263 381 101  32  41  66 138
## 5 0.386  38   92 0.413 0.485  22  26 0.846   4  62  66  30   4   9  21  56
## 6    NA   5   15 0.333 0.333   2   6 0.333   3  12  15   3   2   1   1  11
##    PTS
## 1  353
## 2  411
## 3 1056
## 4  477
## 5  179
## 6   12
player_stats3 <- player_stats2[-c(24, 25, 28, 29, 45, 46, 59, 60, 68, 69, 70, 73, 74, 77, 78, 97, 98, 102, 103, 118, 119, 122, 123, 142, 143, 145, 146, 167, 168, 174, 175, 180, 181, 190, 191, 202, 203, 229, 230, 237, 238, 242, 243, 250, 251, 252, 259, 260, 266, 267, 276, 277, 288, 289, 299, 300, 306, 307, 314, 315, 319, 320, 328, 329, 331, 332, 342, 342, 343, 344, 368, 369, 386, 387, 401, 402, 419, 420, 425, 426, 427, 439, 440, 442, 443, 450, 451, 454, 455, 471, 472, 481, 482, 493, 494, 496, 497, 523, 524, 535, 536, 550, 551, 568, 569, 585, 586, 602, 603, 604, 606, 607, 610, 611, 623, 624, 642, 643, 650, 651, 660, 661),]
head (player_stats3)
##   Rk                  Player Pos Age  Tm  G GS   MP  FG FGA   FG. X3P X3PA
## 1  1  Alex Abrines:abrinal01  SG  24 OKC 75  8 1134 115 291 0.395  84  221
## 2  2      Quincy Acy:acyqu01  PF  27 BRK 70  8 1359 130 365 0.356 102  292
## 3  3  Steven Adams:adamsst01   C  24 OKC 76 76 2487 448 712 0.629   0    2
## 4  4   Bam Adebayo:adebaba01   C  20 MIA 69 19 1368 174 340 0.512   0    7
## 5  5 Arron Afflalo:afflaar01  SG  32 ORL 53  3  682  65 162 0.401  27   70
## 6  6  Cole Aldrich:aldrico01   C  29 MIN 21  0   49   5  15 0.333   0    0
##    X3P. X2P X2PA  X2P.  eFG.  FT FTA   FT. ORB DRB TRB AST STL BLK TOV  PF
## 1 0.380  31   70 0.443 0.540  39  46 0.848  26  88 114  28  38   8  25 124
## 2 0.349  28   73 0.384 0.496  49  60 0.817  40 217 257  57  33  29  60 149
## 3 0.000 448  710 0.631 0.629 160 286 0.559 384 301 685  88  92  78 128 215
## 4 0.000 174  333 0.523 0.512 129 179 0.721 118 263 381 101  32  41  66 138
## 5 0.386  38   92 0.413 0.485  22  26 0.846   4  62  66  30   4   9  21  56
## 6    NA   5   15 0.333 0.333   2   6 0.333   3  12  15   3   2   1   1  11
##    PTS
## 1  353
## 2  411
## 3 1056
## 4  477
## 5  179
## 6   12

Due to the format of the Kaggle data set “nba_extra2” accounting for certain players playing for multiple teams in one year (trades, free-agent signings,etc). Some players for example orginially had three rows or sets of oberservations: one for each team they played for and a total row (the two separate team stats together). These instances were manually identified and removed using this section of code. After this procedure, each NBA player only had one row/set of observations that accounts for his total statistics from that season.

playerS<-separate(data=player_stats3,col=Player,into=c("Player", "nickname"), sep=":")
head(playerS)
##   Rk        Player  nickname Pos Age  Tm  G GS   MP  FG FGA   FG. X3P X3PA
## 1  1  Alex Abrines abrinal01  SG  24 OKC 75  8 1134 115 291 0.395  84  221
## 2  2    Quincy Acy   acyqu01  PF  27 BRK 70  8 1359 130 365 0.356 102  292
## 3  3  Steven Adams adamsst01   C  24 OKC 76 76 2487 448 712 0.629   0    2
## 4  4   Bam Adebayo adebaba01   C  20 MIA 69 19 1368 174 340 0.512   0    7
## 5  5 Arron Afflalo afflaar01  SG  32 ORL 53  3  682  65 162 0.401  27   70
## 6  6  Cole Aldrich aldrico01   C  29 MIN 21  0   49   5  15 0.333   0    0
##    X3P. X2P X2PA  X2P.  eFG.  FT FTA   FT. ORB DRB TRB AST STL BLK TOV  PF
## 1 0.380  31   70 0.443 0.540  39  46 0.848  26  88 114  28  38   8  25 124
## 2 0.349  28   73 0.384 0.496  49  60 0.817  40 217 257  57  33  29  60 149
## 3 0.000 448  710 0.631 0.629 160 286 0.559 384 301 685  88  92  78 128 215
## 4 0.000 174  333 0.523 0.512 129 179 0.721 118 263 381 101  32  41  66 138
## 5 0.386  38   92 0.413 0.485  22  26 0.846   4  62  66  30   4   9  21  56
## 6    NA   5   15 0.333 0.333   2   6 0.333   3  12  15   3   2   1   1  11
##    PTS
## 1  353
## 2  411
## 3 1056
## 4  477
## 5  179
## 6   12

In this section of code the separate function is used to separate the “Player” and “nickname” by ‘:’ so the two datasets can be joined by the similar column of “Player” each player’s full name.

data<-left_join(playerS, salary)
## Joining, by = c("Player", "Tm")
data<-data%>%
  mutate(MPG = MP/G, PPG = PTS/G, APG = AST/G, RPG = TRB/G, TOG = TOV/G, GSR = GS/G)
data<-na.omit(data)
attach(data)

This section of code joins “playerS” and “salary” using the left_join function and creates the per game basis metrics using the mutata function. Na.omit works to removed any empty values (NAs) from the dataset.

data.pure <- data
data.pure$Player <- data.pure$nickname <- data.pure$Tm <- data.pure$Pos <- data.pure$Rk <- data.pure$X <- NULL

The final dataset titled data is copied and any non-numerical or unimportant variable is removed. This allows for no confusion in later outputs of pairs correlation plots and puts a focus on the statisitics that will be used in analysis.

stats_salary_cor <- data %>%
  select(season17_18, PTS, TOV, PPG, MPG, TOG, RPG, APG, GSR, GS, Age)
ggpairs(stats_salary_cor)

cor(stats_salary_cor)[,"season17_18"]
## season17_18         PTS         TOV         PPG         MPG         TOG 
##   1.0000000   0.5700031   0.5041297   0.6256246   0.5770201   0.5142743 
##         RPG         APG         GSR          GS         Age 
##   0.4696220   0.4472140   0.6165369   0.5814952   0.3400804

A pairs correlation plot is created to highlight any initial correlation betwen 2017-2018 season salaries adn certain basketball performance statistics.

### importance plot with Random Forest
set.seed(1)
train<-sample(1:nrow(data.pure), nrow(data.pure)/2)
NBA.test <- data.pure[-train, "season17_18"]
rf.NBA<-randomForest(season17_18~., data=data.pure, subset=train, 
                     importance=TRUE)
yhat.rf<-predict(rf.NBA, newdata=data.pure[-train,])
mean((yhat.rf-NBA.test)^2)
## [1] 2.384387e+13
importance(rf.NBA)
##          %IncMSE IncNodePurity
## Age  19.82939349  8.888248e+14
## G    -0.13743537  1.015118e+14
## GS    7.71891629  5.355915e+14
## MP    3.78119908  2.517207e+14
## FG    2.95021921  3.009064e+14
## FGA   2.49550348  1.631564e+14
## FG.   2.17875857  1.444834e+14
## X3P   1.93042097  1.524405e+14
## X3PA  1.58665840  1.760458e+14
## X3P.  0.02162664  1.774951e+14
## X2P   3.41344048  1.198227e+14
## X2PA  2.93144331  9.615173e+13
## X2P.  1.75340769  1.402969e+14
## eFG.  4.04310256  1.959106e+14
## FT    4.52118980  2.628892e+14
## FTA   3.34048057  2.111980e+14
## FT.  -0.73169010  1.012629e+14
## ORB   5.77684594  1.640142e+14
## DRB   3.23270186  2.494882e+14
## TRB   2.44718503  1.543428e+14
## AST   3.28501983  1.513149e+14
## STL   3.57376725  1.290821e+14
## BLK   3.67570702  2.060660e+14
## TOV   1.34628620  1.178515e+14
## PF    2.05348264  9.718817e+13
## PTS   4.72941165  3.838420e+14
## MPG  10.92719034  1.262266e+15
## PPG  12.52826764  1.349134e+15
## APG   2.68593451  3.051066e+14
## RPG   9.12615393  2.679921e+14
## TOG   3.44002007  1.690668e+14
## GSR  20.11974893  2.313002e+15
varImpPlot(rf.NBA)

Using the randomForest package, a training subset is taken from the data.pure dataset and applied to produce a variable importance plot (discussed in body of the report).

### tree with cross validation for pruning
set.seed(1)
train<-sample(1:nrow(data.pure), nrow(data.pure)/2)
NBA.test <- data.pure[-train, "season17_18"]
tree.NBA<-tree(season17_18~., data.pure, subset=train)
summary(tree.NBA)
## 
## Regression tree:
## tree(formula = season17_18 ~ ., data = data.pure, subset = train)
## Variables actually used in tree construction:
## [1] "GSR"  "Age"  "ORB"  "MP"   "PPG"  "TRB"  "X3P." "STL" 
## Number of terminal nodes:  11 
## Residual mean deviance:  1.003e+13 = 1.735e+15 / 173 
## Distribution of residuals:
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -10110000  -1884000   -454700         0   1388000   9705000
plot(tree.NBA)
text(tree.NBA, pretty = 0)

The same train subset is created and applied to the tree algorithim to create the recursive binary spltting diagram (discussed in body of the report).

cv.NBA<-cv.tree(tree.NBA)
plot(cv.NBA$size, cv.NBA$dev, type='b')

prune.NBA<-prune.tree(tree.NBA, best=4)
plot(prune.NBA)
text(prune.NBA, pretty=0)

Cross-validation for the number of terminal nodes(4) is performed and used in coordination with the prune.tree function is modify the tree plot.

mod1a <- lm(season17_18~MPG, data = data.pure)
anova(mod1a)
## Analysis of Variance Table
## 
## Response: season17_18
##            Df     Sum Sq    Mean Sq F value    Pr(>F)    
## MPG         1 7.0049e+15 7.0049e+15  182.69 < 2.2e-16 ***
## Residuals 366 1.4034e+16 3.8344e+13                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# RSS = 1.4034e+16

mod1b <- lm(season17_18~PPG, data = data.pure)
anova(mod1b)
## Analysis of Variance Table
## 
## Response: season17_18
##            Df     Sum Sq    Mean Sq F value    Pr(>F)    
## PPG         1 8.2346e+15 8.2346e+15  235.39 < 2.2e-16 ***
## Residuals 366 1.2804e+16 3.4984e+13                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# RSS = 1.2803e+16

mod1c <- lm(season17_18~Age, data = data.pure)
anova(mod1c)
## Analysis of Variance Table
## 
## Response: season17_18
##            Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Age         1 2.4332e+15 2.4332e+15  47.865 2.049e-11 ***
## Residuals 366 1.8605e+16 5.0834e+13                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# RSS = 1.8605e+16

mod1d <- lm(season17_18~GS, data = data.pure)
anova(mod1d)
## Analysis of Variance Table
## 
## Response: season17_18
##            Df     Sum Sq    Mean Sq F value    Pr(>F)    
## GS          1 7.1139e+15 7.1139e+15  186.98 < 2.2e-16 ***
## Residuals 366 1.3925e+16 3.8046e+13                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# RSS = 1.3925e+16

mod1e <- lm(season17_18~GSR, data = data.pure)
anova(mod1e)
## Analysis of Variance Table
## 
## Response: season17_18
##            Df     Sum Sq    Mean Sq F value    Pr(>F)    
## GSR         1 7.9972e+15 7.9972e+15  224.43 < 2.2e-16 ***
## Residuals 366 1.3041e+16 3.5632e+13                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#RSS = 1.3041e+16

mod1e <- lm(season17_18~TOG, data = data.pure)
anova(mod1e)
## Analysis of Variance Table
## 
## Response: season17_18
##            Df     Sum Sq    Mean Sq F value    Pr(>F)    
## TOG         1 5.5643e+15 5.5643e+15  131.61 < 2.2e-16 ***
## Residuals 366 1.5474e+16 4.2280e+13                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# RSS = 1.5474e+16 

First step of forward selection: testing each explanatory variable in a simple regression format and comaring residual sum of squares (RSS). PPG determined to be the first variable in the model.

mod2a <- lm(season17_18~PPG + Age, data = data.pure)
anova(mod2a)
## Analysis of Variance Table
## 
## Response: season17_18
##            Df     Sum Sq    Mean Sq F value    Pr(>F)    
## PPG         1 8.2346e+15 8.2346e+15 278.828 < 2.2e-16 ***
## Age         1 2.0244e+15 2.0244e+15  68.547 2.367e-15 ***
## Residuals 365 1.0780e+16 2.9533e+13                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# RSS = 1.078e+16

mod2b <- lm(season17_18~PPG + GSR, data = data.pure)
anova(mod2b)
## Analysis of Variance Table
## 
## Response: season17_18
##            Df     Sum Sq    Mean Sq F value    Pr(>F)    
## PPG         1 8.2346e+15 8.2346e+15 255.170 < 2.2e-16 ***
## GSR         1 1.0250e+15 1.0250e+15  31.761 3.491e-08 ***
## Residuals 365 1.1779e+16 3.2271e+13                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# RSS = 1.1779e+16

mod2c <- lm(season17_18~PPG + MPG, data = data.pure)
anova(mod2c)
## Analysis of Variance Table
## 
## Response: season17_18
##            Df     Sum Sq    Mean Sq F value Pr(>F)    
## PPG         1 8.2346e+15 8.2346e+15 236.264 <2e-16 ***
## MPG         1 8.2429e+13 8.2429e+13   2.365  0.125    
## Residuals 365 1.2722e+16 3.4854e+13                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# RSS = 1.2722e+16

mod2d <- lm(season17_18~PPG + GS, data = data.pure)
anova(mod2d)
## Analysis of Variance Table
## 
## Response: season17_18
##            Df     Sum Sq    Mean Sq F value    Pr(>F)    
## PPG         1 8.2346e+15 8.2346e+15 249.544 < 2.2e-16 ***
## GS          1 7.5942e+14 7.5942e+14  23.014 2.348e-06 ***
## Residuals 365 1.2045e+16 3.2999e+13                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# RSS = 1.2045e+16

mod2e <- lm(season17_18~PPG + TOG, data = data.pure)
anova(mod2e)
## Analysis of Variance Table
## 
## Response: season17_18
##            Df     Sum Sq    Mean Sq  F value Pr(>F)    
## PPG         1 8.2346e+15 8.2346e+15 234.7687 <2e-16 ***
## TOG         1 1.3884e+12 1.3884e+12   0.0396 0.8424    
## Residuals 365 1.2803e+16 3.5076e+13                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# TOG not significant

Second step of forward selection: Age determined to be the second vairable in the model.

mod3a <- lm(season17_18~PPG + Age + GSR, data = data.pure)
anova(mod3a)
## Analysis of Variance Table
## 
## Response: season17_18
##            Df     Sum Sq    Mean Sq F value    Pr(>F)    
## PPG         1 8.2346e+15 8.2346e+15 302.999 < 2.2e-16 ***
## Age         1 2.0244e+15 2.0244e+15  74.489 < 2.2e-16 ***
## GSR         1 8.8708e+14 8.8708e+14  32.641 2.308e-08 ***
## Residuals 364 9.8925e+15 2.7177e+13                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#RSS = 9.8925e+15

mod3b <- lm(season17_18~PPG + Age + MPG, data = data.pure)
anova(mod3b)
## Analysis of Variance Table
## 
## Response: season17_18
##            Df     Sum Sq    Mean Sq  F value    Pr(>F)    
## PPG         1 8.2346e+15 8.2346e+15 279.5529 < 2.2e-16 ***
## Age         1 2.0244e+15 2.0244e+15  68.7252  2.21e-15 ***
## MPG         1 5.7407e+13 5.7407e+13   1.9489    0.1636    
## Residuals 364 1.0722e+16 2.9456e+13                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# MPG not significant

mod3c <- lm(season17_18~PPG + Age + GS, data = data.pure)
anova(mod3c)
## Analysis of Variance Table
## 
## Response: season17_18
##            Df     Sum Sq    Mean Sq F value    Pr(>F)    
## PPG         1 8.2346e+15 8.2346e+15 296.532 < 2.2e-16 ***
## Age         1 2.0244e+15 2.0244e+15  72.899 3.760e-16 ***
## GS          1 6.7135e+14 6.7135e+14  24.175 1.333e-06 ***
## Residuals 364 1.0108e+16 2.7770e+13                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# RSS = 1.0108e+16

mod3d <- lm(season17_18~PPG + Age + TOG, data = data.pure)
anova(mod3d)
## Analysis of Variance Table
## 
## Response: season17_18
##            Df     Sum Sq    Mean Sq  F value    Pr(>F)    
## PPG         1 8.2346e+15 8.2346e+15 278.1187 < 2.2e-16 ***
## Age         1 2.0244e+15 2.0244e+15  68.3727 2.569e-15 ***
## TOG         1 2.1150e+12 2.1150e+12   0.0714    0.7894    
## Residuals 364 1.0777e+16 2.9608e+13                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# TOG not significant

Third step of forward selection: GSR is determined to be the third variable in the model.

mod4a <- lm(season17_18~PPG + Age + GSR + MPG, data = data.pure)
anova(mod4a)
## Analysis of Variance Table
## 
## Response: season17_18
##            Df     Sum Sq    Mean Sq  F value    Pr(>F)    
## PPG         1 8.2346e+15 8.2346e+15 305.1463 < 2.2e-16 ***
## Age         1 2.0244e+15 2.0244e+15  75.0171 < 2.2e-16 ***
## GSR         1 8.8708e+14 8.8708e+14  32.8719 2.073e-08 ***
## MPG         1 9.6612e+13 9.6612e+13   3.5801   0.05927 .  
## Residuals 363 9.7959e+15 2.6986e+13                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# MPG not significant

mod4b <- lm(season17_18~PPG + Age + GSR + GS, data = data.pure)
anova(mod4b)
## Analysis of Variance Table
## 
## Response: season17_18
##            Df     Sum Sq    Mean Sq  F value    Pr(>F)    
## PPG         1 8.2346e+15 8.2346e+15 302.1710 < 2.2e-16 ***
## Age         1 2.0244e+15 2.0244e+15  74.2857 < 2.2e-16 ***
## GSR         1 8.8708e+14 8.8708e+14  32.5514 2.412e-08 ***
## GS          1 1.6006e+11 1.6006e+11   0.0059     0.939    
## Residuals 363 9.8923e+15 2.7252e+13                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# GS not significant

mod4c <- lm(season17_18~PPG + Age + GSR + TOG, data = data.pure)
anova(mod4c)
## Analysis of Variance Table
## 
## Response: season17_18
##            Df     Sum Sq    Mean Sq  F value    Pr(>F)    
## PPG         1 8.2346e+15 8.2346e+15 302.2270 < 2.2e-16 ***
## Age         1 2.0244e+15 2.0244e+15  74.2994 < 2.2e-16 ***
## GSR         1 8.8708e+14 8.8708e+14  32.5574 2.405e-08 ***
## TOG         1 1.9904e+12 1.9904e+12   0.0731    0.7871    
## Residuals 363 9.8905e+15 2.7247e+13                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# TOG not significant 

Fourth step of forward selection: No other variables determined to be significant

set.seed(52)
data[sample(nrow(data),1),]
##      Rk       Player  nickname Pos Age  Tm  G GS   MP  FG FGA   FG. X3P
## 116 114 Allen Crabbe crabbal01  SG  25 BRK 75 68 2197 337 827 0.407 201
##     X3PA  X3P. X2P X2PA  X2P.  eFG.  FT FTA   FT. ORB DRB TRB AST STL BLK
## 116  532 0.378 136  295 0.461 0.529 115 135 0.852  28 296 324 117  47  35
##     TOV  PF PTS  X season17_18      MPG  PPG  APG  RPG  TOG       GSR
## 116  78 168 990 44    19332500 29.29333 13.2 1.56 4.32 1.04 0.9066667
final.model <- lm(season17_18~PPG + Age +GSR, data = data.pure)
anova(final.model)
## Analysis of Variance Table
## 
## Response: season17_18
##            Df     Sum Sq    Mean Sq F value    Pr(>F)    
## PPG         1 8.2346e+15 8.2346e+15 302.999 < 2.2e-16 ***
## Age         1 2.0244e+15 2.0244e+15  74.489 < 2.2e-16 ***
## GSR         1 8.8708e+14 8.8708e+14  32.641 2.308e-08 ***
## Residuals 364 9.8925e+15 2.7177e+13                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
new.data<- data.frame(PPG = 13.2, Age = 25, GSR = 1)

final.test <- predict(final.model, new.data)
final.test
##        1 
## 11381341

This code takes a random sample for one row/player from the data set and uses the predict function to input Allen Crabbe’s stats to createsalary output.

5.0b Appendix: College/International and NBA Statistics

firstQT<-data%>%
  filter(season17_18<1465920)
dim(firstQT)
## [1] 66 39
set.seed(1)
samp1<-sample(121, 10)
sampFirst<-firstQT[samp1,]

secondQT<-data%>%
  filter(season17_18>=1465920 & season17_18<3028410)
dim(secondQT)
## [1] 99 39
set.seed(2)
samp2<-sample(122, 10)
sampSecond<-secondQT[samp2,]

thirdQT<-data%>%
  filter(season17_18>=3028410 & season17_18<9539057)
dim(thirdQT)
## [1] 99 39
set.seed(3)
samp3<-sample(122, 10)
sampThird<-thirdQT[samp3,]

fourthQT<-data%>%
  filter(season17_18>=9539097 & season17_18<=34682550)
dim(fourthQT)
## [1] 104  39
set.seed(4)
samp4<-sample(122, 10)
sampFourth<-fourthQT[samp4,]

This set of code performs a stratified sample based on the five number summary of the overall dataset (all the players’ salaries in the NBA during the 2017-2018 season) to create four stratas based on salary in dollar amounts. The results of each strata were used to separately research and create a dataframe with thses 40 players (10 from each strata). This new dataset will be used for regression analysis.

college.data <- read.csv("Stratified Sample_NBA stats_Sheet1.csv", header = TRUE,
                         stringsAsFactors = FALSE)
head(college.data)
##              Player Pos Age  Tm  G GS   MP  FG FGA   FG. X3P X3PA  X3P.
## 1 Willy Hernangomez   C  23 TOT 48  1  495  91 164 0.555   5   12 0.417
## 2         Ivan Rabb  PF  20 MEM 36  5  516  86 152 0.566   0    0    NA
## 3       David Nwaba  SG  25 CHI 70 21 1646 202 423 0.478  18   52 0.346
## 4   Tyler Cavanaugh  PF  23 ATL 39  1  518  67 152 0.441  32   89 0.360
## 5        Tyler Ulis  PG  22 PHO 71 43 1658 214 551 0.388  42  146 0.288
## 6    Ersan Ilyasova  PF  30 TOT 69 43 1729 277 613 0.452  91  253 0.360
##   X2P X2PA  X2P.  eFG.  FT FTA   FT. ORB DRB TRB AST STL BLK TOV  PF PTS
## 1  86  152 0.566 0.570  59  90 0.656  61 122 183  33  18  16  35  64 246
## 2  86  152 0.566 0.566  29  36 0.806  53 105 158  32  12  13  35  55 201
## 3 184  371 0.496 0.499 133 203 0.655  89 237 326 104  59  29  77 158 555
## 4  35   63 0.556 0.546  17  21 0.810  45  82 127  27   9   4  14  61 183
## 5 172  405 0.425 0.426  84 101 0.832  24 104 128 311  70   7 127 120 554
## 6 186  360 0.517 0.526 105 135 0.778 118 287 405  90  61  26  72 157 750
college<-read.csv("Stat Project_College_International Data_Sheet1.csv", header=TRUE, stringsAsFactors = FALSE)
head(college)
##              Player              Team cMPG cPPG cFGM cFGA  cFGp cAPG cRPG
## 1 Willy Hernangomez       Real Madrid 12.6 18.8  7.5 11.2 0.667  1.2 11.0
## 2         Ivan Rabb              Cal  32.6 14.0  4.9 10.1 0.484  1.5 10.5
## 3       David Nwaba          Cal Poly 28.0 12.5  4.4  9.5 0.462  3.5  6.3
## 4   Tyler Cavanaugh George Washington 32.2 18.3  5.7 12.7 0.448  2.0  8.4
## 5        Tyler Ulis          Kentucky 36.8 17.3  5.5 12.6 0.434  7.0  3.0
## 6    Ersan Ilyasova      FC Barcelona 29.0 10.7  4.0  8.0 0.500  0.7  7.6

Loading and labeling of the two data sets that will be used in the College and NBA statistics analysis.

pro.college <- left_join(college.data, college)
## Joining, by = "Player"
pro.college<-pro.college%>%
  mutate(MPG = MP/G, PPG = PTS/G, APG = AST/G, RPG = TRB/G, TOG = TOV/G)
pro.college <- na.omit(pro.college)
head(pro.college)
##              Player Pos Age  Tm  G GS   MP  FG FGA   FG. X3P X3PA  X3P.
## 1 Willy Hernangomez   C  23 TOT 48  1  495  91 164 0.555   5   12 0.417
## 3       David Nwaba  SG  25 CHI 70 21 1646 202 423 0.478  18   52 0.346
## 4   Tyler Cavanaugh  PF  23 ATL 39  1  518  67 152 0.441  32   89 0.360
## 5        Tyler Ulis  PG  22 PHO 71 43 1658 214 551 0.388  42  146 0.288
## 6    Ersan Ilyasova  PF  30 TOT 69 43 1729 277 613 0.452  91  253 0.360
## 7       Travis Wear  SF  27 LAL 17  0  228  25  72 0.347  17   47 0.362
##   X2P X2PA  X2P.  eFG.  FT FTA   FT. ORB DRB TRB AST STL BLK TOV  PF PTS
## 1  86  152 0.566 0.570  59  90 0.656  61 122 183  33  18  16  35  64 246
## 3 184  371 0.496 0.499 133 203 0.655  89 237 326 104  59  29  77 158 555
## 4  35   63 0.556 0.546  17  21 0.810  45  82 127  27   9   4  14  61 183
## 5 172  405 0.425 0.426  84 101 0.832  24 104 128 311  70   7 127 120 554
## 6 186  360 0.517 0.526 105 135 0.778 118 287 405  90  61  26  72 157 750
## 7   8   25 0.320 0.465   8   8 1.000   0  38  38   7   4   5   6  30  75
##                Team cMPG cPPG cFGM cFGA  cFGp cAPG cRPG      MPG       PPG
## 1       Real Madrid 12.6 18.8  7.5 11.2 0.667  1.2 11.0 10.31250  5.125000
## 3          Cal Poly 28.0 12.5  4.4  9.5 0.462  3.5  6.3 23.51429  7.928571
## 4 George Washington 32.2 18.3  5.7 12.7 0.448  2.0  8.4 13.28205  4.692308
## 5          Kentucky 36.8 17.3  5.5 12.6 0.434  7.0  3.0 23.35211  7.802817
## 6      FC Barcelona 29.0 10.7  4.0  8.0 0.500  0.7  7.6 25.05797 10.869565
## 7              UCLA 23.9  7.2  3.1  5.8 0.530  1.4  3.2 13.41176  4.411765
##         APG      RPG       TOG
## 1 0.6875000 3.812500 0.7291667
## 3 1.4857143 4.657143 1.1000000
## 4 0.6923077 3.256410 0.3589744
## 5 4.3802817 1.802817 1.7887324
## 6 1.3043478 5.869565 1.0434783
## 7 0.4117647 2.235294 0.3529412

This section of code joins “college.data” and “college” by the Player column using the left_join function and creates the per game basis metrics using the mutata function. Na.omit works to removed any empty values (NAs) from the dataset.

pro.college <- na.omit(pro.college)
pro.college.pure <- pro.college
pro.college.pure$Player <- pro.college.pure$Tm <- pro.college.pure$Team <- pro.college.pure$Pos <- NULL

The final dataset titled data is copied and any non-numerical or unimportant variable is removed. This allows for no confusion in later outputs of pairs correlation plots and puts a focus on the statisitics that will be used in analysis.

stats_college_cor <- pro.college %>%
  select(MPG, cMPG, cPPG, cFGA, cFGp, cAPG, cRPG, cFGM)
ggpairs(stats_college_cor)

cor(stats_college_cor)[,"MPG"]
##         MPG        cMPG        cPPG        cFGA        cFGp        cAPG 
##  1.00000000  0.15077490 -0.02568161  0.03141627 -0.34032515  0.39943917 
##        cRPG        cFGM 
## -0.23779916 -0.09621396

A pairs correlation plot is created to highlight any initial correlation betwen MGP in the 2017-2018 NBA season and certain basketball performance statistics from the college or international season prior to entering the league.

set.seed(2)
train<-sample(1:nrow(pro.college.pure), nrow(pro.college.pure)/2)
set.seed(2)
rf.College<-randomForest(MPG~cMPG+cRPG+cPPG+cAPG+cFGM+cFGA+cFGp, data=pro.college.pure, subset=train, 
                     importance=TRUE)
yhat.rf<-predict(rf.College, newdata=pro.college.pure[-train,])
mean((yhat.rf-NBA.test)^2)
## Warning in yhat.rf - NBA.test: longer object length is not a multiple of
## shorter object length
## [1] 9.713284e+13
importance(rf.College)
##         %IncMSE IncNodePurity
## cMPG  0.5627210     193.13778
## cRPG -0.3501000     146.28648
## cPPG  0.4079435     126.09444
## cAPG  5.9093233     296.70085
## cFGM -2.4516593      90.52356
## cFGA -1.5832037     127.53237
## cFGp -1.3091603     197.97435
varImpPlot(rf.College)

Using the randomForest package, a training subset is taken from the pro.college.pure dataset and applied to produce a variable importance plot (discussed in body of the report).

College.test <- data.pure[-train, "MPG"]
tree.College<-tree(MPG~cMPG+cRPG+cPPG+cAPG+cFGM+cFGA+cFGp, pro.college.pure, subset=train)
summary(tree.College)
## 
## Regression tree:
## tree(formula = MPG ~ cMPG + cRPG + cPPG + cAPG + cFGM + cFGA + 
##     cFGp, data = pro.college.pure, subset = train)
## Variables actually used in tree construction:
## [1] "cAPG"
## Number of terminal nodes:  2 
## Residual mean deviance:  58.38 = 875.7 / 15 
## Distribution of residuals:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -14.380  -5.358  -1.667   0.000   6.364  11.350
plot(tree.College)
text(tree.College, pretty = 0)

cv.College<-cv.tree(tree.College)
plot(cv.College$size, cv.College$dev, type='b')

This code sets a seed for the train subset of the pro.college.pure dataset and uses that subset to apply the tree algorithim and plot. No pruning by cross-validation needed in this instance.

MOD1a <- lm(MPG~cRPG, data = pro.college.pure)
anova(MOD1a)
## Analysis of Variance Table
## 
## Response: MPG
##           Df  Sum Sq Mean Sq F value Pr(>F)
## cRPG       1  139.32 139.322  1.9779  0.169
## Residuals 33 2324.44  70.438
# RSS = 2324.44

MOD1b <- lm(MPG~cAPG, data = pro.college.pure)
anova(MOD1b)
## Analysis of Variance Table
## 
## Response: MPG
##           Df Sum Sq Mean Sq F value  Pr(>F)  
## cAPG       1  393.1  393.10  6.2648 0.01744 *
## Residuals 33 2070.7   62.75                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# RSS = 2070.7

MOD1c <- lm(MPG~cFGA, data = pro.college.pure)
anova(MOD1c)
## Analysis of Variance Table
## 
## Response: MPG
##           Df  Sum Sq Mean Sq F value Pr(>F)
## cFGA       1    2.43   2.432  0.0326 0.8578
## Residuals 33 2461.33  74.586
# RSS = 2440.96

MOD1d <- lm(MPG~cFGp, data = pro.college.pure)
anova(MOD1d)
## Analysis of Variance Table
## 
## Response: MPG
##           Df  Sum Sq Mean Sq F value  Pr(>F)  
## cFGp       1  285.36 285.356  4.3228 0.04545 *
## Residuals 33 2178.41  66.012                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# RSS = 2178.41

MOD1e <- lm(MPG~cPPG, data = pro.college.pure)
anova(MOD1e)
## Analysis of Variance Table
## 
## Response: MPG
##           Df  Sum Sq Mean Sq F value Pr(>F)
## cPPG       1    1.62   1.625  0.0218 0.8836
## Residuals 33 2462.14  74.610
# RSS = 2462.14

MOD1f <- lm(MPG~cMPG, data = pro.college.pure)
anova(MOD1f)
## Analysis of Variance Table
## 
## Response: MPG
##           Df  Sum Sq Mean Sq F value Pr(>F)
## cMPG       1   56.01  56.009  0.7676 0.3873
## Residuals 33 2407.75  72.962
# RSS = 2407.75

First step of forward selection: testing each explanatory variable in a simple regression format and comaring residual sum of squares (RSS). cAPG determined to be the first variable in the model.

MOD2a <- lm(MPG~cAPG + cFGA, data = pro.college.pure)
anova(MOD2a)
## Analysis of Variance Table
## 
## Response: MPG
##           Df  Sum Sq Mean Sq F value  Pr(>F)  
## cAPG       1  393.10  393.10  6.1743 0.01838 *
## cFGA       1   33.33   33.33  0.5236 0.47458  
## Residuals 32 2037.33   63.67                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
MOD2b <- lm(MPG~cAPG + cPPG, data = pro.college.pure)
anova(MOD2b)
## Analysis of Variance Table
## 
## Response: MPG
##           Df Sum Sq Mean Sq F value  Pr(>F)  
## cAPG       1  393.1  393.10  6.1964 0.01819 *
## cPPG       1   40.6   40.60  0.6400 0.42961  
## Residuals 32 2030.1   63.44                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
MOD2c <- lm(MPG~cAPG + cFGM, data = pro.college.pure)
anova(MOD2c)
## Analysis of Variance Table
## 
## Response: MPG
##           Df  Sum Sq Mean Sq F value Pr(>F)  
## cAPG       1  393.10  393.10  6.3531 0.0169 *
## cFGM       1   90.66   90.66  1.4652 0.2350  
## Residuals 32 1980.00   61.88                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
MOD2d <- lm(MPG~cAPG + cMPG, data = pro.college.pure)
anova(MOD2d)
## Analysis of Variance Table
## 
## Response: MPG
##           Df  Sum Sq Mean Sq F value  Pr(>F)  
## cAPG       1  393.10  393.10  6.1678 0.01844 *
## cMPG       1   31.18   31.18  0.4893 0.48931  
## Residuals 32 2039.48   63.73                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
MOD2e <- lm(MPG~cAPG + cRPG, data = pro.college.pure)
anova(MOD2e)
## Analysis of Variance Table
## 
## Response: MPG
##           Df  Sum Sq Mean Sq F value  Pr(>F)  
## cAPG       1  393.10  393.10  6.2956 0.01736 *
## cRPG       1   72.59   72.59  1.1625 0.28901  
## Residuals 32 1998.08   62.44                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
MOD2f <- lm(MPG~cAPG + cFGp, data = pro.college.pure)
anova(MOD2f)
## Analysis of Variance Table
## 
## Response: MPG
##           Df  Sum Sq Mean Sq F value  Pr(>F)  
## cAPG       1  393.10  393.10  6.4364 0.01625 *
## cFGp       1  116.31  116.31  1.9044 0.17716  
## Residuals 32 1954.36   61.07                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Second step of forward selection: No other variables determined to be significant.