library(tidyverse)
## ── Attaching packages ─────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1 ✔ purrr 0.3.2
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 1.0.0 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(tidyr)
library(leaps)
library(tree)
## Registered S3 method overwritten by 'tree':
## method from
## print.tree cli
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
##
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
##
## nasa
Packages required to run R code and analysis.
salary<-read.csv("NBA_season1718_salary (1).csv", header=TRUE,
stringsAsFactors = FALSE)
head(salary)
## X Player Tm season17_18
## 1 1 Stephen Curry GSW 34682550
## 2 2 LeBron James CLE 33285709
## 3 3 Paul Millsap DEN 31269231
## 4 4 Gordon Hayward BOS 29727900
## 5 5 Blake Griffin DET 29512900
## 6 6 Kyle Lowry TOR 28703704
player_stats<-read.csv("nba_extra2.csv", header=TRUE,
stringsAsFactors = FALSE)
head(player_stats)
## Rk Player Pos Age Tm G GS MP FG FGA FG. X3P X3PA
## 1 NA NA NA NA NA NA NA NA NA NA
## 2 1 Alex Abrines:abrinal01 SG 24 OKC 75 8 1134 115 291 0.395 84 221
## 3 NA NA NA NA NA NA NA NA NA NA
## 4 2 Quincy Acy:acyqu01 PF 27 BRK 70 8 1359 130 365 0.356 102 292
## 5 NA NA NA NA NA NA NA NA NA NA
## 6 3 Steven Adams:adamsst01 C 24 OKC 76 76 2487 448 712 0.629 0 2
## X3P. X2P X2PA X2P. eFG. FT FTA FT. ORB DRB TRB AST STL BLK TOV PF
## 1 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## 2 0.380 31 70 0.443 0.540 39 46 0.848 26 88 114 28 38 8 25 124
## 3 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## 4 0.349 28 73 0.384 0.496 49 60 0.817 40 217 257 57 33 29 60 149
## 5 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## 6 0.000 448 710 0.631 0.629 160 286 0.559 384 301 685 88 92 78 128 215
## PTS
## 1 NA
## 2 353
## 3 NA
## 4 411
## 5 NA
## 6 1056
summary(player_stats)
## Rk Player Pos Age
## Min. : 1.0 Length:1328 Length:1328 Min. :19.00
## 1st Qu.:139.0 Class :character Class :character 1st Qu.:23.00
## Median :266.5 Mode :character Mode :character Median :26.00
## Mean :270.8 Mean :26.19
## 3rd Qu.:401.2 3rd Qu.:29.00
## Max. :540.0 Max. :41.00
## NA's :664 NA's :664
## Tm G GS MP
## Length:1328 Min. : 1.00 Min. : 0.00 Min. : 1.0
## Class :character 1st Qu.:17.00 1st Qu.: 0.00 1st Qu.: 186.0
## Mode :character Median :46.00 Median : 4.00 Median : 755.0
## Mean :43.28 Mean :19.71 Mean : 972.9
## 3rd Qu.:71.00 3rd Qu.:35.00 3rd Qu.:1651.5
## Max. :82.00 Max. :82.00 Max. :3026.0
## NA's :664 NA's :664 NA's :664
## FG FGA FG. X3P
## Min. : 0.0 Min. : 0.0 Min. :0.0000 Min. : 0.00
## 1st Qu.: 22.0 1st Qu.: 58.0 1st Qu.:0.3950 1st Qu.: 1.75
## Median :102.0 Median : 224.5 Median :0.4400 Median : 18.00
## Mean :159.5 Mean : 347.2 Mean :0.4414 Mean : 42.27
## 3rd Qu.:253.0 3rd Qu.: 554.0 3rd Qu.:0.4930 3rd Qu.: 64.25
## Max. :857.0 Max. :1687.0 Max. :1.0000 Max. :265.00
## NA's :664 NA's :664 NA's :668 NA's :664
## X3PA X3P. X2P X2PA
## Min. : 0.0 Min. :0.0000 Min. : 0.0 Min. : 0.0
## 1st Qu.: 7.0 1st Qu.:0.2500 1st Qu.: 15.0 1st Qu.: 33.0
## Median : 56.5 Median :0.3370 Median : 71.0 Median : 143.0
## Mean :117.2 Mean :0.3100 Mean :117.2 Mean : 230.0
## 3rd Qu.:189.2 3rd Qu.:0.3795 3rd Qu.:181.2 3rd Qu.: 361.2
## Max. :722.0 Max. :1.0000 Max. :725.0 Max. :1361.0
## NA's :664 NA's :729 NA's :664 NA's :664
## X2P. eFG. FT FTA
## Min. :0.0000 Min. :0.000 Min. : 0.00 Min. : 0.00
## 1st Qu.:0.4422 1st Qu.:0.458 1st Qu.: 8.00 1st Qu.: 12.00
## Median :0.4980 Median :0.506 Median : 37.50 Median : 51.00
## Mean :0.4931 Mean :0.498 Mean : 66.93 Mean : 87.19
## 3rd Qu.:0.5468 3rd Qu.:0.551 3rd Qu.: 97.00 3rd Qu.:120.75
## Max. :1.0000 Max. :1.500 Max. :624.00 Max. :727.00
## NA's :682 NA's :668 NA's :664 NA's :664
## FT. ORB DRB TRB
## Min. :0.0000 Min. : 0.00 Min. : 0.0 Min. : 0.0
## 1st Qu.:0.6670 1st Qu.: 5.00 1st Qu.: 22.0 1st Qu.: 29.0
## Median :0.7680 Median : 21.50 Median : 95.5 Median : 121.5
## Mean :0.7411 Mean : 39.01 Mean :135.3 Mean : 174.3
## 3rd Qu.:0.8330 3rd Qu.: 53.00 3rd Qu.:208.0 3rd Qu.: 259.2
## Max. :1.0000 Max. :399.00 Max. :848.0 Max. :1247.0
## NA's :722 NA's :664 NA's :664 NA's :664
## AST STL BLK TOV
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 11.75 1st Qu.: 5.00 1st Qu.: 2.00 1st Qu.: 8.00
## Median : 51.00 Median : 23.00 Median : 10.00 Median : 36.00
## Mean : 93.18 Mean : 31.15 Mean : 19.01 Mean : 55.01
## 3rd Qu.:126.25 3rd Qu.: 47.00 3rd Qu.: 25.00 3rd Qu.: 86.00
## Max. :820.00 Max. :177.00 Max. :193.00 Max. :381.00
## NA's :664 NA's :664 NA's :664 NA's :664
## PF PTS
## Min. : 0.00 Min. : 0.0
## 1st Qu.: 16.75 1st Qu.: 59.0
## Median : 66.50 Median : 264.0
## Mean : 79.91 Mean : 428.1
## 3rd Qu.:132.00 3rd Qu.: 667.2
## Max. :285.00 Max. :2251.0
## NA's :664 NA's :664
Loading and labeling of the two datasets that will be manipulated and joined to create the final dataset that will be used in the NBA Statistics and Salaries analysis.
player_stats2<-player_stats%>%
filter(is.na(Rk)==FALSE)
head(player_stats2)
## Rk Player Pos Age Tm G GS MP FG FGA FG. X3P X3PA
## 1 1 Alex Abrines:abrinal01 SG 24 OKC 75 8 1134 115 291 0.395 84 221
## 2 2 Quincy Acy:acyqu01 PF 27 BRK 70 8 1359 130 365 0.356 102 292
## 3 3 Steven Adams:adamsst01 C 24 OKC 76 76 2487 448 712 0.629 0 2
## 4 4 Bam Adebayo:adebaba01 C 20 MIA 69 19 1368 174 340 0.512 0 7
## 5 5 Arron Afflalo:afflaar01 SG 32 ORL 53 3 682 65 162 0.401 27 70
## 6 6 Cole Aldrich:aldrico01 C 29 MIN 21 0 49 5 15 0.333 0 0
## X3P. X2P X2PA X2P. eFG. FT FTA FT. ORB DRB TRB AST STL BLK TOV PF
## 1 0.380 31 70 0.443 0.540 39 46 0.848 26 88 114 28 38 8 25 124
## 2 0.349 28 73 0.384 0.496 49 60 0.817 40 217 257 57 33 29 60 149
## 3 0.000 448 710 0.631 0.629 160 286 0.559 384 301 685 88 92 78 128 215
## 4 0.000 174 333 0.523 0.512 129 179 0.721 118 263 381 101 32 41 66 138
## 5 0.386 38 92 0.413 0.485 22 26 0.846 4 62 66 30 4 9 21 56
## 6 NA 5 15 0.333 0.333 2 6 0.333 3 12 15 3 2 1 1 11
## PTS
## 1 353
## 2 411
## 3 1056
## 4 477
## 5 179
## 6 12
player_stats3 <- player_stats2[-c(24, 25, 28, 29, 45, 46, 59, 60, 68, 69, 70, 73, 74, 77, 78, 97, 98, 102, 103, 118, 119, 122, 123, 142, 143, 145, 146, 167, 168, 174, 175, 180, 181, 190, 191, 202, 203, 229, 230, 237, 238, 242, 243, 250, 251, 252, 259, 260, 266, 267, 276, 277, 288, 289, 299, 300, 306, 307, 314, 315, 319, 320, 328, 329, 331, 332, 342, 342, 343, 344, 368, 369, 386, 387, 401, 402, 419, 420, 425, 426, 427, 439, 440, 442, 443, 450, 451, 454, 455, 471, 472, 481, 482, 493, 494, 496, 497, 523, 524, 535, 536, 550, 551, 568, 569, 585, 586, 602, 603, 604, 606, 607, 610, 611, 623, 624, 642, 643, 650, 651, 660, 661),]
head (player_stats3)
## Rk Player Pos Age Tm G GS MP FG FGA FG. X3P X3PA
## 1 1 Alex Abrines:abrinal01 SG 24 OKC 75 8 1134 115 291 0.395 84 221
## 2 2 Quincy Acy:acyqu01 PF 27 BRK 70 8 1359 130 365 0.356 102 292
## 3 3 Steven Adams:adamsst01 C 24 OKC 76 76 2487 448 712 0.629 0 2
## 4 4 Bam Adebayo:adebaba01 C 20 MIA 69 19 1368 174 340 0.512 0 7
## 5 5 Arron Afflalo:afflaar01 SG 32 ORL 53 3 682 65 162 0.401 27 70
## 6 6 Cole Aldrich:aldrico01 C 29 MIN 21 0 49 5 15 0.333 0 0
## X3P. X2P X2PA X2P. eFG. FT FTA FT. ORB DRB TRB AST STL BLK TOV PF
## 1 0.380 31 70 0.443 0.540 39 46 0.848 26 88 114 28 38 8 25 124
## 2 0.349 28 73 0.384 0.496 49 60 0.817 40 217 257 57 33 29 60 149
## 3 0.000 448 710 0.631 0.629 160 286 0.559 384 301 685 88 92 78 128 215
## 4 0.000 174 333 0.523 0.512 129 179 0.721 118 263 381 101 32 41 66 138
## 5 0.386 38 92 0.413 0.485 22 26 0.846 4 62 66 30 4 9 21 56
## 6 NA 5 15 0.333 0.333 2 6 0.333 3 12 15 3 2 1 1 11
## PTS
## 1 353
## 2 411
## 3 1056
## 4 477
## 5 179
## 6 12
Due to the format of the Kaggle data set “nba_extra2” accounting for certain players playing for multiple teams in one year (trades, free-agent signings,etc). Some players for example orginially had three rows or sets of oberservations: one for each team they played for and a total row (the two separate team stats together). These instances were manually identified and removed using this section of code. After this procedure, each NBA player only had one row/set of observations that accounts for his total statistics from that season.
playerS<-separate(data=player_stats3,col=Player,into=c("Player", "nickname"), sep=":")
head(playerS)
## Rk Player nickname Pos Age Tm G GS MP FG FGA FG. X3P X3PA
## 1 1 Alex Abrines abrinal01 SG 24 OKC 75 8 1134 115 291 0.395 84 221
## 2 2 Quincy Acy acyqu01 PF 27 BRK 70 8 1359 130 365 0.356 102 292
## 3 3 Steven Adams adamsst01 C 24 OKC 76 76 2487 448 712 0.629 0 2
## 4 4 Bam Adebayo adebaba01 C 20 MIA 69 19 1368 174 340 0.512 0 7
## 5 5 Arron Afflalo afflaar01 SG 32 ORL 53 3 682 65 162 0.401 27 70
## 6 6 Cole Aldrich aldrico01 C 29 MIN 21 0 49 5 15 0.333 0 0
## X3P. X2P X2PA X2P. eFG. FT FTA FT. ORB DRB TRB AST STL BLK TOV PF
## 1 0.380 31 70 0.443 0.540 39 46 0.848 26 88 114 28 38 8 25 124
## 2 0.349 28 73 0.384 0.496 49 60 0.817 40 217 257 57 33 29 60 149
## 3 0.000 448 710 0.631 0.629 160 286 0.559 384 301 685 88 92 78 128 215
## 4 0.000 174 333 0.523 0.512 129 179 0.721 118 263 381 101 32 41 66 138
## 5 0.386 38 92 0.413 0.485 22 26 0.846 4 62 66 30 4 9 21 56
## 6 NA 5 15 0.333 0.333 2 6 0.333 3 12 15 3 2 1 1 11
## PTS
## 1 353
## 2 411
## 3 1056
## 4 477
## 5 179
## 6 12
In this section of code the separate function is used to separate the “Player” and “nickname” by ‘:’ so the two datasets can be joined by the similar column of “Player” each player’s full name.
data<-left_join(playerS, salary)
## Joining, by = c("Player", "Tm")
data<-data%>%
mutate(MPG = MP/G, PPG = PTS/G, APG = AST/G, RPG = TRB/G, TOG = TOV/G, GSR = GS/G)
data<-na.omit(data)
attach(data)
This section of code joins “playerS” and “salary” using the left_join function and creates the per game basis metrics using the mutata function. Na.omit works to removed any empty values (NAs) from the dataset.
data.pure <- data
data.pure$Player <- data.pure$nickname <- data.pure$Tm <- data.pure$Pos <- data.pure$Rk <- data.pure$X <- NULL
The final dataset titled data is copied and any non-numerical or unimportant variable is removed. This allows for no confusion in later outputs of pairs correlation plots and puts a focus on the statisitics that will be used in analysis.
stats_salary_cor <- data %>%
select(season17_18, PTS, TOV, PPG, MPG, TOG, RPG, APG, GSR, GS, Age)
ggpairs(stats_salary_cor)
cor(stats_salary_cor)[,"season17_18"]
## season17_18 PTS TOV PPG MPG TOG
## 1.0000000 0.5700031 0.5041297 0.6256246 0.5770201 0.5142743
## RPG APG GSR GS Age
## 0.4696220 0.4472140 0.6165369 0.5814952 0.3400804
A pairs correlation plot is created to highlight any initial correlation betwen 2017-2018 season salaries adn certain basketball performance statistics.
### importance plot with Random Forest
set.seed(1)
train<-sample(1:nrow(data.pure), nrow(data.pure)/2)
NBA.test <- data.pure[-train, "season17_18"]
rf.NBA<-randomForest(season17_18~., data=data.pure, subset=train,
importance=TRUE)
yhat.rf<-predict(rf.NBA, newdata=data.pure[-train,])
mean((yhat.rf-NBA.test)^2)
## [1] 2.384387e+13
importance(rf.NBA)
## %IncMSE IncNodePurity
## Age 19.82939349 8.888248e+14
## G -0.13743537 1.015118e+14
## GS 7.71891629 5.355915e+14
## MP 3.78119908 2.517207e+14
## FG 2.95021921 3.009064e+14
## FGA 2.49550348 1.631564e+14
## FG. 2.17875857 1.444834e+14
## X3P 1.93042097 1.524405e+14
## X3PA 1.58665840 1.760458e+14
## X3P. 0.02162664 1.774951e+14
## X2P 3.41344048 1.198227e+14
## X2PA 2.93144331 9.615173e+13
## X2P. 1.75340769 1.402969e+14
## eFG. 4.04310256 1.959106e+14
## FT 4.52118980 2.628892e+14
## FTA 3.34048057 2.111980e+14
## FT. -0.73169010 1.012629e+14
## ORB 5.77684594 1.640142e+14
## DRB 3.23270186 2.494882e+14
## TRB 2.44718503 1.543428e+14
## AST 3.28501983 1.513149e+14
## STL 3.57376725 1.290821e+14
## BLK 3.67570702 2.060660e+14
## TOV 1.34628620 1.178515e+14
## PF 2.05348264 9.718817e+13
## PTS 4.72941165 3.838420e+14
## MPG 10.92719034 1.262266e+15
## PPG 12.52826764 1.349134e+15
## APG 2.68593451 3.051066e+14
## RPG 9.12615393 2.679921e+14
## TOG 3.44002007 1.690668e+14
## GSR 20.11974893 2.313002e+15
varImpPlot(rf.NBA)
Using the randomForest package, a training subset is taken from the data.pure dataset and applied to produce a variable importance plot (discussed in body of the report).
### tree with cross validation for pruning
set.seed(1)
train<-sample(1:nrow(data.pure), nrow(data.pure)/2)
NBA.test <- data.pure[-train, "season17_18"]
tree.NBA<-tree(season17_18~., data.pure, subset=train)
summary(tree.NBA)
##
## Regression tree:
## tree(formula = season17_18 ~ ., data = data.pure, subset = train)
## Variables actually used in tree construction:
## [1] "GSR" "Age" "ORB" "MP" "PPG" "TRB" "X3P." "STL"
## Number of terminal nodes: 11
## Residual mean deviance: 1.003e+13 = 1.735e+15 / 173
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -10110000 -1884000 -454700 0 1388000 9705000
plot(tree.NBA)
text(tree.NBA, pretty = 0)
The same train subset is created and applied to the tree algorithim to create the recursive binary spltting diagram (discussed in body of the report).
cv.NBA<-cv.tree(tree.NBA)
plot(cv.NBA$size, cv.NBA$dev, type='b')
prune.NBA<-prune.tree(tree.NBA, best=4)
plot(prune.NBA)
text(prune.NBA, pretty=0)
Cross-validation for the number of terminal nodes(4) is performed and used in coordination with the prune.tree function is modify the tree plot.
mod1a <- lm(season17_18~MPG, data = data.pure)
anova(mod1a)
## Analysis of Variance Table
##
## Response: season17_18
## Df Sum Sq Mean Sq F value Pr(>F)
## MPG 1 7.0049e+15 7.0049e+15 182.69 < 2.2e-16 ***
## Residuals 366 1.4034e+16 3.8344e+13
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# RSS = 1.4034e+16
mod1b <- lm(season17_18~PPG, data = data.pure)
anova(mod1b)
## Analysis of Variance Table
##
## Response: season17_18
## Df Sum Sq Mean Sq F value Pr(>F)
## PPG 1 8.2346e+15 8.2346e+15 235.39 < 2.2e-16 ***
## Residuals 366 1.2804e+16 3.4984e+13
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# RSS = 1.2803e+16
mod1c <- lm(season17_18~Age, data = data.pure)
anova(mod1c)
## Analysis of Variance Table
##
## Response: season17_18
## Df Sum Sq Mean Sq F value Pr(>F)
## Age 1 2.4332e+15 2.4332e+15 47.865 2.049e-11 ***
## Residuals 366 1.8605e+16 5.0834e+13
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# RSS = 1.8605e+16
mod1d <- lm(season17_18~GS, data = data.pure)
anova(mod1d)
## Analysis of Variance Table
##
## Response: season17_18
## Df Sum Sq Mean Sq F value Pr(>F)
## GS 1 7.1139e+15 7.1139e+15 186.98 < 2.2e-16 ***
## Residuals 366 1.3925e+16 3.8046e+13
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# RSS = 1.3925e+16
mod1e <- lm(season17_18~GSR, data = data.pure)
anova(mod1e)
## Analysis of Variance Table
##
## Response: season17_18
## Df Sum Sq Mean Sq F value Pr(>F)
## GSR 1 7.9972e+15 7.9972e+15 224.43 < 2.2e-16 ***
## Residuals 366 1.3041e+16 3.5632e+13
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#RSS = 1.3041e+16
mod1e <- lm(season17_18~TOG, data = data.pure)
anova(mod1e)
## Analysis of Variance Table
##
## Response: season17_18
## Df Sum Sq Mean Sq F value Pr(>F)
## TOG 1 5.5643e+15 5.5643e+15 131.61 < 2.2e-16 ***
## Residuals 366 1.5474e+16 4.2280e+13
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# RSS = 1.5474e+16
First step of forward selection: testing each explanatory variable in a simple regression format and comaring residual sum of squares (RSS). PPG determined to be the first variable in the model.
mod2a <- lm(season17_18~PPG + Age, data = data.pure)
anova(mod2a)
## Analysis of Variance Table
##
## Response: season17_18
## Df Sum Sq Mean Sq F value Pr(>F)
## PPG 1 8.2346e+15 8.2346e+15 278.828 < 2.2e-16 ***
## Age 1 2.0244e+15 2.0244e+15 68.547 2.367e-15 ***
## Residuals 365 1.0780e+16 2.9533e+13
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# RSS = 1.078e+16
mod2b <- lm(season17_18~PPG + GSR, data = data.pure)
anova(mod2b)
## Analysis of Variance Table
##
## Response: season17_18
## Df Sum Sq Mean Sq F value Pr(>F)
## PPG 1 8.2346e+15 8.2346e+15 255.170 < 2.2e-16 ***
## GSR 1 1.0250e+15 1.0250e+15 31.761 3.491e-08 ***
## Residuals 365 1.1779e+16 3.2271e+13
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# RSS = 1.1779e+16
mod2c <- lm(season17_18~PPG + MPG, data = data.pure)
anova(mod2c)
## Analysis of Variance Table
##
## Response: season17_18
## Df Sum Sq Mean Sq F value Pr(>F)
## PPG 1 8.2346e+15 8.2346e+15 236.264 <2e-16 ***
## MPG 1 8.2429e+13 8.2429e+13 2.365 0.125
## Residuals 365 1.2722e+16 3.4854e+13
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# RSS = 1.2722e+16
mod2d <- lm(season17_18~PPG + GS, data = data.pure)
anova(mod2d)
## Analysis of Variance Table
##
## Response: season17_18
## Df Sum Sq Mean Sq F value Pr(>F)
## PPG 1 8.2346e+15 8.2346e+15 249.544 < 2.2e-16 ***
## GS 1 7.5942e+14 7.5942e+14 23.014 2.348e-06 ***
## Residuals 365 1.2045e+16 3.2999e+13
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# RSS = 1.2045e+16
mod2e <- lm(season17_18~PPG + TOG, data = data.pure)
anova(mod2e)
## Analysis of Variance Table
##
## Response: season17_18
## Df Sum Sq Mean Sq F value Pr(>F)
## PPG 1 8.2346e+15 8.2346e+15 234.7687 <2e-16 ***
## TOG 1 1.3884e+12 1.3884e+12 0.0396 0.8424
## Residuals 365 1.2803e+16 3.5076e+13
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# TOG not significant
Second step of forward selection: Age determined to be the second vairable in the model.
mod3a <- lm(season17_18~PPG + Age + GSR, data = data.pure)
anova(mod3a)
## Analysis of Variance Table
##
## Response: season17_18
## Df Sum Sq Mean Sq F value Pr(>F)
## PPG 1 8.2346e+15 8.2346e+15 302.999 < 2.2e-16 ***
## Age 1 2.0244e+15 2.0244e+15 74.489 < 2.2e-16 ***
## GSR 1 8.8708e+14 8.8708e+14 32.641 2.308e-08 ***
## Residuals 364 9.8925e+15 2.7177e+13
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#RSS = 9.8925e+15
mod3b <- lm(season17_18~PPG + Age + MPG, data = data.pure)
anova(mod3b)
## Analysis of Variance Table
##
## Response: season17_18
## Df Sum Sq Mean Sq F value Pr(>F)
## PPG 1 8.2346e+15 8.2346e+15 279.5529 < 2.2e-16 ***
## Age 1 2.0244e+15 2.0244e+15 68.7252 2.21e-15 ***
## MPG 1 5.7407e+13 5.7407e+13 1.9489 0.1636
## Residuals 364 1.0722e+16 2.9456e+13
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# MPG not significant
mod3c <- lm(season17_18~PPG + Age + GS, data = data.pure)
anova(mod3c)
## Analysis of Variance Table
##
## Response: season17_18
## Df Sum Sq Mean Sq F value Pr(>F)
## PPG 1 8.2346e+15 8.2346e+15 296.532 < 2.2e-16 ***
## Age 1 2.0244e+15 2.0244e+15 72.899 3.760e-16 ***
## GS 1 6.7135e+14 6.7135e+14 24.175 1.333e-06 ***
## Residuals 364 1.0108e+16 2.7770e+13
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# RSS = 1.0108e+16
mod3d <- lm(season17_18~PPG + Age + TOG, data = data.pure)
anova(mod3d)
## Analysis of Variance Table
##
## Response: season17_18
## Df Sum Sq Mean Sq F value Pr(>F)
## PPG 1 8.2346e+15 8.2346e+15 278.1187 < 2.2e-16 ***
## Age 1 2.0244e+15 2.0244e+15 68.3727 2.569e-15 ***
## TOG 1 2.1150e+12 2.1150e+12 0.0714 0.7894
## Residuals 364 1.0777e+16 2.9608e+13
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# TOG not significant
Third step of forward selection: GSR is determined to be the third variable in the model.
mod4a <- lm(season17_18~PPG + Age + GSR + MPG, data = data.pure)
anova(mod4a)
## Analysis of Variance Table
##
## Response: season17_18
## Df Sum Sq Mean Sq F value Pr(>F)
## PPG 1 8.2346e+15 8.2346e+15 305.1463 < 2.2e-16 ***
## Age 1 2.0244e+15 2.0244e+15 75.0171 < 2.2e-16 ***
## GSR 1 8.8708e+14 8.8708e+14 32.8719 2.073e-08 ***
## MPG 1 9.6612e+13 9.6612e+13 3.5801 0.05927 .
## Residuals 363 9.7959e+15 2.6986e+13
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# MPG not significant
mod4b <- lm(season17_18~PPG + Age + GSR + GS, data = data.pure)
anova(mod4b)
## Analysis of Variance Table
##
## Response: season17_18
## Df Sum Sq Mean Sq F value Pr(>F)
## PPG 1 8.2346e+15 8.2346e+15 302.1710 < 2.2e-16 ***
## Age 1 2.0244e+15 2.0244e+15 74.2857 < 2.2e-16 ***
## GSR 1 8.8708e+14 8.8708e+14 32.5514 2.412e-08 ***
## GS 1 1.6006e+11 1.6006e+11 0.0059 0.939
## Residuals 363 9.8923e+15 2.7252e+13
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# GS not significant
mod4c <- lm(season17_18~PPG + Age + GSR + TOG, data = data.pure)
anova(mod4c)
## Analysis of Variance Table
##
## Response: season17_18
## Df Sum Sq Mean Sq F value Pr(>F)
## PPG 1 8.2346e+15 8.2346e+15 302.2270 < 2.2e-16 ***
## Age 1 2.0244e+15 2.0244e+15 74.2994 < 2.2e-16 ***
## GSR 1 8.8708e+14 8.8708e+14 32.5574 2.405e-08 ***
## TOG 1 1.9904e+12 1.9904e+12 0.0731 0.7871
## Residuals 363 9.8905e+15 2.7247e+13
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# TOG not significant
Fourth step of forward selection: No other variables determined to be significant
set.seed(52)
data[sample(nrow(data),1),]
## Rk Player nickname Pos Age Tm G GS MP FG FGA FG. X3P
## 116 114 Allen Crabbe crabbal01 SG 25 BRK 75 68 2197 337 827 0.407 201
## X3PA X3P. X2P X2PA X2P. eFG. FT FTA FT. ORB DRB TRB AST STL BLK
## 116 532 0.378 136 295 0.461 0.529 115 135 0.852 28 296 324 117 47 35
## TOV PF PTS X season17_18 MPG PPG APG RPG TOG GSR
## 116 78 168 990 44 19332500 29.29333 13.2 1.56 4.32 1.04 0.9066667
final.model <- lm(season17_18~PPG + Age +GSR, data = data.pure)
anova(final.model)
## Analysis of Variance Table
##
## Response: season17_18
## Df Sum Sq Mean Sq F value Pr(>F)
## PPG 1 8.2346e+15 8.2346e+15 302.999 < 2.2e-16 ***
## Age 1 2.0244e+15 2.0244e+15 74.489 < 2.2e-16 ***
## GSR 1 8.8708e+14 8.8708e+14 32.641 2.308e-08 ***
## Residuals 364 9.8925e+15 2.7177e+13
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
new.data<- data.frame(PPG = 13.2, Age = 25, GSR = 1)
final.test <- predict(final.model, new.data)
final.test
## 1
## 11381341
This code takes a random sample for one row/player from the data set and uses the predict function to input Allen Crabbe’s stats to createsalary output.
firstQT<-data%>%
filter(season17_18<1465920)
dim(firstQT)
## [1] 66 39
set.seed(1)
samp1<-sample(121, 10)
sampFirst<-firstQT[samp1,]
secondQT<-data%>%
filter(season17_18>=1465920 & season17_18<3028410)
dim(secondQT)
## [1] 99 39
set.seed(2)
samp2<-sample(122, 10)
sampSecond<-secondQT[samp2,]
thirdQT<-data%>%
filter(season17_18>=3028410 & season17_18<9539057)
dim(thirdQT)
## [1] 99 39
set.seed(3)
samp3<-sample(122, 10)
sampThird<-thirdQT[samp3,]
fourthQT<-data%>%
filter(season17_18>=9539097 & season17_18<=34682550)
dim(fourthQT)
## [1] 104 39
set.seed(4)
samp4<-sample(122, 10)
sampFourth<-fourthQT[samp4,]
This set of code performs a stratified sample based on the five number summary of the overall dataset (all the players’ salaries in the NBA during the 2017-2018 season) to create four stratas based on salary in dollar amounts. The results of each strata were used to separately research and create a dataframe with thses 40 players (10 from each strata). This new dataset will be used for regression analysis.
college.data <- read.csv("Stratified Sample_NBA stats_Sheet1.csv", header = TRUE,
stringsAsFactors = FALSE)
head(college.data)
## Player Pos Age Tm G GS MP FG FGA FG. X3P X3PA X3P.
## 1 Willy Hernangomez C 23 TOT 48 1 495 91 164 0.555 5 12 0.417
## 2 Ivan Rabb PF 20 MEM 36 5 516 86 152 0.566 0 0 NA
## 3 David Nwaba SG 25 CHI 70 21 1646 202 423 0.478 18 52 0.346
## 4 Tyler Cavanaugh PF 23 ATL 39 1 518 67 152 0.441 32 89 0.360
## 5 Tyler Ulis PG 22 PHO 71 43 1658 214 551 0.388 42 146 0.288
## 6 Ersan Ilyasova PF 30 TOT 69 43 1729 277 613 0.452 91 253 0.360
## X2P X2PA X2P. eFG. FT FTA FT. ORB DRB TRB AST STL BLK TOV PF PTS
## 1 86 152 0.566 0.570 59 90 0.656 61 122 183 33 18 16 35 64 246
## 2 86 152 0.566 0.566 29 36 0.806 53 105 158 32 12 13 35 55 201
## 3 184 371 0.496 0.499 133 203 0.655 89 237 326 104 59 29 77 158 555
## 4 35 63 0.556 0.546 17 21 0.810 45 82 127 27 9 4 14 61 183
## 5 172 405 0.425 0.426 84 101 0.832 24 104 128 311 70 7 127 120 554
## 6 186 360 0.517 0.526 105 135 0.778 118 287 405 90 61 26 72 157 750
college<-read.csv("Stat Project_College_International Data_Sheet1.csv", header=TRUE, stringsAsFactors = FALSE)
head(college)
## Player Team cMPG cPPG cFGM cFGA cFGp cAPG cRPG
## 1 Willy Hernangomez Real Madrid 12.6 18.8 7.5 11.2 0.667 1.2 11.0
## 2 Ivan Rabb Cal 32.6 14.0 4.9 10.1 0.484 1.5 10.5
## 3 David Nwaba Cal Poly 28.0 12.5 4.4 9.5 0.462 3.5 6.3
## 4 Tyler Cavanaugh George Washington 32.2 18.3 5.7 12.7 0.448 2.0 8.4
## 5 Tyler Ulis Kentucky 36.8 17.3 5.5 12.6 0.434 7.0 3.0
## 6 Ersan Ilyasova FC Barcelona 29.0 10.7 4.0 8.0 0.500 0.7 7.6
Loading and labeling of the two data sets that will be used in the College and NBA statistics analysis.
pro.college <- left_join(college.data, college)
## Joining, by = "Player"
pro.college<-pro.college%>%
mutate(MPG = MP/G, PPG = PTS/G, APG = AST/G, RPG = TRB/G, TOG = TOV/G)
pro.college <- na.omit(pro.college)
head(pro.college)
## Player Pos Age Tm G GS MP FG FGA FG. X3P X3PA X3P.
## 1 Willy Hernangomez C 23 TOT 48 1 495 91 164 0.555 5 12 0.417
## 3 David Nwaba SG 25 CHI 70 21 1646 202 423 0.478 18 52 0.346
## 4 Tyler Cavanaugh PF 23 ATL 39 1 518 67 152 0.441 32 89 0.360
## 5 Tyler Ulis PG 22 PHO 71 43 1658 214 551 0.388 42 146 0.288
## 6 Ersan Ilyasova PF 30 TOT 69 43 1729 277 613 0.452 91 253 0.360
## 7 Travis Wear SF 27 LAL 17 0 228 25 72 0.347 17 47 0.362
## X2P X2PA X2P. eFG. FT FTA FT. ORB DRB TRB AST STL BLK TOV PF PTS
## 1 86 152 0.566 0.570 59 90 0.656 61 122 183 33 18 16 35 64 246
## 3 184 371 0.496 0.499 133 203 0.655 89 237 326 104 59 29 77 158 555
## 4 35 63 0.556 0.546 17 21 0.810 45 82 127 27 9 4 14 61 183
## 5 172 405 0.425 0.426 84 101 0.832 24 104 128 311 70 7 127 120 554
## 6 186 360 0.517 0.526 105 135 0.778 118 287 405 90 61 26 72 157 750
## 7 8 25 0.320 0.465 8 8 1.000 0 38 38 7 4 5 6 30 75
## Team cMPG cPPG cFGM cFGA cFGp cAPG cRPG MPG PPG
## 1 Real Madrid 12.6 18.8 7.5 11.2 0.667 1.2 11.0 10.31250 5.125000
## 3 Cal Poly 28.0 12.5 4.4 9.5 0.462 3.5 6.3 23.51429 7.928571
## 4 George Washington 32.2 18.3 5.7 12.7 0.448 2.0 8.4 13.28205 4.692308
## 5 Kentucky 36.8 17.3 5.5 12.6 0.434 7.0 3.0 23.35211 7.802817
## 6 FC Barcelona 29.0 10.7 4.0 8.0 0.500 0.7 7.6 25.05797 10.869565
## 7 UCLA 23.9 7.2 3.1 5.8 0.530 1.4 3.2 13.41176 4.411765
## APG RPG TOG
## 1 0.6875000 3.812500 0.7291667
## 3 1.4857143 4.657143 1.1000000
## 4 0.6923077 3.256410 0.3589744
## 5 4.3802817 1.802817 1.7887324
## 6 1.3043478 5.869565 1.0434783
## 7 0.4117647 2.235294 0.3529412
This section of code joins “college.data” and “college” by the Player column using the left_join function and creates the per game basis metrics using the mutata function. Na.omit works to removed any empty values (NAs) from the dataset.
pro.college <- na.omit(pro.college)
pro.college.pure <- pro.college
pro.college.pure$Player <- pro.college.pure$Tm <- pro.college.pure$Team <- pro.college.pure$Pos <- NULL
The final dataset titled data is copied and any non-numerical or unimportant variable is removed. This allows for no confusion in later outputs of pairs correlation plots and puts a focus on the statisitics that will be used in analysis.
stats_college_cor <- pro.college %>%
select(MPG, cMPG, cPPG, cFGA, cFGp, cAPG, cRPG, cFGM)
ggpairs(stats_college_cor)
cor(stats_college_cor)[,"MPG"]
## MPG cMPG cPPG cFGA cFGp cAPG
## 1.00000000 0.15077490 -0.02568161 0.03141627 -0.34032515 0.39943917
## cRPG cFGM
## -0.23779916 -0.09621396
A pairs correlation plot is created to highlight any initial correlation betwen MGP in the 2017-2018 NBA season and certain basketball performance statistics from the college or international season prior to entering the league.
set.seed(2)
train<-sample(1:nrow(pro.college.pure), nrow(pro.college.pure)/2)
set.seed(2)
rf.College<-randomForest(MPG~cMPG+cRPG+cPPG+cAPG+cFGM+cFGA+cFGp, data=pro.college.pure, subset=train,
importance=TRUE)
yhat.rf<-predict(rf.College, newdata=pro.college.pure[-train,])
mean((yhat.rf-NBA.test)^2)
## Warning in yhat.rf - NBA.test: longer object length is not a multiple of
## shorter object length
## [1] 9.713284e+13
importance(rf.College)
## %IncMSE IncNodePurity
## cMPG 0.5627210 193.13778
## cRPG -0.3501000 146.28648
## cPPG 0.4079435 126.09444
## cAPG 5.9093233 296.70085
## cFGM -2.4516593 90.52356
## cFGA -1.5832037 127.53237
## cFGp -1.3091603 197.97435
varImpPlot(rf.College)
Using the randomForest package, a training subset is taken from the pro.college.pure dataset and applied to produce a variable importance plot (discussed in body of the report).
College.test <- data.pure[-train, "MPG"]
tree.College<-tree(MPG~cMPG+cRPG+cPPG+cAPG+cFGM+cFGA+cFGp, pro.college.pure, subset=train)
summary(tree.College)
##
## Regression tree:
## tree(formula = MPG ~ cMPG + cRPG + cPPG + cAPG + cFGM + cFGA +
## cFGp, data = pro.college.pure, subset = train)
## Variables actually used in tree construction:
## [1] "cAPG"
## Number of terminal nodes: 2
## Residual mean deviance: 58.38 = 875.7 / 15
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -14.380 -5.358 -1.667 0.000 6.364 11.350
plot(tree.College)
text(tree.College, pretty = 0)
cv.College<-cv.tree(tree.College)
plot(cv.College$size, cv.College$dev, type='b')
This code sets a seed for the train subset of the pro.college.pure dataset and uses that subset to apply the tree algorithim and plot. No pruning by cross-validation needed in this instance.
MOD1a <- lm(MPG~cRPG, data = pro.college.pure)
anova(MOD1a)
## Analysis of Variance Table
##
## Response: MPG
## Df Sum Sq Mean Sq F value Pr(>F)
## cRPG 1 139.32 139.322 1.9779 0.169
## Residuals 33 2324.44 70.438
# RSS = 2324.44
MOD1b <- lm(MPG~cAPG, data = pro.college.pure)
anova(MOD1b)
## Analysis of Variance Table
##
## Response: MPG
## Df Sum Sq Mean Sq F value Pr(>F)
## cAPG 1 393.1 393.10 6.2648 0.01744 *
## Residuals 33 2070.7 62.75
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# RSS = 2070.7
MOD1c <- lm(MPG~cFGA, data = pro.college.pure)
anova(MOD1c)
## Analysis of Variance Table
##
## Response: MPG
## Df Sum Sq Mean Sq F value Pr(>F)
## cFGA 1 2.43 2.432 0.0326 0.8578
## Residuals 33 2461.33 74.586
# RSS = 2440.96
MOD1d <- lm(MPG~cFGp, data = pro.college.pure)
anova(MOD1d)
## Analysis of Variance Table
##
## Response: MPG
## Df Sum Sq Mean Sq F value Pr(>F)
## cFGp 1 285.36 285.356 4.3228 0.04545 *
## Residuals 33 2178.41 66.012
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# RSS = 2178.41
MOD1e <- lm(MPG~cPPG, data = pro.college.pure)
anova(MOD1e)
## Analysis of Variance Table
##
## Response: MPG
## Df Sum Sq Mean Sq F value Pr(>F)
## cPPG 1 1.62 1.625 0.0218 0.8836
## Residuals 33 2462.14 74.610
# RSS = 2462.14
MOD1f <- lm(MPG~cMPG, data = pro.college.pure)
anova(MOD1f)
## Analysis of Variance Table
##
## Response: MPG
## Df Sum Sq Mean Sq F value Pr(>F)
## cMPG 1 56.01 56.009 0.7676 0.3873
## Residuals 33 2407.75 72.962
# RSS = 2407.75
First step of forward selection: testing each explanatory variable in a simple regression format and comaring residual sum of squares (RSS). cAPG determined to be the first variable in the model.
MOD2a <- lm(MPG~cAPG + cFGA, data = pro.college.pure)
anova(MOD2a)
## Analysis of Variance Table
##
## Response: MPG
## Df Sum Sq Mean Sq F value Pr(>F)
## cAPG 1 393.10 393.10 6.1743 0.01838 *
## cFGA 1 33.33 33.33 0.5236 0.47458
## Residuals 32 2037.33 63.67
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
MOD2b <- lm(MPG~cAPG + cPPG, data = pro.college.pure)
anova(MOD2b)
## Analysis of Variance Table
##
## Response: MPG
## Df Sum Sq Mean Sq F value Pr(>F)
## cAPG 1 393.1 393.10 6.1964 0.01819 *
## cPPG 1 40.6 40.60 0.6400 0.42961
## Residuals 32 2030.1 63.44
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
MOD2c <- lm(MPG~cAPG + cFGM, data = pro.college.pure)
anova(MOD2c)
## Analysis of Variance Table
##
## Response: MPG
## Df Sum Sq Mean Sq F value Pr(>F)
## cAPG 1 393.10 393.10 6.3531 0.0169 *
## cFGM 1 90.66 90.66 1.4652 0.2350
## Residuals 32 1980.00 61.88
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
MOD2d <- lm(MPG~cAPG + cMPG, data = pro.college.pure)
anova(MOD2d)
## Analysis of Variance Table
##
## Response: MPG
## Df Sum Sq Mean Sq F value Pr(>F)
## cAPG 1 393.10 393.10 6.1678 0.01844 *
## cMPG 1 31.18 31.18 0.4893 0.48931
## Residuals 32 2039.48 63.73
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
MOD2e <- lm(MPG~cAPG + cRPG, data = pro.college.pure)
anova(MOD2e)
## Analysis of Variance Table
##
## Response: MPG
## Df Sum Sq Mean Sq F value Pr(>F)
## cAPG 1 393.10 393.10 6.2956 0.01736 *
## cRPG 1 72.59 72.59 1.1625 0.28901
## Residuals 32 1998.08 62.44
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
MOD2f <- lm(MPG~cAPG + cFGp, data = pro.college.pure)
anova(MOD2f)
## Analysis of Variance Table
##
## Response: MPG
## Df Sum Sq Mean Sq F value Pr(>F)
## cAPG 1 393.10 393.10 6.4364 0.01625 *
## cFGp 1 116.31 116.31 1.9044 0.17716
## Residuals 32 1954.36 61.07
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Second step of forward selection: No other variables determined to be significant.