1 Executive Summary

  1. NBA player Salary based on their statistics

2 Introduction

3 Loading and Exploring Data

3.1 Loading libraries required

library(knitr)
library(plyr)
library(dplyr)
library(tidyr)
library(caret)
library(ggplot2)
library(corrplot)
library(stringr)
library(scales)
library(randomForest)
library(psych)
library(glmnet)
library(rpart)
library(lubridate)
library(plotly)
opts_chunk$set(echo = TRUE, cache = TRUE)
opts_chunk$set(tidy.opts = list(width.cutoff = 60), tidy = TRUE)

3.3 File description

3.3.1 players.csv

Source: https://data.world/datadavis/nba-salaries
It has primary key id, their birth date, birth place, career statistics, name, position

dim(players)
## [1] 4685   24
str(players)
## 'data.frame':    4685 obs. of  24 variables:
##  $ X_id       : chr  "abdelal01" "abdulza01" "abdulka01" "abdulma02" ...
##  $ birthDate  : chr  "June 24, 1968" "April 7, 1946" "April 16, 1947" "March 9, 1969" ...
##  $ birthPlace : chr  "Cairo, Egypt" "Brooklyn, New York" "New York, New York" "Gulfport, Mississippi" ...
##  $ career_AST : num  0.3 1.2 3.6 3.5 1.1 2.5 1.2 1 0.7 0.5 ...
##  $ career_FG. : chr  "50.2" "42.8" "55.9" "44.2" ...
##  $ career_FG3.: chr  "0.0" "" "5.6" "35.4" ...
##  $ career_FT. : chr  "70.1" "72.8" "72.1" "90.5" ...
##  $ career_G   : int  256 505 1560 586 236 830 319 1 56 174 ...
##  $ career_PER : chr  "13.0" "15.1" "24.6" "15.4" ...
##  $ career_PTS : num  5.7 9 24.6 14.6 7.8 18.1 5.6 0 9.5 5.3 ...
##  $ career_TRB : chr  "3.3" "8.0" "11.2" "1.9" ...
##  $ career_WS  : chr  "4.8" "17.5" "273.4" "25.2" ...
##  $ career_eFG.: chr  "50.2" "" "55.9" "47.2" ...
##  $ college    : chr  "Duke University" "Iowa State University" "University of California, Los Angeles" "Louisiana State University" ...
##  $ draft_pick : chr  "25th overall" "5th overall" "1st overall" "3rd overall" ...
##  $ draft_round: chr  "1st round" "1st round" "1st round" "1st round" ...
##  $ draft_team : chr  "Portland Trail Blazers" "Cincinnati Royals" "Milwaukee Bucks" "Denver Nuggets" ...
##  $ draft_year : chr  "1990" "1968" "1969" "1990" ...
##  $ height     : chr  "6-10" "6-9" "7-2" "6-1" ...
##  $ highSchool : chr  "Bloomfield in Bloomfield, New Jersey" "John Jay in Brooklyn, New York" "Power Memorial in New York, New York" "Gulfport in Gulfport, Mississippi" ...
##  $ name       : chr  "Alaa Abdelnaby" "Zaid Abdul-Aziz" "Kareem Abdul-Jabbar" "Mahmoud Abdul-Rauf" ...
##  $ position   : chr  "Power Forward" "Power Forward and Center" "Center" "Point Guard" ...
##  $ shoots     : chr  "Right" "Right" "Right" "Right" ...
##  $ weight     : chr  "240lb" "235lb" "225lb" "162lb" ...

3.3.2 salaries_1985to2018.csv

Source: https://data.world/datadavis/nba-salaries
It record the player_id (as a foreign key to the players.csv), their salary, the season and the teams they played in.

dim(salaries)
## [1] 14163     7
str(salaries)
## 'data.frame':    14163 obs. of  7 variables:
##  $ league      : chr  "NBA" "NBA" "NBA" "NBA" ...
##  $ player_id   : chr  "abdelal01" "abdelal01" "abdelal01" "abdelal01" ...
##  $ salary      : int  395000 494000 500000 805000 650000 1530000 2030000 2000000 3000000 1660000 ...
##  $ season      : chr  "1990-91" "1991-92" "1992-93" "1993-94" ...
##  $ season_end  : int  1991 1992 1993 1994 1995 1985 1986 1988 1989 1991 ...
##  $ season_start: int  1990 1991 1992 1993 1994 1984 1985 1987 1988 1990 ...
##  $ team        : chr  "Portland Trail Blazers" "Portland Trail Blazers" "Boston Celtics" "Boston Celtics" ...

3.3.3 all_seasons.csv

Source: https://www.kaggle.com/datasets/justinas/nba-players-data
It record the player names, teams, age, and other statistics for each individual season from 1996 season to 2020 season. (the index in this table is not linked to the previous tables)

dim(Pstats)
## [1] 11700    22
str(Pstats)
## 'data.frame':    11700 obs. of  22 variables:
##  $ X                : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ player_name      : chr  "Travis Knight" "Matt Fish" "Matt Bullard" "Marty Conlon" ...
##  $ team_abbreviation: chr  "LAL" "MIA" "HOU" "BOS" ...
##  $ age              : num  22 27 30 29 22 22 36 26 33 32 ...
##  $ player_height    : num  213 211 208 211 206 ...
##  $ player_weight    : num  107 107 107 111 107 ...
##  $ college          : chr  "Connecticut" "North Carolina-Wilmington" "Iowa" "Providence" ...
##  $ country          : chr  "USA" "USA" "USA" "USA" ...
##  $ draft_year       : chr  "1996" "1992" "Undrafted" "Undrafted" ...
##  $ draft_round      : chr  "1" "2" "Undrafted" "Undrafted" ...
##  $ draft_number     : chr  "29" "50" "Undrafted" "Undrafted" ...
##  $ gp               : int  71 6 71 74 42 9 70 31 70 82 ...
##  $ pts              : num  4.8 0.3 4.5 7.8 3.7 1.6 3.2 2 11.3 9.9 ...
##  $ reb              : num  4.5 0.8 1.6 4.4 1.6 0.7 2.7 1.2 2.6 4.8 ...
##  $ ast              : num  0.5 0 0.9 1.4 0.5 0.4 0.3 0 4.9 11.4 ...
##  $ net_rating       : num  6.2 -15.1 0.9 -9 -14.5 -3.5 3.5 -17.1 -3.1 -2 ...
##  $ oreb_pct         : num  0.127 0.143 0.016 0.083 0.109 0.087 0.092 0.109 0.023 0.035 ...
##  $ dreb_pct         : num  0.182 0.267 0.115 0.152 0.118 0.045 0.146 0.152 0.088 0.116 ...
##  $ usg_pct          : num  0.142 0.265 0.151 0.167 0.233 0.135 0.137 0.232 0.192 0.155 ...
##  $ ts_pct           : num  0.536 0.333 0.535 0.542 0.482 0.47 0.555 0.448 0.597 0.525 ...
##  $ ast_pct          : num  0.052 0 0.099 0.101 0.114 0.125 0.034 0.013 0.289 0.464 ...
##  $ season           : chr  "1996-97" "1996-97" "1996-97" "1996-97" ...

4 Preprossing data

4.1 Merge data tables

4.1.1 Merging players and salaries

Since it is from the same source and the two tables are linked by a primary key (players.X_id) and foreign key (salaries.player_id). I will link them with merge column X_id and player_id.

player_salary <- merge(players, salaries, by.x = "X_id", by.y = "player_id")
dim(player_salary)
## [1] 14163    30
str(player_salary)
## 'data.frame':    14163 obs. of  30 variables:
##  $ X_id        : chr  "abdelal01" "abdelal01" "abdelal01" "abdelal01" ...
##  $ birthDate   : chr  "June 24, 1968" "June 24, 1968" "June 24, 1968" "June 24, 1968" ...
##  $ birthPlace  : chr  "Cairo, Egypt" "Cairo, Egypt" "Cairo, Egypt" "Cairo, Egypt" ...
##  $ career_AST  : num  0.3 0.3 0.3 0.3 0.3 3.6 3.6 3.6 3.6 3.5 ...
##  $ career_FG.  : chr  "50.2" "50.2" "50.2" "50.2" ...
##  $ career_FG3. : chr  "0.0" "0.0" "0.0" "0.0" ...
##  $ career_FT.  : chr  "70.1" "70.1" "70.1" "70.1" ...
##  $ career_G    : int  256 256 256 256 256 1560 1560 1560 1560 586 ...
##  $ career_PER  : chr  "13.0" "13.0" "13.0" "13.0" ...
##  $ career_PTS  : num  5.7 5.7 5.7 5.7 5.7 24.6 24.6 24.6 24.6 14.6 ...
##  $ career_TRB  : chr  "3.3" "3.3" "3.3" "3.3" ...
##  $ career_WS   : chr  "4.8" "4.8" "4.8" "4.8" ...
##  $ career_eFG. : chr  "50.2" "50.2" "50.2" "50.2" ...
##  $ college     : chr  "Duke University" "Duke University" "Duke University" "Duke University" ...
##  $ draft_pick  : chr  "25th overall" "25th overall" "25th overall" "25th overall" ...
##  $ draft_round : chr  "1st round" "1st round" "1st round" "1st round" ...
##  $ draft_team  : chr  "Portland Trail Blazers" "Portland Trail Blazers" "Portland Trail Blazers" "Portland Trail Blazers" ...
##  $ draft_year  : chr  "1990" "1990" "1990" "1990" ...
##  $ height      : chr  "6-10" "6-10" "6-10" "6-10" ...
##  $ highSchool  : chr  "Bloomfield in Bloomfield, New Jersey" "Bloomfield in Bloomfield, New Jersey" "Bloomfield in Bloomfield, New Jersey" "Bloomfield in Bloomfield, New Jersey" ...
##  $ name        : chr  "Alaa Abdelnaby" "Alaa Abdelnaby" "Alaa Abdelnaby" "Alaa Abdelnaby" ...
##  $ position    : chr  "Power Forward" "Power Forward" "Power Forward" "Power Forward" ...
##  $ shoots      : chr  "Right" "Right" "Right" "Right" ...
##  $ weight      : chr  "240lb" "240lb" "240lb" "240lb" ...
##  $ league      : chr  "NBA" "NBA" "NBA" "NBA" ...
##  $ salary      : int  395000 494000 500000 805000 650000 1530000 2030000 2000000 3000000 1660000 ...
##  $ season      : chr  "1990-91" "1991-92" "1992-93" "1993-94" ...
##  $ season_end  : int  1991 1992 1993 1994 1995 1985 1986 1988 1989 1991 ...
##  $ season_start: int  1990 1991 1992 1993 1994 1984 1985 1987 1988 1990 ...
##  $ team        : chr  "Portland Trail Blazers" "Portland Trail Blazers" "Boston Celtics" "Boston Celtics" ...

4.2 Merging table from two source

teams <- unique(player_salary$team)
teamsABB <- c("POR", "BOS", "SAC", "LAL", "DEN", "VAN", "DAL",
    "ATL", "OKC", "DET", "LAC", "ORL", "HOU", "TOR", "NYK", "BKN",
    "PHX", "NJN", "MEM", "WAS", "CHA", "MIA", "GSW", "CHA", "MIN",
    "SAS", "NOP", "PHI", "WAS", "NOH", "CLE", "MIL", "CHI", "IND",
    "SEA", "UTA", "NOK", "None", "KCK")

teamABBTab <- data.frame(team_name = teams, team_ABB = teamsABB)

player_salary$team_abbreviation <- sapply(player_salary$team,
    function(x) teamABBTab$team_ABB[which(teamABBTab$team_name ==
        x)])
player_salary <- rename(player_salary, player_name = name)

Pstats_salary <- merge(player_salary, Pstats, by = c("player_name",
    "season", "team_abbreviation"))
dim(Pstats_salary)
## [1] 8722   50
str(Pstats_salary)
## 'data.frame':    8722 obs. of  50 variables:
##  $ player_name      : chr  "A.C. Green" "A.C. Green" "A.C. Green" "A.C. Green" ...
##  $ season           : chr  "1996-97" "1997-98" "1998-99" "1999-00" ...
##  $ team_abbreviation: chr  "DAL" "DAL" "DAL" "LAL" ...
##  $ X_id             : chr  "greenac01" "greenac01" "greenac01" "greenac01" ...
##  $ birthDate        : chr  "October 4, 1963" "October 4, 1963" "October 4, 1963" "October 4, 1963" ...
##  $ birthPlace       : chr  "Portland, Oregon" "Portland, Oregon" "Portland, Oregon" "Portland, Oregon" ...
##  $ career_AST       : num  1.1 1.1 1.1 1.1 1.1 0 1.8 1.8 1.8 3 ...
##  $ career_FG.       : chr  "49.4" "49.4" "49.4" "49.4" ...
##  $ career_FG3.      : chr  "25.4" "25.4" "25.4" "25.4" ...
##  $ career_FT.       : chr  "73.4" "73.4" "73.4" "73.4" ...
##  $ career_G         : int  1278 1278 1278 1278 1278 8 80 80 80 645 ...
##  $ career_PER       : chr  "14.4" "14.4" "14.4" "14.4" ...
##  $ career_PTS       : num  9.6 9.6 9.6 9.6 9.6 1 5.5 5.5 5.5 9.7 ...
##  $ career_TRB       : chr  "7.4" "7.4" "7.4" "7.4" ...
##  $ career_WS        : chr  "99.5" "99.5" "99.5" "99.5" ...
##  $ career_eFG.      : chr  "50.1" "50.1" "50.1" "50.1" ...
##  $ college.x        : chr  "Oregon State University" "Oregon State University" "Oregon State University" "Oregon State University" ...
##  $ draft_pick       : chr  "23rd overall" "23rd overall" "23rd overall" "23rd overall" ...
##  $ draft_round.x    : chr  "1st round" "1st round" "1st round" "1st round" ...
##  $ draft_team       : chr  "Los Angeles Lakers" "Los Angeles Lakers" "Los Angeles Lakers" "Los Angeles Lakers" ...
##  $ draft_year.x     : chr  "1985" "1985" "1985" "1985" ...
##  $ height           : chr  "6-9" "6-9" "6-9" "6-9" ...
##  $ highSchool       : chr  "Benson Polytechnic in Portland, Oregon" "Benson Polytechnic in Portland, Oregon" "Benson Polytechnic in Portland, Oregon" "Benson Polytechnic in Portland, Oregon" ...
##  $ position         : chr  "Power Forward and Small Forward" "Power Forward and Small Forward" "Power Forward and Small Forward" "Power Forward and Small Forward" ...
##  $ shoots           : chr  "Right" "Right" "Right" "Right" ...
##  $ weight           : chr  "220lb" "220lb" "220lb" "220lb" ...
##  $ league           : chr  "NBA" "NBA" "NBA" "NBA" ...
##  $ salary           : int  4851000 5095088 5125088 1700000 2250000 118974 316969 465850 18748 972720 ...
##  $ season_end       : int  1997 1998 1999 2000 2001 2000 2001 2002 2003 2008 ...
##  $ season_start     : int  1996 1997 1998 1999 2000 1999 2000 2001 2002 2007 ...
##  $ team             : chr  "Dallas Mavericks" "Dallas Mavericks" "Dallas Mavericks" "Los Angeles Lakers" ...
##  $ X                : int  300 549 1001 1691 2031 1711 2030 2623 2920 5164 ...
##  $ age              : num  33 34 35 36 37 23 23 24 25 23 ...
##  $ player_height    : num  206 206 206 206 206 ...
##  $ player_weight    : num  102 102 102 102 102 ...
##  $ college.y        : chr  "Oregon State" "Oregon State" "Oregon State" "Oregon State" ...
##  $ country          : chr  "USA" "USA" "USA" "USA" ...
##  $ draft_year.y     : chr  "1985" "1985" "1985" "1985" ...
##  $ draft_round.y    : chr  "1" "1" "1" "1" ...
##  $ draft_number     : chr  "23" "23" "23" "23" ...
##  $ gp               : int  83 82 50 82 82 8 33 45 2 51 ...
##  $ pts              : num  7.2 7.3 4.9 5 4.5 1 6 5.4 0 5.2 ...
##  $ reb              : num  7.9 8.1 4.6 5.9 3.8 2.8 1.1 1 0 1.1 ...
##  $ ast              : num  0.8 1.5 0.5 1 0.5 0 1.9 1.8 1 1.7 ...
##  $ net_rating       : num  -8 -7.2 -5.6 8.1 3.3 -32.6 -12.4 -3.8 -3.9 -0.5 ...
##  $ oreb_pct         : num  0.1 0.09 0.097 0.089 0.089 0.158 0.018 0.022 0 0.026 ...
##  $ dreb_pct         : num  0.207 0.196 0.179 0.179 0.171 0.208 0.053 0.067 0 0.085 ...
##  $ usg_pct          : num  0.119 0.118 0.148 0.111 0.141 0.146 0.169 0.221 0.282 0.224 ...
##  $ ts_pct           : num  0.523 0.496 0.441 0.482 0.492 0.19 0.495 0.477 0 0.535 ...
##  $ ast_pct          : num  0.045 0.074 0.043 0.058 0.05 0 0.198 0.248 0.4 0.249 ...

4.3 saving new data table

write.csv(Pstats_salary, "dataset/all.csv")

4.3.1 Read in the saved table

all <- read.csv("dataset/all.csv")

4.4 Removing repeated or directly correlated variables

dim(all)
## [1] 8722   51
str(all)
## 'data.frame':    8722 obs. of  51 variables:
##  $ X.1              : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ player_name      : chr  "A.C. Green" "A.C. Green" "A.C. Green" "A.C. Green" ...
##  $ season           : chr  "1996-97" "1997-98" "1998-99" "1999-00" ...
##  $ team_abbreviation: chr  "DAL" "DAL" "DAL" "LAL" ...
##  $ X_id             : chr  "greenac01" "greenac01" "greenac01" "greenac01" ...
##  $ birthDate        : chr  "October 4, 1963" "October 4, 1963" "October 4, 1963" "October 4, 1963" ...
##  $ birthPlace       : chr  "Portland, Oregon" "Portland, Oregon" "Portland, Oregon" "Portland, Oregon" ...
##  $ career_AST       : num  1.1 1.1 1.1 1.1 1.1 0 1.8 1.8 1.8 3 ...
##  $ career_FG.       : chr  "49.4" "49.4" "49.4" "49.4" ...
##  $ career_FG3.      : chr  "25.4" "25.4" "25.4" "25.4" ...
##  $ career_FT.       : chr  "73.4" "73.4" "73.4" "73.4" ...
##  $ career_G         : int  1278 1278 1278 1278 1278 8 80 80 80 645 ...
##  $ career_PER       : chr  "14.4" "14.4" "14.4" "14.4" ...
##  $ career_PTS       : num  9.6 9.6 9.6 9.6 9.6 1 5.5 5.5 5.5 9.7 ...
##  $ career_TRB       : num  7.4 7.4 7.4 7.4 7.4 2.8 1 1 1 1.7 ...
##  $ career_WS        : num  99.5 99.5 99.5 99.5 99.5 -0.2 0.3 0.3 0.3 19.2 ...
##  $ career_eFG.      : chr  "50.1" "50.1" "50.1" "50.1" ...
##  $ college.x        : chr  "Oregon State University" "Oregon State University" "Oregon State University" "Oregon State University" ...
##  $ draft_pick       : chr  "23rd overall" "23rd overall" "23rd overall" "23rd overall" ...
##  $ draft_round.x    : chr  "1st round" "1st round" "1st round" "1st round" ...
##  $ draft_team       : chr  "Los Angeles Lakers" "Los Angeles Lakers" "Los Angeles Lakers" "Los Angeles Lakers" ...
##  $ draft_year.x     : int  1985 1985 1985 1985 1985 1999 2000 2000 2000 2007 ...
##  $ height           : chr  "6-9" "6-9" "6-9" "6-9" ...
##  $ highSchool       : chr  "Benson Polytechnic in Portland, Oregon" "Benson Polytechnic in Portland, Oregon" "Benson Polytechnic in Portland, Oregon" "Benson Polytechnic in Portland, Oregon" ...
##  $ position         : chr  "Power Forward and Small Forward" "Power Forward and Small Forward" "Power Forward and Small Forward" "Power Forward and Small Forward" ...
##  $ shoots           : chr  "Right" "Right" "Right" "Right" ...
##  $ weight           : chr  "220lb" "220lb" "220lb" "220lb" ...
##  $ league           : chr  "NBA" "NBA" "NBA" "NBA" ...
##  $ salary           : int  4851000 5095088 5125088 1700000 2250000 118974 316969 465850 18748 972720 ...
##  $ season_end       : int  1997 1998 1999 2000 2001 2000 2001 2002 2003 2008 ...
##  $ season_start     : int  1996 1997 1998 1999 2000 1999 2000 2001 2002 2007 ...
##  $ team             : chr  "Dallas Mavericks" "Dallas Mavericks" "Dallas Mavericks" "Los Angeles Lakers" ...
##  $ X                : int  300 549 1001 1691 2031 1711 2030 2623 2920 5164 ...
##  $ age              : int  33 34 35 36 37 23 23 24 25 23 ...
##  $ player_height    : num  206 206 206 206 206 ...
##  $ player_weight    : num  102 102 102 102 102 ...
##  $ college.y        : chr  "Oregon State" "Oregon State" "Oregon State" "Oregon State" ...
##  $ country          : chr  "USA" "USA" "USA" "USA" ...
##  $ draft_year.y     : chr  "1985" "1985" "1985" "1985" ...
##  $ draft_round.y    : chr  "1" "1" "1" "1" ...
##  $ draft_number     : chr  "23" "23" "23" "23" ...
##  $ gp               : int  83 82 50 82 82 8 33 45 2 51 ...
##  $ pts              : num  7.2 7.3 4.9 5 4.5 1 6 5.4 0 5.2 ...
##  $ reb              : num  7.9 8.1 4.6 5.9 3.8 2.8 1.1 1 0 1.1 ...
##  $ ast              : num  0.8 1.5 0.5 1 0.5 0 1.9 1.8 1 1.7 ...
##  $ net_rating       : num  -8 -7.2 -5.6 8.1 3.3 -32.6 -12.4 -3.8 -3.9 -0.5 ...
##  $ oreb_pct         : num  0.1 0.09 0.097 0.089 0.089 0.158 0.018 0.022 0 0.026 ...
##  $ dreb_pct         : num  0.207 0.196 0.179 0.179 0.171 0.208 0.053 0.067 0 0.085 ...
##  $ usg_pct          : num  0.119 0.118 0.148 0.111 0.141 0.146 0.169 0.221 0.282 0.224 ...
##  $ ts_pct           : num  0.523 0.496 0.441 0.482 0.492 0.19 0.495 0.477 0 0.535 ...
##  $ ast_pct          : num  0.045 0.074 0.043 0.058 0.05 0 0.198 0.248 0.4 0.249 ...

4.4.1 College

Since college.x and college.y means the same thing (the college they played in before NBA). I will remove college.y and rename college.x as college.

all <- all %>%
    select(!college.y) %>%
    rename(college = college.x)

4.4.2 Height and Weight

4.4.2.1 Height

There are height and player_height, the player_height is in cm while height is in feet. I will first convert feet to inches.

heightinch <- as.numeric(sapply(all$height, function(x) substring(x,
    1, 1))) * 12 + as.numeric(sapply(all$height, function(x) substring(x,
    3, nchar(x))))

height_cor <- cor(heightinch, all$player_height)

height_cor
## [1] 0.9944585

Since the height and player_height is highly correlated, I will remove player_height since the imperial unit system (feet and inches) is more widely used in the NBA.

all <- all %>%
    select(!player_height) %>%
    mutate(height = heightinch)

4.4.2.2 Weight

There are player_weight and weight, I will check if there are the same all the time. From the first column, since the difference is about 2.2 times, the player_weight should be in kg while weight is in lb.

weightlb <- as.numeric(sapply(all$weight, function(x) substring(x,
    1, nchar(x) - 2)))

weight_cor <- cor(weightlb, all$player_weight)
weight_cor
## [1] 0.9462853

Since the weight and player_weight is highly correlated (0.9462853), I will remove the player_weight as the imperial unit system (lb) is more widely used in the NBA.

all <- all %>%
    select(!player_weight) %>%
    mutate(weight = weightlb)

4.4.3 Drafting

range(all$draft_year.x[!is.na(all$draft_year.x)])
## [1] 1976 2017

From Wikipedia (https://en.wikipedia.org/wiki/NBA_draft), there are 10 draft rounds from 1976 to 1984, 7 draft rounds from 1985 to 1988, and 2 rounds from 1989 onward.

4.4.3.1 Draft Year

There is repeated draft year: draft_year.x and draft_year.y.

unique(all$draft_year.y[is.na(all$draft_year.x)])
## [1] "Undrafted" "1995"      "1992"      "1999"      "1994"      "1997"     
## [7] "1982"      "2011"
unique(all$draft_year.x[all$draft_year.y == "Undrafted"])
## [1] NA

As some NA in draft_year.x corresponds to some year in draft_year.y while all Undrafted in draft_year.y is NA in draft_year.x, I will remove draft_year.x and rename draft_year.y to draft_year.

all <- all %>%
    select(!draft_year.x) %>%
    rename(draft_year = draft_year.y)

4.4.3.2 Draft round

There are draft_round.x and draft_round.y

unique(all$draft_round.x)
## [1] "1st round" "2nd round" ""          "3rd round" "4th round" "8th round"
## [7] "7th round" "6th round"
unique(all$draft_round.y)
## [1] "1"         "2"         "Undrafted" "3"         "8"         "4"        
## [7] "7"         "6"

I will keep draft_round.y as the data is cleaner (“1” vs “1st round”) and I will change all third round or later to undrafted as there is only first round, second round and undrafted after 1989.

all <- all %>%
    select(!draft_round.x) %>%
    rename(draft_round = draft_round.y)

all$draft_round[which(!(all$draft_round %in% c("1", "2", "Undrafted")))] <- "Undrafted"

4.4.3.3 Draft number and Draft pick

unique(all$draft_number)
##  [1] "23"        "39"        "32"        "26"        "4"         "49"       
##  [7] "Undrafted" "17"        "58"        "11"        "10"        "3"        
## [13] "8"         "42"        "25"        "15"        "14"        "16"       
## [19] "29"        "12"        "60"        "5"         "45"        "20"       
## [25] "31"        "1"         "2"         "50"        "21"        "56"       
## [31] "47"        "9"         "30"        "46"        "35"        "24"       
## [37] "34"        "44"        "28"        "19"        "38"        "52"       
## [43] "53"        "6"         "36"        "27"        "18"        "51"       
## [49] "43"        "7"         "33"        "22"        "40"        "37"       
## [55] "41"        "13"        "48"        "165"       "75"        "55"       
## [61] "57"        "59"        "54"        "79"        "63"        "160"      
## [67] "120"       "127"       "78"
unique(all$draft_pick)
##  [1] "23rd overall"  "39th overall"  "32nd overall"  "26th overall" 
##  [5] "4th overall"   "49th overall"  ""              "17th overall" 
##  [9] "58th overall"  "11th overall"  "10th overall"  "3rd overall"  
## [13] "8th overall"   "42nd overall"  "25th overall"  "15th overall" 
## [17] "14th overall"  "16th overall"  "29th overall"  "12th overall" 
## [21] "60th overall"  "5th overall"   "45th overall"  "20th overall" 
## [25] "31st overall"  "1st overall"   "2nd overall"   "51st overall" 
## [29] "21st overall"  "57th overall"  "47th overall"  "9th overall"  
## [33] "56th overall"  "30th overall"  "46th overall"  "35th overall" 
## [37] "24th overall"  "34th overall"  "44th overall"  "28th overall" 
## [41] "19th overall"  "38th overall"  "52nd overall"  "53rd overall" 
## [45] "6th overall"   "36th overall"  "48th overall"  "27th overall" 
## [49] "77th overall"  "18th overall"  "43rd overall"  "7th overall"  
## [53] "33rd overall"  "22nd overall"  "40th overall"  "37th overall" 
## [57] "13th overall"  "165th overall" "50th overall"  "75th overall" 
## [61] "55th overall"  "41st overall"  "59th overall"  "54th overall" 
## [65] "79th overall"  "63rd overall"  "160th overall" "120th overall"
## [69] "127th overall"
temp_dp <- as.numeric(str_extract(all$draft_pick, "[0-9]+"))
sum(temp_dp != as.numeric(all$draft_number), na.rm = TRUE)
## Warning: NAs introduced by coercion
## [1] 192
kable(head(all[which(temp_dp != as.numeric(all$draft_number)),
    c("X.1", "player_name", "draft_pick", "draft_number", "draft_year",
        "season", "team")], 20))
## Warning in which(temp_dp != as.numeric(all$draft_number)): NAs introduced by
## coercion
X.1 player_name draft_pick draft_number draft_year season team
243 243 Alton Ford 51st overall 50 2001 2001-02 Phoenix Suns
244 244 Alton Ford 51st overall 50 2001 2002-03 Phoenix Suns
245 245 Alton Ford 51st overall 50 2001 2003-04 Houston Rockets
248 248 Alvin Jones 57th overall 56 2001 2001-02 Philadelphia 76ers
631 631 Antonis Fotsis 48th overall 47 2001 2001-02 Memphis Grizzlies
665 665 Arvydas Sabonis 77th overall 24 1986 1996-97 Portland Trail Blazers
666 666 Arvydas Sabonis 77th overall 24 1986 1997-98 Portland Trail Blazers
667 667 Arvydas Sabonis 77th overall 24 1986 1998-99 Portland Trail Blazers
668 668 Arvydas Sabonis 77th overall 24 1986 1999-00 Portland Trail Blazers
669 669 Arvydas Sabonis 77th overall 24 1986 2000-01 Portland Trail Blazers
670 670 Arvydas Sabonis 77th overall 24 1986 2002-03 Portland Trail Blazers
875 875 Bobby Simmons 42nd overall 41 2001 2001-02 Washington Wizards
876 876 Bobby Simmons 42nd overall 41 2001 2002-03 Washington Wizards
877 877 Bobby Simmons 42nd overall 41 2001 2003-04 Los Angeles Clippers
878 878 Bobby Simmons 42nd overall 41 2001 2004-05 Los Angeles Clippers
879 879 Bobby Simmons 42nd overall 41 2001 2005-06 Milwaukee Bucks
880 880 Bobby Simmons 42nd overall 41 2001 2007-08 Milwaukee Bucks
881 881 Bobby Simmons 42nd overall 41 2001 2008-09 New Jersey Nets
882 882 Bobby Simmons 42nd overall 41 2001 2009-10 New Jersey Nets
883 883 Bobby Simmons 42nd overall 41 2001 2010-11 San Antonio Spurs

I manually search up the draft pick and found out the draft_number is mostly correct when their is a difference.

all <- all %>%
    select(!draft_pick)

4.4.4 League

unique(all$league)
## [1] "NBA"

Since there is no variation in the league, I will remove the variable

all <- all %>%
    select(!league)

4.4.5 team

Team and team_abbreviation is directly correlated. I will remove team and keep team_abbreviation

all <- all %>%
    select(!team)

4.4.6 Season

Since season_end, season_start and season represent the same information, I will remove season and season_end.

all <- all %>%
    select(!c(season, season_end))

4.4.7 Career Stats

all <- all[, -grep("^career", names(all))]

5 Exploratory Analysis

dim(all)
## [1] 8722   31
str(all)
## 'data.frame':    8722 obs. of  31 variables:
##  $ X.1              : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ player_name      : chr  "A.C. Green" "A.C. Green" "A.C. Green" "A.C. Green" ...
##  $ team_abbreviation: chr  "DAL" "DAL" "DAL" "LAL" ...
##  $ X_id             : chr  "greenac01" "greenac01" "greenac01" "greenac01" ...
##  $ birthDate        : chr  "October 4, 1963" "October 4, 1963" "October 4, 1963" "October 4, 1963" ...
##  $ birthPlace       : chr  "Portland, Oregon" "Portland, Oregon" "Portland, Oregon" "Portland, Oregon" ...
##  $ college          : chr  "Oregon State University" "Oregon State University" "Oregon State University" "Oregon State University" ...
##  $ draft_team       : chr  "Los Angeles Lakers" "Los Angeles Lakers" "Los Angeles Lakers" "Los Angeles Lakers" ...
##  $ height           : num  81 81 81 81 81 82 73 73 73 72 ...
##  $ highSchool       : chr  "Benson Polytechnic in Portland, Oregon" "Benson Polytechnic in Portland, Oregon" "Benson Polytechnic in Portland, Oregon" "Benson Polytechnic in Portland, Oregon" ...
##  $ position         : chr  "Power Forward and Small Forward" "Power Forward and Small Forward" "Power Forward and Small Forward" "Power Forward and Small Forward" ...
##  $ shoots           : chr  "Right" "Right" "Right" "Right" ...
##  $ weight           : num  220 220 220 220 220 227 180 180 180 161 ...
##  $ salary           : int  4851000 5095088 5125088 1700000 2250000 118974 316969 465850 18748 972720 ...
##  $ season_start     : int  1996 1997 1998 1999 2000 1999 2000 2001 2002 2007 ...
##  $ X                : int  300 549 1001 1691 2031 1711 2030 2623 2920 5164 ...
##  $ age              : int  33 34 35 36 37 23 23 24 25 23 ...
##  $ country          : chr  "USA" "USA" "USA" "USA" ...
##  $ draft_year       : chr  "1985" "1985" "1985" "1985" ...
##  $ draft_round      : chr  "1" "1" "1" "1" ...
##  $ draft_number     : chr  "23" "23" "23" "23" ...
##  $ gp               : int  83 82 50 82 82 8 33 45 2 51 ...
##  $ pts              : num  7.2 7.3 4.9 5 4.5 1 6 5.4 0 5.2 ...
##  $ reb              : num  7.9 8.1 4.6 5.9 3.8 2.8 1.1 1 0 1.1 ...
##  $ ast              : num  0.8 1.5 0.5 1 0.5 0 1.9 1.8 1 1.7 ...
##  $ net_rating       : num  -8 -7.2 -5.6 8.1 3.3 -32.6 -12.4 -3.8 -3.9 -0.5 ...
##  $ oreb_pct         : num  0.1 0.09 0.097 0.089 0.089 0.158 0.018 0.022 0 0.026 ...
##  $ dreb_pct         : num  0.207 0.196 0.179 0.179 0.171 0.208 0.053 0.067 0 0.085 ...
##  $ usg_pct          : num  0.119 0.118 0.148 0.111 0.141 0.146 0.169 0.221 0.282 0.224 ...
##  $ ts_pct           : num  0.523 0.496 0.441 0.482 0.492 0.19 0.495 0.477 0 0.535 ...
##  $ ast_pct          : num  0.045 0.074 0.043 0.058 0.05 0 0.198 0.248 0.4 0.249 ...

5.1 The response variable: salary

5.1.1 Before inflation

sal_year <- all %>%
    group_by(season_start) %>%
    summarise(median = median(salary), mean = mean(salary), sd = sd(salary))

fig1 <- plot_ly(sal_year, x = ~season_start, y = ~median, name = "Median salary",
    type = "scatter", mode = "lines") %>%
    add_trace(y = ~mean, name = "Mean salary") %>%
    layout(title = "Salary in NBA from 1996-97 season to 2017-18 season",
        yaxis = list(title = "Salary (USD)"), xaxis = list(title = "Season"))
fig1
fig2 <- plot_ly(sal_year, x = ~season_start, y = ~sd, name = "standard deviation",
    type = "scatter", mode = "lines") %>%
    layout(title = "Standard Deviation of NBA Salary")
fig2
fig3 <- plot_ly(data = all, x = ~season_start, y = ~salary, type = "box")
fig3

5.1.2 After inflation

inflation <- read.csv("files/inflation.csv")

all <- merge(all, inflation[, c(1, 3)], by.x = "season_start",
    by.y = "year")
all$salary_infl <- all$salary/all$pct
sal_year <- all %>%
    group_by(season_start) %>%
    summarise(median = median(salary_infl), mean = mean(salary_infl),
        sd = sd(salary_infl))

fig1 <- plot_ly(sal_year, x = ~season_start, y = ~median, name = "Median salary",
    type = "scatter", mode = "lines") %>%
    add_trace(y = ~mean, name = "Mean salary") %>%
    layout(title = "Salary in NBA from 1996-97 season to 2017-18 season",
        yaxis = list(title = "Salary (USD)"), xaxis = list(title = "Season"))
fig1
fig2 <- plot_ly(sal_year, x = ~season_start, y = ~sd, name = "standard deviation",
    type = "scatter", mode = "lines") %>%
    layout(title = "Standard Deviation of NBA Salary")
fig2
fig3 <- plot_ly(data = all, x = ~season_start, y = ~salary, type = "box")
fig3

5.2 The most important numeric variables

numVar <- which(sapply(all, is.numeric))
numVarName <- names(numVar)

There are 19 numeric variables

all_numVar <- all[, numVar]
all_numVar <- select(all_numVar, !salary)
cor_numVar <- cor(all_numVar, use = "pairwise.complete.obs")

cor_sorted <- as.matrix(sort(cor_numVar[, "salary_infl"], decreasing = TRUE))

cor_high <- row.names(cor_sorted)[1:10]

cor_numVar <- cor_numVar[cor_high, cor_high]

corrplot.mixed(cor_numVar, tl.pos = "lt")