“players.csv” + “salaries_1985to2018.csv”: https://data.world/datadavis/nba-salaries
“all_seasons.csv”: https://www.kaggle.com/datasets/justinas/nba-players-data
library(knitr)
library(plyr)
library(dplyr)
library(tidyr)
library(caret)
library(ggplot2)
library(corrplot)
library(stringr)
library(scales)
library(randomForest)
library(psych)
library(glmnet)
library(rpart)
library(lubridate)
library(plotly)
opts_chunk$set(echo = TRUE, cache = TRUE)
opts_chunk$set(tidy.opts = list(width.cutoff = 60), tidy = TRUE)
Source: https://data.world/datadavis/nba-salaries
It has primary key id, their birth date, birth place, career statistics,
name, position
dim(players)
## [1] 4685 24
str(players)
## 'data.frame': 4685 obs. of 24 variables:
## $ X_id : chr "abdelal01" "abdulza01" "abdulka01" "abdulma02" ...
## $ birthDate : chr "June 24, 1968" "April 7, 1946" "April 16, 1947" "March 9, 1969" ...
## $ birthPlace : chr "Cairo, Egypt" "Brooklyn, New York" "New York, New York" "Gulfport, Mississippi" ...
## $ career_AST : num 0.3 1.2 3.6 3.5 1.1 2.5 1.2 1 0.7 0.5 ...
## $ career_FG. : chr "50.2" "42.8" "55.9" "44.2" ...
## $ career_FG3.: chr "0.0" "" "5.6" "35.4" ...
## $ career_FT. : chr "70.1" "72.8" "72.1" "90.5" ...
## $ career_G : int 256 505 1560 586 236 830 319 1 56 174 ...
## $ career_PER : chr "13.0" "15.1" "24.6" "15.4" ...
## $ career_PTS : num 5.7 9 24.6 14.6 7.8 18.1 5.6 0 9.5 5.3 ...
## $ career_TRB : chr "3.3" "8.0" "11.2" "1.9" ...
## $ career_WS : chr "4.8" "17.5" "273.4" "25.2" ...
## $ career_eFG.: chr "50.2" "" "55.9" "47.2" ...
## $ college : chr "Duke University" "Iowa State University" "University of California, Los Angeles" "Louisiana State University" ...
## $ draft_pick : chr "25th overall" "5th overall" "1st overall" "3rd overall" ...
## $ draft_round: chr "1st round" "1st round" "1st round" "1st round" ...
## $ draft_team : chr "Portland Trail Blazers" "Cincinnati Royals" "Milwaukee Bucks" "Denver Nuggets" ...
## $ draft_year : chr "1990" "1968" "1969" "1990" ...
## $ height : chr "6-10" "6-9" "7-2" "6-1" ...
## $ highSchool : chr "Bloomfield in Bloomfield, New Jersey" "John Jay in Brooklyn, New York" "Power Memorial in New York, New York" "Gulfport in Gulfport, Mississippi" ...
## $ name : chr "Alaa Abdelnaby" "Zaid Abdul-Aziz" "Kareem Abdul-Jabbar" "Mahmoud Abdul-Rauf" ...
## $ position : chr "Power Forward" "Power Forward and Center" "Center" "Point Guard" ...
## $ shoots : chr "Right" "Right" "Right" "Right" ...
## $ weight : chr "240lb" "235lb" "225lb" "162lb" ...
Source: https://data.world/datadavis/nba-salaries
It record the player_id (as a foreign key to the players.csv), their
salary, the season and the teams they played in.
dim(salaries)
## [1] 14163 7
str(salaries)
## 'data.frame': 14163 obs. of 7 variables:
## $ league : chr "NBA" "NBA" "NBA" "NBA" ...
## $ player_id : chr "abdelal01" "abdelal01" "abdelal01" "abdelal01" ...
## $ salary : int 395000 494000 500000 805000 650000 1530000 2030000 2000000 3000000 1660000 ...
## $ season : chr "1990-91" "1991-92" "1992-93" "1993-94" ...
## $ season_end : int 1991 1992 1993 1994 1995 1985 1986 1988 1989 1991 ...
## $ season_start: int 1990 1991 1992 1993 1994 1984 1985 1987 1988 1990 ...
## $ team : chr "Portland Trail Blazers" "Portland Trail Blazers" "Boston Celtics" "Boston Celtics" ...
Source: https://www.kaggle.com/datasets/justinas/nba-players-data
It record the player names, teams, age, and other statistics for each
individual season from 1996 season to 2020 season. (the index in this
table is not linked to the previous tables)
dim(Pstats)
## [1] 11700 22
str(Pstats)
## 'data.frame': 11700 obs. of 22 variables:
## $ X : int 0 1 2 3 4 5 6 7 8 9 ...
## $ player_name : chr "Travis Knight" "Matt Fish" "Matt Bullard" "Marty Conlon" ...
## $ team_abbreviation: chr "LAL" "MIA" "HOU" "BOS" ...
## $ age : num 22 27 30 29 22 22 36 26 33 32 ...
## $ player_height : num 213 211 208 211 206 ...
## $ player_weight : num 107 107 107 111 107 ...
## $ college : chr "Connecticut" "North Carolina-Wilmington" "Iowa" "Providence" ...
## $ country : chr "USA" "USA" "USA" "USA" ...
## $ draft_year : chr "1996" "1992" "Undrafted" "Undrafted" ...
## $ draft_round : chr "1" "2" "Undrafted" "Undrafted" ...
## $ draft_number : chr "29" "50" "Undrafted" "Undrafted" ...
## $ gp : int 71 6 71 74 42 9 70 31 70 82 ...
## $ pts : num 4.8 0.3 4.5 7.8 3.7 1.6 3.2 2 11.3 9.9 ...
## $ reb : num 4.5 0.8 1.6 4.4 1.6 0.7 2.7 1.2 2.6 4.8 ...
## $ ast : num 0.5 0 0.9 1.4 0.5 0.4 0.3 0 4.9 11.4 ...
## $ net_rating : num 6.2 -15.1 0.9 -9 -14.5 -3.5 3.5 -17.1 -3.1 -2 ...
## $ oreb_pct : num 0.127 0.143 0.016 0.083 0.109 0.087 0.092 0.109 0.023 0.035 ...
## $ dreb_pct : num 0.182 0.267 0.115 0.152 0.118 0.045 0.146 0.152 0.088 0.116 ...
## $ usg_pct : num 0.142 0.265 0.151 0.167 0.233 0.135 0.137 0.232 0.192 0.155 ...
## $ ts_pct : num 0.536 0.333 0.535 0.542 0.482 0.47 0.555 0.448 0.597 0.525 ...
## $ ast_pct : num 0.052 0 0.099 0.101 0.114 0.125 0.034 0.013 0.289 0.464 ...
## $ season : chr "1996-97" "1996-97" "1996-97" "1996-97" ...
Since it is from the same source and the two tables are linked by a primary key (players.X_id) and foreign key (salaries.player_id). I will link them with merge column X_id and player_id.
player_salary <- merge(players, salaries, by.x = "X_id", by.y = "player_id")
dim(player_salary)
## [1] 14163 30
str(player_salary)
## 'data.frame': 14163 obs. of 30 variables:
## $ X_id : chr "abdelal01" "abdelal01" "abdelal01" "abdelal01" ...
## $ birthDate : chr "June 24, 1968" "June 24, 1968" "June 24, 1968" "June 24, 1968" ...
## $ birthPlace : chr "Cairo, Egypt" "Cairo, Egypt" "Cairo, Egypt" "Cairo, Egypt" ...
## $ career_AST : num 0.3 0.3 0.3 0.3 0.3 3.6 3.6 3.6 3.6 3.5 ...
## $ career_FG. : chr "50.2" "50.2" "50.2" "50.2" ...
## $ career_FG3. : chr "0.0" "0.0" "0.0" "0.0" ...
## $ career_FT. : chr "70.1" "70.1" "70.1" "70.1" ...
## $ career_G : int 256 256 256 256 256 1560 1560 1560 1560 586 ...
## $ career_PER : chr "13.0" "13.0" "13.0" "13.0" ...
## $ career_PTS : num 5.7 5.7 5.7 5.7 5.7 24.6 24.6 24.6 24.6 14.6 ...
## $ career_TRB : chr "3.3" "3.3" "3.3" "3.3" ...
## $ career_WS : chr "4.8" "4.8" "4.8" "4.8" ...
## $ career_eFG. : chr "50.2" "50.2" "50.2" "50.2" ...
## $ college : chr "Duke University" "Duke University" "Duke University" "Duke University" ...
## $ draft_pick : chr "25th overall" "25th overall" "25th overall" "25th overall" ...
## $ draft_round : chr "1st round" "1st round" "1st round" "1st round" ...
## $ draft_team : chr "Portland Trail Blazers" "Portland Trail Blazers" "Portland Trail Blazers" "Portland Trail Blazers" ...
## $ draft_year : chr "1990" "1990" "1990" "1990" ...
## $ height : chr "6-10" "6-10" "6-10" "6-10" ...
## $ highSchool : chr "Bloomfield in Bloomfield, New Jersey" "Bloomfield in Bloomfield, New Jersey" "Bloomfield in Bloomfield, New Jersey" "Bloomfield in Bloomfield, New Jersey" ...
## $ name : chr "Alaa Abdelnaby" "Alaa Abdelnaby" "Alaa Abdelnaby" "Alaa Abdelnaby" ...
## $ position : chr "Power Forward" "Power Forward" "Power Forward" "Power Forward" ...
## $ shoots : chr "Right" "Right" "Right" "Right" ...
## $ weight : chr "240lb" "240lb" "240lb" "240lb" ...
## $ league : chr "NBA" "NBA" "NBA" "NBA" ...
## $ salary : int 395000 494000 500000 805000 650000 1530000 2030000 2000000 3000000 1660000 ...
## $ season : chr "1990-91" "1991-92" "1992-93" "1993-94" ...
## $ season_end : int 1991 1992 1993 1994 1995 1985 1986 1988 1989 1991 ...
## $ season_start: int 1990 1991 1992 1993 1994 1984 1985 1987 1988 1990 ...
## $ team : chr "Portland Trail Blazers" "Portland Trail Blazers" "Boston Celtics" "Boston Celtics" ...
teams <- unique(player_salary$team)
teamsABB <- c("POR", "BOS", "SAC", "LAL", "DEN", "VAN", "DAL",
"ATL", "OKC", "DET", "LAC", "ORL", "HOU", "TOR", "NYK", "BKN",
"PHX", "NJN", "MEM", "WAS", "CHA", "MIA", "GSW", "CHA", "MIN",
"SAS", "NOP", "PHI", "WAS", "NOH", "CLE", "MIL", "CHI", "IND",
"SEA", "UTA", "NOK", "None", "KCK")
teamABBTab <- data.frame(team_name = teams, team_ABB = teamsABB)
player_salary$team_abbreviation <- sapply(player_salary$team,
function(x) teamABBTab$team_ABB[which(teamABBTab$team_name ==
x)])
player_salary <- rename(player_salary, player_name = name)
Pstats_salary <- merge(player_salary, Pstats, by = c("player_name",
"season", "team_abbreviation"))
dim(Pstats_salary)
## [1] 8722 50
str(Pstats_salary)
## 'data.frame': 8722 obs. of 50 variables:
## $ player_name : chr "A.C. Green" "A.C. Green" "A.C. Green" "A.C. Green" ...
## $ season : chr "1996-97" "1997-98" "1998-99" "1999-00" ...
## $ team_abbreviation: chr "DAL" "DAL" "DAL" "LAL" ...
## $ X_id : chr "greenac01" "greenac01" "greenac01" "greenac01" ...
## $ birthDate : chr "October 4, 1963" "October 4, 1963" "October 4, 1963" "October 4, 1963" ...
## $ birthPlace : chr "Portland, Oregon" "Portland, Oregon" "Portland, Oregon" "Portland, Oregon" ...
## $ career_AST : num 1.1 1.1 1.1 1.1 1.1 0 1.8 1.8 1.8 3 ...
## $ career_FG. : chr "49.4" "49.4" "49.4" "49.4" ...
## $ career_FG3. : chr "25.4" "25.4" "25.4" "25.4" ...
## $ career_FT. : chr "73.4" "73.4" "73.4" "73.4" ...
## $ career_G : int 1278 1278 1278 1278 1278 8 80 80 80 645 ...
## $ career_PER : chr "14.4" "14.4" "14.4" "14.4" ...
## $ career_PTS : num 9.6 9.6 9.6 9.6 9.6 1 5.5 5.5 5.5 9.7 ...
## $ career_TRB : chr "7.4" "7.4" "7.4" "7.4" ...
## $ career_WS : chr "99.5" "99.5" "99.5" "99.5" ...
## $ career_eFG. : chr "50.1" "50.1" "50.1" "50.1" ...
## $ college.x : chr "Oregon State University" "Oregon State University" "Oregon State University" "Oregon State University" ...
## $ draft_pick : chr "23rd overall" "23rd overall" "23rd overall" "23rd overall" ...
## $ draft_round.x : chr "1st round" "1st round" "1st round" "1st round" ...
## $ draft_team : chr "Los Angeles Lakers" "Los Angeles Lakers" "Los Angeles Lakers" "Los Angeles Lakers" ...
## $ draft_year.x : chr "1985" "1985" "1985" "1985" ...
## $ height : chr "6-9" "6-9" "6-9" "6-9" ...
## $ highSchool : chr "Benson Polytechnic in Portland, Oregon" "Benson Polytechnic in Portland, Oregon" "Benson Polytechnic in Portland, Oregon" "Benson Polytechnic in Portland, Oregon" ...
## $ position : chr "Power Forward and Small Forward" "Power Forward and Small Forward" "Power Forward and Small Forward" "Power Forward and Small Forward" ...
## $ shoots : chr "Right" "Right" "Right" "Right" ...
## $ weight : chr "220lb" "220lb" "220lb" "220lb" ...
## $ league : chr "NBA" "NBA" "NBA" "NBA" ...
## $ salary : int 4851000 5095088 5125088 1700000 2250000 118974 316969 465850 18748 972720 ...
## $ season_end : int 1997 1998 1999 2000 2001 2000 2001 2002 2003 2008 ...
## $ season_start : int 1996 1997 1998 1999 2000 1999 2000 2001 2002 2007 ...
## $ team : chr "Dallas Mavericks" "Dallas Mavericks" "Dallas Mavericks" "Los Angeles Lakers" ...
## $ X : int 300 549 1001 1691 2031 1711 2030 2623 2920 5164 ...
## $ age : num 33 34 35 36 37 23 23 24 25 23 ...
## $ player_height : num 206 206 206 206 206 ...
## $ player_weight : num 102 102 102 102 102 ...
## $ college.y : chr "Oregon State" "Oregon State" "Oregon State" "Oregon State" ...
## $ country : chr "USA" "USA" "USA" "USA" ...
## $ draft_year.y : chr "1985" "1985" "1985" "1985" ...
## $ draft_round.y : chr "1" "1" "1" "1" ...
## $ draft_number : chr "23" "23" "23" "23" ...
## $ gp : int 83 82 50 82 82 8 33 45 2 51 ...
## $ pts : num 7.2 7.3 4.9 5 4.5 1 6 5.4 0 5.2 ...
## $ reb : num 7.9 8.1 4.6 5.9 3.8 2.8 1.1 1 0 1.1 ...
## $ ast : num 0.8 1.5 0.5 1 0.5 0 1.9 1.8 1 1.7 ...
## $ net_rating : num -8 -7.2 -5.6 8.1 3.3 -32.6 -12.4 -3.8 -3.9 -0.5 ...
## $ oreb_pct : num 0.1 0.09 0.097 0.089 0.089 0.158 0.018 0.022 0 0.026 ...
## $ dreb_pct : num 0.207 0.196 0.179 0.179 0.171 0.208 0.053 0.067 0 0.085 ...
## $ usg_pct : num 0.119 0.118 0.148 0.111 0.141 0.146 0.169 0.221 0.282 0.224 ...
## $ ts_pct : num 0.523 0.496 0.441 0.482 0.492 0.19 0.495 0.477 0 0.535 ...
## $ ast_pct : num 0.045 0.074 0.043 0.058 0.05 0 0.198 0.248 0.4 0.249 ...
write.csv(Pstats_salary, "dataset/all.csv")
all <- read.csv("dataset/all.csv")
dim(all)
## [1] 8722 31
str(all)
## 'data.frame': 8722 obs. of 31 variables:
## $ X.1 : int 1 2 3 4 5 6 7 8 9 10 ...
## $ player_name : chr "A.C. Green" "A.C. Green" "A.C. Green" "A.C. Green" ...
## $ team_abbreviation: chr "DAL" "DAL" "DAL" "LAL" ...
## $ X_id : chr "greenac01" "greenac01" "greenac01" "greenac01" ...
## $ birthDate : chr "October 4, 1963" "October 4, 1963" "October 4, 1963" "October 4, 1963" ...
## $ birthPlace : chr "Portland, Oregon" "Portland, Oregon" "Portland, Oregon" "Portland, Oregon" ...
## $ college : chr "Oregon State University" "Oregon State University" "Oregon State University" "Oregon State University" ...
## $ draft_team : chr "Los Angeles Lakers" "Los Angeles Lakers" "Los Angeles Lakers" "Los Angeles Lakers" ...
## $ height : num 81 81 81 81 81 82 73 73 73 72 ...
## $ highSchool : chr "Benson Polytechnic in Portland, Oregon" "Benson Polytechnic in Portland, Oregon" "Benson Polytechnic in Portland, Oregon" "Benson Polytechnic in Portland, Oregon" ...
## $ position : chr "Power Forward and Small Forward" "Power Forward and Small Forward" "Power Forward and Small Forward" "Power Forward and Small Forward" ...
## $ shoots : chr "Right" "Right" "Right" "Right" ...
## $ weight : num 220 220 220 220 220 227 180 180 180 161 ...
## $ salary : int 4851000 5095088 5125088 1700000 2250000 118974 316969 465850 18748 972720 ...
## $ season_start : int 1996 1997 1998 1999 2000 1999 2000 2001 2002 2007 ...
## $ X : int 300 549 1001 1691 2031 1711 2030 2623 2920 5164 ...
## $ age : int 33 34 35 36 37 23 23 24 25 23 ...
## $ country : chr "USA" "USA" "USA" "USA" ...
## $ draft_year : chr "1985" "1985" "1985" "1985" ...
## $ draft_round : chr "1" "1" "1" "1" ...
## $ draft_number : chr "23" "23" "23" "23" ...
## $ gp : int 83 82 50 82 82 8 33 45 2 51 ...
## $ pts : num 7.2 7.3 4.9 5 4.5 1 6 5.4 0 5.2 ...
## $ reb : num 7.9 8.1 4.6 5.9 3.8 2.8 1.1 1 0 1.1 ...
## $ ast : num 0.8 1.5 0.5 1 0.5 0 1.9 1.8 1 1.7 ...
## $ net_rating : num -8 -7.2 -5.6 8.1 3.3 -32.6 -12.4 -3.8 -3.9 -0.5 ...
## $ oreb_pct : num 0.1 0.09 0.097 0.089 0.089 0.158 0.018 0.022 0 0.026 ...
## $ dreb_pct : num 0.207 0.196 0.179 0.179 0.171 0.208 0.053 0.067 0 0.085 ...
## $ usg_pct : num 0.119 0.118 0.148 0.111 0.141 0.146 0.169 0.221 0.282 0.224 ...
## $ ts_pct : num 0.523 0.496 0.441 0.482 0.492 0.19 0.495 0.477 0 0.535 ...
## $ ast_pct : num 0.045 0.074 0.043 0.058 0.05 0 0.198 0.248 0.4 0.249 ...
sal_year <- all %>%
group_by(season_start) %>%
summarise(median = median(salary), mean = mean(salary), sd = sd(salary))
fig1 <- plot_ly(sal_year, x = ~season_start, y = ~median, name = "Median salary",
type = "scatter", mode = "lines") %>%
add_trace(y = ~mean, name = "Mean salary") %>%
layout(title = "Salary in NBA from 1996-97 season to 2017-18 season",
yaxis = list(title = "Salary (USD)"), xaxis = list(title = "Season"))
fig1
fig2 <- plot_ly(sal_year, x = ~season_start, y = ~sd, name = "standard deviation",
type = "scatter", mode = "lines") %>%
layout(title = "Standard Deviation of NBA Salary")
fig2
fig3 <- plot_ly(data = all, x = ~season_start, y = ~salary, type = "box")
fig3
inflation <- read.csv("files/inflation.csv")
all <- merge(all, inflation[, c(1, 3)], by.x = "season_start",
by.y = "year")
all$salary_infl <- all$salary/all$pct
sal_year <- all %>%
group_by(season_start) %>%
summarise(median = median(salary_infl), mean = mean(salary_infl),
sd = sd(salary_infl))
fig1 <- plot_ly(sal_year, x = ~season_start, y = ~median, name = "Median salary",
type = "scatter", mode = "lines") %>%
add_trace(y = ~mean, name = "Mean salary") %>%
layout(title = "Salary in NBA from 1996-97 season to 2017-18 season",
yaxis = list(title = "Salary (USD)"), xaxis = list(title = "Season"))
fig1
fig2 <- plot_ly(sal_year, x = ~season_start, y = ~sd, name = "standard deviation",
type = "scatter", mode = "lines") %>%
layout(title = "Standard Deviation of NBA Salary")
fig2
fig3 <- plot_ly(data = all, x = ~season_start, y = ~salary, type = "box")
fig3
numVar <- which(sapply(all, is.numeric))
numVarName <- names(numVar)
There are 19 numeric variables
all_numVar <- all[, numVar]
all_numVar <- select(all_numVar, !salary)
cor_numVar <- cor(all_numVar, use = "pairwise.complete.obs")
cor_sorted <- as.matrix(sort(cor_numVar[, "salary_infl"], decreasing = TRUE))
cor_high <- row.names(cor_sorted)[1:10]
cor_numVar <- cor_numVar[cor_high, cor_high]
corrplot.mixed(cor_numVar, tl.pos = "lt")