library(tidyverse)
## ── Attaching packages ────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.2.1 ✓ purrr 0.3.3
## ✓ tibble 2.1.3 ✓ dplyr 0.8.3
## ✓ tidyr 1.0.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.4.0
## ── Conflicts ───────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(readxl)
library(ggplot2)
bb <- read_excel("basketball.xlsx")
## New names:
## * `` -> ...1
str(bb)
## Classes 'tbl_df', 'tbl' and 'data.frame': 559 obs. of 29 variables:
## $ ...1 : num 1 2 3 4 5 6 7 8 9 10 ...
## $ Player : chr "Kay Felder" "Vince Hunter" "Josh Smith" "Jeremy Evans" ...
## $ Country : chr "USA" "USA" "USA" "USA" ...
## $ Country_Dummy : num 1 1 1 1 1 0 0 1 0 0 ...
## $ Salary : num 1312611 50000 5400000 104059 950000 ...
## $ Guaranteed : num 456529 50000 16200000 50000 1640000 ...
## $ Position : chr "Point Guard" "Power Forward" "Power Forward" "Small Forward" ...
## $ Age : num 22 23 32 30 19 29 24 22 23 24 ...
## $ Team : chr "Detroit Pistons" "Memphis Grizzlies" "New Orleans Pride" "Atlanta Hawks" ...
## $ Player_Efficiency_Rating : num -31.6 35.9 3.5 10.4 23.2 30.3 11.7 8.8 25 20.8 ...
## $ True_Shooting_Percentage : num 0 0.6 0.25 1 0.558 0.635 0.521 0.465 0.589 0.628 ...
## $ Three_Point_Field_Goal_Percentage: num 50 0 0 0 0 0 16.7 0 9.2 0.3 ...
## $ Free_Throw_Percentage : num 0 0 0 0 66.7 71.1 100 44.4 81.5 39.7 ...
## $ Offensive_Rebound_Percentage : num 35.9 32.3 28.1 22.4 20.4 20.4 18.7 17.9 17.2 16.9 ...
## $ Defensive_Rebound_Percentage : num 0 17 8.8 0 16.1 32.9 12.4 20.8 29.8 13.9 ...
## $ Total_Rebound_Percentage : num 18.4 24.8 18.1 11.2 18.2 26.7 15.5 19.4 23.4 15.5 ...
## $ Assist_Percentage : num 0 0 0 0 0 9.7 0 5.9 7.1 5.5 ...
## $ Steal_Percentage : num 0 0 0 0 1.8 2.1 2.6 0 2 1.9 ...
## $ Block_Percentage : num 0 13.4 0 0 9.5 3.6 0 6.7 3 2.9 ...
## $ Turnover_Percentage : num 33.3 16.7 0 50 14.7 11.9 18.8 21.8 7.3 13.3 ...
## $ Usage_Percentage : num 44.2 38.5 14.4 17.5 21.8 25.2 12.7 23.5 21 16.7 ...
## $ Offensive_Win_Shares : num -0.1 0 0 0 0.1 0.5 0 0 0.7 6.4 ...
## $ Defensive_Win_Shares : num 0 0 0 0 0 0.2 0 0 0.3 2.9 ...
## $ Win_Shares : num -0.1 0 0 0 0.1 0.7 0.1 0 1.1 9.3 ...
## $ Win_Shares_Per_48_Minutes : num -1.005 0.099 -0.015 -0.03 0.177 ...
## $ Offense_Box_Plus_Minus : num -29.5 -3.1 -6 -2.3 -4.1 2.5 -3.1 -7.5 1.4 2.2 ...
## $ Defense_Box_Plus_Minus : num -11.9 -5.8 -7.4 -6.1 -1.4 0.8 -4.6 0.6 0.4 1.2 ...
## $ Box_Plus_Minus : num -41.4 -9 -13.4 -8.3 -5.4 3.2 -7.7 -6.9 1.8 3.4 ...
## $ Value_Over_Replacement_Player : num 0 0 0 0 0 0.2 -0.1 0 0.2 3.3 ...
names(bb)
## [1] "...1" "Player"
## [3] "Country" "Country_Dummy"
## [5] "Salary" "Guaranteed"
## [7] "Position" "Age"
## [9] "Team" "Player_Efficiency_Rating"
## [11] "True_Shooting_Percentage" "Three_Point_Field_Goal_Percentage"
## [13] "Free_Throw_Percentage" "Offensive_Rebound_Percentage"
## [15] "Defensive_Rebound_Percentage" "Total_Rebound_Percentage"
## [17] "Assist_Percentage" "Steal_Percentage"
## [19] "Block_Percentage" "Turnover_Percentage"
## [21] "Usage_Percentage" "Offensive_Win_Shares"
## [23] "Defensive_Win_Shares" "Win_Shares"
## [25] "Win_Shares_Per_48_Minutes" "Offense_Box_Plus_Minus"
## [27] "Defense_Box_Plus_Minus" "Box_Plus_Minus"
## [29] "Value_Over_Replacement_Player"
modbb <- lm(Salary~Age, data=bb)
modbb
##
## Call:
## lm(formula = Salary ~ Age, data = bb)
##
## Coefficients:
## (Intercept) Age
## -8117036 563483
summary(modbb)
##
## Call:
## lm(formula = Salary ~ Age, data = bb)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12093644 -4265093 -2186556 2409749 26458570
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8117036 1824186 -4.450 1.04e-05 ***
## Age 563483 68059 8.279 9.23e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6826000 on 557 degrees of freedom
## Multiple R-squared: 0.1096, Adjusted R-squared: 0.108
## F-statistic: 68.55 on 1 and 557 DF, p-value: 9.228e-16
ggplot(bb, aes(x=Age, y=Salary))+
geom_jitter()+
geom_abline(slope=modbb$coefficients[2], intercept=modbb$coefficients[1],
color="blue", lty=2, lwd=1)+
geom_smooth(col = "orange")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

plot(modbb)




hist(bb$Salary)

pairs(~Salary + Guaranteed + Age + Player_Efficiency_Rating +
True_Shooting_Percentage + Three_Point_Field_Goal_Percentage, bb)

pairs(~Salary + Free_Throw_Percentage + Offensive_Rebound_Percentage + Defensive_Rebound_Percentage +
Total_Rebound_Percentage + Assist_Percentage, bb)

pairs(~Salary + Steal_Percentage + Block_Percentage + Turnover_Percentage + Usage_Percentage +
Offensive_Win_Shares, bb)

pairs(~Salary + Defensive_Win_Shares + Win_Shares + Win_Shares_Per_48_Minutes
+ Offense_Box_Plus_Minus, bb)

pairs(~Salary + Defense_Box_Plus_Minus + Box_Plus_Minus + Value_Over_Replacement_Player, bb)

anova(modbb)
## Analysis of Variance Table
##
## Response: Salary
## Df Sum Sq Mean Sq F value Pr(>F)
## Age 1 3.1939e+15 3.1939e+15 68.547 9.228e-16 ***
## Residuals 557 2.5953e+16 4.6594e+13
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
library(ggplot2)
modbb2 <- lm(Salary~Age+Guaranteed+Player_Efficiency_Rating+True_Shooting_Percentage+Three_Point_Field_Goal_Percentage+Free_Throw_Percentage+Offensive_Rebound_Percentage+Defensive_Rebound_Percentage+Total_Rebound_Percentage+Assist_Percentage+Steal_Percentage+Block_Percentage+Turnover_Percentage+Usage_Percentage+Offensive_Win_Shares+Defensive_Win_Shares+Win_Shares+Win_Shares_Per_48_Minutes+Offense_Box_Plus_Minus+Defense_Box_Plus_Minus+Box_Plus_Minus+Value_Over_Replacement_Player, data=bb)
modbb2
##
## Call:
## lm(formula = Salary ~ Age + Guaranteed + Player_Efficiency_Rating +
## True_Shooting_Percentage + Three_Point_Field_Goal_Percentage +
## Free_Throw_Percentage + Offensive_Rebound_Percentage + Defensive_Rebound_Percentage +
## Total_Rebound_Percentage + Assist_Percentage + Steal_Percentage +
## Block_Percentage + Turnover_Percentage + Usage_Percentage +
## Offensive_Win_Shares + Defensive_Win_Shares + Win_Shares +
## Win_Shares_Per_48_Minutes + Offense_Box_Plus_Minus + Defense_Box_Plus_Minus +
## Box_Plus_Minus + Value_Over_Replacement_Player, data = bb)
##
## Coefficients:
## (Intercept) Age
## -7.300e+06 4.011e+05
## Guaranteed Player_Efficiency_Rating
## 1.774e-01 1.384e+05
## True_Shooting_Percentage Three_Point_Field_Goal_Percentage
## -2.283e+06 -1.546e+04
## Free_Throw_Percentage Offensive_Rebound_Percentage
## -3.037e+03 -1.045e+06
## Defensive_Rebound_Percentage Total_Rebound_Percentage
## -8.493e+05 1.911e+06
## Assist_Percentage Steal_Percentage
## -5.664e+04 -2.103e+05
## Block_Percentage Turnover_Percentage
## -1.739e+05 1.448e+04
## Usage_Percentage Offensive_Win_Shares
## 5.667e+04 6.708e+04
## Defensive_Win_Shares Win_Shares
## 4.394e+05 2.403e+05
## Win_Shares_Per_48_Minutes Offense_Box_Plus_Minus
## -1.330e+07 -2.668e+06
## Defense_Box_Plus_Minus Box_Plus_Minus
## -2.804e+06 2.954e+06
## Value_Over_Replacement_Player
## -1.550e+05
summary(modbb2)
##
## Call:
## lm(formula = Salary ~ Age + Guaranteed + Player_Efficiency_Rating +
## True_Shooting_Percentage + Three_Point_Field_Goal_Percentage +
## Free_Throw_Percentage + Offensive_Rebound_Percentage + Defensive_Rebound_Percentage +
## Total_Rebound_Percentage + Assist_Percentage + Steal_Percentage +
## Block_Percentage + Turnover_Percentage + Usage_Percentage +
## Offensive_Win_Shares + Defensive_Win_Shares + Win_Shares +
## Win_Shares_Per_48_Minutes + Offense_Box_Plus_Minus + Defense_Box_Plus_Minus +
## Box_Plus_Minus + Value_Over_Replacement_Player, data = bb)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22609397 -1963642 -455650 1548429 14898466
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.300e+06 2.849e+06 -2.562 0.0107 *
## Age 4.011e+05 3.873e+04 10.356 <2e-16 ***
## Guaranteed 1.774e-01 7.314e-03 24.255 <2e-16 ***
## Player_Efficiency_Rating 1.384e+05 1.887e+05 0.734 0.4635
## True_Shooting_Percentage -2.283e+06 3.018e+06 -0.756 0.4498
## Three_Point_Field_Goal_Percentage -1.546e+04 1.491e+04 -1.037 0.3002
## Free_Throw_Percentage -3.037e+03 6.738e+03 -0.451 0.6524
## Offensive_Rebound_Percentage -1.045e+06 6.002e+05 -1.741 0.0823 .
## Defensive_Rebound_Percentage -8.493e+05 5.921e+05 -1.434 0.1520
## Total_Rebound_Percentage 1.911e+06 1.186e+06 1.611 0.1078
## Assist_Percentage -5.664e+04 2.986e+04 -1.897 0.0584 .
## Steal_Percentage -2.103e+05 2.870e+05 -0.733 0.4640
## Block_Percentage -1.739e+05 2.128e+05 -0.817 0.4143
## Turnover_Percentage 1.448e+04 3.311e+04 0.437 0.6620
## Usage_Percentage 5.667e+04 7.048e+04 0.804 0.4217
## Offensive_Win_Shares 6.708e+04 3.083e+06 0.022 0.9827
## Defensive_Win_Shares 4.394e+05 3.083e+06 0.143 0.8867
## Win_Shares 2.403e+05 3.086e+06 0.078 0.9380
## Win_Shares_Per_48_Minutes -1.330e+07 6.777e+06 -1.962 0.0503 .
## Offense_Box_Plus_Minus -2.668e+06 3.304e+06 -0.807 0.4199
## Defense_Box_Plus_Minus -2.804e+06 3.257e+06 -0.861 0.3896
## Box_Plus_Minus 2.954e+06 3.268e+06 0.904 0.3665
## Value_Over_Replacement_Player -1.550e+05 3.718e+05 -0.417 0.6770
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3721000 on 536 degrees of freedom
## Multiple R-squared: 0.7453, Adjusted R-squared: 0.7349
## F-statistic: 71.31 on 22 and 536 DF, p-value: < 2.2e-16
ggplot(bb, aes(x=Age+Guaranteed+Player_Efficiency_Rating+True_Shooting_Percentage+Three_Point_Field_Goal_Percentage+Free_Throw_Percentage+Offensive_Rebound_Percentage+Defensive_Rebound_Percentage+Total_Rebound_Percentage+Assist_Percentage+Steal_Percentage+Block_Percentage+Turnover_Percentage+Usage_Percentage+Offensive_Win_Shares+Defensive_Win_Shares+Win_Shares+Win_Shares_Per_48_Minutes+Offense_Box_Plus_Minus+Defense_Box_Plus_Minus+Box_Plus_Minus+Value_Over_Replacement_Player, y=Salary))+
geom_jitter()+
geom_smooth(col = "orange")+ #least square line
geom_smooth(method = "lm", se = FALSE) #regression line
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

plot(modbb2)




- Introduction The purpose of this paper is to analyze a dataset from 2017 to 2018 NBA season’s player statistics and its correlation between players salaries. Our dataset contains 559 observations and has 28 variables. Out of 28 variables, 4 of the variables are character variables. The central question of this analysis is whether player statistics affect to player’s salaries. Out of all the variables in the dataset, A response variable is Salary and exlpanatory includes dummy variables for players from USA as 1 and players outside of USA as 0, Guranteed, Age, Player_Efficiency_Rating, True_Shooting_Percentage, Three_Point_Field_Goal_Percentage, Free_Throw_Percentage, Offensive_Rebound_Percentage, Defensive_Rebound_Percentage, Total_Rebound_Percentage, Assist_Percentage, Steal_Percentage, Block_Percentage Turnover_Percentage, Usage_Percentage, Offensive_Win_Shares, Defensive_Win_Shares, Win_Shares, Win_Shares_Per_48_Minutes, Offense_Box_Plus_Minus, Defense_Box_Plus_Minus, Box_Plus_Minus, and Value_Over_Replacement_Player. For residuals versus fitted values graph, non-linear relationship is not explained in the graph. For normal q-q plot graph, we can acknoeledge that residuals are normally distributed. Outliers are not influential to this linear model becasue cook’s distance scores are not seen in the graph.