# load data
library(tidyverse)
library(DT)
library(psych)
df <- read.csv("https://raw.githubusercontent.com/moham6839/Data606_ProjectProposal/main/nbaplayersdraft.csv")
df
sum(is.na(df))
## [1] 4467
summary(df)
## id year rank overall_pick
## Min. : 1.0 Min. :1989 Min. : 1.00 Min. : 1.00
## 1st Qu.: 481.2 1st Qu.:1997 1st Qu.:15.00 1st Qu.:15.00
## Median : 961.5 Median :2005 Median :30.00 Median :30.00
## Mean : 961.5 Mean :2005 Mean :29.69 Mean :29.69
## 3rd Qu.:1441.8 3rd Qu.:2013 3rd Qu.:44.00 3rd Qu.:44.00
## Max. :1922.0 Max. :2021 Max. :60.00 Max. :60.00
##
## team player college years_active
## Length:1922 Length:1922 Length:1922 Min. : 1.000
## Class :character Class :character Class :character 1st Qu.: 2.000
## Mode :character Mode :character Mode :character Median : 5.000
## Mean : 6.333
## 3rd Qu.:10.000
## Max. :22.000
## NA's :253
## games minutes_played points total_rebounds
## Min. : 1 Min. : 0 Min. : 0 Min. : 0
## 1st Qu.: 72 1st Qu.: 838 1st Qu.: 265 1st Qu.: 128
## Median : 235 Median : 4204 Median : 1552 Median : 656
## Mean : 348 Mean : 8399 Mean : 3580 Mean : 1497
## 3rd Qu.: 584 3rd Qu.:13246 3rd Qu.: 5150 3rd Qu.: 2139
## Max. :1541 Max. :52139 Max. :37062 Max. :15091
## NA's :253 NA's :253 NA's :253 NA's :253
## assists field_goal_percentage X3_point_percentage
## Min. : 0.0 Min. :0.0000 Min. :0.0000
## 1st Qu.: 46.0 1st Qu.:0.4040 1st Qu.:0.2220
## Median : 257.0 Median :0.4350 Median :0.3170
## Mean : 774.3 Mean :0.4366 Mean :0.2724
## 3rd Qu.: 910.0 3rd Qu.:0.4740 3rd Qu.:0.3560
## Max. :12091.0 Max. :1.0000 Max. :1.0000
## NA's :253 NA's :257 NA's :377
## free_throw_percentage average_minutes_played points_per_game
## Min. :0.0000 Min. : 0.00 Min. : 0.000
## 1st Qu.:0.6590 1st Qu.:11.00 1st Qu.: 3.400
## Median :0.7360 Median :17.70 Median : 6.200
## Mean :0.7168 Mean :18.13 Mean : 7.276
## 3rd Qu.:0.7970 3rd Qu.:24.80 3rd Qu.:10.000
## Max. :1.0000 Max. :41.10 Max. :27.200
## NA's :289 NA's :253 NA's :253
## average_total_rebounds average_assists win_shares
## Min. : 0.000 Min. :0.000 Min. : -1.70
## 1st Qu.: 1.700 1st Qu.:0.500 1st Qu.: 0.40
## Median : 2.800 Median :1.100 Median : 5.30
## Mean : 3.194 Mean :1.551 Mean : 17.87
## 3rd Qu.: 4.200 3rd Qu.:2.100 3rd Qu.: 24.50
## Max. :13.300 Max. :9.500 Max. :249.50
## NA's :253 NA's :253 NA's :253
## win_shares_per_48_minutes box_plus_minus value_over_replacement
## Min. :-1.26400 Min. :-52.000 Min. : -8.500
## 1st Qu.: 0.03000 1st Qu.: -3.900 1st Qu.: -0.400
## Median : 0.06900 Median : -2.000 Median : 0.000
## Mean : 0.06169 Mean : -2.311 Mean : 4.403
## 3rd Qu.: 0.10400 3rd Qu.: -0.300 3rd Qu.: 4.500
## Max. : 1.44200 Max. : 51.100 Max. :142.600
## NA's :254 NA's :254 NA's :253
glimpse(df)
## Rows: 1,922
## Columns: 24
## $ id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1…
## $ year <int> 1989, 1989, 1989, 1989, 1989, 1989, 1989, 19…
## $ rank <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1…
## $ overall_pick <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1…
## $ team <chr> "SAC", "LAC", "SAS", "MIA", "CHH", "CHI", "I…
## $ player <chr> "Pervis Ellison", "Danny Ferry", "Sean Ellio…
## $ college <chr> "Louisville", "Duke", "Arizona", "Michigan",…
## $ years_active <int> 11, 13, 12, 15, 11, 8, 12, 5, 12, 10, 13, 13…
## $ games <int> 474, 917, 742, 1000, 672, 438, 766, 281, 687…
## $ minutes_played <int> 11593, 18133, 24502, 34985, 15370, 7406, 174…
## $ points <int> 4494, 6439, 10544, 18336, 5680, 2819, 6925, …
## $ total_rebounds <int> 3170, 2550, 3204, 4387, 3381, 1460, 2342, 13…
## $ assists <int> 691, 1185, 1897, 2097, 639, 387, 1769, 175, …
## $ field_goal_percentage <dbl> 0.510, 0.446, 0.465, 0.456, 0.472, 0.478, 0.…
## $ X3_point_percentage <dbl> 0.050, 0.393, 0.375, 0.400, 0.135, 0.235, 0.…
## $ free_throw_percentage <dbl> 0.689, 0.840, 0.799, 0.846, 0.716, 0.707, 0.…
## $ average_minutes_played <dbl> 24.5, 19.8, 33.0, 35.0, 22.9, 16.9, 22.8, 19…
## $ points_per_game <dbl> 9.5, 7.0, 14.2, 18.3, 8.5, 6.4, 9.0, 7.4, 5.…
## $ average_total_rebounds <dbl> 6.7, 2.8, 4.3, 4.4, 5.0, 3.3, 3.1, 4.9, 3.3,…
## $ average_assists <dbl> 1.5, 1.3, 2.6, 2.1, 1.0, 0.9, 2.3, 0.6, 0.6,…
## $ win_shares <dbl> 21.8, 34.8, 55.7, 88.7, 22.5, 10.9, 24.6, 1.…
## $ win_shares_per_48_minutes <dbl> 0.090, 0.092, 0.109, 0.122, 0.070, 0.071, 0.…
## $ box_plus_minus <dbl> -0.5, -0.9, 0.2, 0.8, -2.9, -3.4, -0.8, -5.0…
## $ value_over_replacement <dbl> 4.4, 4.9, 13.5, 24.9, -3.7, -2.7, 5.3, -4.0,…
# Drop Rank column, since it is the same as overall_pick
df <- select(df, -rank)
head(df)
## id year overall_pick team player college years_active games
## 1 1 1989 1 SAC Pervis Ellison Louisville 11 474
## 2 2 1989 2 LAC Danny Ferry Duke 13 917
## 3 3 1989 3 SAS Sean Elliott Arizona 12 742
## 4 4 1989 4 MIA Glen Rice Michigan 15 1000
## 5 5 1989 5 CHH J.R. Reid UNC 11 672
## 6 6 1989 6 CHI Stacey King Oklahoma 8 438
## minutes_played points total_rebounds assists field_goal_percentage
## 1 11593 4494 3170 691 0.510
## 2 18133 6439 2550 1185 0.446
## 3 24502 10544 3204 1897 0.465
## 4 34985 18336 4387 2097 0.456
## 5 15370 5680 3381 639 0.472
## 6 7406 2819 1460 387 0.478
## X3_point_percentage free_throw_percentage average_minutes_played
## 1 0.050 0.689 24.5
## 2 0.393 0.840 19.8
## 3 0.375 0.799 33.0
## 4 0.400 0.846 35.0
## 5 0.135 0.716 22.9
## 6 0.235 0.707 16.9
## points_per_game average_total_rebounds average_assists win_shares
## 1 9.5 6.7 1.5 21.8
## 2 7.0 2.8 1.3 34.8
## 3 14.2 4.3 2.6 55.7
## 4 18.3 4.4 2.1 88.7
## 5 8.5 5.0 1.0 22.5
## 6 6.4 3.3 0.9 10.9
## win_shares_per_48_minutes box_plus_minus value_over_replacement
## 1 0.090 -0.5 4.4
## 2 0.092 -0.9 4.9
## 3 0.109 0.2 13.5
## 4 0.122 0.8 24.9
## 5 0.070 -2.9 -3.7
## 6 0.071 -3.4 -2.7
new_df <- df %>%
select(overall_pick, win_shares, minutes_played, games, years_active, box_plus_minus)
head(new_df)
## overall_pick win_shares minutes_played games years_active box_plus_minus
## 1 1 21.8 11593 474 11 -0.5
## 2 2 34.8 18133 917 13 -0.9
## 3 3 55.7 24502 742 12 0.2
## 4 4 88.7 34985 1000 15 0.8
## 5 5 22.5 15370 672 11 -2.9
## 6 6 10.9 7406 438 8 -3.4
You should phrase your research question in a way that matches up with the scope of inference your dataset allows for.
What factors affect a player’s WinShares?
What are the cases, and how many are there?
There are 1,922 cases, with 24 columns. Each case is a NBA player who was drafted between 1989 and 2021.
Describe the method of data collection.
According to the Kaggle link where the dataset was downloaded from, the data was collected from Basketball Reference, which you can find here.
What type of study is this (observational/experiment)?
This is an observational study.
If you collected the data, state self-collected. If not, provide a citation/link.
I found the dataset on Kaggle. Here is the link: https://www.kaggle.com/datasets/mattop/nba-draft-basketball-player-data-19892021
What is the response variable? Is it quantitative or qualitative?
The response variable will be WinShares, which is a quantitative variable.
My independent variables will be based on what could affect how high or low WinShares of players, such as overall_pick, games, years_active, minutes_played, minutes_played, and box_plus_minus. Overall draft picks are categorical, while the other variables are numerical.
Provide summary statistics for each the variables. Also include appropriate visualizations related to your research question (e.g. scatter plot, boxplots, etc). This step requires the use of R, hence a code chunk is provided below. Insert more code chunks as needed.
describe(new_df$win_shares)
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 1669 17.87 27.99 5.3 11.77 8.01 -1.7 249.5 251.2 2.77 10.83
## se
## X1 0.69
describe(new_df$games)
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 1669 348.04 324.9 235 307.41 292.07 1 1541 1540 0.94 0.03
## se
## X1 7.95
describe(new_df$years_active)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 1669 6.33 4.66 5 5.87 4.45 1 22 21 0.73 -0.44 0.11
describe(new_df$box_plus_minus)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 1668 -2.31 4.14 -2 -2.1 2.67 -52 51.1 103.1 -0.76 36.92 0.1
describe(new_df$overall_pick)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 1922 29.69 16.91 30 29.65 22.24 1 60 59 0.01 -1.19 0.39
describe(new_df$minutes_played)
## vars n mean sd median trimmed mad min max range skew
## X1 1 1669 8399.06 9845.87 4204 6637.42 5859.24 0 52139 52139 1.49
## kurtosis se
## X1 1.92 241
summary(new_df)
## overall_pick win_shares minutes_played games
## Min. : 1.00 Min. : -1.70 Min. : 0 Min. : 1
## 1st Qu.:15.00 1st Qu.: 0.40 1st Qu.: 838 1st Qu.: 72
## Median :30.00 Median : 5.30 Median : 4204 Median : 235
## Mean :29.69 Mean : 17.87 Mean : 8399 Mean : 348
## 3rd Qu.:44.00 3rd Qu.: 24.50 3rd Qu.:13246 3rd Qu.: 584
## Max. :60.00 Max. :249.50 Max. :52139 Max. :1541
## NA's :253 NA's :253 NA's :253
## years_active box_plus_minus
## Min. : 1.000 Min. :-52.000
## 1st Qu.: 2.000 1st Qu.: -3.900
## Median : 5.000 Median : -2.000
## Mean : 6.333 Mean : -2.311
## 3rd Qu.:10.000 3rd Qu.: -0.300
## Max. :22.000 Max. : 51.100
## NA's :253 NA's :254
colSums(is.na(new_df))
## overall_pick win_shares minutes_played games years_active
## 0 253 253 253 253
## box_plus_minus
## 254
new_df %>%
ggplot(aes(x=win_shares)) +
geom_histogram(bins = 100) +
labs(title="Amount of WinShares for Each Player",
x="Number of WinShares",
y="Count")
## Warning: Removed 253 rows containing non-finite values (`stat_bin()`).
The histogram shows a skewness to the right for WinShares, with a large amount of player WinShares at or around zero.
new_df %>%
ggplot(aes(box_plus_minus, na.rm=TRUE)) +
geom_histogram(bins = 50) +
labs(title="Plus/Minus for Each Player",
x="Plus/Minus",
y="Count")
## Warning: Removed 254 rows containing non-finite values (`stat_bin()`).
new_df %>%
ggplot(aes(win_shares, games, na.rm=TRUE)) +
geom_point() +
labs(title="Amount of WinShares Based on Games Played",
x="WinShares",
y="Games")
## Warning: Removed 253 rows containing missing values (`geom_point()`).
new_df %>%
ggplot(aes(box_plus_minus, win_shares, na.rm=TRUE)) +
geom_point() +
labs(title="Amount of WinShares Based on Plus/Minus of Each Player",
x="Plus/Minus",
y="WinShares")
## Warning: Removed 254 rows containing missing values (`geom_point()`).
new_df %>%
ggplot(aes(overall_pick, win_shares, na.rm=TRUE)) +
geom_point() +
labs(title="Amount of WinShares Based on Overall Pick of Players",
x="WinShares",
y="Overall Draft Picks")
## Warning: Removed 253 rows containing missing values (`geom_point()`).
new_df %>%
ggplot(aes(years_active, win_shares, na.rm=TRUE)) +
geom_point() +
labs(title="Amount of WinShares Based on Number of Playing Years",
x="Years Played",
y="WinShares")
## Warning: Removed 253 rows containing missing values (`geom_point()`).
new_df %>%
ggplot(aes(minutes_played, win_shares, na.rm=TRUE)) +
geom_point() +
labs(title="Amount of WinShares Based on Career Minutes Played",
x="Career Minutes Played",
y="WInShares")
## Warning: Removed 253 rows containing missing values (`geom_point()`).