Data Preparation

# load data
library(tidyverse)
library(DT)
library(psych)
df <- read.csv("https://raw.githubusercontent.com/moham6839/Data606_ProjectProposal/main/nbaplayersdraft.csv")
df
sum(is.na(df))
## [1] 4467
summary(df)
##        id              year           rank        overall_pick  
##  Min.   :   1.0   Min.   :1989   Min.   : 1.00   Min.   : 1.00  
##  1st Qu.: 481.2   1st Qu.:1997   1st Qu.:15.00   1st Qu.:15.00  
##  Median : 961.5   Median :2005   Median :30.00   Median :30.00  
##  Mean   : 961.5   Mean   :2005   Mean   :29.69   Mean   :29.69  
##  3rd Qu.:1441.8   3rd Qu.:2013   3rd Qu.:44.00   3rd Qu.:44.00  
##  Max.   :1922.0   Max.   :2021   Max.   :60.00   Max.   :60.00  
##                                                                 
##      team              player            college           years_active   
##  Length:1922        Length:1922        Length:1922        Min.   : 1.000  
##  Class :character   Class :character   Class :character   1st Qu.: 2.000  
##  Mode  :character   Mode  :character   Mode  :character   Median : 5.000  
##                                                           Mean   : 6.333  
##                                                           3rd Qu.:10.000  
##                                                           Max.   :22.000  
##                                                           NA's   :253     
##      games      minutes_played      points      total_rebounds 
##  Min.   :   1   Min.   :    0   Min.   :    0   Min.   :    0  
##  1st Qu.:  72   1st Qu.:  838   1st Qu.:  265   1st Qu.:  128  
##  Median : 235   Median : 4204   Median : 1552   Median :  656  
##  Mean   : 348   Mean   : 8399   Mean   : 3580   Mean   : 1497  
##  3rd Qu.: 584   3rd Qu.:13246   3rd Qu.: 5150   3rd Qu.: 2139  
##  Max.   :1541   Max.   :52139   Max.   :37062   Max.   :15091  
##  NA's   :253    NA's   :253     NA's   :253     NA's   :253    
##     assists        field_goal_percentage X3_point_percentage
##  Min.   :    0.0   Min.   :0.0000        Min.   :0.0000     
##  1st Qu.:   46.0   1st Qu.:0.4040        1st Qu.:0.2220     
##  Median :  257.0   Median :0.4350        Median :0.3170     
##  Mean   :  774.3   Mean   :0.4366        Mean   :0.2724     
##  3rd Qu.:  910.0   3rd Qu.:0.4740        3rd Qu.:0.3560     
##  Max.   :12091.0   Max.   :1.0000        Max.   :1.0000     
##  NA's   :253       NA's   :257           NA's   :377        
##  free_throw_percentage average_minutes_played points_per_game 
##  Min.   :0.0000        Min.   : 0.00          Min.   : 0.000  
##  1st Qu.:0.6590        1st Qu.:11.00          1st Qu.: 3.400  
##  Median :0.7360        Median :17.70          Median : 6.200  
##  Mean   :0.7168        Mean   :18.13          Mean   : 7.276  
##  3rd Qu.:0.7970        3rd Qu.:24.80          3rd Qu.:10.000  
##  Max.   :1.0000        Max.   :41.10          Max.   :27.200  
##  NA's   :289           NA's   :253            NA's   :253     
##  average_total_rebounds average_assists   win_shares    
##  Min.   : 0.000         Min.   :0.000   Min.   : -1.70  
##  1st Qu.: 1.700         1st Qu.:0.500   1st Qu.:  0.40  
##  Median : 2.800         Median :1.100   Median :  5.30  
##  Mean   : 3.194         Mean   :1.551   Mean   : 17.87  
##  3rd Qu.: 4.200         3rd Qu.:2.100   3rd Qu.: 24.50  
##  Max.   :13.300         Max.   :9.500   Max.   :249.50  
##  NA's   :253            NA's   :253     NA's   :253     
##  win_shares_per_48_minutes box_plus_minus    value_over_replacement
##  Min.   :-1.26400          Min.   :-52.000   Min.   : -8.500       
##  1st Qu.: 0.03000          1st Qu.: -3.900   1st Qu.: -0.400       
##  Median : 0.06900          Median : -2.000   Median :  0.000       
##  Mean   : 0.06169          Mean   : -2.311   Mean   :  4.403       
##  3rd Qu.: 0.10400          3rd Qu.: -0.300   3rd Qu.:  4.500       
##  Max.   : 1.44200          Max.   : 51.100   Max.   :142.600       
##  NA's   :254               NA's   :254       NA's   :253
glimpse(df)
## Rows: 1,922
## Columns: 24
## $ id                        <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1…
## $ year                      <int> 1989, 1989, 1989, 1989, 1989, 1989, 1989, 19…
## $ rank                      <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1…
## $ overall_pick              <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1…
## $ team                      <chr> "SAC", "LAC", "SAS", "MIA", "CHH", "CHI", "I…
## $ player                    <chr> "Pervis Ellison", "Danny Ferry", "Sean Ellio…
## $ college                   <chr> "Louisville", "Duke", "Arizona", "Michigan",…
## $ years_active              <int> 11, 13, 12, 15, 11, 8, 12, 5, 12, 10, 13, 13…
## $ games                     <int> 474, 917, 742, 1000, 672, 438, 766, 281, 687…
## $ minutes_played            <int> 11593, 18133, 24502, 34985, 15370, 7406, 174…
## $ points                    <int> 4494, 6439, 10544, 18336, 5680, 2819, 6925, …
## $ total_rebounds            <int> 3170, 2550, 3204, 4387, 3381, 1460, 2342, 13…
## $ assists                   <int> 691, 1185, 1897, 2097, 639, 387, 1769, 175, …
## $ field_goal_percentage     <dbl> 0.510, 0.446, 0.465, 0.456, 0.472, 0.478, 0.…
## $ X3_point_percentage       <dbl> 0.050, 0.393, 0.375, 0.400, 0.135, 0.235, 0.…
## $ free_throw_percentage     <dbl> 0.689, 0.840, 0.799, 0.846, 0.716, 0.707, 0.…
## $ average_minutes_played    <dbl> 24.5, 19.8, 33.0, 35.0, 22.9, 16.9, 22.8, 19…
## $ points_per_game           <dbl> 9.5, 7.0, 14.2, 18.3, 8.5, 6.4, 9.0, 7.4, 5.…
## $ average_total_rebounds    <dbl> 6.7, 2.8, 4.3, 4.4, 5.0, 3.3, 3.1, 4.9, 3.3,…
## $ average_assists           <dbl> 1.5, 1.3, 2.6, 2.1, 1.0, 0.9, 2.3, 0.6, 0.6,…
## $ win_shares                <dbl> 21.8, 34.8, 55.7, 88.7, 22.5, 10.9, 24.6, 1.…
## $ win_shares_per_48_minutes <dbl> 0.090, 0.092, 0.109, 0.122, 0.070, 0.071, 0.…
## $ box_plus_minus            <dbl> -0.5, -0.9, 0.2, 0.8, -2.9, -3.4, -0.8, -5.0…
## $ value_over_replacement    <dbl> 4.4, 4.9, 13.5, 24.9, -3.7, -2.7, 5.3, -4.0,…
# Drop Rank column, since it is the same as overall_pick
df <- select(df, -rank)
head(df)
##   id year overall_pick team         player    college years_active games
## 1  1 1989            1  SAC Pervis Ellison Louisville           11   474
## 2  2 1989            2  LAC    Danny Ferry       Duke           13   917
## 3  3 1989            3  SAS   Sean Elliott    Arizona           12   742
## 4  4 1989            4  MIA      Glen Rice   Michigan           15  1000
## 5  5 1989            5  CHH      J.R. Reid        UNC           11   672
## 6  6 1989            6  CHI    Stacey King   Oklahoma            8   438
##   minutes_played points total_rebounds assists field_goal_percentage
## 1          11593   4494           3170     691                 0.510
## 2          18133   6439           2550    1185                 0.446
## 3          24502  10544           3204    1897                 0.465
## 4          34985  18336           4387    2097                 0.456
## 5          15370   5680           3381     639                 0.472
## 6           7406   2819           1460     387                 0.478
##   X3_point_percentage free_throw_percentage average_minutes_played
## 1               0.050                 0.689                   24.5
## 2               0.393                 0.840                   19.8
## 3               0.375                 0.799                   33.0
## 4               0.400                 0.846                   35.0
## 5               0.135                 0.716                   22.9
## 6               0.235                 0.707                   16.9
##   points_per_game average_total_rebounds average_assists win_shares
## 1             9.5                    6.7             1.5       21.8
## 2             7.0                    2.8             1.3       34.8
## 3            14.2                    4.3             2.6       55.7
## 4            18.3                    4.4             2.1       88.7
## 5             8.5                    5.0             1.0       22.5
## 6             6.4                    3.3             0.9       10.9
##   win_shares_per_48_minutes box_plus_minus value_over_replacement
## 1                     0.090           -0.5                    4.4
## 2                     0.092           -0.9                    4.9
## 3                     0.109            0.2                   13.5
## 4                     0.122            0.8                   24.9
## 5                     0.070           -2.9                   -3.7
## 6                     0.071           -3.4                   -2.7
new_df <- df %>%
  select(overall_pick, win_shares, minutes_played, games, years_active, box_plus_minus)
head(new_df)
##   overall_pick win_shares minutes_played games years_active box_plus_minus
## 1            1       21.8          11593   474           11           -0.5
## 2            2       34.8          18133   917           13           -0.9
## 3            3       55.7          24502   742           12            0.2
## 4            4       88.7          34985  1000           15            0.8
## 5            5       22.5          15370   672           11           -2.9
## 6            6       10.9           7406   438            8           -3.4

Research question

You should phrase your research question in a way that matches up with the scope of inference your dataset allows for.

What factors affect a player’s WinShares?

Cases

What are the cases, and how many are there?

There are 1,922 cases, with 24 columns. Each case is a NBA player who was drafted between 1989 and 2021.

Data collection

Describe the method of data collection.

According to the Kaggle link where the dataset was downloaded from, the data was collected from Basketball Reference, which you can find here.

Type of study

What type of study is this (observational/experiment)?

This is an observational study.

Data Source

If you collected the data, state self-collected. If not, provide a citation/link.

I found the dataset on Kaggle. Here is the link: https://www.kaggle.com/datasets/mattop/nba-draft-basketball-player-data-19892021

Dependent Variable

What is the response variable? Is it quantitative or qualitative?

The response variable will be WinShares, which is a quantitative variable.

Independent Variable(s)

My independent variables will be based on what could affect how high or low WinShares of players, such as overall_pick, games, years_active, minutes_played, minutes_played, and box_plus_minus. Overall draft picks are categorical, while the other variables are numerical.

Relevant summary statistics

Provide summary statistics for each the variables. Also include appropriate visualizations related to your research question (e.g. scatter plot, boxplots, etc). This step requires the use of R, hence a code chunk is provided below. Insert more code chunks as needed.

describe(new_df$win_shares)
##    vars    n  mean    sd median trimmed  mad  min   max range skew kurtosis
## X1    1 1669 17.87 27.99    5.3   11.77 8.01 -1.7 249.5 251.2 2.77    10.83
##      se
## X1 0.69
describe(new_df$games)
##    vars    n   mean    sd median trimmed    mad min  max range skew kurtosis
## X1    1 1669 348.04 324.9    235  307.41 292.07   1 1541  1540 0.94     0.03
##      se
## X1 7.95
describe(new_df$years_active)
##    vars    n mean   sd median trimmed  mad min max range skew kurtosis   se
## X1    1 1669 6.33 4.66      5    5.87 4.45   1  22    21 0.73    -0.44 0.11
describe(new_df$box_plus_minus)
##    vars    n  mean   sd median trimmed  mad min  max range  skew kurtosis  se
## X1    1 1668 -2.31 4.14     -2    -2.1 2.67 -52 51.1 103.1 -0.76    36.92 0.1
describe(new_df$overall_pick)
##    vars    n  mean    sd median trimmed   mad min max range skew kurtosis   se
## X1    1 1922 29.69 16.91     30   29.65 22.24   1  60    59 0.01    -1.19 0.39
describe(new_df$minutes_played)
##    vars    n    mean      sd median trimmed     mad min   max range skew
## X1    1 1669 8399.06 9845.87   4204 6637.42 5859.24   0 52139 52139 1.49
##    kurtosis  se
## X1     1.92 241
summary(new_df)
##   overall_pick     win_shares     minutes_played      games     
##  Min.   : 1.00   Min.   : -1.70   Min.   :    0   Min.   :   1  
##  1st Qu.:15.00   1st Qu.:  0.40   1st Qu.:  838   1st Qu.:  72  
##  Median :30.00   Median :  5.30   Median : 4204   Median : 235  
##  Mean   :29.69   Mean   : 17.87   Mean   : 8399   Mean   : 348  
##  3rd Qu.:44.00   3rd Qu.: 24.50   3rd Qu.:13246   3rd Qu.: 584  
##  Max.   :60.00   Max.   :249.50   Max.   :52139   Max.   :1541  
##                  NA's   :253      NA's   :253     NA's   :253   
##   years_active    box_plus_minus   
##  Min.   : 1.000   Min.   :-52.000  
##  1st Qu.: 2.000   1st Qu.: -3.900  
##  Median : 5.000   Median : -2.000  
##  Mean   : 6.333   Mean   : -2.311  
##  3rd Qu.:10.000   3rd Qu.: -0.300  
##  Max.   :22.000   Max.   : 51.100  
##  NA's   :253      NA's   :254
colSums(is.na(new_df))
##   overall_pick     win_shares minutes_played          games   years_active 
##              0            253            253            253            253 
## box_plus_minus 
##            254
new_df %>% 
  ggplot(aes(x=win_shares)) +
  geom_histogram(bins = 100) +
  labs(title="Amount of WinShares for Each Player",
       x="Number of WinShares",
       y="Count")
## Warning: Removed 253 rows containing non-finite values (`stat_bin()`).

The histogram shows a skewness to the right for WinShares, with a large amount of player WinShares at or around zero.

new_df %>%
  ggplot(aes(box_plus_minus, na.rm=TRUE)) +
  geom_histogram(bins = 50) +
  labs(title="Plus/Minus for Each Player",
       x="Plus/Minus",
       y="Count")
## Warning: Removed 254 rows containing non-finite values (`stat_bin()`).

new_df %>%
  ggplot(aes(win_shares, games, na.rm=TRUE)) +
  geom_point() +
  labs(title="Amount of WinShares Based on Games Played",
       x="WinShares",
       y="Games")
## Warning: Removed 253 rows containing missing values (`geom_point()`).

new_df %>%
  ggplot(aes(box_plus_minus, win_shares, na.rm=TRUE)) +
  geom_point() +
  labs(title="Amount of WinShares Based on Plus/Minus of Each Player",
       x="Plus/Minus",
       y="WinShares")
## Warning: Removed 254 rows containing missing values (`geom_point()`).

new_df %>%
  ggplot(aes(overall_pick, win_shares, na.rm=TRUE)) +
  geom_point() +
  labs(title="Amount of WinShares Based on Overall Pick of Players",
       x="WinShares",
       y="Overall Draft Picks")
## Warning: Removed 253 rows containing missing values (`geom_point()`).

new_df %>%
  ggplot(aes(years_active, win_shares, na.rm=TRUE)) +
  geom_point() +
  labs(title="Amount of WinShares Based on Number of Playing Years",
       x="Years Played",
       y="WinShares")
## Warning: Removed 253 rows containing missing values (`geom_point()`).

new_df %>%
  ggplot(aes(minutes_played, win_shares, na.rm=TRUE)) +
  geom_point() +
  labs(title="Amount of WinShares Based on Career Minutes Played",
       x="Career Minutes Played",
       y="WInShares")
## Warning: Removed 253 rows containing missing values (`geom_point()`).