NBA Data Dive - Random Sampling

# Load necessary libraries
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

# Load the dataset
nba_data <- read.csv("C:/Statistics/nba.csv")

# Set seed for reproducibility
set.seed(123)

# Calculate sample size (50% of data)
sample_size <- round(0.5 * nrow(nba_data))

# Generate 5 random samples with replacement
df_1 <- nba_data %>% sample_n(sample_size, replace = TRUE)
df_2 <- nba_data %>% sample_n(sample_size, replace = TRUE)
df_3 <- nba_data %>% sample_n(sample_size, replace = TRUE)
df_4 <- nba_data %>% sample_n(sample_size, replace = TRUE)
df_5 <- nba_data %>% sample_n(sample_size, replace = TRUE)

Display Sample Data

# Display the first few rows of each sample
head(df_1)

##       bbrID       Date  Tm Opp TRB AST STL BLK PTS GmSc  Season Playoffs Year
## 1 doleami01 2007-01-30 MIA MIL  11   4   5   1  11 18.6 2006-07    false 2007
## 2 ellismo01 2016-01-28 IND ATL   5   6   5   1  25 29.9 2015-16    false 2016
## 3 boozeca01 2010-02-09 UTA LAC  14   4   4   0  34 37.5 2009-10    false 2010
## 4 freeljo01 2015-04-13 POR OKC   5   0   0   1  16 12.3 2014-15    false 2015
## 5 brandte01 2000-03-19 MIN MIL   3  13   4   0  28 35.9 1999-00    false 2000
## 6  macksa01 1996-04-02 HOU GSW   3   4   2   0  38 30.7 1995-96    false 1996
##   GameIndex GmScMovingZ GmScMovingZTop2Delta      Date2 GmSc2 GmScMovingZ2
## 1       573        5.19                 0.35 2005-02-05  17.0         4.84
## 2       750        2.99                 0.18 2006-11-18  27.3         2.81
## 3       512        3.19                 0.18 2008-03-15  37.9         3.01
## 4       158        3.02                 0.14 2013-04-16  11.0         2.88
## 5       628        3.02                 0.11 2001-02-23  36.1         2.91
## 6        60        5.00                 1.31 1996-04-21  23.5         3.69

head(df_2)

##       bbrID       Date  Tm Opp TRB AST STL BLK PTS GmSc  Season Playoffs Year
## 1 jacksji01 1998-04-02 GSW HOU   3  11   1   1  33 31.9 1997-98    false 1998
## 2 tollian01 2020-01-13 POR CHO  11   1   0   0  16 17.6 2019-20    false 2020
## 3 richmmi01 1991-01-31 GSW LAC   6   7   7   0  40 40.7 1990-91    false 1991
## 4  denglu01 2010-11-01 CHI POR   4   2   1   0  40 33.9 2010-11    false 2011
## 5 porzikr01 2017-11-05 NYK IND   8   1   0   6  40 34.3 2017-18    false 2018
## 6 pachuza01 2015-03-20 MIL BRK  21   7   1   0  22 29.2 2014-15    false 2015
##   GameIndex GmScMovingZ GmScMovingZTop2Delta      Date2 GmSc2 GmScMovingZ2
## 1       390        3.62                 0.19 2000-01-07  29.1         3.43
## 2       707        3.67                 0.06 2013-03-06  20.1         3.61
## 3       201        3.38                 0.07 1995-12-15  39.6         3.31
## 4       426        3.35                 0.04 2016-04-17  27.9         3.31
## 5       146        2.76                 0.31 2015-11-21  27.9         2.45
## 6       841        3.63                 0.02 2018-03-11  20.3         3.61

head(df_3)

##       bbrID       Date  Tm Opp TRB AST STL BLK PTS GmSc  Season Playoffs Year
## 1  lambje01 2017-11-03 CHO SAS   9   6   2   0  27 28.2 2017-18    false 2018
## 2 douglto01 2010-11-04 NYK CHI   1   4   4   0  30 28.3 2010-11    false 2011
## 3 mccloge01 1998-04-12 PHO VAN   8   6   2   0  25 25.3 1997-98    false 1998
## 4 bouchch01 2021-04-08 TOR CHI  19   1   1   1  38 36.3 2020-21    false 2021
## 5 douglto01 2010-11-04 NYK CHI   1   4   4   0  30 28.3 2010-11    false 2011
## 6 bowdlca01 2001-03-27 ATL BOS   4   0   1   3  15 13.6 2000-01    false 2001
##   GameIndex GmScMovingZ GmScMovingZTop2Delta      Date2 GmSc2 GmScMovingZ2
## 1       298        3.74                 0.49 2013-12-29  22.1         3.25
## 2        59        3.18                 0.06 2011-04-05  26.4         3.12
## 3       497        3.50                 0.35 1998-03-09  23.3         3.15
## 4       151        3.61                 0.78 2021-04-18  30.5         2.83
## 5        59        3.18                 0.06 2011-04-05  26.4         3.12
## 6        79        3.29                 0.06 2001-03-22  13.4         3.23

head(df_4)

##       bbrID       Date  Tm Opp TRB AST STL BLK PTS GmSc  Season Playoffs Year
## 1 cassesa01 2007-11-07 LAC IND   3   8   2   0  35 32.3 2007-08    false 2008
## 2  kiddja01 1996-01-12 DAL PHO  12  16   1   1  33 36.9 1995-96    false 1996
## 3 curryse01 2020-02-28 DAL MIA   2   2   0   0  37 30.7 2019-20    false 2020
## 4 mccolcj01 2018-01-31 POR CHI   5   2   1   0  50 40.6 2017-18    false 2018
## 5 mcdankj01 2017-03-23 BRK PHO   8   1   2   1  16 16.5 2016-17    false 2017
## 6 kofoeba01 1992-01-14 SEA CHH   5   9   0   0  15 18.7 1991-92    false 1992
##   GameIndex GmScMovingZ GmScMovingZTop2Delta      Date2 GmSc2 GmScMovingZ2
## 1      1056        3.32                 0.29 1995-05-30  32.2         3.03
## 2       110        3.03                 0.03 2008-04-16  30.6         3.00
## 3       263        3.07                 0.19 2019-12-12  27.8         2.88
## 4       335        3.34                 0.40 2015-12-27  33.7         2.94
## 5       143        3.95                 1.14 2014-11-29  16.7         2.81
## 6        86        6.75                 3.36 1988-12-03   9.7         3.39

head(df_5)

##       bbrID       Date  Tm Opp TRB AST STL BLK PTS GmSc  Season Playoffs Year
## 1 watsoja01 1995-02-01 UTA DEN   2   4   1   1  10 12.8 1994-95    false 1995
## 2 greenge01 2012-04-08 NJN CLE   5   1   0   0  32 24.6 2011-12    false 2012
## 3 washier01 1998-01-02 DEN HOU   3   1   2   0  21 17.2 1997-98    false 1998
## 4 mobleer01 1995-03-10 MIL WSB  12   2   0   4  12 15.8 1994-95    false 1995
## 5 jonesso01 2012-03-03 NOH IND   9   2   0   0  12 13.7 2011-12    false 2012
## 6 fieldke01 1985-03-25 MIL NYK   7   0   0   0  21 16.9 1984-85    false 1985
##   GameIndex GmScMovingZ GmScMovingZTop2Delta      Date2 GmSc2 GmScMovingZ2
## 1        34        3.19                 0.60 1994-11-14  10.9         2.59
## 2       209        3.82                 0.11 2016-04-01  24.9         3.71
## 3        25        2.67                 0.21 1999-04-10  16.3         2.46
## 4        28        3.18                 0.24 1995-04-07  14.7         2.94
## 5       281        3.39                 0.35 2007-04-17  12.4         3.04
## 6        48        3.16                 0.14 1986-03-03  19.2         3.02

Summary Statistics

# Generate summary statistics for each sample
# Summary statistics for key columns in the first sample
summary(df_1$PTS)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    4.00   18.00   24.00   25.69   32.00   60.00

summary(df_1$AST)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.000   3.000   3.808   5.000  18.000

summary(df_1$Tm)

##    Length     Class      Mode 
##       852 character character

# Repeat for other samples
summary(df_2$PTS)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    4.00   18.00   24.00   25.75   32.00   81.00

summary(df_2$AST)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.000   3.000   3.644   5.000  22.000

summary(df_2$Tm)

##    Length     Class      Mode 
##       852 character character

summary(df_3$PTS)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    5.00   18.00   25.00   26.18   32.00   62.00

summary(df_3$AST)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.000   3.000   3.727   5.000  22.000

summary(df_3$Tm)

##    Length     Class      Mode 
##       852 character character

summary(df_4$PTS)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    6.00   19.00   24.00   26.15   32.00   62.00

summary(df_4$AST)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.000   3.000   3.738   5.000  17.000

summary(df_4$Tm)

##    Length     Class      Mode 
##       852 character character

summary(df_5$PTS)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    5.00   19.00   24.00   25.76   32.00   81.00

summary(df_5$AST)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.000   3.000   3.661   5.000  19.000

summary(df_5$Tm)

##    Length     Class      Mode 
##       852 character character

Explanation of Key Metrics Scoring Trend (avg_pts): On average, the points scored by players per game across all samples are around 25.76. This suggests a relatively balanced scoring performance, with variations depending on the sample. Some players likely score much higher, while others score lower.

Assist Trend (avg_ast): The average assists per player per game is 3.66. This shows that while a few players are primary playmakers, many players contribute fewer assists, potentially indicating a more isolated or specific role in the team dynamics.

Diversity in Teams (unique_teams): The dataset includes 38 unique teams, showing a broad distribution across different teams. This helps ensure that the random samples are representative of a variety of team performances, reducing the chance of sample bias toward any specific teams.

Subsample Analysis

# Subsample Analysis: Team Representation

df_1 %>% group_by(Tm) %>% summarise(count = n())

## # A tibble: 38 × 2
##    Tm    count
##    <chr> <int>
##  1 ATL      23
##  2 BOS      27
##  3 BRK      14
##  4 CHA      10
##  5 CHH       9
##  6 CHI      26
##  7 CHO      10
##  8 CLE      35
##  9 DAL      25
## 10 DEN      35
## # ℹ 28 more rows

df_2 %>% group_by(Tm) %>% summarise(count = n())

## # A tibble: 38 × 2
##    Tm    count
##    <chr> <int>
##  1 ATL      34
##  2 BOS      33
##  3 BRK      10
##  4 CHA      10
##  5 CHH       8
##  6 CHI      18
##  7 CHO       8
##  8 CLE      34
##  9 DAL      24
## 10 DEN      29
## # ℹ 28 more rows

df_3 %>% group_by(Tm) %>% summarise(count = n())

## # A tibble: 37 × 2
##    Tm    count
##    <chr> <int>
##  1 ATL      33
##  2 BOS      33
##  3 BRK      13
##  4 CHA       6
##  5 CHH       9
##  6 CHI      26
##  7 CHO       7
##  8 CLE      40
##  9 DAL      20
## 10 DEN      29
## # ℹ 27 more rows

df_4 %>% group_by(Tm) %>% summarise(count = n())

## # A tibble: 38 × 2
##    Tm    count
##    <chr> <int>
##  1 ATL      26
##  2 BOS      28
##  3 BRK      16
##  4 CHA      10
##  5 CHH      11
##  6 CHI      35
##  7 CHO       7
##  8 CLE      42
##  9 DAL      25
## 10 DEN      27
## # ℹ 28 more rows

df_5 %>% group_by(Tm) %>% summarise(count = n())

## # A tibble: 38 × 2
##    Tm    count
##    <chr> <int>
##  1 ATL      35
##  2 BOS      27
##  3 BRK      14
##  4 CHA       8
##  5 CHH       5
##  6 CHI      25
##  7 CHO       6
##  8 CLE      39
##  9 DAL      29
## 10 DEN      27
## # ℹ 28 more rows

Insight: Each subsample shows the distribution of players across teams. If some teams are overrepresented, it might suggest that further analysis or adjustments are needed to achieve a more balanced sample.

# Identify anomalies by checking for outliers
boxplot(df_1$PTS, main="Points Distribution in df_1")

boxplot(df_2$PTS, main="Points Distribution in df_2")

boxplot(df_3$PTS, main="Points Distribution in df_3")

boxplot(df_4$PTS, main="Points Distribution in df_4")

boxplot(df_5$PTS, main="Points Distribution in df_5")

Insight: Outliers are visible as points outside the whiskers in the boxplot. These outliers could represent either extremely high or low performances, which might warrant a closer look to see if they are data errors or valid exceptional performances.

# Find common patterns across all subsamples
common_summary <- function(df) {
  df %>% summarise(
    avg_pts = mean(PTS, na.rm = TRUE),
    avg_ast = mean(AST, na.rm = TRUE),
    unique_teams = n_distinct(Tm)
  )
}

common_summary(df_1)

##    avg_pts  avg_ast unique_teams
## 1 25.69484 3.807512           38

common_summary(df_2)

##    avg_pts  avg_ast unique_teams
## 1 25.74765 3.644366           38

common_summary(df_3)

##   avg_pts  avg_ast unique_teams
## 1 26.1784 3.726526           37

common_summary(df_4)

##    avg_pts  avg_ast unique_teams
## 1 26.14554 3.738263           38

common_summary(df_5)

##    avg_pts  avg_ast unique_teams
## 1 25.75704 3.660798           38

Average Points (avg_pts): The average points per player per game remain consistent across all subsamples, averaging around 25.76. This suggests that, generally, players score at a similar level across different teams and seasons in the dataset.

Average Assists (avg_ast): The average assists across subsamples are also relatively stable at around 3.66, indicating that players tend to have a similar contribution to playmaking across the dataset.

Team Diversity (unique_teams): There are consistently 38 unique teams represented across the samples, showing that our random sampling method is capturing a diverse set of teams without bias..