# Load necessary libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Load the dataset
nba_data <- read.csv("C:/Statistics/nba.csv")
# Set seed for reproducibility
set.seed(123)
# Calculate sample size (50% of data)
sample_size <- round(0.5 * nrow(nba_data))
# Generate 5 random samples with replacement
df_1 <- nba_data %>% sample_n(sample_size, replace = TRUE)
df_2 <- nba_data %>% sample_n(sample_size, replace = TRUE)
df_3 <- nba_data %>% sample_n(sample_size, replace = TRUE)
df_4 <- nba_data %>% sample_n(sample_size, replace = TRUE)
df_5 <- nba_data %>% sample_n(sample_size, replace = TRUE)
# Display the first few rows of each sample
head(df_1)
## bbrID Date Tm Opp TRB AST STL BLK PTS GmSc Season Playoffs Year
## 1 doleami01 2007-01-30 MIA MIL 11 4 5 1 11 18.6 2006-07 false 2007
## 2 ellismo01 2016-01-28 IND ATL 5 6 5 1 25 29.9 2015-16 false 2016
## 3 boozeca01 2010-02-09 UTA LAC 14 4 4 0 34 37.5 2009-10 false 2010
## 4 freeljo01 2015-04-13 POR OKC 5 0 0 1 16 12.3 2014-15 false 2015
## 5 brandte01 2000-03-19 MIN MIL 3 13 4 0 28 35.9 1999-00 false 2000
## 6 macksa01 1996-04-02 HOU GSW 3 4 2 0 38 30.7 1995-96 false 1996
## GameIndex GmScMovingZ GmScMovingZTop2Delta Date2 GmSc2 GmScMovingZ2
## 1 573 5.19 0.35 2005-02-05 17.0 4.84
## 2 750 2.99 0.18 2006-11-18 27.3 2.81
## 3 512 3.19 0.18 2008-03-15 37.9 3.01
## 4 158 3.02 0.14 2013-04-16 11.0 2.88
## 5 628 3.02 0.11 2001-02-23 36.1 2.91
## 6 60 5.00 1.31 1996-04-21 23.5 3.69
head(df_2)
## bbrID Date Tm Opp TRB AST STL BLK PTS GmSc Season Playoffs Year
## 1 jacksji01 1998-04-02 GSW HOU 3 11 1 1 33 31.9 1997-98 false 1998
## 2 tollian01 2020-01-13 POR CHO 11 1 0 0 16 17.6 2019-20 false 2020
## 3 richmmi01 1991-01-31 GSW LAC 6 7 7 0 40 40.7 1990-91 false 1991
## 4 denglu01 2010-11-01 CHI POR 4 2 1 0 40 33.9 2010-11 false 2011
## 5 porzikr01 2017-11-05 NYK IND 8 1 0 6 40 34.3 2017-18 false 2018
## 6 pachuza01 2015-03-20 MIL BRK 21 7 1 0 22 29.2 2014-15 false 2015
## GameIndex GmScMovingZ GmScMovingZTop2Delta Date2 GmSc2 GmScMovingZ2
## 1 390 3.62 0.19 2000-01-07 29.1 3.43
## 2 707 3.67 0.06 2013-03-06 20.1 3.61
## 3 201 3.38 0.07 1995-12-15 39.6 3.31
## 4 426 3.35 0.04 2016-04-17 27.9 3.31
## 5 146 2.76 0.31 2015-11-21 27.9 2.45
## 6 841 3.63 0.02 2018-03-11 20.3 3.61
head(df_3)
## bbrID Date Tm Opp TRB AST STL BLK PTS GmSc Season Playoffs Year
## 1 lambje01 2017-11-03 CHO SAS 9 6 2 0 27 28.2 2017-18 false 2018
## 2 douglto01 2010-11-04 NYK CHI 1 4 4 0 30 28.3 2010-11 false 2011
## 3 mccloge01 1998-04-12 PHO VAN 8 6 2 0 25 25.3 1997-98 false 1998
## 4 bouchch01 2021-04-08 TOR CHI 19 1 1 1 38 36.3 2020-21 false 2021
## 5 douglto01 2010-11-04 NYK CHI 1 4 4 0 30 28.3 2010-11 false 2011
## 6 bowdlca01 2001-03-27 ATL BOS 4 0 1 3 15 13.6 2000-01 false 2001
## GameIndex GmScMovingZ GmScMovingZTop2Delta Date2 GmSc2 GmScMovingZ2
## 1 298 3.74 0.49 2013-12-29 22.1 3.25
## 2 59 3.18 0.06 2011-04-05 26.4 3.12
## 3 497 3.50 0.35 1998-03-09 23.3 3.15
## 4 151 3.61 0.78 2021-04-18 30.5 2.83
## 5 59 3.18 0.06 2011-04-05 26.4 3.12
## 6 79 3.29 0.06 2001-03-22 13.4 3.23
head(df_4)
## bbrID Date Tm Opp TRB AST STL BLK PTS GmSc Season Playoffs Year
## 1 cassesa01 2007-11-07 LAC IND 3 8 2 0 35 32.3 2007-08 false 2008
## 2 kiddja01 1996-01-12 DAL PHO 12 16 1 1 33 36.9 1995-96 false 1996
## 3 curryse01 2020-02-28 DAL MIA 2 2 0 0 37 30.7 2019-20 false 2020
## 4 mccolcj01 2018-01-31 POR CHI 5 2 1 0 50 40.6 2017-18 false 2018
## 5 mcdankj01 2017-03-23 BRK PHO 8 1 2 1 16 16.5 2016-17 false 2017
## 6 kofoeba01 1992-01-14 SEA CHH 5 9 0 0 15 18.7 1991-92 false 1992
## GameIndex GmScMovingZ GmScMovingZTop2Delta Date2 GmSc2 GmScMovingZ2
## 1 1056 3.32 0.29 1995-05-30 32.2 3.03
## 2 110 3.03 0.03 2008-04-16 30.6 3.00
## 3 263 3.07 0.19 2019-12-12 27.8 2.88
## 4 335 3.34 0.40 2015-12-27 33.7 2.94
## 5 143 3.95 1.14 2014-11-29 16.7 2.81
## 6 86 6.75 3.36 1988-12-03 9.7 3.39
head(df_5)
## bbrID Date Tm Opp TRB AST STL BLK PTS GmSc Season Playoffs Year
## 1 watsoja01 1995-02-01 UTA DEN 2 4 1 1 10 12.8 1994-95 false 1995
## 2 greenge01 2012-04-08 NJN CLE 5 1 0 0 32 24.6 2011-12 false 2012
## 3 washier01 1998-01-02 DEN HOU 3 1 2 0 21 17.2 1997-98 false 1998
## 4 mobleer01 1995-03-10 MIL WSB 12 2 0 4 12 15.8 1994-95 false 1995
## 5 jonesso01 2012-03-03 NOH IND 9 2 0 0 12 13.7 2011-12 false 2012
## 6 fieldke01 1985-03-25 MIL NYK 7 0 0 0 21 16.9 1984-85 false 1985
## GameIndex GmScMovingZ GmScMovingZTop2Delta Date2 GmSc2 GmScMovingZ2
## 1 34 3.19 0.60 1994-11-14 10.9 2.59
## 2 209 3.82 0.11 2016-04-01 24.9 3.71
## 3 25 2.67 0.21 1999-04-10 16.3 2.46
## 4 28 3.18 0.24 1995-04-07 14.7 2.94
## 5 281 3.39 0.35 2007-04-17 12.4 3.04
## 6 48 3.16 0.14 1986-03-03 19.2 3.02
# Generate summary statistics for each sample
# Summary statistics for key columns in the first sample
summary(df_1$PTS)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.00 18.00 24.00 25.69 32.00 60.00
summary(df_1$AST)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 1.000 3.000 3.808 5.000 18.000
summary(df_1$Tm)
## Length Class Mode
## 852 character character
# Repeat for other samples
summary(df_2$PTS)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.00 18.00 24.00 25.75 32.00 81.00
summary(df_2$AST)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 1.000 3.000 3.644 5.000 22.000
summary(df_2$Tm)
## Length Class Mode
## 852 character character
summary(df_3$PTS)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.00 18.00 25.00 26.18 32.00 62.00
summary(df_3$AST)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 1.000 3.000 3.727 5.000 22.000
summary(df_3$Tm)
## Length Class Mode
## 852 character character
summary(df_4$PTS)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6.00 19.00 24.00 26.15 32.00 62.00
summary(df_4$AST)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 1.000 3.000 3.738 5.000 17.000
summary(df_4$Tm)
## Length Class Mode
## 852 character character
summary(df_5$PTS)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.00 19.00 24.00 25.76 32.00 81.00
summary(df_5$AST)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 1.000 3.000 3.661 5.000 19.000
summary(df_5$Tm)
## Length Class Mode
## 852 character character
Explanation of Key Metrics Scoring Trend (avg_pts): On average, the points scored by players per game across all samples are around 25.76. This suggests a relatively balanced scoring performance, with variations depending on the sample. Some players likely score much higher, while others score lower.
Assist Trend (avg_ast): The average assists per player per game is 3.66. This shows that while a few players are primary playmakers, many players contribute fewer assists, potentially indicating a more isolated or specific role in the team dynamics.
Diversity in Teams (unique_teams): The dataset includes 38 unique teams, showing a broad distribution across different teams. This helps ensure that the random samples are representative of a variety of team performances, reducing the chance of sample bias toward any specific teams.
# Subsample Analysis: Team Representation
df_1 %>% group_by(Tm) %>% summarise(count = n())
## # A tibble: 38 × 2
## Tm count
## <chr> <int>
## 1 ATL 23
## 2 BOS 27
## 3 BRK 14
## 4 CHA 10
## 5 CHH 9
## 6 CHI 26
## 7 CHO 10
## 8 CLE 35
## 9 DAL 25
## 10 DEN 35
## # ℹ 28 more rows
df_2 %>% group_by(Tm) %>% summarise(count = n())
## # A tibble: 38 × 2
## Tm count
## <chr> <int>
## 1 ATL 34
## 2 BOS 33
## 3 BRK 10
## 4 CHA 10
## 5 CHH 8
## 6 CHI 18
## 7 CHO 8
## 8 CLE 34
## 9 DAL 24
## 10 DEN 29
## # ℹ 28 more rows
df_3 %>% group_by(Tm) %>% summarise(count = n())
## # A tibble: 37 × 2
## Tm count
## <chr> <int>
## 1 ATL 33
## 2 BOS 33
## 3 BRK 13
## 4 CHA 6
## 5 CHH 9
## 6 CHI 26
## 7 CHO 7
## 8 CLE 40
## 9 DAL 20
## 10 DEN 29
## # ℹ 27 more rows
df_4 %>% group_by(Tm) %>% summarise(count = n())
## # A tibble: 38 × 2
## Tm count
## <chr> <int>
## 1 ATL 26
## 2 BOS 28
## 3 BRK 16
## 4 CHA 10
## 5 CHH 11
## 6 CHI 35
## 7 CHO 7
## 8 CLE 42
## 9 DAL 25
## 10 DEN 27
## # ℹ 28 more rows
df_5 %>% group_by(Tm) %>% summarise(count = n())
## # A tibble: 38 × 2
## Tm count
## <chr> <int>
## 1 ATL 35
## 2 BOS 27
## 3 BRK 14
## 4 CHA 8
## 5 CHH 5
## 6 CHI 25
## 7 CHO 6
## 8 CLE 39
## 9 DAL 29
## 10 DEN 27
## # ℹ 28 more rows
Insight: Each subsample shows the distribution of players across teams. If some teams are overrepresented, it might suggest that further analysis or adjustments are needed to achieve a more balanced sample.
# Identify anomalies by checking for outliers
boxplot(df_1$PTS, main="Points Distribution in df_1")
boxplot(df_2$PTS, main="Points Distribution in df_2")
boxplot(df_3$PTS, main="Points Distribution in df_3")
boxplot(df_4$PTS, main="Points Distribution in df_4")
boxplot(df_5$PTS, main="Points Distribution in df_5")
Insight: Outliers are visible as points outside the whiskers in the
boxplot. These outliers could represent either extremely high or low
performances, which might warrant a closer look to see if they are data
errors or valid exceptional performances.
# Find common patterns across all subsamples
common_summary <- function(df) {
df %>% summarise(
avg_pts = mean(PTS, na.rm = TRUE),
avg_ast = mean(AST, na.rm = TRUE),
unique_teams = n_distinct(Tm)
)
}
common_summary(df_1)
## avg_pts avg_ast unique_teams
## 1 25.69484 3.807512 38
common_summary(df_2)
## avg_pts avg_ast unique_teams
## 1 25.74765 3.644366 38
common_summary(df_3)
## avg_pts avg_ast unique_teams
## 1 26.1784 3.726526 37
common_summary(df_4)
## avg_pts avg_ast unique_teams
## 1 26.14554 3.738263 38
common_summary(df_5)
## avg_pts avg_ast unique_teams
## 1 25.75704 3.660798 38
Average Points (avg_pts): The average points per player per game remain consistent across all subsamples, averaging around 25.76. This suggests that, generally, players score at a similar level across different teams and seasons in the dataset.
Average Assists (avg_ast): The average assists across subsamples are also relatively stable at around 3.66, indicating that players tend to have a similar contribution to playmaking across the dataset.
Team Diversity (unique_teams): There are consistently 38 unique teams represented across the samples, showing that our random sampling method is capturing a diverse set of teams without bias..