knitr::opts_chunk$set(echo = TRUE)

Load Libraries and Data

library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(readr)
library(stats)
library(pwr)  # For power analysis

# Load dataset
nba_data <- read.csv("C:/Statistics/nba.csv")

# View structure of dataset
str(nba_data)
## 'data.frame':    1703 obs. of  19 variables:
##  $ bbrID               : chr  "abdelal01" "abdulma02" "abdulta01" "abdursh01" ...
##  $ Date                : chr  "1993-03-16" "1991-04-02" "1998-04-19" "2001-11-23" ...
##  $ Tm                  : chr  "BOS" "DEN" "SAC" "ATL" ...
##  $ Opp                 : chr  "GSW" "DAL" "VAN" "DET" ...
##  $ TRB                 : int  10 2 2 12 2 13 10 14 2 10 ...
##  $ AST                 : int  2 6 3 5 0 3 1 1 8 3 ...
##  $ STL                 : int  0 4 1 2 0 0 0 1 5 1 ...
##  $ BLK                 : int  0 0 0 1 0 1 0 0 0 3 ...
##  $ PTS                 : int  25 30 31 50 25 17 18 19 31 17 ...
##  $ GmSc                : num  22.7 29.7 26.4 46 17.1 16.9 19.2 20.7 33.2 20.6 ...
##  $ Season              : chr  "1992-93" "1990-91" "1997-98" "2001-02" ...
##  $ Playoffs            : chr  "false" "false" "false" "false" ...
##  $ Year                : int  1993 1991 1998 2002 2019 2021 1990 2015 1988 2014 ...
##  $ GameIndex           : int  181 64 58 386 160 8 236 124 100 4 ...
##  $ GmScMovingZ         : num  4.13 3.82 4.11 4.06 3.37 2.58 4.27 4.15 3.16 4.68 ...
##  $ GmScMovingZTop2Delta: num  0.24 0.64 1.67 0.84 0.18 0.05 0.02 0.93 0.22 1.16 ...
##  $ Date2               : chr  "1991-12-04" "1995-12-07" "1998-01-14" "2003-11-28" ...
##  $ GmSc2               : num  18.6 40.1 16.9 34.3 16.6 16.8 19.6 18.5 42.3 29.5 ...
##  $ GmScMovingZ2        : num  3.89 3.18 2.44 3.22 3.19 2.53 4.25 3.22 2.94 3.52 ...

Hypothesis 1: Playoff Impact on Game Score

Null Hypothesis (H₀): There is no significant difference in the average Game Score (GmSc) between players who played in the playoffs and those who did not.

Alternative Hypothesis (H₁): There is a significant difference in the average GmSc between the two groups.

Neyman-Pearson Framework Considerations

  • Alpha Level (α): Set to 0.05 to control for Type I error.
  • Power Level (1 - β): Set to 0.8 to minimize Type II error.
  • Effect Size (Cohen’s d): Minimum effect size of 0.5 (moderate effect) chosen based on practical significance in performance difference.

Sample Size Calculation

# Compute required sample size
effect_size <- 0.5  # Moderate effect size
power <- 0.8  # Standard power level
alpha <- 0.05  # Standard significance level

sample_size <- pwr.t.test(d = effect_size, power = power, sig.level = alpha, type = "two.sample", alternative = "two.sided")$n
print(sample_size)
## [1] 63.76561

Check If We Have Enough Data

# Ensure necessary columns exist
nba_data <- nba_data %>% filter(!is.na(GmSc) & !is.na(Playoffs))

# Separate groups
playoff_gmsc <- nba_data %>% filter(Playoffs == 1) %>% pull(GmSc)
non_playoff_gmsc <- nba_data %>% filter(Playoffs == 0) %>% pull(GmSc)

# Compare actual sample sizes to required sample size
actual_sample_playoff <- length(playoff_gmsc)
actual_sample_non_playoff <- length(non_playoff_gmsc)

sufficient_data <- actual_sample_playoff >= sample_size & actual_sample_non_playoff >= sample_size
sufficient_data
## [1] FALSE

Hypothesis Test

If the dataset has enough observations per group, we proceed with the t-test; otherwise, we explain why the test cannot be performed.

if (sufficient_data) {
  hyp1_test <- t.test(playoff_gmsc, non_playoff_gmsc, var.equal = FALSE)
  print(hyp1_test)
} else {
  print("Not enough data to perform hypothesis test. Consider increasing sample size.")
}
## [1] "Not enough data to perform hypothesis test. Consider increasing sample size."

Interpretation

  • If p-value < α (0.05), reject H₀, indicating a significant difference in GmSc between playoff and non-playoff players.
  • If p-value ≥ α, fail to reject H₀, meaning we do not have enough evidence to conclude a significant difference.

Visualization: Boxplot of GmSc by Playoff Status

ggplot(nba_data, aes(x = factor(Playoffs), y = GmSc, fill = factor(Playoffs))) +
  geom_boxplot() +
  labs(title = "Game Score (GmSc) Distribution by Playoff Status",
       x = "Playoff Participation (0 = No, 1 = Yes)",
       y = "Game Score (GmSc)") +
  theme_minimal()

Hypothesis 2: Correlation Between Rebounds and Points Scored

Null Hypothesis (H₀): There is no correlation between total rebounds (TRB) and points scored (PTS).

Alternative Hypothesis (H₁): There is a significant correlation between TRB and PTS.

# Ensure necessary columns exist
nba_data <- nba_data %>% filter(!is.na(TRB) & !is.na(PTS))

# Perform Pearson correlation test
hyp2_test <- cor.test(nba_data$TRB, nba_data$PTS, method = "pearson")
print(hyp2_test)
## 
##  Pearson's product-moment correlation
## 
## data:  nba_data$TRB and nba_data$PTS
## t = 3.7819, df = 1701, p-value = 0.000161
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.04400424 0.13821455
## sample estimates:
##        cor 
## 0.09131371
# Fisher transformation
r <- hyp2_test$estimate
n <- nrow(nba_data)
z <- 0.5 * log((1 + r) / (1 - r))  # Fisher z transformation
se_z <- 1 / sqrt(n - 3)
z_score <- z / se_z
p_value_fisher <- 2 * (1 - pnorm(abs(z_score)))

cat("Fisher z-score:", z_score, "\n")
## Fisher z-score: 3.775478
cat("Fisher p-value:", p_value_fisher, "\n")
## Fisher p-value: 0.0001597012

Visualization: Scatter Plot of TRB vs. PTS

ggplot(nba_data, aes(x = TRB, y = PTS)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", color = "blue", se = FALSE) +
  labs(title = "Correlation Between Total Rebounds and Points Scored",
       x = "Total Rebounds (TRB)",
       y = "Points Scored (PTS)") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

Interpretation of Scatter Plot and Correlation Test

  • The scatter plot shows a weak positive correlation between total rebounds (TRB) and points scored (PTS), with a slight upward trend in the fitted line.
  • The Pearson correlation coefficient is approximately 0.091, which indicates a weak correlation.
  • The p-value is very small (0.000161), which means the correlation is statistically significant at the 0.05 level.
  • Although the relationship is weak, the statistical significance suggests that players who grab more rebounds tend to score slightly more points, but the effect size is minimal.

Conclusion

Hypothesis 1: We tested whether playoff participation impacts Game Score. If the p-value from the t-test is less than 0.05, we reject the null hypothesis and conclude that there’s a significant difference in performance.

Hypothesis 2: While the correlation between total rebounds and points is weak (r ≈ 0.091), both the Pearson and Fisher tests show the relationship is statistically significant. This highlights the power of large datasets to detect subtle trends and reminds us to distinguish between statistical significance and practical significance.