knitr::opts_chunk$set(echo = TRUE)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(readr)
library(stats)
library(pwr) # For power analysis
# Load dataset
nba_data <- read.csv("C:/Statistics/nba.csv")
# View structure of dataset
str(nba_data)
## 'data.frame': 1703 obs. of 19 variables:
## $ bbrID : chr "abdelal01" "abdulma02" "abdulta01" "abdursh01" ...
## $ Date : chr "1993-03-16" "1991-04-02" "1998-04-19" "2001-11-23" ...
## $ Tm : chr "BOS" "DEN" "SAC" "ATL" ...
## $ Opp : chr "GSW" "DAL" "VAN" "DET" ...
## $ TRB : int 10 2 2 12 2 13 10 14 2 10 ...
## $ AST : int 2 6 3 5 0 3 1 1 8 3 ...
## $ STL : int 0 4 1 2 0 0 0 1 5 1 ...
## $ BLK : int 0 0 0 1 0 1 0 0 0 3 ...
## $ PTS : int 25 30 31 50 25 17 18 19 31 17 ...
## $ GmSc : num 22.7 29.7 26.4 46 17.1 16.9 19.2 20.7 33.2 20.6 ...
## $ Season : chr "1992-93" "1990-91" "1997-98" "2001-02" ...
## $ Playoffs : chr "false" "false" "false" "false" ...
## $ Year : int 1993 1991 1998 2002 2019 2021 1990 2015 1988 2014 ...
## $ GameIndex : int 181 64 58 386 160 8 236 124 100 4 ...
## $ GmScMovingZ : num 4.13 3.82 4.11 4.06 3.37 2.58 4.27 4.15 3.16 4.68 ...
## $ GmScMovingZTop2Delta: num 0.24 0.64 1.67 0.84 0.18 0.05 0.02 0.93 0.22 1.16 ...
## $ Date2 : chr "1991-12-04" "1995-12-07" "1998-01-14" "2003-11-28" ...
## $ GmSc2 : num 18.6 40.1 16.9 34.3 16.6 16.8 19.6 18.5 42.3 29.5 ...
## $ GmScMovingZ2 : num 3.89 3.18 2.44 3.22 3.19 2.53 4.25 3.22 2.94 3.52 ...
Null Hypothesis (H₀): There is no significant difference in the average Game Score (GmSc) between players who played in the playoffs and those who did not.
Alternative Hypothesis (H₁): There is a significant difference in the average GmSc between the two groups.
# Compute required sample size
effect_size <- 0.5 # Moderate effect size
power <- 0.8 # Standard power level
alpha <- 0.05 # Standard significance level
sample_size <- pwr.t.test(d = effect_size, power = power, sig.level = alpha, type = "two.sample", alternative = "two.sided")$n
print(sample_size)
## [1] 63.76561
# Ensure necessary columns exist
nba_data <- nba_data %>% filter(!is.na(GmSc) & !is.na(Playoffs))
# Separate groups
playoff_gmsc <- nba_data %>% filter(Playoffs == 1) %>% pull(GmSc)
non_playoff_gmsc <- nba_data %>% filter(Playoffs == 0) %>% pull(GmSc)
# Compare actual sample sizes to required sample size
actual_sample_playoff <- length(playoff_gmsc)
actual_sample_non_playoff <- length(non_playoff_gmsc)
sufficient_data <- actual_sample_playoff >= sample_size & actual_sample_non_playoff >= sample_size
sufficient_data
## [1] FALSE
If the dataset has enough observations per group, we proceed with the t-test; otherwise, we explain why the test cannot be performed.
if (sufficient_data) {
hyp1_test <- t.test(playoff_gmsc, non_playoff_gmsc, var.equal = FALSE)
print(hyp1_test)
} else {
print("Not enough data to perform hypothesis test. Consider increasing sample size.")
}
## [1] "Not enough data to perform hypothesis test. Consider increasing sample size."
ggplot(nba_data, aes(x = factor(Playoffs), y = GmSc, fill = factor(Playoffs))) +
geom_boxplot() +
labs(title = "Game Score (GmSc) Distribution by Playoff Status",
x = "Playoff Participation (0 = No, 1 = Yes)",
y = "Game Score (GmSc)") +
theme_minimal()
Null Hypothesis (H₀): There is no correlation between total rebounds (TRB) and points scored (PTS).
Alternative Hypothesis (H₁): There is a significant correlation between TRB and PTS.
# Ensure necessary columns exist
nba_data <- nba_data %>% filter(!is.na(TRB) & !is.na(PTS))
# Perform Pearson correlation test
hyp2_test <- cor.test(nba_data$TRB, nba_data$PTS, method = "pearson")
print(hyp2_test)
##
## Pearson's product-moment correlation
##
## data: nba_data$TRB and nba_data$PTS
## t = 3.7819, df = 1701, p-value = 0.000161
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.04400424 0.13821455
## sample estimates:
## cor
## 0.09131371
# Fisher transformation
r <- hyp2_test$estimate
n <- nrow(nba_data)
z <- 0.5 * log((1 + r) / (1 - r)) # Fisher z transformation
se_z <- 1 / sqrt(n - 3)
z_score <- z / se_z
p_value_fisher <- 2 * (1 - pnorm(abs(z_score)))
cat("Fisher z-score:", z_score, "\n")
## Fisher z-score: 3.775478
cat("Fisher p-value:", p_value_fisher, "\n")
## Fisher p-value: 0.0001597012
ggplot(nba_data, aes(x = TRB, y = PTS)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", color = "blue", se = FALSE) +
labs(title = "Correlation Between Total Rebounds and Points Scored",
x = "Total Rebounds (TRB)",
y = "Points Scored (PTS)") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
Hypothesis 1: We tested whether playoff participation impacts Game Score. If the p-value from the t-test is less than 0.05, we reject the null hypothesis and conclude that there’s a significant difference in performance.
Hypothesis 2: While the correlation between total rebounds and points is weak (r ≈ 0.091), both the Pearson and Fisher tests show the relationship is statistically significant. This highlights the power of large datasets to detect subtle trends and reminds us to distinguish between statistical significance and practical significance.