# Load the dataset
nba_data <- read.csv("C:/Statistics/nba.csv")
# Inspect the data
glimpse(nba_data)
## Rows: 1,703
## Columns: 19
## $ bbrID <chr> "abdelal01", "abdulma02", "abdulta01", "abdursh01…
## $ Date <chr> "1993-03-16", "1991-04-02", "1998-04-19", "2001-1…
## $ Tm <chr> "BOS", "DEN", "SAC", "ATL", "OKC", "MIA", "ORL", …
## $ Opp <chr> "GSW", "DAL", "VAN", "DET", "CHO", "PHI", "WSB", …
## $ TRB <int> 10, 2, 2, 12, 2, 13, 10, 14, 2, 10, 4, 5, 10, 2, …
## $ AST <int> 2, 6, 3, 5, 0, 3, 1, 1, 8, 3, 3, 9, 2, 2, 0, 2, 1…
## $ STL <int> 0, 4, 1, 2, 0, 0, 0, 1, 5, 1, 4, 1, 1, 0, 1, 1, 2…
## $ BLK <int> 0, 0, 0, 1, 0, 1, 0, 0, 0, 3, 0, 0, 3, 1, 1, 2, 1…
## $ PTS <int> 25, 30, 31, 50, 25, 17, 18, 19, 31, 17, 22, 41, 2…
## $ GmSc <dbl> 22.7, 29.7, 26.4, 46.0, 17.1, 16.9, 19.2, 20.7, 3…
## $ Season <chr> "1992-93", "1990-91", "1997-98", "2001-02", "2018…
## $ Playoffs <chr> "false", "false", "false", "false", "false", "fal…
## $ Year <int> 1993, 1991, 1998, 2002, 2019, 2021, 1990, 2015, 1…
## $ GameIndex <int> 181, 64, 58, 386, 160, 8, 236, 124, 100, 4, 4, 25…
## $ GmScMovingZ <dbl> 4.13, 3.82, 4.11, 4.06, 3.37, 2.58, 4.27, 4.15, 3…
## $ GmScMovingZTop2Delta <dbl> 0.24, 0.64, 1.67, 0.84, 0.18, 0.05, 0.02, 0.93, 0…
## $ Date2 <chr> "1991-12-04", "1995-12-07", "1998-01-14", "2003-1…
## $ GmSc2 <dbl> 18.6, 40.1, 16.9, 34.3, 16.6, 16.8, 19.6, 18.5, 4…
## $ GmScMovingZ2 <dbl> 3.89, 3.18, 2.44, 3.22, 3.19, 2.53, 4.25, 3.22, 2…
# Create new columns based on existing numeric variables
nba_data <- nba_data %>%
mutate(
Scoring_Level = case_when(
PTS < 10 ~ "Low",
PTS >= 10 & PTS < 20 ~ "Medium",
PTS >= 20 ~ "High"
),
Assist_Level = case_when(
AST < 3 ~ "Low",
AST >= 3 & AST < 7 ~ "Medium",
AST >= 7 ~ "High"
)
)
# Convert to ordered factor
nba_data$Scoring_Level <- factor(nba_data$Scoring_Level, levels = c("Low", "Medium", "High"), ordered = TRUE)
nba_data$Assist_Level <- factor(nba_data$Assist_Level, levels = c("Low", "Medium", "High"), ordered = TRUE)
#Visualization Boxplot: Points by Scoring Level
ggplot(nba_data, aes(x = Scoring_Level, y = PTS, fill = Scoring_Level)) +
geom_boxplot() +
labs(title = "Boxplot of Points Scored by Scoring Level", x = "Scoring Level", y = "Points Scored") +
theme_minimal()
#Violin Plot: Assists by Assist Level
ggplot(nba_data, aes(x = Assist_Level, y = AST, fill = Assist_Level)) +
geom_violin() +
labs(title = "Violin Plot of Assists by Assist Level", x = "Assist Level", y = "Assists") +
theme_minimal()
We convert the scoring and assist levels into numeric values and compute Pearson correlations with the original numerical features.
# Convert Scoring Level and Assist Level to numeric for correlation
nba_data$Scoring_Level_Num <- as.numeric(nba_data$Scoring_Level)
nba_data$Assist_Level_Num <- as.numeric(nba_data$Assist_Level)
# Compute correlation coefficients
corr_pts <- cor(nba_data$PTS, nba_data$Scoring_Level_Num, method = "pearson")
corr_ast <- cor(nba_data$AST, nba_data$Assist_Level_Num, method = "pearson")
# Print correlation results
corr_results <- data.frame(Variable = c("PTS & Scoring Level", "AST & Assist Level"),
Correlation = c(corr_pts, corr_ast))
print(corr_results)
## Variable Correlation
## 1 PTS & Scoring Level 0.6615882
## 2 AST & Assist Level 0.8892969
#Interpretation:1)The correlation between PTS and Scoring Level is expected to be strong since the level is derived from PTS.
#2)The correlation between AST and Assist Level might be moderate due to broader variance in assist data.
#Confidence Intervals: We define a function to compute the 95% confidence interval for both points scored and assists.
# Function to compute confidence interval
confidence_interval <- function(data, confidence = 0.95) {
mean_val <- mean(data, na.rm = TRUE)
std_err <- sd(data, na.rm = TRUE) / sqrt(length(data))
margin_of_error <- qt((1 + confidence) / 2, df = length(data) - 1) * std_err
return(c(mean_val - margin_of_error, mean_val + margin_of_error))
}
# Calculate confidence intervals
ci_pts <- confidence_interval(nba_data$PTS)
ci_ast <- confidence_interval(nba_data$AST)
# Display results
ci_results <- data.frame(
Variable = c("PTS (Points Scored)", "AST (Assists)"),
CI_Lower = c(ci_pts[1], ci_ast[1]),
CI_Upper = c(ci_pts[2], ci_ast[2])
)
print(ci_results)
## Variable CI_Lower CI_Upper
## 1 PTS (Points Scored) 25.571603 26.551708
## 2 AST (Assists) 3.584751 3.896165
Conclusion: This analysis highlights the relationships between performance metrics and categorized levels for NBA players. We also assessed their distributions and variability with statistical tools such as correlation and confidence intervals.