# Load the dataset
nba_data <- read.csv("C:/Statistics/nba.csv")

# Inspect the data
glimpse(nba_data)
## Rows: 1,703
## Columns: 19
## $ bbrID                <chr> "abdelal01", "abdulma02", "abdulta01", "abdursh01…
## $ Date                 <chr> "1993-03-16", "1991-04-02", "1998-04-19", "2001-1…
## $ Tm                   <chr> "BOS", "DEN", "SAC", "ATL", "OKC", "MIA", "ORL", …
## $ Opp                  <chr> "GSW", "DAL", "VAN", "DET", "CHO", "PHI", "WSB", …
## $ TRB                  <int> 10, 2, 2, 12, 2, 13, 10, 14, 2, 10, 4, 5, 10, 2, …
## $ AST                  <int> 2, 6, 3, 5, 0, 3, 1, 1, 8, 3, 3, 9, 2, 2, 0, 2, 1…
## $ STL                  <int> 0, 4, 1, 2, 0, 0, 0, 1, 5, 1, 4, 1, 1, 0, 1, 1, 2…
## $ BLK                  <int> 0, 0, 0, 1, 0, 1, 0, 0, 0, 3, 0, 0, 3, 1, 1, 2, 1…
## $ PTS                  <int> 25, 30, 31, 50, 25, 17, 18, 19, 31, 17, 22, 41, 2…
## $ GmSc                 <dbl> 22.7, 29.7, 26.4, 46.0, 17.1, 16.9, 19.2, 20.7, 3…
## $ Season               <chr> "1992-93", "1990-91", "1997-98", "2001-02", "2018…
## $ Playoffs             <chr> "false", "false", "false", "false", "false", "fal…
## $ Year                 <int> 1993, 1991, 1998, 2002, 2019, 2021, 1990, 2015, 1…
## $ GameIndex            <int> 181, 64, 58, 386, 160, 8, 236, 124, 100, 4, 4, 25…
## $ GmScMovingZ          <dbl> 4.13, 3.82, 4.11, 4.06, 3.37, 2.58, 4.27, 4.15, 3…
## $ GmScMovingZTop2Delta <dbl> 0.24, 0.64, 1.67, 0.84, 0.18, 0.05, 0.02, 0.93, 0…
## $ Date2                <chr> "1991-12-04", "1995-12-07", "1998-01-14", "2003-1…
## $ GmSc2                <dbl> 18.6, 40.1, 16.9, 34.3, 16.6, 16.8, 19.6, 18.5, 4…
## $ GmScMovingZ2         <dbl> 3.89, 3.18, 2.44, 3.22, 3.19, 2.53, 4.25, 3.22, 2…

We create new categorical variables for scoring and assist levels based on thresholds.

# Create new columns based on existing numeric variables
nba_data <- nba_data %>%
  mutate(
    Scoring_Level = case_when(
      PTS < 10 ~ "Low",
      PTS >= 10 & PTS < 20 ~ "Medium",
      PTS >= 20 ~ "High"
    ),
    Assist_Level = case_when(
      AST < 3 ~ "Low",
      AST >= 3 & AST < 7 ~ "Medium",
      AST >= 7 ~ "High"
    )
  )

# Convert to ordered factor
nba_data$Scoring_Level <- factor(nba_data$Scoring_Level, levels = c("Low", "Medium", "High"), ordered = TRUE)
nba_data$Assist_Level <- factor(nba_data$Assist_Level, levels = c("Low", "Medium", "High"), ordered = TRUE)

#Visualization Boxplot: Points by Scoring Level

ggplot(nba_data, aes(x = Scoring_Level, y = PTS, fill = Scoring_Level)) +
  geom_boxplot() +
  labs(title = "Boxplot of Points Scored by Scoring Level", x = "Scoring Level", y = "Points Scored") +
  theme_minimal()

#Violin Plot: Assists by Assist Level
ggplot(nba_data, aes(x = Assist_Level, y = AST, fill = Assist_Level)) +
  geom_violin() +
  labs(title = "Violin Plot of Assists by Assist Level", x = "Assist Level", y = "Assists") +
  theme_minimal()

Correlation Analysis

We convert the scoring and assist levels into numeric values and compute Pearson correlations with the original numerical features.

# Convert Scoring Level and Assist Level to numeric for correlation
nba_data$Scoring_Level_Num <- as.numeric(nba_data$Scoring_Level)
nba_data$Assist_Level_Num <- as.numeric(nba_data$Assist_Level)

# Compute correlation coefficients
corr_pts <- cor(nba_data$PTS, nba_data$Scoring_Level_Num, method = "pearson")
corr_ast <- cor(nba_data$AST, nba_data$Assist_Level_Num, method = "pearson")

# Print correlation results
corr_results <- data.frame(Variable = c("PTS & Scoring Level", "AST & Assist Level"),
                           Correlation = c(corr_pts, corr_ast))

print(corr_results)
##              Variable Correlation
## 1 PTS & Scoring Level   0.6615882
## 2  AST & Assist Level   0.8892969

#Interpretation:1)The correlation between PTS and Scoring Level is expected to be strong since the level is derived from PTS.

#2)The correlation between AST and Assist Level might be moderate due to broader variance in assist data.

#Confidence Intervals: We define a function to compute the 95% confidence interval for both points scored and assists.

# Function to compute confidence interval
confidence_interval <- function(data, confidence = 0.95) {
  mean_val <- mean(data, na.rm = TRUE)
  std_err <- sd(data, na.rm = TRUE) / sqrt(length(data))
  margin_of_error <- qt((1 + confidence) / 2, df = length(data) - 1) * std_err
  return(c(mean_val - margin_of_error, mean_val + margin_of_error))
}

# Calculate confidence intervals
ci_pts <- confidence_interval(nba_data$PTS)
ci_ast <- confidence_interval(nba_data$AST)

# Display results
ci_results <- data.frame(
  Variable = c("PTS (Points Scored)", "AST (Assists)"),
  CI_Lower = c(ci_pts[1], ci_ast[1]),
  CI_Upper = c(ci_pts[2], ci_ast[2])
)

print(ci_results)
##              Variable  CI_Lower  CI_Upper
## 1 PTS (Points Scored) 25.571603 26.551708
## 2       AST (Assists)  3.584751  3.896165

Conclusion: This analysis highlights the relationships between performance metrics and categorized levels for NBA players. We also assessed their distributions and variability with statistical tools such as correlation and confidence intervals.