assignment

# Load the dataset
nba_data <- read.csv("C:/Statistics/nba.csv")

# Inspect the data
glimpse(nba_data)

## Rows: 1,703
## Columns: 19
## $ bbrID                <chr> "abdelal01", "abdulma02", "abdulta01", "abdursh01…
## $ Date                 <chr> "1993-03-16", "1991-04-02", "1998-04-19", "2001-1…
## $ Tm                   <chr> "BOS", "DEN", "SAC", "ATL", "OKC", "MIA", "ORL", …
## $ Opp                  <chr> "GSW", "DAL", "VAN", "DET", "CHO", "PHI", "WSB", …
## $ TRB                  <int> 10, 2, 2, 12, 2, 13, 10, 14, 2, 10, 4, 5, 10, 2, …
## $ AST                  <int> 2, 6, 3, 5, 0, 3, 1, 1, 8, 3, 3, 9, 2, 2, 0, 2, 1…
## $ STL                  <int> 0, 4, 1, 2, 0, 0, 0, 1, 5, 1, 4, 1, 1, 0, 1, 1, 2…
## $ BLK                  <int> 0, 0, 0, 1, 0, 1, 0, 0, 0, 3, 0, 0, 3, 1, 1, 2, 1…
## $ PTS                  <int> 25, 30, 31, 50, 25, 17, 18, 19, 31, 17, 22, 41, 2…
## $ GmSc                 <dbl> 22.7, 29.7, 26.4, 46.0, 17.1, 16.9, 19.2, 20.7, 3…
## $ Season               <chr> "1992-93", "1990-91", "1997-98", "2001-02", "2018…
## $ Playoffs             <chr> "false", "false", "false", "false", "false", "fal…
## $ Year                 <int> 1993, 1991, 1998, 2002, 2019, 2021, 1990, 2015, 1…
## $ GameIndex            <int> 181, 64, 58, 386, 160, 8, 236, 124, 100, 4, 4, 25…
## $ GmScMovingZ          <dbl> 4.13, 3.82, 4.11, 4.06, 3.37, 2.58, 4.27, 4.15, 3…
## $ GmScMovingZTop2Delta <dbl> 0.24, 0.64, 1.67, 0.84, 0.18, 0.05, 0.02, 0.93, 0…
## $ Date2                <chr> "1991-12-04", "1995-12-07", "1998-01-14", "2003-1…
## $ GmSc2                <dbl> 18.6, 40.1, 16.9, 34.3, 16.6, 16.8, 19.6, 18.5, 4…
## $ GmScMovingZ2         <dbl> 3.89, 3.18, 2.44, 3.22, 3.19, 2.53, 4.25, 3.22, 2…

We create new categorical variables for scoring and assist levels based on thresholds.

# Create new columns based on existing numeric variables
nba_data <- nba_data %>%
  mutate(
    Scoring_Level = case_when(
      PTS < 10 ~ "Low",
      PTS >= 10 & PTS < 20 ~ "Medium",
      PTS >= 20 ~ "High"
    ),
    Assist_Level = case_when(
      AST < 3 ~ "Low",
      AST >= 3 & AST < 7 ~ "Medium",
      AST >= 7 ~ "High"
    )
  )

# Convert to ordered factor
nba_data$Scoring_Level <- factor(nba_data$Scoring_Level, levels = c("Low", "Medium", "High"), ordered = TRUE)
nba_data$Assist_Level <- factor(nba_data$Assist_Level, levels = c("Low", "Medium", "High"), ordered = TRUE)

#Visualization Boxplot: Points by Scoring Level

ggplot(nba_data, aes(x = Scoring_Level, y = PTS, fill = Scoring_Level)) +
  geom_boxplot() +
  labs(title = "Boxplot of Points Scored by Scoring Level", x = "Scoring Level", y = "Points Scored") +
  theme_minimal()

#Violin Plot: Assists by Assist Level
ggplot(nba_data, aes(x = Assist_Level, y = AST, fill = Assist_Level)) +
  geom_violin() +
  labs(title = "Violin Plot of Assists by Assist Level", x = "Assist Level", y = "Assists") +
  theme_minimal()

Correlation Analysis

We convert the scoring and assist levels into numeric values and compute Pearson correlations with the original numerical features.

# Convert Scoring Level and Assist Level to numeric for correlation
nba_data$Scoring_Level_Num <- as.numeric(nba_data$Scoring_Level)
nba_data$Assist_Level_Num <- as.numeric(nba_data$Assist_Level)

# Compute correlation coefficients
corr_pts <- cor(nba_data$PTS, nba_data$Scoring_Level_Num, method = "pearson")
corr_ast <- cor(nba_data$AST, nba_data$Assist_Level_Num, method = "pearson")

# Print correlation results
corr_results <- data.frame(Variable = c("PTS & Scoring Level", "AST & Assist Level"),
                           Correlation = c(corr_pts, corr_ast))

print(corr_results)

##              Variable Correlation
## 1 PTS & Scoring Level   0.6615882
## 2  AST & Assist Level   0.8892969

#Interpretation:1)The correlation between PTS and Scoring Level is expected to be strong since the level is derived from PTS.

#2)The correlation between AST and Assist Level might be moderate due to broader variance in assist data.

#Confidence Intervals: We define a function to compute the 95% confidence interval for both points scored and assists.

# Function to compute confidence interval
confidence_interval <- function(data, confidence = 0.95) {
  mean_val <- mean(data, na.rm = TRUE)
  std_err <- sd(data, na.rm = TRUE) / sqrt(length(data))
  margin_of_error <- qt((1 + confidence) / 2, df = length(data) - 1) * std_err
  return(c(mean_val - margin_of_error, mean_val + margin_of_error))
}

# Calculate confidence intervals
ci_pts <- confidence_interval(nba_data$PTS)
ci_ast <- confidence_interval(nba_data$AST)

# Display results
ci_results <- data.frame(
  Variable = c("PTS (Points Scored)", "AST (Assists)"),
  CI_Lower = c(ci_pts[1], ci_ast[1]),
  CI_Upper = c(ci_pts[2], ci_ast[2])
)

print(ci_results)

##              Variable  CI_Lower  CI_Upper
## 1 PTS (Points Scored) 25.571603 26.551708
## 2       AST (Assists)  3.584751  3.896165

Conclusion: This analysis highlights the relationships between performance metrics and categorized levels for NBA players. We also assessed their distributions and variability with statistical tools such as correlation and confidence intervals.

assignment_6

2025-02-20

We create new categorical variables for scoring and assist levels based on thresholds.

Correlation Analysis