library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(readr)

volleyball <- read_csv("ncaa_w_boxscores_2012-2019.csv")
## New names:
## • `` -> `...1`
## Rows: 781962 Columns: 26
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (7): firstName, lastName, team, participated, started, opponent, url_t...
## dbl  (17): ...1, assists, attackAttempts, attackErrors, ballHandlingErrors, ...
## lgl   (1): home_team
## date  (1): dates
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Step 1: Keep only variables related to my research question
volleyball <- volleyball %>%
  select(
    team,
    serviceAces,
    serviceErrors,
    points
  ) %>%
  distinct()
# Step 2: Check dataset size
dim(volleyball)
## [1] 137125      4
# Check variable types
glimpse(volleyball)
## Rows: 137,125
## Columns: 4
## $ team          <chr> "navy", "navy", "navy", "navy", "navy", "navy", "navy", …
## $ serviceAces   <dbl> 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 3, 1, 0, 4, 0, 1, 3, 0,…
## $ serviceErrors <dbl> 2, 0, 0, 2, 0, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 4, 3, 1,…
## $ points        <dbl> 3.0, 7.5, 1.0, 7.5, 2.0, 1.0, 0.0, 9.5, 0.0, 4.5, 12.5, …
# Summary statistics
summary(volleyball)
##      team            serviceAces     serviceErrors        points     
##  Length:137125      Min.   : 0.000   Min.   : 0.000   Min.   : 0.00  
##  Class :character   1st Qu.: 0.000   1st Qu.: 0.000   1st Qu.: 5.00  
##  Mode  :character   Median : 1.000   Median : 1.000   Median :10.00  
##                     Mean   : 1.154   Mean   : 1.474   Mean   :10.85  
##                     3rd Qu.: 2.000   3rd Qu.: 2.000   3rd Qu.:15.50  
##                     Max.   :24.000   Max.   :30.000   Max.   :75.00
# Check for missing values
colSums(is.na(volleyball))
##          team   serviceAces serviceErrors        points 
##             0             0             0             0
# Create serving aggression metric
volleyball <- volleyball %>%
  mutate(
    ace_to_error_ratio = serviceAces / serviceErrors
  )

summary(volleyball$ace_to_error_ratio)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##     0.0     0.0     0.8     Inf     2.0     Inf   15918
# Fix infinite values in serving aggression ratio
volleyball <- volleyball %>%
  mutate(
    ace_to_error_ratio = ifelse(
      serviceErrors == 0,
      NA,
      serviceAces / serviceErrors
    )
  )

summary(volleyball$ace_to_error_ratio)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00    0.00    0.50    0.76    1.00   15.00   38198
# Histogram of service aces
ggplot(volleyball, aes(x = serviceAces)) +
  geom_histogram(bins = 30) +
  labs(
    title = "Distribution of Service Aces",
    x = "Service Aces",
    y = "Frequency"
  )

# Boxplot of service errors
ggplot(volleyball, aes(y = serviceErrors)) +
  geom_boxplot() +
  labs(
    title = "Distribution of Service Errors",
    y = "Service Errors"
  )

# Scatterplot of service aces vs service errors
ggplot(volleyball, aes(x = serviceAces, y = serviceErrors)) +
  geom_point(alpha = 0.3) +
  labs(
    title = "Service Aces vs Service Errors",
    x = "Service Aces",
    y = "Service Errors"
  )

# for checkpoint 8... Transform variables for modeling

volleyball <- volleyball %>%
  mutate(
    log_serviceAces = log1p(serviceAces),
    log_serviceErrors = log1p(serviceErrors),
    log_ace_to_error_ratio = log1p(ace_to_error_ratio)
  )

summary(volleyball$log_serviceAces)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.6931  0.6154  1.0986  3.2189
summary(volleyball$log_serviceErrors)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.6931  0.7604  1.0986  3.4340
summary(volleyball$log_ace_to_error_ratio)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   0.000   0.405   0.453   0.693   2.773   38198
# Standardized transformed variables using z-scores

volleyball <- volleyball %>%
  mutate(
    z_serviceAces = as.numeric(scale(log_serviceAces)),
    z_serviceErrors = as.numeric(scale(log_serviceErrors)),
    z_ace_to_error_ratio = as.numeric(scale(log_ace_to_error_ratio))
  )

summary(volleyball$z_serviceAces)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -1.1258 -1.1258  0.1422  0.0000  0.8840  4.7628
summary(volleyball$z_serviceErrors)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -1.3763 -1.3763 -0.1217  0.0000  0.6122  4.8395
summary(volleyball$z_ace_to_error_ratio)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  -1.009  -1.009  -0.107   0.000   0.533   5.162   38198
# new Histogram after transformation

ggplot(volleyball, aes(x = log_serviceAces)) +
  geom_histogram(bins = 30) +
  labs(
    title = "Transformed Distribution of Service Aces",
    x = "Log Service Aces",
    y = "Frequency"
  )

# Boxplot after transformation

ggplot(volleyball, aes(y = log_serviceErrors)) +
  geom_boxplot() +
  labs(
    title = "Transformed Distribution of Service Errors",
    y = "Log Service Errors"
  )

# Scatterplot after transformation

ggplot(volleyball, aes(x = log_serviceAces, y = log_serviceErrors)) +
  geom_point(alpha = 0.3) +
  labs(
    title = "Transformed Service Aces vs Service Errors",
    x = "Log Service Aces",
    y = "Log Service Errors"
  )

# Correlation after transformation

cor(volleyball$log_serviceAces, volleyball$log_serviceErrors, use = "complete.obs")
## [1] 0.05167387
# Updated data dictionary check

glimpse(volleyball)
## Rows: 137,125
## Columns: 11
## $ team                   <chr> "navy", "navy", "navy", "navy", "navy", "navy",…
## $ serviceAces            <dbl> 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 3, 1, 0, 4, 0,…
## $ serviceErrors          <dbl> 2, 0, 0, 2, 0, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,…
## $ points                 <dbl> 3.0, 7.5, 1.0, 7.5, 2.0, 1.0, 0.0, 9.5, 0.0, 4.…
## $ ace_to_error_ratio     <dbl> 0.5000000, NA, NA, 0.0000000, NA, 0.5000000, NA…
## $ log_serviceAces        <dbl> 0.6931472, 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ log_serviceErrors      <dbl> 1.0986123, 0.0000000, 0.0000000, 1.0986123, 0.0…
## $ log_ace_to_error_ratio <dbl> 0.4054651, NA, NA, 0.0000000, NA, 0.4054651, NA…
## $ z_serviceAces          <dbl> 0.1422276, -1.1258123, -1.1258123, -1.1258123, …
## $ z_serviceErrors        <dbl> 0.6122454, -1.3763340, -1.3763340, 0.6122454, -…
## $ z_ace_to_error_ratio   <dbl> -0.1068814, NA, NA, -1.0093554, NA, -0.1068814,…
# Create team-level dataset for modeling

team_model <- volleyball %>%
  group_by(team) %>%
  summarise(
    avg_serviceAces = mean(serviceAces, na.rm = TRUE),
    avg_serviceErrors = mean(serviceErrors, na.rm = TRUE),
    avg_ace_to_error_ratio = mean(ace_to_error_ratio, na.rm = TRUE),
    avg_points = mean(points, na.rm = TRUE),
    .groups = "drop"
  )

# Create proxy for team success using points
team_model <- team_model %>%
  mutate(
    winning_percentage = avg_points / max(avg_points)
  )

glimpse(team_model)
## Rows: 380
## Columns: 6
## $ team                   <chr> "abilene-christian", "air-force", "akron", "ala…
## $ avg_serviceAces        <dbl> 1.0421348, 1.0932401, 1.3094629, 1.1936170, 1.2…
## $ avg_serviceErrors      <dbl> 1.5000000, 1.6363636, 1.5754476, 1.7170213, 1.1…
## $ avg_ace_to_error_ratio <dbl> 0.7229943, 0.6892785, 0.8126174, 0.7669692, 0.8…
## $ avg_points             <dbl> 10.196629, 10.942890, 9.035806, 12.768085, 10.5…
## $ winning_percentage     <dbl> 0.7670831, 0.8232237, 0.6797554, 0.9605314, 0.7…
summary(team_model)
##      team           avg_serviceAces avg_serviceErrors avg_ace_to_error_ratio
##  Length:380         Min.   :0.000   Min.   :0.125     Min.   :0.0000        
##  Class :character   1st Qu.:1.043   1st Qu.:1.293     1st Qu.:0.6962        
##  Mode  :character   Median :1.125   Median :1.430     Median :0.7500        
##                     Mean   :1.077   Mean   :1.376     Mean   :0.7343        
##                     3rd Qu.:1.221   3rd Qu.:1.555     3rd Qu.:0.8036        
##                     Max.   :1.617   Max.   :2.048     Max.   :1.7500        
##    avg_points    winning_percentage
##  Min.   : 1.20   Min.   :0.09027   
##  1st Qu.:10.03   1st Qu.:0.75419   
##  Median :10.67   Median :0.80248   
##  Mean   :10.11   Mean   :0.76049   
##  3rd Qu.:11.29   3rd Qu.:0.84929   
##  Max.   :13.29   Max.   :1.00000
# Multiple Linear Regression Model

model1 <- lm(
  winning_percentage ~ avg_serviceAces + avg_serviceErrors + avg_ace_to_error_ratio,
  data = team_model
)

summary(model1)
## 
## Call:
## lm(formula = winning_percentage ~ avg_serviceAces + avg_serviceErrors + 
##     avg_ace_to_error_ratio, data = team_model)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.32464 -0.05664  0.01004  0.06270  0.24754 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             0.15593    0.02824   5.522 6.25e-08 ***
## avg_serviceAces         0.41124    0.04959   8.292 2.00e-15 ***
## avg_serviceErrors       0.15513    0.02948   5.262 2.40e-07 ***
## avg_ace_to_error_ratio -0.07063    0.04934  -1.432    0.153    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.09407 on 376 degrees of freedom
## Multiple R-squared:  0.6808, Adjusted R-squared:  0.6783 
## F-statistic: 267.3 on 3 and 376 DF,  p-value: < 2.2e-16
# Predicted vs Actual Values

team_model$predicted <- predict(model1)

ggplot(team_model, aes(x = winning_percentage, y = predicted)) +
  geom_point(alpha = 0.6) +
  labs(
    title = "Actual vs Predicted Winning Percentage",
    x = "Actual Winning Percentage",
    y = "Predicted Winning Percentage"
  )

# Residual Plot

ggplot(model1, aes(x = .fitted, y = .resid)) +
  geom_point(alpha = 0.6) +
  labs(
    title = "Residual Plot",
    x = "Fitted Values",
    y = "Residuals"
  )
## Warning: `fortify(<lm>)` was deprecated in ggplot2 4.0.0.
## ℹ Please use `broom::augment(<lm>)` instead.
## ℹ The deprecated feature was likely used in the ggplot2 package.
##   Please report the issue at <https://github.com/tidyverse/ggplot2/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Model 2: K-Means Clustering

library(dplyr) library(ggplot2)

Use transformed team-level features

cluster_data <- team_model %>% select( avg_serviceAces, avg_serviceErrors, avg_ace_to_error_ratio )

Run K-means with 3 clusters

set.seed(123)

kmeans_model <- kmeans(cluster_data, centers = 3, nstart = 25)

Add clusters back to data

team_model\(cluster <- as.factor(kmeans_model\)cluster)

Check cluster sizes

kmeans_model$size

Check cluster centers

kmeans_model$centers

Visualize clusters

ggplot(team_model, aes(x = avg_serviceAces, y = avg_serviceErrors, color = cluster)) + geom_point(size = 3, alpha = 0.7) + labs( title = “Team Clusters Based on Serving Performance”, x = “Average Service Aces”, y = “Average Service Errors” )

Average winning % by cluster

team_model %>% group_by(cluster) %>% summarise( avg_win_pct = mean(winning_percentage) )