library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(readr)
volleyball <- read_csv("ncaa_w_boxscores_2012-2019.csv")
## New names:
## • `` -> `...1`
## Rows: 781962 Columns: 26
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): firstName, lastName, team, participated, started, opponent, url_t...
## dbl (17): ...1, assists, attackAttempts, attackErrors, ballHandlingErrors, ...
## lgl (1): home_team
## date (1): dates
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Step 1: Keep only variables related to my research question
volleyball <- volleyball %>%
select(
team,
serviceAces,
serviceErrors,
points
) %>%
distinct()
# Step 2: Check dataset size
dim(volleyball)
## [1] 137125 4
# Check variable types
glimpse(volleyball)
## Rows: 137,125
## Columns: 4
## $ team <chr> "navy", "navy", "navy", "navy", "navy", "navy", "navy", …
## $ serviceAces <dbl> 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 3, 1, 0, 4, 0, 1, 3, 0,…
## $ serviceErrors <dbl> 2, 0, 0, 2, 0, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 4, 3, 1,…
## $ points <dbl> 3.0, 7.5, 1.0, 7.5, 2.0, 1.0, 0.0, 9.5, 0.0, 4.5, 12.5, …
# Summary statistics
summary(volleyball)
## team serviceAces serviceErrors points
## Length:137125 Min. : 0.000 Min. : 0.000 Min. : 0.00
## Class :character 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 5.00
## Mode :character Median : 1.000 Median : 1.000 Median :10.00
## Mean : 1.154 Mean : 1.474 Mean :10.85
## 3rd Qu.: 2.000 3rd Qu.: 2.000 3rd Qu.:15.50
## Max. :24.000 Max. :30.000 Max. :75.00
# Check for missing values
colSums(is.na(volleyball))
## team serviceAces serviceErrors points
## 0 0 0 0
# Create serving aggression metric
volleyball <- volleyball %>%
mutate(
ace_to_error_ratio = serviceAces / serviceErrors
)
summary(volleyball$ace_to_error_ratio)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0 0.0 0.8 Inf 2.0 Inf 15918
# Fix infinite values in serving aggression ratio
volleyball <- volleyball %>%
mutate(
ace_to_error_ratio = ifelse(
serviceErrors == 0,
NA,
serviceAces / serviceErrors
)
)
summary(volleyball$ace_to_error_ratio)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 0.00 0.50 0.76 1.00 15.00 38198
# Histogram of service aces
ggplot(volleyball, aes(x = serviceAces)) +
geom_histogram(bins = 30) +
labs(
title = "Distribution of Service Aces",
x = "Service Aces",
y = "Frequency"
)
# Boxplot of service errors
ggplot(volleyball, aes(y = serviceErrors)) +
geom_boxplot() +
labs(
title = "Distribution of Service Errors",
y = "Service Errors"
)
# Scatterplot of service aces vs service errors
ggplot(volleyball, aes(x = serviceAces, y = serviceErrors)) +
geom_point(alpha = 0.3) +
labs(
title = "Service Aces vs Service Errors",
x = "Service Aces",
y = "Service Errors"
)
# for checkpoint 8... Transform variables for modeling
volleyball <- volleyball %>%
mutate(
log_serviceAces = log1p(serviceAces),
log_serviceErrors = log1p(serviceErrors),
log_ace_to_error_ratio = log1p(ace_to_error_ratio)
)
summary(volleyball$log_serviceAces)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.6931 0.6154 1.0986 3.2189
summary(volleyball$log_serviceErrors)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.6931 0.7604 1.0986 3.4340
summary(volleyball$log_ace_to_error_ratio)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 0.000 0.405 0.453 0.693 2.773 38198
# Standardized transformed variables using z-scores
volleyball <- volleyball %>%
mutate(
z_serviceAces = as.numeric(scale(log_serviceAces)),
z_serviceErrors = as.numeric(scale(log_serviceErrors)),
z_ace_to_error_ratio = as.numeric(scale(log_ace_to_error_ratio))
)
summary(volleyball$z_serviceAces)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.1258 -1.1258 0.1422 0.0000 0.8840 4.7628
summary(volleyball$z_serviceErrors)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.3763 -1.3763 -0.1217 0.0000 0.6122 4.8395
summary(volleyball$z_ace_to_error_ratio)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -1.009 -1.009 -0.107 0.000 0.533 5.162 38198
# new Histogram after transformation
ggplot(volleyball, aes(x = log_serviceAces)) +
geom_histogram(bins = 30) +
labs(
title = "Transformed Distribution of Service Aces",
x = "Log Service Aces",
y = "Frequency"
)
# Boxplot after transformation
ggplot(volleyball, aes(y = log_serviceErrors)) +
geom_boxplot() +
labs(
title = "Transformed Distribution of Service Errors",
y = "Log Service Errors"
)
# Scatterplot after transformation
ggplot(volleyball, aes(x = log_serviceAces, y = log_serviceErrors)) +
geom_point(alpha = 0.3) +
labs(
title = "Transformed Service Aces vs Service Errors",
x = "Log Service Aces",
y = "Log Service Errors"
)
# Correlation after transformation
cor(volleyball$log_serviceAces, volleyball$log_serviceErrors, use = "complete.obs")
## [1] 0.05167387
# Updated data dictionary check
glimpse(volleyball)
## Rows: 137,125
## Columns: 11
## $ team <chr> "navy", "navy", "navy", "navy", "navy", "navy",…
## $ serviceAces <dbl> 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 3, 1, 0, 4, 0,…
## $ serviceErrors <dbl> 2, 0, 0, 2, 0, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,…
## $ points <dbl> 3.0, 7.5, 1.0, 7.5, 2.0, 1.0, 0.0, 9.5, 0.0, 4.…
## $ ace_to_error_ratio <dbl> 0.5000000, NA, NA, 0.0000000, NA, 0.5000000, NA…
## $ log_serviceAces <dbl> 0.6931472, 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ log_serviceErrors <dbl> 1.0986123, 0.0000000, 0.0000000, 1.0986123, 0.0…
## $ log_ace_to_error_ratio <dbl> 0.4054651, NA, NA, 0.0000000, NA, 0.4054651, NA…
## $ z_serviceAces <dbl> 0.1422276, -1.1258123, -1.1258123, -1.1258123, …
## $ z_serviceErrors <dbl> 0.6122454, -1.3763340, -1.3763340, 0.6122454, -…
## $ z_ace_to_error_ratio <dbl> -0.1068814, NA, NA, -1.0093554, NA, -0.1068814,…
# Create team-level dataset for modeling
team_model <- volleyball %>%
group_by(team) %>%
summarise(
avg_serviceAces = mean(serviceAces, na.rm = TRUE),
avg_serviceErrors = mean(serviceErrors, na.rm = TRUE),
avg_ace_to_error_ratio = mean(ace_to_error_ratio, na.rm = TRUE),
avg_points = mean(points, na.rm = TRUE),
.groups = "drop"
)
# Create proxy for team success using points
team_model <- team_model %>%
mutate(
winning_percentage = avg_points / max(avg_points)
)
glimpse(team_model)
## Rows: 380
## Columns: 6
## $ team <chr> "abilene-christian", "air-force", "akron", "ala…
## $ avg_serviceAces <dbl> 1.0421348, 1.0932401, 1.3094629, 1.1936170, 1.2…
## $ avg_serviceErrors <dbl> 1.5000000, 1.6363636, 1.5754476, 1.7170213, 1.1…
## $ avg_ace_to_error_ratio <dbl> 0.7229943, 0.6892785, 0.8126174, 0.7669692, 0.8…
## $ avg_points <dbl> 10.196629, 10.942890, 9.035806, 12.768085, 10.5…
## $ winning_percentage <dbl> 0.7670831, 0.8232237, 0.6797554, 0.9605314, 0.7…
summary(team_model)
## team avg_serviceAces avg_serviceErrors avg_ace_to_error_ratio
## Length:380 Min. :0.000 Min. :0.125 Min. :0.0000
## Class :character 1st Qu.:1.043 1st Qu.:1.293 1st Qu.:0.6962
## Mode :character Median :1.125 Median :1.430 Median :0.7500
## Mean :1.077 Mean :1.376 Mean :0.7343
## 3rd Qu.:1.221 3rd Qu.:1.555 3rd Qu.:0.8036
## Max. :1.617 Max. :2.048 Max. :1.7500
## avg_points winning_percentage
## Min. : 1.20 Min. :0.09027
## 1st Qu.:10.03 1st Qu.:0.75419
## Median :10.67 Median :0.80248
## Mean :10.11 Mean :0.76049
## 3rd Qu.:11.29 3rd Qu.:0.84929
## Max. :13.29 Max. :1.00000
# Multiple Linear Regression Model
model1 <- lm(
winning_percentage ~ avg_serviceAces + avg_serviceErrors + avg_ace_to_error_ratio,
data = team_model
)
summary(model1)
##
## Call:
## lm(formula = winning_percentage ~ avg_serviceAces + avg_serviceErrors +
## avg_ace_to_error_ratio, data = team_model)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.32464 -0.05664 0.01004 0.06270 0.24754
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.15593 0.02824 5.522 6.25e-08 ***
## avg_serviceAces 0.41124 0.04959 8.292 2.00e-15 ***
## avg_serviceErrors 0.15513 0.02948 5.262 2.40e-07 ***
## avg_ace_to_error_ratio -0.07063 0.04934 -1.432 0.153
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.09407 on 376 degrees of freedom
## Multiple R-squared: 0.6808, Adjusted R-squared: 0.6783
## F-statistic: 267.3 on 3 and 376 DF, p-value: < 2.2e-16
# Predicted vs Actual Values
team_model$predicted <- predict(model1)
ggplot(team_model, aes(x = winning_percentage, y = predicted)) +
geom_point(alpha = 0.6) +
labs(
title = "Actual vs Predicted Winning Percentage",
x = "Actual Winning Percentage",
y = "Predicted Winning Percentage"
)
# Residual Plot
ggplot(model1, aes(x = .fitted, y = .resid)) +
geom_point(alpha = 0.6) +
labs(
title = "Residual Plot",
x = "Fitted Values",
y = "Residuals"
)
## Warning: `fortify(<lm>)` was deprecated in ggplot2 4.0.0.
## ℹ Please use `broom::augment(<lm>)` instead.
## ℹ The deprecated feature was likely used in the ggplot2 package.
## Please report the issue at <https://github.com/tidyverse/ggplot2/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
library(dplyr) library(ggplot2)
cluster_data <- team_model %>% select( avg_serviceAces, avg_serviceErrors, avg_ace_to_error_ratio )
set.seed(123)
kmeans_model <- kmeans(cluster_data, centers = 3, nstart = 25)
team_model\(cluster <- as.factor(kmeans_model\)cluster)
kmeans_model$size
kmeans_model$centers
ggplot(team_model, aes(x = avg_serviceAces, y = avg_serviceErrors, color = cluster)) + geom_point(size = 3, alpha = 0.7) + labs( title = “Team Clusters Based on Serving Performance”, x = “Average Service Aces”, y = “Average Service Errors” )
team_model %>% group_by(cluster) %>% summarise( avg_win_pct = mean(winning_percentage) )