library(tidyr)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ purrr 1.0.2
## ✔ forcats 1.0.0 ✔ readr 2.1.4
## ✔ ggplot2 3.4.4 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(ggplot2)
library(boot)
library(lindia)
## Warning: package 'lindia' was built under R version 4.3.3
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 4.3.3
library(ggrepel)
## Warning: package 'ggrepel' was built under R version 4.3.3
volley_data <- read.csv("C:\\Users\\brian\\Downloads\\bvb_matches_2022.csv")
The gender column is turned into a binary variable where 1 is associated with a male and 0 is associated with female. Here we add the column to the dataset so it can be used in the model.
volley_data$is_male <- ifelse(volley_data$gender == "M", 1, 0)
head(volley_data)
## circuit tournament country year date gender match_num
## 1 AVP Austin United States 2022 5/6/2022 M 1
## 2 AVP Austin United States 2022 5/6/2022 M 2
## 3 AVP Austin United States 2022 5/6/2022 M 3
## 4 AVP Austin United States 2022 5/6/2022 M 4
## 5 AVP Austin United States 2022 5/6/2022 M 5
## 6 AVP Austin United States 2022 5/6/2022 M 6
## w_player1 w_p1_birthdate w_p1_age w_p1_hgt w_p1_country
## 1 Trevor Crabb 9/15/1989 32.63792 76 United States
## 2 John Hyden 10/7/1972 49.57700 77 United States
## 3 Casey Patterson 4/20/1980 42.04244 78 United States
## 4 Chase Budinger 5/22/1988 33.95483 79 United States
## 5 Chaim Schalk 4/23/1986 36.03559 77 Canada
## 6 Billy Allen 10/15/1981 40.55578 74 United States
## w_player2 w_p2_birthdate w_p2_age w_p2_hgt w_p2_country w_rank
## 1 Tri Bourne 6/20/1989 32.87611 77 United States 1
## 2 Logan Webber 10/29/1995 26.51882 NA United States 8
## 3 Ed Ratledge 12/16/1976 45.38535 80 United States 5
## 4 Troy Field 12/21/1993 28.37235 NA United States 4
## 5 Theo Brunner 3/17/1985 37.13621 79 United States 3
## 6 Jeremy Casebeer 1/3/1989 33.33607 77 United States 6
## l_player1 l_p1_birthdate l_p1_age l_p1_hgt l_p1_country
## 1 Caleb Kwekel 7/27/2002 19.77550 NA United States
## 2 Billy Kolinske 8/6/1986 35.74812 78 United States
## 3 Piotr Marciniak 10/1/1986 35.59480 78 Poland
## 4 Dave Palm 10/10/1990 31.57016 NA United States
## 5 Jeff Samuels 6/1/1986 35.92882 NA United States
## 6 Paul Lotman 11/3/1985 36.50376 NA United States
## l_player2 l_p2_birthdate l_p2_age l_p2_hgt l_p2_country l_rank
## 1 Marty Lorenz 12/28/1990 31.35387 77 United States 16
## 2 Evan Cory 11/14/1997 24.47365 75 United States 9
## 3 Tim Bomgren 6/1/1987 34.92950 76 United States 12
## 4 Roberto Rodriguez 8/8/1986 35.74264 75 Puerto Rico 13
## 5 Raffe Paulis 5/13/1986 35.98083 74 United States 14
## 6 Skylar del Sol 10/10/1990 31.57016 NA United States 11
## score duration bracket round w_p1_tot_attacks
## 1 21-17, 22-20 0:49 Winner's Bracket Round 1 28
## 2 21-19, 21-17 0:44 Winner's Bracket Round 1 23
## 3 14-21, 21-19, 16-14 1:05 Winner's Bracket Round 1 34
## 4 21-19, 21-18 0:46 Winner's Bracket Round 1 18
## 5 21-16, 21-19 0:45 Winner's Bracket Round 1 25
## 6 16-21, 21-16, 15-9 0:59 Winner's Bracket Round 1 26
## w_p1_tot_kills w_p1_tot_errors w_p1_tot_hitpct w_p1_tot_aces
## 1 18 5 0.464 1
## 2 15 1 0.609 0
## 3 21 2 0.559 1
## 4 9 4 0.278 0
## 5 16 6 0.400 1
## 6 15 3 0.462 1
## w_p1_tot_serve_errors w_p1_tot_blocks w_p1_tot_digs w_p2_tot_attacks
## 1 0 1 11 21
## 2 2 0 9 27
## 3 2 0 17 39
## 4 0 5 1 28
## 5 4 0 18 14
## 6 1 0 7 14
## w_p2_tot_kills w_p2_tot_errors w_p2_tot_hitpct w_p2_tot_aces
## 1 11 2 0.429 4
## 2 17 1 0.593 1
## 3 17 8 0.231 0
## 4 17 3 0.500 3
## 5 11 0 0.786 0
## 6 9 1 0.571 5
## w_p2_tot_serve_errors w_p2_tot_blocks w_p2_tot_digs l_p1_tot_attacks
## 1 4 2 6 25
## 2 5 3 1 18
## 3 2 2 2 33
## 4 7 0 14 18
## 5 1 5 1 15
## 6 7 1 0 24
## l_p1_tot_kills l_p1_tot_errors l_p1_tot_hitpct l_p1_tot_aces
## 1 16 3 0.520 1
## 2 10 2 0.444 0
## 3 19 5 0.424 0
## 4 9 4 0.278 3
## 5 9 2 0.467 2
## 6 13 4 0.375 1
## l_p1_tot_serve_errors l_p1_tot_blocks l_p1_tot_digs l_p2_tot_attacks
## 1 2 0 5 19
## 2 1 2 2 25
## 3 1 2 1 35
## 4 2 5 1 22
## 5 2 2 0 34
## 6 7 1 5 24
## l_p2_tot_kills l_p2_tot_errors l_p2_tot_hitpct l_p2_tot_aces
## 1 5 3 0.105 4
## 2 17 4 0.520 0
## 3 19 4 0.429 2
## 4 7 5 0.091 1
## 5 11 4 0.206 0
## 6 17 3 0.583 0
## l_p2_tot_serve_errors l_p2_tot_blocks l_p2_tot_digs is_male
## 1 1 5 5 1
## 2 2 0 11 1
## 3 2 1 22 1
## 4 2 0 6 1
## 5 5 0 6 1
## 6 8 0 5 1
W_p1_hgt is chosen to be the explanatory variable. The standard regression has a linear relationship but in this case we must find a new function to model the non-linear regression.
volley_data|>
ggplot(mapping = aes(x = w_p1_hgt, y = is_male)) +
geom_jitter(width = 0.1, height = 0.1, shape = 'O', size = 3) +
geom_smooth(method = 'lm', se = FALSE) +
labs(title = "Modeling a Binary Response with OLS") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1195 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 1195 rows containing missing values (`geom_point()`).
We create the model and summarize to find the intercept and slope of the fitted regression.
model<- glm(is_male ~ w_p1_hgt, data= volley_data, family= binomial)
summary(model)
##
## Call:
## glm(formula = is_male ~ w_p1_hgt, family = binomial, data = volley_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -63.35971 2.25029 -28.16 <2e-16 ***
## w_p1_hgt 0.86555 0.03078 28.12 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4167.0 on 3010 degrees of freedom
## Residual deviance: 1896.3 on 3009 degrees of freedom
## (1195 observations deleted due to missingness)
## AIC: 1900.3
##
## Number of Fisher Scoring iterations: 6
Extracting these values, we can insert them into the sigmoid function to fit our regression line to the data.
intercept <- coef(model)[1]
slope <- coef(model)[2]
sigmoid <- function(x) {
1/ (1 + exp(-(intercept + slope *x)))
}
This is the final logistic regression model with the fitted sigmoid function. The value of the intercept of -63.36 mean that when height is zero, the predicted log- odds of the player being male are extremely low, and slope of .87 indicates that for each unit increase in height, the log- odds of the player being male increase by .87. Therefore, as height increases, the probability of is_male also increases.
volley_data |>
ggplot(mapping = aes(x = w_p1_hgt, y = is_male)) +
geom_jitter(width = 0.1, height = 0.1, shape = 'O', size = 3) +
geom_function(fun = sigmoid, color = 'blue', linewidth = 1) +
labs(title = "Modeling a Binary Response with Sigmoid",
subtitle =paste("Estimation: B0 =",round(intercept, 2), "and B1 =", round(slope, 2))) +
scale_y_continuous(breaks = c(0, 1)) +
theme_minimal()
## Warning: Removed 1195 rows containing missing values (`geom_point()`).
model <- glm(is_male ~ w_p1_hgt, data = volley_data, family = binomial)
summary(model)
##
## Call:
## glm(formula = is_male ~ w_p1_hgt, family = binomial, data = volley_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -63.35971 2.25029 -28.16 <2e-16 ***
## w_p1_hgt 0.86555 0.03078 28.12 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4167.0 on 3010 degrees of freedom
## Residual deviance: 1896.3 on 3009 degrees of freedom
## (1195 observations deleted due to missingness)
## AIC: 1900.3
##
## Number of Fisher Scoring iterations: 6
coef_value <- coef(model)["w_p1_hgt"]
std_error <- summary(model)$coefficients["w_p1_hgt", "Std. Error"]
z <- qnorm(0.975)
lower <- coef_value - z * std_error
upper <- coef_value + z * std_error
ci <- c(lower, upper)
ci
## w_p1_hgt w_p1_hgt
## 0.8052247 0.9258707
We are 95% confident that for each unit increase in height, the log- odds of being male increases by an amount between .805 and .926. This suggests a positive association between height and the likelihood of being male, with the effect being statistically significant if the interval does not include zero.