library(tidyr)
library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.3.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ purrr     1.0.2
## ✔ forcats   1.0.0     ✔ readr     2.1.4
## ✔ ggplot2   3.4.4     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(dplyr)
library(ggplot2)
library(boot)
library(lindia)

## Warning: package 'lindia' was built under R version 4.3.3

library(ggthemes)

## Warning: package 'ggthemes' was built under R version 4.3.3

library(ggrepel)

## Warning: package 'ggrepel' was built under R version 4.3.3

volley_data <- read.csv("C:\\Users\\brian\\Downloads\\bvb_matches_2022.csv")

Logistic Regression

Binary variable= gender

Explanatory variable = winning player 1 height

The gender column is turned into a binary variable where 1 is associated with a male and 0 is associated with female. Here we add the column to the dataset so it can be used in the model.

volley_data$is_male <- ifelse(volley_data$gender == "M", 1, 0)
head(volley_data)

##   circuit tournament       country year     date gender match_num
## 1     AVP     Austin United States 2022 5/6/2022      M         1
## 2     AVP     Austin United States 2022 5/6/2022      M         2
## 3     AVP     Austin United States 2022 5/6/2022      M         3
## 4     AVP     Austin United States 2022 5/6/2022      M         4
## 5     AVP     Austin United States 2022 5/6/2022      M         5
## 6     AVP     Austin United States 2022 5/6/2022      M         6
##         w_player1 w_p1_birthdate w_p1_age w_p1_hgt  w_p1_country
## 1    Trevor Crabb      9/15/1989 32.63792       76 United States
## 2      John Hyden      10/7/1972 49.57700       77 United States
## 3 Casey Patterson      4/20/1980 42.04244       78 United States
## 4  Chase Budinger      5/22/1988 33.95483       79 United States
## 5    Chaim Schalk      4/23/1986 36.03559       77        Canada
## 6     Billy Allen     10/15/1981 40.55578       74 United States
##         w_player2 w_p2_birthdate w_p2_age w_p2_hgt  w_p2_country w_rank
## 1      Tri Bourne      6/20/1989 32.87611       77 United States      1
## 2    Logan Webber     10/29/1995 26.51882       NA United States      8
## 3     Ed Ratledge     12/16/1976 45.38535       80 United States      5
## 4      Troy Field     12/21/1993 28.37235       NA United States      4
## 5    Theo Brunner      3/17/1985 37.13621       79 United States      3
## 6 Jeremy Casebeer       1/3/1989 33.33607       77 United States      6
##         l_player1 l_p1_birthdate l_p1_age l_p1_hgt  l_p1_country
## 1    Caleb Kwekel      7/27/2002 19.77550       NA United States
## 2  Billy Kolinske       8/6/1986 35.74812       78 United States
## 3 Piotr Marciniak      10/1/1986 35.59480       78        Poland
## 4       Dave Palm     10/10/1990 31.57016       NA United States
## 5    Jeff Samuels       6/1/1986 35.92882       NA United States
## 6     Paul Lotman      11/3/1985 36.50376       NA United States
##           l_player2 l_p2_birthdate l_p2_age l_p2_hgt  l_p2_country l_rank
## 1      Marty Lorenz     12/28/1990 31.35387       77 United States     16
## 2         Evan Cory     11/14/1997 24.47365       75 United States      9
## 3       Tim Bomgren       6/1/1987 34.92950       76 United States     12
## 4 Roberto Rodriguez       8/8/1986 35.74264       75   Puerto Rico     13
## 5      Raffe Paulis      5/13/1986 35.98083       74 United States     14
## 6    Skylar del Sol     10/10/1990 31.57016       NA United States     11
##                 score duration          bracket   round w_p1_tot_attacks
## 1        21-17, 22-20     0:49 Winner's Bracket Round 1               28
## 2        21-19, 21-17     0:44 Winner's Bracket Round 1               23
## 3 14-21, 21-19, 16-14     1:05 Winner's Bracket Round 1               34
## 4        21-19, 21-18     0:46 Winner's Bracket Round 1               18
## 5        21-16, 21-19     0:45 Winner's Bracket Round 1               25
## 6  16-21, 21-16, 15-9     0:59 Winner's Bracket Round 1               26
##   w_p1_tot_kills w_p1_tot_errors w_p1_tot_hitpct w_p1_tot_aces
## 1             18               5           0.464             1
## 2             15               1           0.609             0
## 3             21               2           0.559             1
## 4              9               4           0.278             0
## 5             16               6           0.400             1
## 6             15               3           0.462             1
##   w_p1_tot_serve_errors w_p1_tot_blocks w_p1_tot_digs w_p2_tot_attacks
## 1                     0               1            11               21
## 2                     2               0             9               27
## 3                     2               0            17               39
## 4                     0               5             1               28
## 5                     4               0            18               14
## 6                     1               0             7               14
##   w_p2_tot_kills w_p2_tot_errors w_p2_tot_hitpct w_p2_tot_aces
## 1             11               2           0.429             4
## 2             17               1           0.593             1
## 3             17               8           0.231             0
## 4             17               3           0.500             3
## 5             11               0           0.786             0
## 6              9               1           0.571             5
##   w_p2_tot_serve_errors w_p2_tot_blocks w_p2_tot_digs l_p1_tot_attacks
## 1                     4               2             6               25
## 2                     5               3             1               18
## 3                     2               2             2               33
## 4                     7               0            14               18
## 5                     1               5             1               15
## 6                     7               1             0               24
##   l_p1_tot_kills l_p1_tot_errors l_p1_tot_hitpct l_p1_tot_aces
## 1             16               3           0.520             1
## 2             10               2           0.444             0
## 3             19               5           0.424             0
## 4              9               4           0.278             3
## 5              9               2           0.467             2
## 6             13               4           0.375             1
##   l_p1_tot_serve_errors l_p1_tot_blocks l_p1_tot_digs l_p2_tot_attacks
## 1                     2               0             5               19
## 2                     1               2             2               25
## 3                     1               2             1               35
## 4                     2               5             1               22
## 5                     2               2             0               34
## 6                     7               1             5               24
##   l_p2_tot_kills l_p2_tot_errors l_p2_tot_hitpct l_p2_tot_aces
## 1              5               3           0.105             4
## 2             17               4           0.520             0
## 3             19               4           0.429             2
## 4              7               5           0.091             1
## 5             11               4           0.206             0
## 6             17               3           0.583             0
##   l_p2_tot_serve_errors l_p2_tot_blocks l_p2_tot_digs is_male
## 1                     1               5             5       1
## 2                     2               0            11       1
## 3                     2               1            22       1
## 4                     2               0             6       1
## 5                     5               0             6       1
## 6                     8               0             5       1

W_p1_hgt is chosen to be the explanatory variable. The standard regression has a linear relationship but in this case we must find a new function to model the non-linear regression.

volley_data|>
  ggplot(mapping = aes(x = w_p1_hgt, y = is_male)) +
  geom_jitter(width = 0.1, height = 0.1, shape = 'O', size = 3) +
  geom_smooth(method = 'lm', se = FALSE) +
  labs(title = "Modeling a Binary Response with OLS") +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 1195 rows containing non-finite values (`stat_smooth()`).

## Warning: Removed 1195 rows containing missing values (`geom_point()`).

We create the model and summarize to find the intercept and slope of the fitted regression.

model<- glm(is_male ~ w_p1_hgt, data= volley_data, family= binomial)
summary(model)

## 
## Call:
## glm(formula = is_male ~ w_p1_hgt, family = binomial, data = volley_data)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -63.35971    2.25029  -28.16   <2e-16 ***
## w_p1_hgt      0.86555    0.03078   28.12   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4167.0  on 3010  degrees of freedom
## Residual deviance: 1896.3  on 3009  degrees of freedom
##   (1195 observations deleted due to missingness)
## AIC: 1900.3
## 
## Number of Fisher Scoring iterations: 6

Extracting these values, we can insert them into the sigmoid function to fit our regression line to the data.

intercept <- coef(model)[1]
slope <- coef(model)[2]
sigmoid <- function(x) {
  1/ (1 + exp(-(intercept + slope *x)))
}

This is the final logistic regression model with the fitted sigmoid function. The value of the intercept of -63.36 mean that when height is zero, the predicted log- odds of the player being male are extremely low, and slope of .87 indicates that for each unit increase in height, the log- odds of the player being male increase by .87. Therefore, as height increases, the probability of is_male also increases.

volley_data |>
  ggplot(mapping = aes(x = w_p1_hgt, y = is_male)) +
  geom_jitter(width = 0.1, height = 0.1, shape = 'O', size = 3) +
  geom_function(fun = sigmoid, color = 'blue', linewidth = 1) +
  labs(title = "Modeling a Binary Response with Sigmoid",
       subtitle =paste("Estimation: B0 =",round(intercept, 2), "and B1 =", round(slope, 2))) +
  scale_y_continuous(breaks = c(0, 1)) +
  theme_minimal()

## Warning: Removed 1195 rows containing missing values (`geom_point()`).

Confidence Interval

model <- glm(is_male ~ w_p1_hgt, data = volley_data, family = binomial)
summary(model)

## 
## Call:
## glm(formula = is_male ~ w_p1_hgt, family = binomial, data = volley_data)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -63.35971    2.25029  -28.16   <2e-16 ***
## w_p1_hgt      0.86555    0.03078   28.12   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4167.0  on 3010  degrees of freedom
## Residual deviance: 1896.3  on 3009  degrees of freedom
##   (1195 observations deleted due to missingness)
## AIC: 1900.3
## 
## Number of Fisher Scoring iterations: 6

coef_value <- coef(model)["w_p1_hgt"]
std_error <- summary(model)$coefficients["w_p1_hgt", "Std. Error"]

z <- qnorm(0.975)
lower <- coef_value - z * std_error
upper <- coef_value + z * std_error
ci <- c(lower, upper)
ci

##  w_p1_hgt  w_p1_hgt 
## 0.8052247 0.9258707

We are 95% confident that for each unit increase in height, the log- odds of being male increases by an amount between .805 and .926. This suggests a positive association between height and the likelihood of being male, with the effect being statistically significant if the interval does not include zero.

Data Dive 10

Brianna Stopher

2024-11-04

Logistic Regression

Binary variable= gender

Explanatory variable = winning player 1 height

Confidence Interval