Data Dive 9

library(tidyr)
library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.3.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ purrr     1.0.2
## ✔ forcats   1.0.0     ✔ readr     2.1.4
## ✔ ggplot2   3.4.4     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(dplyr)
library(ggplot2)
library(boot)
library(lindia)

## Warning: package 'lindia' was built under R version 4.3.3

library(ggthemes)

## Warning: package 'ggthemes' was built under R version 4.3.3

library(ggrepel)

## Warning: package 'ggrepel' was built under R version 4.3.3

volley_data <- read.csv("C:\\Users\\brian\\Downloads\\bvb_matches_2022.csv")

Original Model

clean_data3 <- na.omit(volley_data[, c('w_p1_tot_kills', 'w_p1_hgt', 'w_p1_tot_aces', 'w_p1_tot_blocks', 'w_p1_country', 'w_player1')])

model_1 <- lm(w_p1_hgt ~ w_p1_tot_kills, data = clean_data3) |> summary()
model_1

## 
## Call:
## lm(formula = w_p1_hgt ~ w_p1_tot_kills, data = clean_data3)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -5.743 -3.243 -0.943  3.182  8.357 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    72.24306    0.56045 128.901  < 2e-16 ***
## w_p1_tot_kills  0.10000    0.03556   2.812  0.00517 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.518 on 386 degrees of freedom
## Multiple R-squared:  0.02008,    Adjusted R-squared:  0.01754 
## F-statistic:  7.91 on 1 and 386 DF,  p-value: 0.005168

clean_data3 |>
  ggplot(mapping = aes(x = w_p1_hgt, y = w_p1_tot_kills)) +
  geom_point(size = 2) +
  geom_smooth(method = "lm", se = FALSE, color = 'darkblue')

## `geom_smooth()` using formula = 'y ~ x'

cor(clean_data3[c('w_p1_tot_kills', 'w_p1_tot_aces', 'w_p1_tot_blocks')])

##                 w_p1_tot_kills w_p1_tot_aces w_p1_tot_blocks
## w_p1_tot_kills      1.00000000   -0.02195759      0.02313137
## w_p1_tot_aces      -0.02195759    1.00000000     -0.03733099
## w_p1_tot_blocks     0.02313137   -0.03733099      1.00000000

Model + Aces

model_2 <- lm(w_p1_hgt ~ w_p1_tot_kills + w_p1_tot_aces, data = clean_data3) |> summary()
model_2

## 
## Call:
## lm(formula = w_p1_hgt ~ w_p1_tot_kills + w_p1_tot_aces, data = clean_data3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.4116 -3.1642 -0.6714  2.9396  9.1608 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    72.85200    0.58389 124.769  < 2e-16 ***
## w_p1_tot_kills  0.09747    0.03512   2.775  0.00579 ** 
## w_p1_tot_aces  -0.46756    0.14272  -3.276  0.00115 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.474 on 385 degrees of freedom
## Multiple R-squared:  0.04666,    Adjusted R-squared:  0.0417 
## F-statistic: 9.421 on 2 and 385 DF,  p-value: 0.0001013

Model + Blocks

model_3 <- lm(w_p1_hgt ~ w_p1_tot_kills + w_p1_tot_blocks, data = clean_data3) |> summary()
model_3

## 
## Call:
## lm(formula = w_p1_hgt ~ w_p1_tot_kills + w_p1_tot_blocks, data = clean_data3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.1618 -2.6244 -0.8787  3.2572  8.0926 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     71.57861    0.54486 131.371  < 2e-16 ***
## w_p1_tot_kills   0.09507    0.03392   2.803  0.00532 ** 
## w_p1_tot_blocks  0.56818    0.09049   6.279 9.21e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.355 on 385 degrees of freedom
## Multiple R-squared:  0.1111, Adjusted R-squared:  0.1065 
## F-statistic: 24.06 on 2 and 385 DF,  p-value: 1.425e-10

model_4 <- lm(w_p1_hgt ~ w_p1_tot_kills + w_p1_country, data = clean_data3) |> summary()
model_4

## 
## Call:
## lm(formula = w_p1_hgt ~ w_p1_tot_kills + w_p1_country, data = clean_data3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.5998 -3.1108 -0.3851  3.0697  7.8236 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               68.06489    0.84969  80.106  < 2e-16 ***
## w_p1_tot_kills             0.07438    0.03427   2.170   0.0306 *  
## w_p1_countryCanada         4.67532    0.87654   5.334 1.65e-07 ***
## w_p1_countryPoland         5.55998    1.18828   4.679 4.00e-06 ***
## w_p1_countryUnited States  4.81393    0.76268   6.312 7.63e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.356 on 383 degrees of freedom
## Multiple R-squared:  0.1152, Adjusted R-squared:  0.1059 
## F-statistic: 12.46 on 4 and 383 DF,  p-value: 1.537e-09

Model * Blocks

model_5 <- lm(w_p1_hgt ~ w_p1_tot_kills * w_p1_tot_blocks, data = clean_data3) |> summary()
model_5

## 
## Call:
## lm(formula = w_p1_hgt ~ w_p1_tot_kills * w_p1_tot_blocks, data = clean_data3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.3235 -2.5156 -0.8918  3.2405  8.1426 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    71.16554    0.65031 109.434  < 2e-16 ***
## w_p1_tot_kills                  0.12274    0.04142   2.963  0.00324 ** 
## w_p1_tot_blocks                 0.86975    0.27478   3.165  0.00167 ** 
## w_p1_tot_kills:w_p1_tot_blocks -0.01997    0.01718  -1.162  0.24583    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.353 on 384 degrees of freedom
## Multiple R-squared:  0.1142, Adjusted R-squared:  0.1073 
## F-statistic: 16.51 on 3 and 384 DF,  p-value: 4.159e-10

Plotting R Squared

rsq = c(model_1$r.squared, model_2$r.squared, model_3$r.squared, model_4$r.squared, model_5$r.squared)
rsqadj = c(model_1$adj.r.squared, 
      model_2$adj.r.squared, 
      model_3$adj.r.squared,
      model_4$adj.r.squared,
      model_5$adj.r.squared)
list_name = c('kills', '+aces', '+blocks', '+country', '*blocks')
list = c(1,2,3,4,5)

ggplot() +
  geom_line(mapping = aes(x = list, y = rsq, color = 'r_sq')) +
  geom_line(mapping = aes(x = list, y = rsqadj, color = 'r_sq_adj')) +
  #scale_x_discrete(labels = list_name)
  labs(title = "R squared vs R squared adjusted for different lr models",
       x = "tests:: 1: kills, 2: +aces, 3: +blocks, 4: +country, 5: *blocks",
       y = "value") +
  theme_minimal()

All variables look to improve the model, with the least of which with blocks as an interaction variable. There was about a .1 change in r squared value when these additional variables are included.

Evaluating the Model

Total number of kills and blocks having a positive relation with player height makes sense. We would assume that these things move proportionally. Meaning, players that are taller often earn more kills and blocks. Using diagnostic plots we can further evaluate this.

Residuals

model = lm(w_p1_hgt ~ w_p1_tot_kills * w_p1_tot_blocks, clean_data3)

gg_resfitted(model) +
  geom_smooth(se = FALSE)

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

plot<- gg_resX(model, plot.all= FALSE)
plot$w_p1_tot_kills +
  geom_smooth(se = FALSE)

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

plot$w_p1_tot_blocks +
  geom_smooth(se = FALSE)

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at -0.05

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 2.05

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 1.0446e-15

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : There are other near singularities as well. 4

Histogram

gg_reshist(model)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

x <- model$residuals
x_val <- seq(min(x), max(x), length= 40)
fun <- dnorm(x_val, mean = mean(x), sd = sd(x))
hist(x, breaks= 50)
lines(x_val, fun*18000000)

This is not a normal distribution, but instead a bimodal distribution. This means that the data points are distributed across two separate values. In our case this makes sense because players may have more kills than blocks, or more blocks than kills and the number is hardly ever equal.

QQ Plot

gg_qqplot(model)

The data hardly fits the theoretical distribution and lies mostly above and below the line. It looks like there are an equal number of values above and below the line which goes along with our findings in the histogram.

Cooks D

gg_cooksd(model, threshold = 'matlab')

The highest columns are ‘highly influential rows’. These are most likely players that have significantly more kills than blocks or vice versa.