library(tidyr)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ purrr 1.0.2
## ✔ forcats 1.0.0 ✔ readr 2.1.4
## ✔ ggplot2 3.4.4 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(ggplot2)
library(boot)
library(lindia)
## Warning: package 'lindia' was built under R version 4.3.3
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 4.3.3
library(ggrepel)
## Warning: package 'ggrepel' was built under R version 4.3.3
volley_data <- read.csv("C:\\Users\\brian\\Downloads\\bvb_matches_2022.csv")
clean_data3 <- na.omit(volley_data[, c('w_p1_tot_kills', 'w_p1_hgt', 'w_p1_tot_aces', 'w_p1_tot_blocks', 'w_p1_country', 'w_player1')])
model_1 <- lm(w_p1_hgt ~ w_p1_tot_kills, data = clean_data3) |> summary()
model_1
##
## Call:
## lm(formula = w_p1_hgt ~ w_p1_tot_kills, data = clean_data3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.743 -3.243 -0.943 3.182 8.357
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 72.24306 0.56045 128.901 < 2e-16 ***
## w_p1_tot_kills 0.10000 0.03556 2.812 0.00517 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.518 on 386 degrees of freedom
## Multiple R-squared: 0.02008, Adjusted R-squared: 0.01754
## F-statistic: 7.91 on 1 and 386 DF, p-value: 0.005168
clean_data3 |>
ggplot(mapping = aes(x = w_p1_hgt, y = w_p1_tot_kills)) +
geom_point(size = 2) +
geom_smooth(method = "lm", se = FALSE, color = 'darkblue')
## `geom_smooth()` using formula = 'y ~ x'
cor(clean_data3[c('w_p1_tot_kills', 'w_p1_tot_aces', 'w_p1_tot_blocks')])
## w_p1_tot_kills w_p1_tot_aces w_p1_tot_blocks
## w_p1_tot_kills 1.00000000 -0.02195759 0.02313137
## w_p1_tot_aces -0.02195759 1.00000000 -0.03733099
## w_p1_tot_blocks 0.02313137 -0.03733099 1.00000000
model_2 <- lm(w_p1_hgt ~ w_p1_tot_kills + w_p1_tot_aces, data = clean_data3) |> summary()
model_2
##
## Call:
## lm(formula = w_p1_hgt ~ w_p1_tot_kills + w_p1_tot_aces, data = clean_data3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.4116 -3.1642 -0.6714 2.9396 9.1608
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 72.85200 0.58389 124.769 < 2e-16 ***
## w_p1_tot_kills 0.09747 0.03512 2.775 0.00579 **
## w_p1_tot_aces -0.46756 0.14272 -3.276 0.00115 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.474 on 385 degrees of freedom
## Multiple R-squared: 0.04666, Adjusted R-squared: 0.0417
## F-statistic: 9.421 on 2 and 385 DF, p-value: 0.0001013
model_3 <- lm(w_p1_hgt ~ w_p1_tot_kills + w_p1_tot_blocks, data = clean_data3) |> summary()
model_3
##
## Call:
## lm(formula = w_p1_hgt ~ w_p1_tot_kills + w_p1_tot_blocks, data = clean_data3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.1618 -2.6244 -0.8787 3.2572 8.0926
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 71.57861 0.54486 131.371 < 2e-16 ***
## w_p1_tot_kills 0.09507 0.03392 2.803 0.00532 **
## w_p1_tot_blocks 0.56818 0.09049 6.279 9.21e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.355 on 385 degrees of freedom
## Multiple R-squared: 0.1111, Adjusted R-squared: 0.1065
## F-statistic: 24.06 on 2 and 385 DF, p-value: 1.425e-10
model_4 <- lm(w_p1_hgt ~ w_p1_tot_kills + w_p1_country, data = clean_data3) |> summary()
model_4
##
## Call:
## lm(formula = w_p1_hgt ~ w_p1_tot_kills + w_p1_country, data = clean_data3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.5998 -3.1108 -0.3851 3.0697 7.8236
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 68.06489 0.84969 80.106 < 2e-16 ***
## w_p1_tot_kills 0.07438 0.03427 2.170 0.0306 *
## w_p1_countryCanada 4.67532 0.87654 5.334 1.65e-07 ***
## w_p1_countryPoland 5.55998 1.18828 4.679 4.00e-06 ***
## w_p1_countryUnited States 4.81393 0.76268 6.312 7.63e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.356 on 383 degrees of freedom
## Multiple R-squared: 0.1152, Adjusted R-squared: 0.1059
## F-statistic: 12.46 on 4 and 383 DF, p-value: 1.537e-09
model_5 <- lm(w_p1_hgt ~ w_p1_tot_kills * w_p1_tot_blocks, data = clean_data3) |> summary()
model_5
##
## Call:
## lm(formula = w_p1_hgt ~ w_p1_tot_kills * w_p1_tot_blocks, data = clean_data3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.3235 -2.5156 -0.8918 3.2405 8.1426
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 71.16554 0.65031 109.434 < 2e-16 ***
## w_p1_tot_kills 0.12274 0.04142 2.963 0.00324 **
## w_p1_tot_blocks 0.86975 0.27478 3.165 0.00167 **
## w_p1_tot_kills:w_p1_tot_blocks -0.01997 0.01718 -1.162 0.24583
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.353 on 384 degrees of freedom
## Multiple R-squared: 0.1142, Adjusted R-squared: 0.1073
## F-statistic: 16.51 on 3 and 384 DF, p-value: 4.159e-10
rsq = c(model_1$r.squared, model_2$r.squared, model_3$r.squared, model_4$r.squared, model_5$r.squared)
rsqadj = c(model_1$adj.r.squared,
model_2$adj.r.squared,
model_3$adj.r.squared,
model_4$adj.r.squared,
model_5$adj.r.squared)
list_name = c('kills', '+aces', '+blocks', '+country', '*blocks')
list = c(1,2,3,4,5)
ggplot() +
geom_line(mapping = aes(x = list, y = rsq, color = 'r_sq')) +
geom_line(mapping = aes(x = list, y = rsqadj, color = 'r_sq_adj')) +
#scale_x_discrete(labels = list_name)
labs(title = "R squared vs R squared adjusted for different lr models",
x = "tests:: 1: kills, 2: +aces, 3: +blocks, 4: +country, 5: *blocks",
y = "value") +
theme_minimal()
All variables look to improve the model, with the least of which with blocks as an interaction variable. There was about a .1 change in r squared value when these additional variables are included.
Total number of kills and blocks having a positive relation with player height makes sense. We would assume that these things move proportionally. Meaning, players that are taller often earn more kills and blocks. Using diagnostic plots we can further evaluate this.
model = lm(w_p1_hgt ~ w_p1_tot_kills * w_p1_tot_blocks, clean_data3)
gg_resfitted(model) +
geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
plot<- gg_resX(model, plot.all= FALSE)
plot$w_p1_tot_kills +
geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
plot$w_p1_tot_blocks +
geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at -0.05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 2.05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 1.0446e-15
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : There are other near singularities as well. 4
gg_reshist(model)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
x <- model$residuals
x_val <- seq(min(x), max(x), length= 40)
fun <- dnorm(x_val, mean = mean(x), sd = sd(x))
hist(x, breaks= 50)
lines(x_val, fun*18000000)
This is not a normal distribution, but instead a bimodal distribution. This means that the data points are distributed across two separate values. In our case this makes sense because players may have more kills than blocks, or more blocks than kills and the number is hardly ever equal.
gg_qqplot(model)
The data hardly fits the theoretical distribution and lies mostly above and below the line. It looks like there are an equal number of values above and below the line which goes along with our findings in the histogram.
Cooks D
gg_cooksd(model, threshold = 'matlab')
The highest columns are ‘highly influential rows’. These are most likely players that have significantly more kills than blocks or vice versa.