library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 4.3.3
library(ggplot2)
volley_data <- read.csv("C:\\Users\\brian\\Downloads\\bvb_matches_2022.csv")
The dataset that I have chosen includes all sorts of data surrounding beach volleyball in the year 2022. This dataset includes data from tournaments held all over the world and gives us information on each player playing in a game. There is information on players such as their name, age, height, and home country, but we also have their statistics from the game such as kills, aces, digs, blocks, etc. The dataset is sorted by date and we have data from May 2022 to September 2022. By looking at the data we can make simple conclusions such as most beach volleyball games in this dataset are played in the US through the AVP circuit. There are close to equal numbers of mens and womens matches, but analysis has been done to show there are differences among countries.
The dataset was found at kaggle.com and the documentation can be found at Beach Volleyball (kaggle.com)
My main goal in this project is to find what game-time statistics have the highest impact on winning percentage for mens/womens and what country has the most success with these skills.
This will require analysis of each gamedat statistic (aces, blocks, digs, kills) to see how they differ between the winning and losing teams.
Then we will need to separate these into men and women/ different countries to be able to see how the statistics vary between the groups.
country_gender <- volley_data |>
group_by(gender,country) |>
summarise(count=n(),.groups = 'drop')
country_gender |>
ggplot(aes(x=country,y=count,fill=gender)) +
geom_bar(stat = 'identity') +
theme_economist()
This data is interesting because it shows the distribution of gender by country. This helps to show that some countries have an equal number of mens/womens games while others may only have one or the other. This is interesting to look at since I am wanting to dive into winning percentage by gender, so we know some may be null.
killdata <- volley_data |>
group_by(gender) |>
reframe(kill_perc = round(w_p1_tot_kills/w_p1_tot_attacks,2))
killdata |>
ggplot() +
geom_histogram(mapping = aes(x = kill_perc, fill = gender))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3709 rows containing non-finite values (`stat_bin()`).
This second plot is good to look at but is just a start to seeing a correlation between kill percentage and winning percentage. In this plot we are only looking at one of the winning players kill percentage in each match. We need to do a bit more analysis, but it is interesting that men have a higher kill percentage and it could use more investigation as for which country is the highest.
Winning player 1 closest to the mean age have the highest kill percentage.
mens_data <- volley_data |>
filter(gender=='M') |>
mutate(w_p1_kill_pct = (w_p1_tot_kills-w_p1_tot_errors)/(w_p1_tot_attacks))|>
filter(!is.na(w_p1_age))
men_mean_age <- mean(mens_data$w_p1_age,na.rm=TRUE)
mens_data <- mens_data |>
mutate(close_to_mean = ifelse(w_p1_age<men_mean_age+5 & w_p1_age>men_mean_age-5,'Close_to_mean','Other'))
mens_data |>
ggplot(aes(x=close_to_mean,y=w_p1_kill_pct,fill=close_to_mean))+
geom_boxplot() +
theme_economist()
## Warning: Removed 1919 rows containing non-finite values (`stat_boxplot()`).
The box plot above does not show conclusive evidence to prove that age correlates to kill percentage. Players that are within 5 years of the mean age have about the same kill percentage as those outside of that range.
The winning team in a game has a higher kill percentage than the losing team.
Game_mens_data_1 <- mens_data |>
filter(!is.na(w_p1_tot_kills) | !is.na(w_p1_tot_errors) | !is.na(w_p1_tot_attacks))|>
mutate(team_kill_pct = ((w_p1_tot_kills+w_p2_tot_kills)-(w_p1_tot_errors+w_p2_tot_errors))/(w_p1_tot_attacks+w_p2_tot_attacks),
tag='Win')|>
select(tag,team_kill_pct)
Game_mens_data_2 <- mens_data |>
filter(!is.na(l_p1_tot_kills) | !is.na(l_p1_tot_errors) | !is.na(l_p1_tot_attacks))|>
mutate(team_kill_pct = ((l_p1_tot_kills+l_p2_tot_kills)-(l_p1_tot_errors+l_p2_tot_errors))/(l_p1_tot_attacks+l_p2_tot_attacks),
tag='Loss') |>
select(tag,team_kill_pct)
Game_mens_data <- rbind(Game_mens_data_1,Game_mens_data_2)
Game_mens_data |>
ggplot(aes(x=tag,y=team_kill_pct,fill=tag))+
geom_boxplot() +
theme_economist()
This box plot shows what we would expect- teams that win typically have a higher average kill percentage. Besides the outliers, this point is clearly shown by the data and we can conclude confidently that winning teams have a higher kill percentage than losing teams.