library(tidyr)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ purrr 1.0.2
## ✔ forcats 1.0.0 ✔ readr 2.1.4
## ✔ ggplot2 3.4.4 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
volley_data <- read.csv("C:\\Users\\brian\\Downloads\\bvb_matches_2022.csv")
gender_data <- volley_data |>
group_by(gender) |>
summarise(mean_w1age = mean(w_p1_age, na.rm= TRUE),
mean_w2age = mean(w_p2_age, na.rm= TRUE),
mean_l1age = mean(l_p1_age, na.rm= TRUE),
mean_l2age = mean(l_p2_age, na.rm= TRUE),
mean_age = ((mean_w1age + mean_w2age + mean_l1age + mean_l2age) /4))
print(gender_data)
## # A tibble: 2 × 6
## gender mean_w1age mean_w2age mean_l1age mean_l2age mean_age
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M 28.8 28.6 28.7 28.2 28.6
## 2 W 27.3 26.5 27.1 26.6 26.9
genderplot <- ggplot(gender_data,aes(x = gender, y = mean_age, fill = gender)) +
geom_bar(stat = "identity") +
theme_minimal()
print(genderplot)
gender_data_count <- volley_data |>
group_by(gender) |>
summarise(count= n()) |>
mutate(probability= count/ sum(count))
gender_data_count
## # A tibble: 2 × 3
## gender count probability
## <chr> <int> <dbl>
## 1 M 2220 0.528
## 2 W 1986 0.472
gender_data_count$tag <- cut(gender_data_count$count, breaks= c(0, 2000, Inf), labels= c("Low probability", "High probability"))
gender_data_count
## # A tibble: 2 × 4
## gender count probability tag
## <chr> <int> <dbl> <fct>
## 1 M 2220 0.528 High probability
## 2 W 1986 0.472 Low probability
This table groups the data by gender and then summarizes each line by player age. We then also have the average of all men and women and can see that the average of the mens age is about 28.6 while the average of women is about 26.9.
Hypothesis- There is a higher probability of male than female because more male tournaments are hosted in beach volleyball than womens tournaments.
country_age <- volley_data |>
group_by(country) |>
summarise(mean_w1agec = mean(w_p1_age, na.rm= TRUE),
mean_w2agec = mean(w_p2_age, na.rm= TRUE),
mean_l1agec = mean(l_p1_age, na.rm= TRUE),
mean_l2agec = mean(l_p2_age, na.rm= TRUE),
mean_age = ((mean_w1agec + mean_w2agec + mean_l1agec + mean_l2agec) /4)) |>
arrange(desc(mean_age))
print(country_age)
## # A tibble: 24 × 6
## country mean_w1agec mean_w2agec mean_l1agec mean_l2agec mean_age
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 United States 30.3 29.8 30.6 29.9 30.1
## 2 Czech Republic 29.1 29.1 28.8 28.9 29.0
## 3 Latvia 29.1 28.5 28.6 28.0 28.5
## 4 Mexico 29.2 28.1 28.9 27.6 28.5
## 5 Switzerland 28.9 28.7 28.5 27.6 28.4
## 6 Germany 28.9 28.2 28.3 27.5 28.2
## 7 Qatar 28.9 27.1 28.0 26.8 27.7
## 8 Brazil 27.8 27.5 28.0 26.8 27.5
## 9 Korea 24.8 28.8 26.7 29.0 27.4
## 10 T<fc>rkiye 27.7 26.2 27.4 26.6 27.0
## # ℹ 14 more rows
countryplot <- ggplot(country_age,aes(x = country, y = mean_age, fill = country)) +
geom_bar(stat = "identity") +
theme_minimal()
print(countryplot)
country_age_count <- volley_data |>
group_by(country) |>
summarise(count= n()) |>
mutate(probability= count/ sum(count))
country_age_count
## # A tibble: 24 × 3
## country count probability
## <chr> <int> <dbl>
## 1 Australia 74 0.0176
## 2 Austria 78 0.0185
## 3 Belgium 79 0.0188
## 4 Brazil 119 0.0283
## 5 Czech Republic 87 0.0207
## 6 France 38 0.00903
## 7 Germany 87 0.0207
## 8 Greece 119 0.0283
## 9 Hungary 80 0.0190
## 10 Italy 523 0.124
## # ℹ 14 more rows
country_age_count$tag <- cut(country_age_count$count, breaks= c(0, 200, Inf), labels= c("Low probability", "High probability"))
country_age_count
## # A tibble: 24 × 4
## country count probability tag
## <chr> <int> <dbl> <fct>
## 1 Australia 74 0.0176 Low probability
## 2 Austria 78 0.0185 Low probability
## 3 Belgium 79 0.0188 Low probability
## 4 Brazil 119 0.0283 Low probability
## 5 Czech Republic 87 0.0207 Low probability
## 6 France 38 0.00903 Low probability
## 7 Germany 87 0.0207 Low probability
## 8 Greece 119 0.0283 Low probability
## 9 Hungary 80 0.0190 Low probability
## 10 Italy 523 0.124 High probability
## # ℹ 14 more rows
The data is now grouped by country and still summarized by each player. Here country refers to the country in which the games were played and the players age is the 4 different players that participated in that match. This table is a bit more informative in that we can see that the average age from different countries varies quite a bit. For example, the average age in Austria is around 25 while the average in Czech Republic is about 29. I also found an average of all four players to make this table more easy to draw conclusions from which is what is displayed in the graph.
Hypothesis- We could hypothesize that the countries with the highest probability are where most players are from and therefore they host the most beach volleyball tournaments.
bracket_age <- volley_data |>
group_by(bracket) |>
summarise(mean_w1ageb= mean(w_p1_age, na.rm= TRUE),
mean_w2ageb= mean(w_p2_age, na.rm= TRUE),
mean_l1ageb= mean(l_p1_age, na.rm= TRUE),
mean_l2ageb= mean(l_p1_age, na.rm= TRUE),
mean_age = ((mean_w1ageb + mean_w2ageb + mean_l1ageb + mean_l2ageb) /4))
print(bracket_age)
## # A tibble: 23 × 6
## bracket mean_w1ageb mean_w2ageb mean_l1ageb mean_l2ageb mean_age
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 17th Place 27.9 27.6 26.9 26.9 27.3
## 2 3rd Place 33.4 32.6 29.6 29.6 31.3
## 3 5th Place 35.2 27.6 30.3 30.3 30.9
## 4 Bronze Medal 26.4 26.5 27.2 27.2 26.8
## 5 Contender's Bracket 30.9 29.8 30.6 30.6 30.5
## 6 Finals 30.9 32.1 31.8 31.8 31.7
## 7 Gold Medal 27.2 26.4 27.3 27.3 27.1
## 8 Lucky Losers 29.1 29.8 28.2 28.2 28.8
## 9 Pool A 27.2 26.9 26.6 26.6 26.9
## 10 Pool B 27.5 25.9 26.8 26.8 26.8
## # ℹ 13 more rows
bracketplot <- ggplot(bracket_age,aes(x = bracket, y = mean_age, fill = bracket)) +
geom_bar(stat = "identity") +
theme_minimal()
print(bracketplot)
bracket_age_count <- volley_data |>
group_by(bracket) |>
summarise(count= n()) |>
mutate(probability= count/ sum(count))
bracket_age_count
## # A tibble: 23 × 3
## bracket count probability
## <chr> <int> <dbl>
## 1 17th Place 24 0.00571
## 2 3rd Place 2 0.000476
## 3 5th Place 2 0.000476
## 4 Bronze Medal 64 0.0152
## 5 Contender's Bracket 400 0.0951
## 6 Finals 26 0.00618
## 7 Gold Medal 64 0.0152
## 8 Lucky Losers 8 0.00190
## 9 Pool A 274 0.0651
## 10 Pool B 274 0.0651
## # ℹ 13 more rows
bracket_age_count$tag <- cut(bracket_age_count$count, breaks= c(0, 1000, Inf), labels= c("Low probability", "High probability"))
bracket_age_count
## # A tibble: 23 × 4
## bracket count probability tag
## <chr> <int> <dbl> <fct>
## 1 17th Place 24 0.00571 Low probability
## 2 3rd Place 2 0.000476 Low probability
## 3 5th Place 2 0.000476 Low probability
## 4 Bronze Medal 64 0.0152 Low probability
## 5 Contender's Bracket 400 0.0951 Low probability
## 6 Finals 26 0.00618 Low probability
## 7 Gold Medal 64 0.0152 Low probability
## 8 Lucky Losers 8 0.00190 Low probability
## 9 Pool A 274 0.0651 Low probability
## 10 Pool B 274 0.0651 Low probability
## # ℹ 13 more rows
Finally, the data was grouped by bracket and the table was created to show average player ages. This table helps us to see if there is a trend of player age and success in tournaments. Unfortunately since different tournaments classify their tournament brackets by different things, this table is not as conclusive as I had hoped. We can see some differences, for example pool H has an average age of about 25.6 while Finals looks to be closer to 31.6. We would need more investigation here to draw any real conclusions but we could hypothesize that maybe this is somewhat of a trend.
Hypothesis- The highest probability brackets are winners and qualifier by far and this could be because in certain circuits these are the only brackets. For example, in AVP circuit they may only have two brackets, while in FIVB there may be others.
country_gender <- volley_data |>
group_by(country) |>
count(gender) |>
pluck("country")
print(country_gender)
## [1] "Australia" "Australia" "Austria" "Austria"
## [5] "Belgium" "Belgium" "Brazil" "Brazil"
## [9] "Czech Republic" "Czech Republic" "France" "Germany"
## [13] "Germany" "Greece" "Greece" "Hungary"
## [17] "Hungary" "Italy" "Italy" "Korea"
## [21] "Latvia" "Latvia" "Lithuania" "Lithuania"
## [25] "Mexico" "Mexico" "Morocco" "Morocco"
## [29] "Poland" "Poland" "Portugal" "Portugal"
## [33] "Qatar" "Qatar" "Slovenia" "Slovenia"
## [37] "Spain" "Spain" "Switzerland" "Switzerland"
## [41] "T<fc>rkiye" "T<fc>rkiye" "Thailand" "Thailand"
## [45] "United States" "United States"
country_gen <- volley_data |>
filter(country%in%country_gender) |>
ggplot() +
geom_bar(mapping = aes(x = country , fill = gender)) +
theme_minimal() +
scale_fill_brewer(palette = 'Dark2')
country_gen
volley_data |>
group_by(country) |>
count(gender)
## # A tibble: 46 × 3
## # Groups: country [24]
## country gender n
## <chr> <chr> <int>
## 1 Australia M 39
## 2 Australia W 35
## 3 Austria M 40
## 4 Austria W 38
## 5 Belgium M 39
## 6 Belgium W 40
## 7 Brazil M 61
## 8 Brazil W 58
## 9 Czech Republic M 44
## 10 Czech Republic W 43
## # ℹ 36 more rows
The smallest group here is the women in Korea. In this data set we only have 20 instances of this. We could conclude that if a player were to be chosen by random, there is only a 20/4,206 chance that they would be a woman playing in Korea. This table is interesting because if we think about what it really means, when we sort by country, each entry that we get is a match played in that country. Therefore a count of 20 for gender in Korea really means that there were 20 womens matches played in Korea. This does not mean that there are 20 Korean women in this data. To find this we would have to look at the players individual birth countries. So for this data our chances of picking a woman’s match that was played in Korea is 20/4,206 and picking a woman that played in Korea is 80/16,824.
There are two combinations that do not exist in these two columns. We have no counts for womens matches in France and no counts for mens matches in Korea. These combinations are missing because there is not a tournament hosted in France that had a womens bracket and no tournament in Korea that had a mens bracket. Seeing how few instances these countries have recorded in general, they must have only a tournament or two hosted by one of the recorded circuits.