library(tsibble)
## Warning: package 'tsibble' was built under R version 4.3.3
## Registered S3 method overwritten by 'tsibble':
## method from
## as_tibble.grouped_df dplyr
##
## Attaching package: 'tsibble'
## The following objects are masked from 'package:base':
##
## intersect, setdiff, union
library(ggplot2)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:tsibble':
##
## interval
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
volley_data <- read.csv("C:\\Users\\brian\\Downloads\\bvb_matches_2022.csv")
volley_data$newdate <- format(as.Date(volley_data$date, format ="%m/%d/%Y"), "%Y/%m/%d")
tourns_ <- volley_data |>
select(newdate, tournament, country) |>
group_by(newdate) |>
summarise(count = n(), .groups = 'drop')
tourns_$newdate <- as.Date(tourns_$newdate)
tourns_ts <- tourns_ %>%
as_tsibble(index = newdate)
ggplot(tourns_ts, aes(x = newdate, y = count)) +
geom_line() +
labs(title = "Match Count Over Time (Daily)",
x = "Date",
y = "Count of Matches") +
theme_minimal()
tourns_weekly <- tourns_ts %>%
index_by(week = ~ floor_date(.x, "week")) %>%
summarise(count = sum(count))
ggplot(tourns_weekly, aes(x = week, y = count)) +
geom_line() +
labs(title = "Match Count Over Time (Weekly)",
x = "Week",
y = "Count of Matches") +
theme_minimal()
tourns_monthly <- tourns_ts %>%
index_by(month = ~ floor_date(.x, "month")) %>%
summarise(count = sum(count))
# Plotting the monthly data
ggplot(tourns_monthly, aes(x = month, y = count)) +
geom_line() +
labs(title = "Match Count Over Time (Monthly)",
x = "Month",
y = "Count of Matches") +
theme_minimal()
What stands out immediately is that the most beach volleyball matches are played in the summer months. We would expect this since it is an outdoor sport and it is not popular in the winter and cold. Therefore it is not surprising that the data ranges from April to October. While some of these months are not super warm for us here in the US, they may be warmer in other countries and this may be what those data points are coming from.
plot_with_regression <- function(data, title, x_label, y_label) {
lm_model <- lm(count ~ as.numeric(newdate), data = data)
ggplot(data, aes(x = newdate, y = count)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE,
color = "red", linetype = "dashed") +
labs(title = title,
x = x_label,
y = y_label) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
}
ggplot(tourns_ts, aes(x = newdate, y = count)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(title = "Match Count Over Time (Daily) with Linear Regression",
x = "Date",
y = "Count of Matches") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
ggplot(tourns_weekly, aes(x = week, y = count)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(title = "Match Count Over Time (Weekly) with Linear Regression",
x = "Week",
y = "Count of Matches") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
ggplot(tourns_monthly, aes(x = month, y = count)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(title = "Match Count Over Time (Monthly) with Linear Regression",
x = "Month",
y = "Count of Matches") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
It seems that the trend is upward throughout the months from April to October. I think this is reasonable for what we would conclude with just using reasoning.
ggplot(tourns_ts, aes(x = newdate, y = count)) +
geom_point() +
geom_smooth(method = "loess", color = "blue") +
labs(title = "Match Count Over Time with Loess Smoothing",
x = "Date",
y = "Count of Matches") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
The trend lines show an increase in matches in June and a decrease at some point in August. Again, this is to be expected since this is when beach volleyball is in its prime season.
acf(tourns_ts$count, lag.max = 365, main = "ACF for Daily Match Count")
pacf(tourns_ts$count, lag.max = 365, main = "PACF for Daily Match Count")
None of the spikes seem to be super significant. I don’t see anything that indicates any seasonality.