Data Dive week 12
#importing libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggthemes)
library(ggrepel)
library(dplyr)
# time series toolkits
library(xts)
## Warning: package 'xts' was built under R version 4.3.2
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.3.2
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
##
## ######################### Warning from 'xts' package ##########################
## # #
## # The dplyr lag() function breaks how base R's lag() function is supposed to #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or #
## # source() into this session won't work correctly. #
## # #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop #
## # dplyr from breaking base R's lag() function. #
## # #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning. #
## # #
## ###############################################################################
##
## Attaching package: 'xts'
##
## The following objects are masked from 'package:dplyr':
##
## first, last
library(tsibble)
## Warning: package 'tsibble' was built under R version 4.3.2
##
## Attaching package: 'tsibble'
##
## The following object is masked from 'package:zoo':
##
## index
##
## The following object is masked from 'package:lubridate':
##
## interval
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, union
library(wikipediatrend)
## Warning: package 'wikipediatrend' was built under R version 4.3.2
##
## [wikipediatrend]
##
## Note:
##
## - Data before 2016-01-01
## * is provided by petermeissner.de and
## * was prepared in a project commissioned by the Hertie School of Governance (Prof. Dr. Simon Munzert)
## * and supported by the Daimler and Benz Foundation.
##
## - Data from 2016-01-01 onwards
## * is provided by the Wikipedia Foundation
## * via its pageviews package and API.
##
#importing the dataset
data <-read.csv('C:/Downloads/final_dataset.csv')
colnames(data)
## [1] "X" "Date" "HomeTeam" "AwayTeam"
## [5] "FTHG" "FTAG" "FTR" "HTGS"
## [9] "ATGS" "HTGC" "ATGC" "HTP"
## [13] "ATP" "HM1" "HM2" "HM3"
## [17] "HM4" "HM5" "AM1" "AM2"
## [21] "AM3" "AM4" "AM5" "MW"
## [25] "HTFormPtsStr" "ATFormPtsStr" "HTFormPts" "ATFormPts"
## [29] "HTWinStreak3" "HTWinStreak5" "HTLossStreak3" "HTLossStreak5"
## [33] "ATWinStreak3" "ATWinStreak5" "ATLossStreak3" "ATLossStreak5"
## [37] "HTGD" "ATGD" "DiffPts" "DiffFormPts"
This dataset does not contain any time or date related field, so we
get time series data from the ‘wikipediatrend’ library which aligns
closely to our dataset.
wiki_topic <- "English_Premier_League"
hits <- wp_trend(page = c("English_Premier_League", "Premier_League", "EPL", "Football_in_England"),
from = "2014-01-01",
to = "2020-12-31",
lang = "en",
warn = TRUE)
hits_summary <- hits %>%
group_by(date) %>%
summarise(views = sum(views)) %>%
ungroup() %>%
mutate(article = wiki_topic)
hits_summary$date <- as.Date(hits_summary$date)
merged_data <- merge(data
, hits_summary, by.x = "Date", by.y = "date", all.x = TRUE)
colnames(merged_data)
## [1] "Date" "X" "HomeTeam" "AwayTeam"
## [5] "FTHG" "FTAG" "FTR" "HTGS"
## [9] "ATGS" "HTGC" "ATGC" "HTP"
## [13] "ATP" "HM1" "HM2" "HM3"
## [17] "HM4" "HM5" "AM1" "AM2"
## [21] "AM3" "AM4" "AM5" "MW"
## [25] "HTFormPtsStr" "ATFormPtsStr" "HTFormPts" "ATFormPts"
## [29] "HTWinStreak3" "HTWinStreak5" "HTLossStreak3" "HTLossStreak5"
## [33] "ATWinStreak3" "ATWinStreak5" "ATLossStreak3" "ATLossStreak5"
## [37] "HTGD" "ATGD" "DiffPts" "DiffFormPts"
## [41] "views" "article"
Now we have a date column in our merged dataset
merged_data$Date <- as.Date(merged_data$Date, format = "%y/%m/%d")
creating a tsibble of our merged data
tsib <- merged_data %>%
filter(!is.na(Date)) %>%
mutate(MatchID = row_number()) %>%
as_tsibble(index = Date, key = MatchID) %>%
select(Date, FTHG)
Plotting the tsibble
ggplot(tsib, aes(x = Date, y = FTHG)) +
geom_line() +
labs(title = "Total Goals Scored Over Time", x = "Date", y = "Total Goals")

Linear model with the tsibble
lm_model <- lm(FTHG ~ Date, data = as.data.frame(tsib))
tsib <- tsib %>%
mutate(fitted_values = predict(lm_model))
summary(lm_model)
##
## Call:
## lm(formula = FTHG ~ Date, data = as.data.frame(tsib))
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.5510 -0.5494 -0.5049 0.4939 7.4878
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.606e+00 8.259e-02 19.447 <2e-16 ***
## Date -4.872e-06 4.770e-06 -1.021 0.307
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.296 on 6633 degrees of freedom
## Multiple R-squared: 0.0001573, Adjusted R-squared: 6.529e-06
## F-statistic: 1.043 on 1 and 6633 DF, p-value: 0.3071
ggplot(tsib, aes(x = Date)) +
geom_line(aes(y = FTHG), color = "blue", linetype = "solid", size = 1) +
geom_line(aes(y = fitted_values), color = "red", linetype = "dashed", size = 1) +
labs(title = "Observed vs. Fitted Values", x = "Date", y = "Values") +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

The linear regression model suggests that there is no significant
linear relationship between the Date variable and the number of goals
scored FTHG. The p-value for the Date coefficient is high, indicating
that the slope is not significantly different from zero.
acf_values <- acf(tsib$FTHG, lag.max = 30, plot = FALSE)
acf_values
##
## Autocorrelations of series 'tsib$FTHG', by lag
##
## 0 1 2 3 4 5 6 7 8 9 10
## 1.000 -0.003 -0.018 -0.006 -0.021 -0.007 0.000 0.032 -0.006 -0.001 -0.008
## 11 12 13 14 15 16 17 18 19 20 21
## 0.026 0.006 0.007 0.013 -0.029 -0.002 0.005 -0.009 0.011 0.022 0.011
## 22 23 24 25 26 27 28 29 30
## -0.019 -0.019 -0.020 0.004 -0.001 0.004 0.000 -0.014 -0.013
summary(acf_values)
## Length Class Mode
## acf 31 -none- numeric
## type 1 -none- character
## n.used 1 -none- numeric
## lag 31 -none- numeric
## series 1 -none- character
## snames 0 -none- NULL
plot(acf_values, main = "Autocorrelation Function (ACF)")

The ACF values appear to be close to zero for many lags indicating a
relatively weak autocorrelation.
pacf_values <- pacf(tsib$FTHG, lag.max = 30, plot = FALSE)
pacf_values
##
## Partial autocorrelations of series 'tsib$FTHG', by lag
##
## 1 2 3 4 5 6 7 8 9 10 11
## -0.003 -0.018 -0.006 -0.021 -0.007 -0.001 0.032 -0.006 0.000 -0.008 0.028
## 12 13 14 15 16 17 18 19 20 21 22
## 0.006 0.008 0.012 -0.028 -0.002 0.005 -0.011 0.010 0.021 0.011 -0.017
## 23 24 25 26 27 28 29 30
## -0.019 -0.020 0.003 -0.002 0.002 -0.001 -0.011 -0.013
summary(pacf_values)
## Length Class Mode
## acf 30 -none- numeric
## type 1 -none- character
## n.used 1 -none- numeric
## lag 30 -none- numeric
## series 1 -none- character
## snames 0 -none- NULL
plot(pacf_values, main = "Partial Autocorrelation Function (PACF)")
