library(haven)
library(dplyr)
library(ggplot2)
library(lmtest)
library(forecast)
library(sandwich)
library(nlme)
dat <- read_dta("all.dta")
dat <- dat %>%
mutate(date = as.Date(paste(year, month, day, sep = "-"))) %>%
filter(!is.na(date))
Vyberieme hráča s najvyšším počtom pozorovaní, aby sme mali dostatočne dlhý časový rad.
player_pick <- dat %>% count(player, sort = TRUE) %>% slice(1) %>% pull(player)
d <- dat %>%
filter(player == player_pick) %>%
arrange(date) %>%
filter(!is.na(rankindex), !is.na(b365), !is.na(rankindexdiff)) %>%
droplevels()
# DÔLEŽITÉ: AR(1) v nlme vyžaduje unikátne hodnoty "času".
# Keď má hráč viac zápasov v rovnaký deň, date nie je unikátny.
# Preto vytvoríme jednoduchý časový index 1..T:
d <- d %>% mutate(t = row_number())
cat("Použitý hráč:", player_pick, "\n")
Použitý hráč: Djokovic N.
cat("Počet pozorovaní:", nrow(d), "\n")
Počet pozorovaní: 675
cat("Rozsah dátumov:", format(min(d$date)), "–", format(max(d$date)), "\n")
Rozsah dátumov: 2007-01-02 – 2021-11-20
Použijeme jednoduchý model:
\[ rankindex_t = \beta_0 + \beta_1 b365_t + \beta_2 rankindexdiff_t + u_t \]
m <- lm(rankindex ~ b365 + rankindexdiff, data = d)
summary(m)
Call:
lm(formula = rankindex ~ b365 + rankindexdiff, data = d)
Residuals:
Min 1Q Median 3Q Max
-3.1134 -0.5423 0.1437 0.6627 1.6056
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.01219 0.14410 41.723 <2e-16 ***
b365 0.09303 0.06475 1.437 0.151
rankindexdiff 0.26588 0.02218 11.986 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.963 on 672 degrees of freedom
Multiple R-squared: 0.2141, Adjusted R-squared: 0.2118
F-statistic: 91.53 on 2 and 672 DF, p-value: < 2.2e-16
d <- d %>% mutate(e = resid(m))
ggplot(d, aes(x = date, y = e)) +
geom_line() +
labs(
title = "Rezíduá OLS v čase",
x = "Dátum", y = "rezíduá"
)
d_lag <- d %>%
mutate(e_lag1 = dplyr::lag(e, 1)) %>%
filter(!is.na(e_lag1))
ggplot(d_lag, aes(x = e_lag1, y = e)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
labs(
title = "Lag-plot rezíduí: e(t) vs e(t-1)",
x = "e(t-1)", y = "e(t)"
)
(Voliteľne) pomocná regresia:
summary(lm(e ~ e_lag1, data = d_lag))
Call:
lm(formula = e ~ e_lag1, data = d_lag)
Residuals:
Min 1Q Median 3Q Max
-2.10462 -0.32983 0.05773 0.37198 1.66452
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.005759 0.021145 0.272 0.785
e_lag1 0.815752 0.022026 37.035 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.5489 on 672 degrees of freedom
Multiple R-squared: 0.6712, Adjusted R-squared: 0.6707
F-statistic: 1372 on 1 and 672 DF, p-value: < 2.2e-16
Acf(resid(m), main = "ACF rezíduí (OLS)")
dwtest(m)
Durbin-Watson test
data: m
DW = 0.35882, p-value < 2.2e-16
alternative hypothesis: true autocorrelation is greater than 0
bgtest(m, order = 1)
Breusch-Godfrey test for serial correlation of order up to 1
data: m
LM test = 499.02, df = 1, p-value < 2.2e-16
bgtest(m, order = 2)
Breusch-Godfrey test for serial correlation of order up to 2
data: m
LM test = 542.46, df = 2, p-value < 2.2e-16
bgtest(m, order = 3)
Breusch-Godfrey test for serial correlation of order up to 3
data: m
LM test = 565.28, df = 3, p-value < 2.2e-16
bgtest(m, order = 4)
Breusch-Godfrey test for serial correlation of order up to 4
data: m
LM test = 576.46, df = 4, p-value < 2.2e-16