options(repos = c(CRAN = "https://cloud.r-project.org"))
Sys.unsetenv("RSPM")  

install.packages(c("rsconnect", "rmarkdown", "knitr"))
## Installing packages into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
need <- c("haven", "dplyr", "lmtest", "sandwich", "broom")
to_install <- setdiff(need, rownames(installed.packages()))
if (length(to_install) > 0) install.packages(to_install, dependencies = TRUE)

library(haven); library(dplyr); library(lmtest); library(sandwich); library(broom)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
df <- haven::read_dta("all.dta") %>% as.data.frame()

names(df)[1:30]
##  [1] "player"        "rank"          "s1"            "s2"           
##  [5] "s3"            "swon"          "b365"          "gender"       
##  [9] "tylocation"    "tytournament"  "tyseries"      "tycourt"      
## [13] "tysurface"     "tyround"       "year"          "month"        
## [17] "day"           "rankdiff"      "s1diff"        "s2diff"       
## [21] "s3diff"        "swondiff"      "b365diff"      "s1w"          
## [25] "s2w"           "s3w"           "rankindex"     "rankop"       
## [29] "rankindexop"   "rankindexdiff"
dim(df)
## [1] 112138     30
# Ľahký wrangling (kategórie ako faktory)
df <- df %>% mutate(
  gender    = factor(gender),
  tysurface = factor(tysurface),
  year      = as.integer(year)
)

# Zvolené premenné (ak niektorá chýba, automaticky sa vynechá)
vars_needed <- c("b365","rankdiff","s1","s2","s3","swon","gender","tysurface","year")
vars_needed <- intersect(vars_needed, names(df))

df_model <- df %>% dplyr::select(all_of(vars_needed)) %>% na.omit()
dim(df_model); head(df_model)
## [1] 38582     9
##    b365 rankdiff s1 s2 s3 swon gender tysurface year
## 4  2.62      -13  6  6  3    1      0         5 2012
## 7  3.75      174  3  6  3    1      1         5 2016
## 9  4.00      291  7  4  4    1      1         1 2008
## 24 2.37       -4  4  6  3    1      1         5 2015
## 25 8.00      227  7  2  3    1      1         5 2015
## 26 1.30      -50  6  6  3    1      1         5 2014

1 Špecifikácia a OLS odhad

# Dynamicky poskladaná formula
rhs <- c()
for (v in c("rankdiff","s1","s2","s3","swon","gender","tysurface","year")) {
  if (v %in% names(df_model)) rhs <- c(rhs, v)
}

stopifnot("b365" %in% names(df_model))
form <- as.formula(paste("b365 ~", paste(rhs, collapse = " + ")))
form
## b365 ~ rankdiff + s1 + s2 + s3 + swon + gender + tysurface + 
##     year
# OLS
m_ols <- lm(form, data = df_model)
summary(m_ols)
## 
## Call:
## lm(formula = form, data = df_model)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.3216 -0.6884 -0.3176  0.2827 25.9022 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.860e+01  3.368e+00   8.490  < 2e-16 ***
## rankdiff     5.320e-03  6.313e-05  84.268  < 2e-16 ***
## s1          -5.370e-02  4.790e-03 -11.209  < 2e-16 ***
## s2          -4.656e-02  4.726e-03  -9.851  < 2e-16 ***
## s3          -6.735e-02  5.470e-03 -12.312  < 2e-16 ***
## swon        -2.908e-01  2.069e-02 -14.053  < 2e-16 ***
## gender1      1.206e-01  1.419e-02   8.498  < 2e-16 ***
## tysurface2  -4.251e-02  7.190e-02  -0.591    0.554    
## tysurface3  -6.302e-02  7.516e-02  -0.838    0.402    
## tysurface4  -4.067e-01  5.633e-01  -0.722    0.470    
## tysurface5  -1.078e-02  7.145e-02  -0.151    0.880    
## year        -1.244e-02  1.678e-03  -7.415 1.24e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.369 on 38570 degrees of freedom
## Multiple R-squared:  0.2042, Adjusted R-squared:  0.204 
## F-statistic: 899.9 on 11 and 38570 DF,  p-value: < 2.2e-16

2 Test heteroskedasticity

# Breusch–Pagan
bp <- lmtest::bptest(m_ols)
bp
## 
##  studentized Breusch-Pagan test
## 
## data:  m_ols
## BP = 625.62, df = 11, p-value < 2.2e-16
# White 
make_white_rhs <- function(vars) {
  keep <- intersect(vars, names(df_model))
  if (length(keep) == 0) return(~ 1)
  terms <- unlist(lapply(keep, function(v) c(v, paste0("I(", v, "^2)"))))
  as.formula(paste("~", paste(terms, collapse = " + ")))
}
white_rhs <- make_white_rhs(c("rankdiff","s1","s2","s3","swon"))
white <- lmtest::bptest(m_ols, varformula = white_rhs, studentize = TRUE, data = df_model)
white
## 
##  studentized Breusch-Pagan test
## 
## data:  m_ols
## BP = 1073.8, df = 10, p-value < 2.2e-16

3 Korekcia heteroskedasticity (WLS/FGLS)

aux_form <- as.formula(paste("log(residuals(m_ols)^2) ~", paste(rhs, collapse = " + ")))
aux <- lm(aux_form, data = df_model)

sigma_hat2 <- exp(fitted(aux))     # odhad var(e|X)
w <- 1 / sigma_hat2                # WLS váhy (menšia váha pri väčšej variancii)

# WLS/FGLS model
m_wls <- lm(form, data = df_model, weights = w)
summary(m_wls)
## 
## Call:
## lm(formula = form, data = df_model, weights = w)
## 
## Weighted Residuals:
##    Min     1Q Median     3Q    Max 
## -2.874 -1.464 -0.688  0.738 45.040 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.586e+01  2.770e+00   5.725 1.04e-08 ***
## rankdiff     1.824e-03  3.117e-05  58.517  < 2e-16 ***
## s1          -3.408e-02  4.037e-03  -8.442  < 2e-16 ***
## s2          -3.465e-02  3.974e-03  -8.719  < 2e-16 ***
## s3          -4.019e-02  5.564e-03  -7.224 5.15e-13 ***
## swon        -3.380e-01  1.834e-02 -18.428  < 2e-16 ***
## gender1      8.945e-02  1.153e-02   7.757 8.89e-15 ***
## tysurface2  -1.138e-03  7.308e-02  -0.016    0.988    
## tysurface3  -7.731e-03  7.486e-02  -0.103    0.918    
## tysurface4   2.939e-03  5.143e-01   0.006    0.995    
## tysurface5   1.186e-02  7.284e-02   0.163    0.871    
## year        -6.270e-03  1.379e-03  -4.545 5.50e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.502 on 38570 degrees of freedom
## Multiple R-squared:  0.1244, Adjusted R-squared:  0.1241 
## F-statistic: 497.9 on 11 and 38570 DF,  p-value: < 2.2e-16

4 Diagnostika: Pred vs Po korekcii

Na porovnanie použijeme Residuals vs Fitted — priamo ukazuje, či sa rozptyl mení s predikovanou hodnotou (lievik = heteroskedasticita).

4.1 Graf 1 — Pred korekciou (OLS)

par(mfrow=c(1,1))
plot(m_ols, which = 1)

Vysvetlivka (Graf 1 – OLS):
- Čo skúma: vzťah medzi predikovanými hodnotami (X) a reziduami (Y).
- Čo hľadám: náhodný „mrak“ okolo 0 bez trendu. Ak sa rozptyl mení s X (tvar lievika), ide o heteroskedasticitu.
- Dôsledok: OLS koeficienty sú stále nestranné, ale štandardné chyby sú skreslené

4.2 Graf 2 — Po korekcii (WLS/FGLS)

par(mfrow=c(1,1))
plot(m_wls, which = 1)

Vysvetlivka (Graf 2 – WLS/FGLS):
- Čo skúma: to isté čo Graf 1, ale po aplikácii váženia podľa odhadnutej variability chýb.
- Čo hľadám: rozptyl bodov by mal byť stabilnejší naprieč fitted hodnotami (menej „lievikovitý“).
- Dôsledok: Po korekcii sú odhady efektívnejšie a intervaly/spoľahlivostné testy sú spoľahlivejšie.

5 Zhrnutie testov

data.frame(
  Test = c("Breusch–Pagan", "White"),
  p_value = signif(c(bp$p.value, white$p.value), 4),
  stringsAsFactors = FALSE
)
##            Test    p_value
## 1 Breusch–Pagan 4.609e-127
## 2         White 2.396e-224