install.packages("knitr")
install.packages("dplyr")
install.packages("ggplot2")
install.packages("haven")

names(data)

## NULL

library(haven)   
library(dplyr)   
data <- read_dta("all.dta")
str(data)

## tibble [112,138 × 30] (S3: tbl_df/tbl/data.frame)
##  $ player       : chr [1:112138] "Krajinovic F." "Roddick A." "Simon G." "Parmentier P." ...
##   ..- attr(*, "format.stata")= chr "%24s"
##  $ rank         : num [1:112138] 37 6 37 59 74 25 196 47 396 29 ...
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ s1           : num [1:112138] 6 6 6 6 6 7 3 6 7 6 ...
##   ..- attr(*, "label")= chr "number of games won in 1st set"
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ s2           : num [1:112138] 6 3 2 6 6 6 6 6 4 6 ...
##   ..- attr(*, "label")= chr "number of games won in 2nd set"
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ s3           : num [1:112138] NA NA NA 3 NA NA 3 NA 4 NA ...
##   ..- attr(*, "label")= chr "number of games won in 3rd set"
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ swon         : num [1:112138] 2 0 0 1 0 2 1 2 1 2 ...
##   ..- attr(*, "label")= chr "number of sets won"
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ b365         : num [1:112138] 1.5 1.53 1.53 2.62 1.83 ...
##   ..- attr(*, "label")= chr "bet365 odds"
##   ..- attr(*, "format.stata")= chr "%10.0g"
##  $ gender       : num [1:112138] 1 1 1 0 1 1 1 1 1 1 ...
##   ..- attr(*, "label")= chr "male=1, female=0"
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ tylocation   : num [1:112138] 149 100 101 100 123 123 98 151 172 140 ...
##   ..- attr(*, "label")= chr "type of location"
##   ..- attr(*, "format.stata")= chr "%12.0g"
##  $ tytournament : num [1:112138] 279 257 229 195 34 34 220 285 235 266 ...
##   ..- attr(*, "label")= chr "type of tournament"
##   ..- attr(*, "format.stata")= chr "%12.0g"
##  $ tyseries     : num [1:112138] 1 5 1 4 7 7 1 4 4 1 ...
##   ..- attr(*, "label")= chr "type of series"
##   ..- attr(*, "format.stata")= chr "%12.0g"
##  $ tycourt      : num [1:112138] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "label")= chr "type of court (outdoors=1, indoors=0)"
##   ..- attr(*, "format.stata")= chr "%12.0g"
##  $ tysurface    : num [1:112138] 5 5 5 5 5 5 5 1 1 5 ...
##   ..- attr(*, "label")= chr "type of surface (carpet=1, clay=2, grass=3, greenset=4, hard=5)"
##   ..- attr(*, "format.stata")= chr "%12.0g"
##  $ tyround      : num [1:112138] 2 5 2 2 2 1 2 2 1 2 ...
##   ..- attr(*, "label")= chr "round of match"
##   ..- attr(*, "format.stata")= chr "%12.0g"
##  $ year         : num [1:112138] 2021 2008 2019 2012 2019 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ month        : num [1:112138] 9 3 9 2 10 11 2 10 2 2 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ day          : num [1:112138] 30 1 19 22 29 9 18 24 26 10 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ rankdiff     : num [1:112138] -10 -36 -39 -13 24 -680 174 -31 291 -16 ...
##   ..- attr(*, "label")= chr "rank of player-rank of opponent"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ s1diff       : num [1:112138] 3 -1 -1 -1 -1 1 -3 5 1 4 ...
##   ..- attr(*, "label")= chr "s1 of player-s1 of oppone"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ s2diff       : num [1:112138] 6 -3 -4 6 -1 2 5 4 -2 6 ...
##   ..- attr(*, "label")= chr "s2 of player-s2 of oppone"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ s3diff       : num [1:112138] NA NA NA -3 NA NA -3 NA -2 NA ...
##   ..- attr(*, "label")= chr "s3 of player-s3 of oppone"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ swondiff     : num [1:112138] 2 -2 -2 -1 -2 2 -1 2 -1 2 ...
##   ..- attr(*, "label")= chr "swin of player-swin of oppone"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ b365diff     : num [1:112138] -1.12 -0.84 -0.84 1.18 0 ...
##   ..- attr(*, "label")= chr "b365 of player-b365 of oppone"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ s1w          : num [1:112138] 1 0 0 0 0 1 0 1 1 1 ...
##   ..- attr(*, "label")= chr "won set1=1, lost=0"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ s2w          : num [1:112138] 1 0 0 1 0 1 1 1 0 1 ...
##   ..- attr(*, "label")= chr "won set2=1, lost=0"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ s3w          : num [1:112138] NA NA NA 0 NA NA 0 NA 0 NA ...
##   ..- attr(*, "label")= chr "won set3=1, lost=0"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ rankindex    : num [1:112138] 2.79 5.42 2.79 2.12 1.79 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ rankop       : num [1:112138] 47 42 76 72 50 705 22 78 105 45 ...
##   ..- attr(*, "label")= chr "rank of opponent"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ rankindexop  : num [1:112138] 2.45 2.61 1.75 1.83 2.36 ...
##   ..- attr(*, "label")= chr "rankindex of opponent"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ rankindexdiff: num [1:112138] 0.345 2.807 1.038 0.287 -0.566 ...
##   ..- attr(*, "label")= chr "rankindex of player-rankindex of opponent"
##   ..- attr(*, "format.stata")= chr "%9.0g"

head(data)

# Hráči, ktorí boli svetovou 1.
rank1_players <- data %>%
filter(rank == 1)
print(rank1_players)

## # A tibble: 1,163 × 30
##    player      rank    s1    s2    s3  swon  b365 gender tylocation tytournament
##    <chr>      <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>      <dbl>        <dbl>
##  1 Djokovic …     1     6     4     7     2  1.04      1         19          291
##  2 Djokovic …     1     6     6     3     1  1.40      1        163          193
##  3 Wozniacki…     1     4     2    NA     0  2.37      0         75          280
##  4 Djokovic …     1     7     6    NA     2  1.03      1        123           34
##  5 Nadal R.       1     6     6    NA     2  1.01      1        123           31
##  6 Jankovic …     1     0     6     6     2  1.53      0        108          174
##  7 Nadal R.       1     7     3     6     2  1.03      1        138            1
##  8 Djokovic …     1     3     6     7     2  1.16      1        123           34
##  9 Djokovic …     1     7     6    NA     2  1.12      1        166          106
## 10 Wozniacki…     1     4     3    NA     0  1.20      0         50          280
## # ℹ 1,153 more rows
## # ℹ 20 more variables: tyseries <dbl>, tycourt <dbl>, tysurface <dbl>,
## #   tyround <dbl>, year <dbl>, month <dbl>, day <dbl>, rankdiff <dbl>,
## #   s1diff <dbl>, s2diff <dbl>, s3diff <dbl>, swondiff <dbl>, b365diff <dbl>,
## #   s1w <dbl>, s2w <dbl>, s3w <dbl>, rankindex <dbl>, rankop <dbl>,
## #   rankindexop <dbl>, rankindexdiff <dbl>

write.csv(rank1_players, "rank1_players.csv", row.names = FALSE)

library(dplyr)
rank1_players <- data %>%
filter(rank == 1)  
rank1_unique <- rank1_players %>%
distinct(player, year, .keep_all = TRUE)
print(rank1_unique)

## # A tibble: 57 × 30
##    player      rank    s1    s2    s3  swon  b365 gender tylocation tytournament
##    <chr>      <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>      <dbl>        <dbl>
##  1 Djokovic …     1     6     4     7     2  1.04      1         19          291
##  2 Djokovic …     1     6     6     3     1  1.40      1        163          193
##  3 Wozniacki…     1     4     2    NA     0  2.37      0         75          280
##  4 Djokovic …     1     7     6    NA     2  1.03      1        123           34
##  5 Nadal R.       1     6     6    NA     2  1.01      1        123           31
##  6 Jankovic …     1     0     6     6     2  1.53      0        108          174
##  7 Nadal R.       1     7     3     6     2  1.03      1        138            1
##  8 Djokovic …     1     7     6    NA     2  1.12      1        166          106
##  9 Wozniacki…     1     4     3    NA     0  1.20      0         50          280
## 10 Murray A.      1     6     6     6     2  1.28      1         87          193
## # ℹ 47 more rows
## # ℹ 20 more variables: tyseries <dbl>, tycourt <dbl>, tysurface <dbl>,
## #   tyround <dbl>, year <dbl>, month <dbl>, day <dbl>, rankdiff <dbl>,
## #   s1diff <dbl>, s2diff <dbl>, s3diff <dbl>, swondiff <dbl>, b365diff <dbl>,
## #   s1w <dbl>, s2w <dbl>, s3w <dbl>, rankindex <dbl>, rankop <dbl>,
## #   rankindexop <dbl>, rankindexdiff <dbl>

1 Cieľ

Krátka analýza zameraná iba na hráčov s rankom 1

1.1 Načítanie dát s autodetekciou stĺpcov

library(tidyverse)
library(haven)
library(lubridate)
library(janitor)
library(stringr)

# 1) načítaj dáta
path <- c(params$data_path, "./all.dta", "/mnt/data/all.dta") %>% purrr::keep(file.exists) %>% dplyr::first()
stopifnot(!is.null(path))
raw <- read_dta(path)
df <- labelled::to_factor(raw) %>% clean_names()

# 2) autodetekcia mien stĺpcov
nm <- names(df)
# hráč
guess_player <- c(params$player_var, nm[str_detect(nm, "player|hrac|nick|name|id")]) %>% purrr::discard(is.null) %>% dplyr::first()
# rank
guess_rank   <- c(params$rank_var,   nm[str_detect(nm, "(^|_)rank($|_)|rating_rank|ranking")]) %>% purrr::discard(is.null) %>% dplyr::first()
# dátum
date_candidates <- nm[vapply(df, inherits, logical(1), what = c("Date","POSIXct","POSIXt"))]
if (length(date_candidates) == 0) {
  likely_date_names <- nm[str_detect(nm, "date|datum|day|time|datetime|match_date|played|created")]
  for (v in likely_date_names){
    dt <- suppressWarnings(ymd(df[[v]]))
    if (sum(!is.na(dt)) > nrow(df)*0.2) { df[[v]] <- dt; date_candidates <- c(date_candidates, v); break }
  }
}
guess_date <- dplyr::first(purrr::discard(c(params$date_var, date_candidates), is.null))

# sets won / lost (autodetekcia) – v datasete mám len swon
guess_sw <- dplyr::first(purrr::discard(c(
  params$sets_won_var,
  nm[stringr::str_detect(nm, "(^|_)(swon|sets?_won|won_sets|set_won|setsfor|sets_for)($|_)")]
), is.null))
# sets_lost neexistuje – vypočítame neskôr pomocou swon → lost

guess_sl <- c(
  params$sets_lost_var,
  nm[str_detect(nm, "(^|_)(sets?_lost|lost_sets|set_lost|sl|setsagainst|sets_against)($|_)")]
) %>% purrr::discard(is.null) %>% dplyr::first()

# 3) validácia a zostavenie core dát
if (is.null(guess_player) || is.null(guess_rank)) {
  stop("Nepodarilo sa nájsť stĺpce player/rank. Zadaj mená do params$player_var a params$rank_var.")
}

# vyber povinné stĺpce
core <- df %>% dplyr::select(player = dplyr::all_of(guess_player), rank = dplyr::all_of(guess_rank))

# doplň date stĺpec — robustne, aj keď sa nenašiel
if (!is.null(guess_date) && guess_date %in% names(df)) {
  if (inherits(df[[guess_date]], "Date")) {
    core$date <- df[[guess_date]]
  } else {
    core$date <- suppressWarnings(as.Date(df[[guess_date]]))
  }
} else {
  core$date <- seq_len(nrow(core)) # náhradný „čas“ podľa poradia riadkov
}

# doplň sets_won (povinné) a z neho dopočítaj sets_lost podľa pravidla
if (!is.null(guess_sw) && guess_sw %in% names(df)) {
  core$sets_won <- suppressWarnings(readr::parse_number(as.character(df[[guess_sw]])))
} else {
  stop("Nenašiel som stĺpec s vyhratými setmi (swon). Nastav params$sets_won_var = 'swon'.")
}
# Pravidlo: swon == 0 alebo 1 → prehral 2 sety; swon >= 2 → prehral 0
core$sets_lost <- dplyr::case_when(
  is.na(core$sets_won) ~ NA_real_,
  core$sets_won %in% c(0,1) ~ 2,
  core$sets_won >= 2 ~ 0,
  TRUE ~ NA_real_
)

# typy/ranky/filtre
if (!is.numeric(core$rank)) core <- core %>% mutate(rank = readr::parse_number(as.character(rank)))
core <- core %>% filter(!is.na(player), !is.na(rank)) %>% arrange(player, date) %>% mutate(is_r1 = rank == 1)

# info
list(
  n_rows = nrow(core), n_players = n_distinct(core$player),
  vars = list(player = guess_player, rank = guess_rank, date = guess_date)
)

## $n_rows
## [1] 111941
## 
## $n_players
## [1] 1999
## 
## $vars
## $vars$player
## [1] "player"
## 
## $vars$rank
## [1] "rank"
## 
## $vars$date
## [1] NA

1.2 Koľko hráčov malo niekedy rank 1

players_total <- n_distinct(core$player)
players_r1    <- core %>% filter(is_r1) %>% distinct(player) %>% nrow()
share_r1      <- players_r1 / players_total

p1 <- tibble(metric = c("hráči s rank1", "ostatní"), 
             value = c(players_r1, players_total - players_r1)) %>%
  ggplot(aes(x = metric, y = value)) +
  geom_col() +
  geom_text(aes(label = value), vjust = -0.3) +
  labs(title = "Počet hráčov, ktorí niekedy dosiahli rank 1",
       x = NULL, y = "Počet hráčov") +
  theme_minimal()
print(p1)

Interpretácia: Stĺpec „hráči s rank1“ ukazuje, koľko hráčov sa aspoň raz dostalo na prvé miesto rebríčku. Druhý stĺpec zobrazuje všetkých ostatných.

1.3 2) Percento hráčov, ktorí dosiahli rank 1

p2 <- tibble(group = c("Rank 1 niekedy", "Nikdy"), share = c(share_r1, 1 - share_r1)) %>%
  ggplot(aes(x = group, y = share)) +
  geom_col() +
  scale_y_continuous(labels = scales::percent) +
  geom_text(aes(label = scales::percent(share, accuracy = 0.1)), vjust = -0.3) +
  labs(title = "Podiel hráčov, ktorí niekedy dosiahli rank 1",
       x = NULL, y = "% hráčov") +
  theme_minimal()
print(p2)

Interpretácia: Výška stĺpca vyjadruje, koľko % všetkých hráčov získalo rank 1 aspoň raz.

1.4 Koľko setov vyhrali a prehrali hráči, keď mali rank 1

r1_sets <- core %>% filter(is_r1) %>%
  group_by(player) %>%
  summarise(sets_won = sum(sets_won, na.rm = TRUE),
            sets_lost = sum(sets_lost, na.rm = TRUE), .groups = "drop")

long_r1 <- r1_sets %>% pivot_longer(c(sets_won, sets_lost), names_to = "type", values_to = "sets")

p3 <- long_r1 %>%
  group_by(type) %>% summarise(total = sum(sets), .groups = "drop") %>%
  ggplot(aes(x = type, y = total)) +
  geom_col() +
  geom_text(aes(label = total), vjust = -0.3) +
  labs(title = "Súčet vyhratých a prehratých setov počas ranku 1 (všetci hráči)",
       subtitle = "Pravidlo: swon 0/1 ⇒ prehrané 2 sety; swon ≥ 2 ⇒ prehrané 0",
       x = NULL, y = "Počet setov") +
  theme_minimal()
print(p3)

Interpretácia: Vidíme celkový „objem“ setov, ktoré hráči odohrali v tom období, keď mali rank 1, rozdelené na výhry a prehry.

1.5 Rank 1 hráči podľa čistého skóre setov

r1_leaderboard <- r1_sets %>% mutate(net_sets = sets_won - sets_lost,
                                     win_rate = sets_won/pmax(1, (sets_won + sets_lost)))

p4 <- r1_leaderboard %>%
  slice_max(net_sets, n = 10, with_ties = FALSE) %>%
  mutate(player = forcats::fct_reorder(player, net_sets)) %>%
  ggplot(aes(x = player, y = net_sets)) +
  geom_col() + coord_flip() +
  labs(title = "Top 10 hráčov podľa čistého skóre setov (rank 1)",
       subtitle = "net_sets = sets_won − sets_lost",
       x = NULL, y = "Čisté sety") +
  theme_minimal()
print(p4)

Interpretácia: Rebríček zvýrazňuje, kto mal počas ranku 1 najvýraznejšiu dominanciu v setoch (čisté skóre).

need <- c("haven", "dplyr", "lmtest", "sandwich", "broom")
to_install <- setdiff(need, rownames(installed.packages()))
if (length(to_install) > 0) install.packages(to_install, dependencies = TRUE)

library(haven); library(dplyr); library(lmtest); library(sandwich); library(broom)

df <- haven::read_dta("all.dta") %>% as.data.frame()

names(df)[1:30]

##  [1] "player"        "rank"          "s1"            "s2"           
##  [5] "s3"            "swon"          "b365"          "gender"       
##  [9] "tylocation"    "tytournament"  "tyseries"      "tycourt"      
## [13] "tysurface"     "tyround"       "year"          "month"        
## [17] "day"           "rankdiff"      "s1diff"        "s2diff"       
## [21] "s3diff"        "swondiff"      "b365diff"      "s1w"          
## [25] "s2w"           "s3w"           "rankindex"     "rankop"       
## [29] "rankindexop"   "rankindexdiff"

dim(df)

## [1] 112138     30

# Ľahký wrangling (kategórie ako faktory)
df <- df %>% mutate(
  gender    = factor(gender),
  tysurface = factor(tysurface),
  year      = as.integer(year)
)

# Zvolené premenné (ak niektorá chýba, automaticky sa vynechá)
vars_needed <- c("b365","rankdiff","s1","s2","s3","swon","gender","tysurface","year")
vars_needed <- intersect(vars_needed, names(df))

df_model <- df %>% dplyr::select(all_of(vars_needed)) %>% na.omit()
dim(df_model); head(df_model)

## [1] 38582     9

2 Špecifikácia a OLS odhad

# Dynamicky poskladaná formula
rhs <- c()
for (v in c("rankdiff","s1","s2","s3","swon","gender","tysurface","year")) {
  if (v %in% names(df_model)) rhs <- c(rhs, v)
}

stopifnot("b365" %in% names(df_model))
form <- as.formula(paste("b365 ~", paste(rhs, collapse = " + ")))
form

## b365 ~ rankdiff + s1 + s2 + s3 + swon + gender + tysurface + 
##     year

# OLS
m_ols <- lm(form, data = df_model)
summary(m_ols)

## 
## Call:
## lm(formula = form, data = df_model)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.3216 -0.6884 -0.3176  0.2827 25.9022 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.860e+01  3.368e+00   8.490  < 2e-16 ***
## rankdiff     5.320e-03  6.313e-05  84.268  < 2e-16 ***
## s1          -5.370e-02  4.790e-03 -11.209  < 2e-16 ***
## s2          -4.656e-02  4.726e-03  -9.851  < 2e-16 ***
## s3          -6.735e-02  5.470e-03 -12.312  < 2e-16 ***
## swon        -2.908e-01  2.069e-02 -14.053  < 2e-16 ***
## gender1      1.206e-01  1.419e-02   8.498  < 2e-16 ***
## tysurface2  -4.251e-02  7.190e-02  -0.591    0.554    
## tysurface3  -6.302e-02  7.516e-02  -0.838    0.402    
## tysurface4  -4.067e-01  5.633e-01  -0.722    0.470    
## tysurface5  -1.078e-02  7.145e-02  -0.151    0.880    
## year        -1.244e-02  1.678e-03  -7.415 1.24e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.369 on 38570 degrees of freedom
## Multiple R-squared:  0.2042, Adjusted R-squared:  0.204 
## F-statistic: 899.9 on 11 and 38570 DF,  p-value: < 2.2e-16

3 Test heteroskedasticity

# Breusch–Pagan
bp <- lmtest::bptest(m_ols)
bp

## 
##  studentized Breusch-Pagan test
## 
## data:  m_ols
## BP = 625.62, df = 11, p-value < 2.2e-16

# White 
make_white_rhs <- function(vars) {
  keep <- intersect(vars, names(df_model))
  if (length(keep) == 0) return(~ 1)
  terms <- unlist(lapply(keep, function(v) c(v, paste0("I(", v, "^2)"))))
  as.formula(paste("~", paste(terms, collapse = " + ")))
}
white_rhs <- make_white_rhs(c("rankdiff","s1","s2","s3","swon"))
white <- lmtest::bptest(m_ols, varformula = white_rhs, studentize = TRUE, data = df_model)
white

## 
##  studentized Breusch-Pagan test
## 
## data:  m_ols
## BP = 1073.8, df = 10, p-value < 2.2e-16

4 Korekcia heteroskedasticity (WLS/FGLS)

aux_form <- as.formula(paste("log(residuals(m_ols)^2) ~", paste(rhs, collapse = " + ")))
aux <- lm(aux_form, data = df_model)

sigma_hat2 <- exp(fitted(aux))     # odhad var(e|X)
w <- 1 / sigma_hat2                # WLS váhy (menšia váha pri väčšej variancii)

# WLS/FGLS model
m_wls <- lm(form, data = df_model, weights = w)
summary(m_wls)

## 
## Call:
## lm(formula = form, data = df_model, weights = w)
## 
## Weighted Residuals:
##    Min     1Q Median     3Q    Max 
## -2.874 -1.464 -0.688  0.738 45.040 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.586e+01  2.770e+00   5.725 1.04e-08 ***
## rankdiff     1.824e-03  3.117e-05  58.517  < 2e-16 ***
## s1          -3.408e-02  4.037e-03  -8.442  < 2e-16 ***
## s2          -3.465e-02  3.974e-03  -8.719  < 2e-16 ***
## s3          -4.019e-02  5.564e-03  -7.224 5.15e-13 ***
## swon        -3.380e-01  1.834e-02 -18.428  < 2e-16 ***
## gender1      8.945e-02  1.153e-02   7.757 8.89e-15 ***
## tysurface2  -1.138e-03  7.308e-02  -0.016    0.988    
## tysurface3  -7.731e-03  7.486e-02  -0.103    0.918    
## tysurface4   2.939e-03  5.143e-01   0.006    0.995    
## tysurface5   1.186e-02  7.284e-02   0.163    0.871    
## year        -6.270e-03  1.379e-03  -4.545 5.50e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.502 on 38570 degrees of freedom
## Multiple R-squared:  0.1244, Adjusted R-squared:  0.1241 
## F-statistic: 497.9 on 11 and 38570 DF,  p-value: < 2.2e-16

5 Diagnostika: Pred vs Po korekcii

Na porovnanie použijeme Residuals vs Fitted — priamo ukazuje, či sa rozptyl mení s predikovanou hodnotou (lievik = heteroskedasticita).

5.1 Graf 1 — Pred korekciou (OLS)

par(mfrow=c(1,1))
plot(m_ols, which = 1)

Vysvetlivka (Graf 1 – OLS):
- Čo skúma: vzťah medzi predikovanými hodnotami (X) a reziduami (Y).
- Čo hľadám: náhodný „mrak“ okolo 0 bez trendu. Ak sa rozptyl mení s X (tvar lievika), ide o heteroskedasticitu.
- Dôsledok: OLS koeficienty sú stále nestranné, ale štandardné chyby sú skreslené

5.2 Graf 2 — Po korekcii (WLS/FGLS)

par(mfrow=c(1,1))
plot(m_wls, which = 1)

Vysvetlivka (Graf 2 – WLS/FGLS):
- Čo skúma: to isté čo Graf 1, ale po aplikácii váženia podľa odhadnutej variability chýb.
- Čo hľadám: rozptyl bodov by mal byť stabilnejší naprieč fitted hodnotami (menej „lievikovitý“).
- Dôsledok: Po korekcii sú odhady efektívnejšie a intervaly/spoľahlivostné testy sú spoľahlivejšie.

6 Zhrnutie testov

data.frame(
  Test = c("Breusch–Pagan", "White"),
  p_value = signif(c(bp$p.value, white$p.value), 4),
  stringsAsFactors = FALSE
)

library(haven)
library(dplyr)
library(ggplot2)
library(lmtest)
library(forecast)
library(sandwich)
library(nlme)

7 1. Dáta a príprava časového radu

dat <- read_dta("all.dta")

dat <- dat %>%
  mutate(date = as.Date(paste(year, month, day, sep = "-"))) %>%
  filter(!is.na(date))

Vyberieme hráča s najvyšším počtom pozorovaní, aby sme mali dostatočne dlhý časový rad.

player_pick <- dat %>% count(player, sort = TRUE) %>% slice(1) %>% pull(player)

d <- dat %>%
  filter(player == player_pick) %>%
  arrange(date) %>%
  filter(!is.na(rankindex), !is.na(b365), !is.na(rankindexdiff)) %>%
  droplevels()

# DÔLEŽITÉ: AR(1) v nlme vyžaduje unikátne hodnoty "času".
# Keď má hráč viac zápasov v rovnaký deň, date nie je unikátny.
# Preto vytvoríme jednoduchý časový index 1..T:
d <- d %>% mutate(t = row_number())

cat("Použitý hráč:", player_pick, "\n")

## Použitý hráč: Djokovic N.

cat("Počet pozorovaní:", nrow(d), "\n")

## Počet pozorovaní: 675

cat("Rozsah dátumov:", format(min(d$date)), "–", format(max(d$date)), "\n")

## Rozsah dátumov: 2007-01-02 – 2021-11-20

8 2. Základný model (OLS)

Použijeme jednoduchý model:

\[ rankindex_t = \beta_0 + \beta_1 b365_t + \beta_2 rankindexdiff_t + u_t \]

m <- lm(rankindex ~ b365 + rankindexdiff, data = d)
summary(m)

## 
## Call:
## lm(formula = rankindex ~ b365 + rankindexdiff, data = d)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.1134 -0.5423  0.1437  0.6627  1.6056 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    6.01219    0.14410  41.723   <2e-16 ***
## b365           0.09303    0.06475   1.437    0.151    
## rankindexdiff  0.26588    0.02218  11.986   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.963 on 672 degrees of freedom
## Multiple R-squared:  0.2141, Adjusted R-squared:  0.2118 
## F-statistic: 91.53 on 2 and 672 DF,  p-value: < 2.2e-16

9 3. Detekcia autokorelácie

9.1 3.1 Graf rezíduí v čase

d <- d %>% mutate(e = resid(m))

ggplot(d, aes(x = date, y = e)) +
  geom_line() +
  labs(
    title = "Rezíduá OLS v čase",
    x = "Dátum", y = "rezíduá"
  )

9.2 3.2 Lag-plot rezíduí: e(t) vs e(t−1)

d_lag <- d %>%
  mutate(e_lag1 = dplyr::lag(e, 1)) %>%
  filter(!is.na(e_lag1))

ggplot(d_lag, aes(x = e_lag1, y = e)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  labs(
    title = "Lag-plot rezíduí: e(t) vs e(t-1)",
    x = "e(t-1)", y = "e(t)"
  )

## `geom_smooth()` using formula = 'y ~ x'

(Voliteľne) pomocná regresia:

summary(lm(e ~ e_lag1, data = d_lag))

## 
## Call:
## lm(formula = e ~ e_lag1, data = d_lag)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.10462 -0.32983  0.05773  0.37198  1.66452 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.005759   0.021145   0.272    0.785    
## e_lag1      0.815752   0.022026  37.035   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5489 on 672 degrees of freedom
## Multiple R-squared:  0.6712, Adjusted R-squared:  0.6707 
## F-statistic:  1372 on 1 and 672 DF,  p-value: < 2.2e-16

9.3 3.3 Korelogram (ACF) rezíduí

Acf(resid(m), main = "ACF rezíduí (OLS)")

9.4 3.4 Durbin–Watson test

dwtest(m)

## 
##  Durbin-Watson test
## 
## data:  m
## DW = 0.35882, p-value < 2.2e-16
## alternative hypothesis: true autocorrelation is greater than 0

9.5 3.5 Breusch–Godfrey test (AR(1) až AR(4))

bgtest(m, order = 1)

## 
##  Breusch-Godfrey test for serial correlation of order up to 1
## 
## data:  m
## LM test = 499.02, df = 1, p-value < 2.2e-16

bgtest(m, order = 2)

## 
##  Breusch-Godfrey test for serial correlation of order up to 2
## 
## data:  m
## LM test = 542.46, df = 2, p-value < 2.2e-16

bgtest(m, order = 3)

## 
##  Breusch-Godfrey test for serial correlation of order up to 3
## 
## data:  m
## LM test = 565.28, df = 3, p-value < 2.2e-16

bgtest(m, order = 4)

## 
##  Breusch-Godfrey test for serial correlation of order up to 4
## 
## data:  m
## LM test = 576.46, df = 4, p-value < 2.2e-16

req <- c("haven","dplyr","ggplot2","car","MASS")
to_install <- req[!sapply(req, requireNamespace, quietly = TRUE)]
if(length(to_install)) install.packages(to_install, dependencies = TRUE)

library(haven)
library(dplyr)
library(ggplot2)
library(car)
library(MASS)

9.6 1) Načítanie dát `all.dta`

paths <- c("all.dta", file.path(getwd(), "all.dta"))
path <- paths[file.exists(paths)][1]
stopifnot(!is.na(path), file.exists(path))

df <- read_dta(path) |> as.data.frame()

# základný sanity check
cat("Rows:", nrow(df), " Cols:", ncol(df), "\n")

## Rows: 112138  Cols: 30

str(df[1:min(10,ncol(df))])

## 'data.frame':    112138 obs. of  10 variables:
##  $ player      : chr  "Krajinovic F." "Roddick A." "Simon G." "Parmentier P." ...
##   ..- attr(*, "format.stata")= chr "%24s"
##  $ rank        : num  37 6 37 59 74 25 196 47 396 29 ...
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ s1          : num  6 6 6 6 6 7 3 6 7 6 ...
##   ..- attr(*, "label")= chr "number of games won in 1st set"
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ s2          : num  6 3 2 6 6 6 6 6 4 6 ...
##   ..- attr(*, "label")= chr "number of games won in 2nd set"
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ s3          : num  NA NA NA 3 NA NA 3 NA 4 NA ...
##   ..- attr(*, "label")= chr "number of games won in 3rd set"
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ swon        : num  2 0 0 1 0 2 1 2 1 2 ...
##   ..- attr(*, "label")= chr "number of sets won"
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ b365        : num  1.5 1.53 1.53 2.62 1.83 ...
##   ..- attr(*, "label")= chr "bet365 odds"
##   ..- attr(*, "format.stata")= chr "%10.0g"
##  $ gender      : num  1 1 1 0 1 1 1 1 1 1 ...
##   ..- attr(*, "label")= chr "male=1, female=0"
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ tylocation  : num  149 100 101 100 123 123 98 151 172 140 ...
##   ..- attr(*, "label")= chr "type of location"
##   ..- attr(*, "format.stata")= chr "%12.0g"
##  $ tytournament: num  279 257 229 195 34 34 220 285 235 266 ...
##   ..- attr(*, "label")= chr "type of tournament"
##   ..- attr(*, "format.stata")= chr "%12.0g"

9.7 2) Príprava premenných pre model

num <- df |> dplyr::select(where(is.numeric))

# vyhodiť konštanty / nulový rozptyl
nzv <- sapply(num, function(x) isTRUE(sd(x, na.rm=TRUE) > 0))
num <- num[, nzv, drop = FALSE]

num <- na.omit(num)

cat("Po čistení: Rows:", nrow(num), " Cols:", ncol(num), "\n")

## Po čistení: Rows: 38582  Cols: 29

9.8 3) Korelačná matica (heatmap)

cor_mat <- cor(num)

# prehľad top korelácií (absolútne)
cm <- cor_mat
diag(cm) <- NA
top_pairs <- as.data.frame(as.table(cm)) |>
  dplyr::mutate(abs_cor = abs(Freq)) |>
  dplyr::filter(!is.na(abs_cor)) |>
  dplyr::arrange(desc(abs_cor)) |>
  dplyr::slice(1:20)

top_pairs

# jednoduchý heatmap bez ďalších balíkov
ord <- order(colnames(cor_mat))
image(
  1:ncol(cor_mat), 1:ncol(cor_mat),
  cor_mat[ord, ord],
  axes = FALSE, xlab = "", ylab = ""
)
axis(1, at = 1:ncol(cor_mat), labels = colnames(cor_mat)[ord], las = 2, cex.axis = 0.6)
axis(2, at = 1:ncol(cor_mat), labels = colnames(cor_mat)[ord], las = 2, cex.axis = 0.6)
title("Korelačná matica (numerické premenné)")

9.9 4) Model + multikolinearita

Keďže v dátach sa často nachádzajú konštrukčné identity (napr. rankdiff = rank - rankop), OLS môže mať presnú kolinearitu.

Preto spravíme: 1. zvolíme cieľovú premennú y automaticky ako prvý numerický stĺpec (ak chceš inú, zmeň to), 2. odstránime presne lineárne závislé prediktory pomocou QR (alias()/rank), 3. potom vypočítame VIF a condition number.

y_name <- names(num)[1]          # <-- zmeň si sem cieľovú premennú ak treba
x_names <- setdiff(names(num), y_name)

y <- num[[y_name]]
X <- num[, x_names, drop = FALSE]

# odstránenie perfektnej multikolinearity (závislé stĺpce)
mm <- model.matrix(~ . , data = X)  # pridá intercept
qr_mm <- qr(mm)
rank_mm <- qr_mm$rank
p_mm <- ncol(mm)

if(rank_mm < p_mm){
  indep_idx <- qr_mm$pivot[1:rank_mm]
  mm2 <- mm[, indep_idx, drop = FALSE]
  # späť na dátový rámec bez interceptu
  keep_cols <- colnames(mm2)
  keep_cols <- setdiff(keep_cols, "(Intercept)")
  X2 <- as.data.frame(mm2[, keep_cols, drop = FALSE])
  cat("Odstránil som", ncol(X) - ncol(X2), "perfektne závislých prediktorov.\n")
} else {
  X2 <- X
  cat("Perfektná kolinearita nebola zistená.\n")
}

## Odstránil som 1 perfektne závislých prediktorov.

dat2 <- cbind.data.frame(y = y, X2)
names(dat2)[1] <- y_name

9.9.1 4.1 OLS

f <- as.formula(paste(y_name, "~", paste(names(X2), collapse = " + ")))
ols <- lm(f, data = dat2)
summary(ols)

## 
## Call:
## lm(formula = f, data = dat2)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -1.458e-11 -2.300e-14 -5.000e-15  1.200e-14  1.023e-10 
## 
## Coefficients:
##                Estimate Std. Error    t value Pr(>|t|)    
## (Intercept)  -1.131e-13  1.689e-12 -6.700e-02 0.946606    
## s1           -4.314e-14  3.412e-15 -1.264e+01  < 2e-16 ***
## s2           -2.755e-15  3.376e-15 -8.160e-01 0.414420    
## s3            3.265e-14  3.146e-15  1.038e+01  < 2e-16 ***
## swon         -4.524e-15  2.938e-13 -1.500e-02 0.987713    
## b365          1.674e-13  4.456e-15  3.757e+01  < 2e-16 ***
## gender        8.085e-14  7.982e-15  1.013e+01  < 2e-16 ***
## tylocation   -2.455e-16  7.205e-17 -3.408e+00 0.000656 ***
## tytournament -3.363e-17  3.514e-17 -9.570e-01 0.338571    
## tyseries      1.454e-14  7.681e-16  1.893e+01  < 2e-16 ***
## tycourt      -4.671e-14  9.442e-15 -4.947e+00 7.58e-07 ***
## tysurface    -2.182e-15  2.502e-15 -8.720e-01 0.383068    
## tyround      -3.155e-14  1.858e-15 -1.698e+01  < 2e-16 ***
## year          1.040e-15  8.012e-16  1.298e+00 0.194406    
## month         4.831e-15  1.187e-15  4.071e+00 4.69e-05 ***
## day           5.416e-16  4.098e-16  1.322e+00 0.186253    
## rankdiff      1.000e+00  5.791e-17  1.727e+16  < 2e-16 ***
## s1diff        1.155e-15  3.136e-15  3.680e-01 0.712781    
## s2diff        1.301e-14  3.106e-15  4.189e+00 2.81e-05 ***
## s3diff       -4.414e-15  2.984e-15 -1.479e+00 0.139113    
## swondiff     -1.090e-13  2.076e-13 -5.250e-01 0.599628    
## b365diff     -1.600e-13  2.995e-15 -5.341e+01  < 2e-16 ***
## s1w          -1.131e-13  2.078e-13 -5.440e-01 0.586401    
## s2w          -1.653e-13  2.077e-13 -7.960e-01 0.426262    
## s3w           4.666e-14  2.939e-13  1.590e-01 0.873861    
## rankindex    -3.905e-13  4.039e-15 -9.667e+01  < 2e-16 ***
## rankop        1.000e+00  8.264e-17  1.210e+16  < 2e-16 ***
## rankindexop  -3.680e-16  4.039e-15 -9.100e-02 0.927408    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.556e-13 on 38554 degrees of freedom
## Multiple R-squared:      1,  Adjusted R-squared:      1 
## F-statistic: 2.432e+31 on 27 and 38554 DF,  p-value: < 2.2e-16

9.9.2 4.2 VIF (Variance Inflation Factor)

v <- car::vif(ols)
v <- sort(v, decreasing = TRUE)
head(v, 30)

##     swondiff         swon          s3w          s1w          s2w       s3diff 
##  3873.433541  1941.018847  1938.255665   969.267107   968.358139     8.093307 
##       s2diff       s1diff     b365diff       rankop         b365     rankdiff 
##     7.808480     7.692521     5.486488     4.484770     4.195253     3.755672 
##  rankindexop    rankindex           s2           s1           s3     tyseries 
##     3.571906     3.571855     3.318751     3.294786     3.178247     1.449856 
##       gender      tyround      tycourt   tylocation    tysurface tytournament 
##     1.419780     1.264489     1.186184     1.149751     1.124088     1.117263 
##        month          day         year 
##     1.116594     1.026043     1.018728

9.9.3 4.3 Condition number (číslo podmienenosti)

Xmm <- model.matrix(ols)[, -1, drop = FALSE]  # bez interceptu
kappa_val <- kappa(Xmm, exact = TRUE)
kappa_val

## [1] 212047

10 1. Príprava dát a definícia binárnej premennej

# načítanie dát zo Stata súboru
tenis <- haven::read_dta("all.dta")

# rýchla kontrola
dim(tenis)

## [1] 112138     30

names(tenis)

##  [1] "player"        "rank"          "s1"            "s2"           
##  [5] "s3"            "swon"          "b365"          "gender"       
##  [9] "tylocation"    "tytournament"  "tyseries"      "tycourt"      
## [13] "tysurface"     "tyround"       "year"          "month"        
## [17] "day"           "rankdiff"      "s1diff"        "s2diff"       
## [21] "s3diff"        "swondiff"      "b365diff"      "s1w"          
## [25] "s2w"           "s3w"           "rankindex"     "rankop"       
## [29] "rankindexop"   "rankindexdiff"

head(tenis)

# binárna vysvetľovaná premenná: výhra hráča 1
# predpoklad: swondiff > 0 znamená, že hráč 1 vyhral zápas
tenis <- tenis %>%
  mutate(
    win = ifelse(swondiff > 0, 1, 0),
    win = as.numeric(win),
    gender    = factor(gender, labels = c("Zena", "Muz")),
    tysurface = factor(tysurface),
    tyround   = factor(tyround)
  )

table(tenis$win)

## 
##     0     1 
## 56069 56069

# dataset pre model – vyhodíme riadky s NA v relevantných premenných
data_model <- tenis %>%
  dplyr::select(win, rankindexdiff, b365diff, gender, tysurface, tyround) %>%
  tidyr::drop_na()

dim(tenis)      # pôvodný počet riadkov

## [1] 112138     31

dim(data_model) # počet riadkov použitých v modeloch

## [1] 111744      6

str(data_model)

## tibble [111,744 × 6] (S3: tbl_df/tbl/data.frame)
##  $ win          : num [1:111744] 1 0 0 0 0 1 0 1 0 1 ...
##  $ rankindexdiff: num [1:111744] 0.345 2.807 1.038 0.287 -0.566 ...
##   ..- attr(*, "label")= chr "rankindex of player-rankindex of opponent"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ b365diff     : num [1:111744] -1.12 -0.84 -0.84 1.18 0 ...
##   ..- attr(*, "label")= chr "b365 of player-b365 of oppone"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ gender       : Factor w/ 2 levels "Zena","Muz": 2 2 2 1 2 2 2 2 2 2 ...
##  $ tysurface    : Factor w/ 5 levels "1","2","3","4",..: 5 5 5 5 5 5 5 1 1 5 ...
##  $ tyround      : Factor w/ 9 levels "1","2","3","4",..: 2 5 2 2 2 1 2 2 1 2 ...

11 2. Lineárny model (LPM)

# lineárny pravdepodobnostný model
lpm <- lm(
  win ~ rankindexdiff + b365diff + gender + tysurface + tyround,
  data = data_model
)

summary(lpm)

## 
## Call:
## lm(formula = win ~ rankindexdiff + b365diff + gender + tysurface + 
##     tyround, data = data_model)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.0455 -0.4422  0.0000  0.4422  2.0455 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    5.000e-01  1.380e-02   36.22   <2e-16 ***
## rankindexdiff  2.721e-02  1.092e-03   24.92   <2e-16 ***
## b365diff      -4.654e-02  6.640e-04  -70.10   <2e-16 ***
## genderMuz     -9.723e-15  2.758e-03    0.00        1    
## tysurface2    -1.606e-13  1.382e-02    0.00        1    
## tysurface3    -1.592e-13  1.447e-02    0.00        1    
## tysurface4    -1.652e-13  6.513e-02    0.00        1    
## tysurface5    -1.657e-13  1.372e-02    0.00        1    
## tyround2      -8.770e-15  3.299e-03    0.00        1    
## tyround3       1.140e-15  6.189e-03    0.00        1    
## tyround4       3.245e-15  1.581e-02    0.00        1    
## tyround5      -5.509e-16  4.622e-03    0.00        1    
## tyround6       3.374e-15  1.403e-02    0.00        1    
## tyround7      -2.889e-16  6.147e-03    0.00        1    
## tyround8       3.321e-16  8.354e-03    0.00        1    
## tyround9       1.083e-14  2.296e-01    0.00        1    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4591 on 111728 degrees of freedom
## Multiple R-squared:  0.157,  Adjusted R-squared:  0.1569 
## F-statistic:  1387 on 15 and 111728 DF,  p-value: < 2.2e-16

# test heteroskedasticity
bptest(lpm)

## 
##  studentized Breusch-Pagan test
## 
## data:  lpm
## BP = 218, df = 15, p-value < 2.2e-16

# robustné (White/HC) štandardné chyby
vcov_lpm_robust <- sandwich::vcovHC(lpm, type = "HC1")
lpm_robust <- lmtest::coeftest(lpm, vcov. = vcov_lpm_robust)
lpm_robust

## 
## t test of coefficients:
## 
##                  Estimate  Std. Error t value Pr(>|t|)    
## (Intercept)    5.0000e-01  1.3638e-02  36.663   <2e-16 ***
## rankindexdiff  2.7207e-02  1.2054e-03  22.570   <2e-16 ***
## b365diff      -4.6545e-02  8.1881e-04 -56.844   <2e-16 ***
## genderMuz     -9.7233e-15  2.7582e-03   0.000        1    
## tysurface2    -1.6060e-13  1.3655e-02   0.000        1    
## tysurface3    -1.5924e-13  1.4345e-02   0.000        1    
## tysurface4    -1.6517e-13  6.4979e-02   0.000        1    
## tysurface5    -1.6570e-13  1.3554e-02   0.000        1    
## tyround2      -8.7700e-15  3.2761e-03   0.000        1    
## tyround3       1.1399e-15  6.1574e-03   0.000        1    
## tyround4       3.2452e-15  1.4954e-02   0.000        1    
## tyround5      -5.5089e-16  4.6409e-03   0.000        1    
## tyround6       3.3743e-15  1.4166e-02   0.000        1    
## tyround7      -2.8886e-16  6.2336e-03   0.000        1    
## tyround8       3.3213e-16  8.4209e-03   0.000        1    
## tyround9       1.0825e-14  2.5838e-01   0.000        1    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# predikované pravdepodobnosti z LPM
data_model$phat_lpm <- fitted(lpm)

summary(data_model$phat_lpm)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -2.0455  0.4047  0.5000  0.5000  0.5953  3.0455

12 3. Nelineárny model – logit

# logit model
logit_mod <- glm(
  win ~ rankindexdiff + b365diff + gender + tysurface + tyround,
  data   = data_model,
  family = binomial(link = "logit")
)

summary(logit_mod)

## 
## Call:
## glm(formula = win ~ rankindexdiff + b365diff + gender + tysurface + 
##     tyround, family = binomial(link = "logit"), data = data_model)
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -5.260e-15  6.870e-02   0.000    1.000    
## rankindexdiff -8.541e-03  5.807e-03  -1.471    0.141    
## b365diff      -4.205e-01  5.072e-03 -82.912   <2e-16 ***
## genderMuz     -1.356e-16  1.328e-02   0.000    1.000    
## tysurface2     5.486e-15  6.874e-02   0.000    1.000    
## tysurface3     5.412e-15  7.171e-02   0.000    1.000    
## tysurface4     1.080e-14  3.243e-01   0.000    1.000    
## tysurface5     5.388e-15  6.829e-02   0.000    1.000    
## tyround2      -1.366e-16  1.601e-02   0.000    1.000    
## tyround3      -1.481e-16  3.068e-02   0.000    1.000    
## tyround4       5.645e-16  8.024e-02   0.000    1.000    
## tyround5      -1.132e-16  2.211e-02   0.000    1.000    
## tyround6      -7.832e-16  6.671e-02   0.000    1.000    
## tyround7       3.868e-17  2.924e-02   0.000    1.000    
## tyround8      -2.613e-16  3.966e-02   0.000    1.000    
## tyround9       3.047e-14  1.002e+00   0.000    1.000    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 154910  on 111743  degrees of freedom
## Residual deviance: 131840  on 111728  degrees of freedom
## AIC: 131872
## 
## Number of Fisher Scoring iterations: 5

# pseudo R2 (McFadden)
logLik_null <- logLik(glm(win ~ 1, data = data_model, family = binomial(link = "logit")))
logLik_full <- logLik(logit_mod)
pseudoR2_logit <- 1 - as.numeric(logLik_full / logLik_null)
pseudoR2_logit

## [1] 0.1489228

# predikované pravdepodobnosti z logitu
data_model$phat_logit <- fitted(logit_mod)
summary(data_model$phat_logit)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.3638  0.5000  0.5000  0.6362  1.0000

# jednoduchý graf: pravdepodobnosť výhry a rankindexdiff
ggplot(data_model, aes(x = rankindexdiff, y = phat_logit)) +
  geom_point(alpha = 0.2) +
  geom_smooth(se = FALSE) +
  labs(
    x = "rankindexdiff",
    y = "P( win = 1 ) – logit",
    title = "Predikované pravdepodobnosti z logit modelu"
  )

13 4. Nelineárny model – probit

# probit model
probit_mod <- glm(
  win ~ rankindexdiff + b365diff + gender + tysurface + tyround,
  data   = data_model,
  family = binomial(link = "probit")
)

summary(probit_mod)

## 
## Call:
## glm(formula = win ~ rankindexdiff + b365diff + gender + tysurface + 
##     tyround, family = binomial(link = "probit"), data = data_model)
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -2.878e-14  4.114e-02   0.000   1.0000    
## rankindexdiff  7.128e-03  3.483e-03   2.047   0.0407 *  
## b365diff      -2.317e-01  2.871e-03 -80.702   <2e-16 ***
## genderMuz      2.195e-14  8.029e-03   0.000   1.0000    
## tysurface2     4.372e-14  4.116e-02   0.000   1.0000    
## tysurface3     4.266e-15  4.299e-02   0.000   1.0000    
## tysurface4     1.968e-14  1.940e-01   0.000   1.0000    
## tysurface5     5.593e-15  4.089e-02   0.000   1.0000    
## tyround2       4.514e-14  9.660e-03   0.000   1.0000    
## tyround3       9.323e-16  1.845e-02   0.000   1.0000    
## tyround4       1.219e-14  4.800e-02   0.000   1.0000    
## tyround5      -8.008e-16  1.339e-02   0.000   1.0000    
## tyround6       8.777e-15  4.044e-02   0.000   1.0000    
## tyround7      -3.018e-16  1.774e-02   0.000   1.0000    
## tyround8       1.457e-16  2.403e-02   0.000   1.0000    
## tyround9       1.936e-14  6.272e-01   0.000   1.0000    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 154910  on 111743  degrees of freedom
## Residual deviance: 132406  on 111728  degrees of freedom
## AIC: 132438
## 
## Number of Fisher Scoring iterations: 5

# pseudo R2 (McFadden) pre probit
logLik_full_probit <- logLik(probit_mod)
pseudoR2_probit <- 1 - as.numeric(logLik_full_probit / logLik_null)
pseudoR2_probit

## [1] 0.1452715

# predikované pravdepodobnosti z probitu
data_model$phat_probit <- fitted(probit_mod)
summary(data_model$phat_probit)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.3745  0.5000  0.5000  0.6255  1.0000

14 5. Porovnanie modelov a klasifikácia

# pomocná funkcia na klasifikačnú tabuľku a percento správnych predikcií
class_table <- function(y, p, cut = 0.5) {
  yhat <- ifelse(p >= cut, 1, 0)
  tab <- table(skutocnost = y, predikcia = yhat)
  acc <- mean(yhat == y)
  list(tab = tab, acc = acc)
}

# LPM (pravdepodobnosti orezané na [0,1])
phat_lpm_trim <- pmin(pmax(data_model$phat_lpm, 0), 1)
ct_lpm <- class_table(data_model$win, phat_lpm_trim, cut = 0.5)

# LOGIT
ct_logit <- class_table(data_model$win, data_model$phat_logit, cut = 0.5)

# PROBIT
ct_probit <- class_table(data_model$win, data_model$phat_probit, cut = 0.5)

ct_lpm$tab

##           predikcia
## skutocnost     0     1
##          0 37889 17983
##          1 17983 37889

ct_lpm$acc

## [1] 0.6781393

ct_logit$tab

##           predikcia
## skutocnost     0     1
##          0 38271 17601
##          1 17601 38271

ct_logit$acc

## [1] 0.6849764

ct_probit$tab

##           predikcia
## skutocnost     0     1
##          0 38240 17632
##          1 17632 38240

ct_probit$acc

## [1] 0.6844215

Ekonometria na FM UK

Igor Németh

1 Cieľ

1.1 Načítanie dát s autodetekciou stĺpcov

1.2 Koľko hráčov malo niekedy rank 1

1.3 2) Percento hráčov, ktorí dosiahli rank 1

1.4 Koľko setov vyhrali a prehrali hráči, keď mali rank 1

1.5 Rank 1 hráči podľa čistého skóre setov

2 Špecifikácia a OLS odhad

3 Test heteroskedasticity

4 Korekcia heteroskedasticity (WLS/FGLS)

5 Diagnostika: Pred vs Po korekcii

5.1 Graf 1 — Pred korekciou (OLS)

5.2 Graf 2 — Po korekcii (WLS/FGLS)

6 Zhrnutie testov

7 1. Dáta a príprava časového radu

8 2. Základný model (OLS)

9 3. Detekcia autokorelácie

9.1 3.1 Graf rezíduí v čase

9.2 3.2 Lag-plot rezíduí: e(t) vs e(t−1)

9.3 3.3 Korelogram (ACF) rezíduí

9.4 3.4 Durbin–Watson test

9.5 3.5 Breusch–Godfrey test (AR(1) až AR(4))

9.6 1) Načítanie dát `all.dta`

9.7 2) Príprava premenných pre model

9.8 3) Korelačná matica (heatmap)

9.9 4) Model + multikolinearita

9.9.1 4.1 OLS

9.9.2 4.2 VIF (Variance Inflation Factor)

9.9.3 4.3 Condition number (číslo podmienenosti)

10 1. Príprava dát a definícia binárnej premennej

11 2. Lineárny model (LPM)

12 3. Nelineárny model – logit

13 4. Nelineárny model – probit

14 5. Porovnanie modelov a klasifikácia

15 Literatúra

Ekonometria na FM UK

Igor Németh

1 Cieľ

1.1 Načítanie dát s autodetekciou stĺpcov

1.2 Koľko hráčov malo niekedy rank 1

1.3 2) Percento hráčov, ktorí dosiahli rank 1

1.4 Koľko setov vyhrali a prehrali hráči, keď mali rank 1

1.5 Rank 1 hráči podľa čistého skóre setov

2 Špecifikácia a OLS odhad

3 Test heteroskedasticity

4 Korekcia heteroskedasticity (WLS/FGLS)

5 Diagnostika: Pred vs Po korekcii

5.1 Graf 1 — Pred korekciou (OLS)

5.2 Graf 2 — Po korekcii (WLS/FGLS)

6 Zhrnutie testov

7 1. Dáta a príprava časového radu

8 2. Základný model (OLS)

9 3. Detekcia autokorelácie

9.1 3.1 Graf rezíduí v čase

9.2 3.2 Lag-plot rezíduí: e(t) vs e(t−1)

9.3 3.3 Korelogram (ACF) rezíduí

9.4 3.4 Durbin–Watson test

9.5 3.5 Breusch–Godfrey test (AR(1) až AR(4))

9.6 1) Načítanie dát all.dta

9.7 2) Príprava premenných pre model

9.8 3) Korelačná matica (heatmap)

9.9 4) Model + multikolinearita

9.9.1 4.1 OLS

9.9.2 4.2 VIF (Variance Inflation Factor)

9.9.3 4.3 Condition number (číslo podmienenosti)

10 1. Príprava dát a definícia binárnej premennej

11 2. Lineárny model (LPM)

12 3. Nelineárny model – logit

13 4. Nelineárny model – probit

14 5. Porovnanie modelov a klasifikácia

15 Literatúra

9.6 1) Načítanie dát `all.dta`