install.packages("knitr")
install.packages("dplyr")
install.packages("ggplot2")
install.packages("haven")
names(data)
## NULL
library(haven)   
library(dplyr)   
data <- read_dta("all.dta")
str(data)
## tibble [112,138 × 30] (S3: tbl_df/tbl/data.frame)
##  $ player       : chr [1:112138] "Krajinovic F." "Roddick A." "Simon G." "Parmentier P." ...
##   ..- attr(*, "format.stata")= chr "%24s"
##  $ rank         : num [1:112138] 37 6 37 59 74 25 196 47 396 29 ...
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ s1           : num [1:112138] 6 6 6 6 6 7 3 6 7 6 ...
##   ..- attr(*, "label")= chr "number of games won in 1st set"
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ s2           : num [1:112138] 6 3 2 6 6 6 6 6 4 6 ...
##   ..- attr(*, "label")= chr "number of games won in 2nd set"
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ s3           : num [1:112138] NA NA NA 3 NA NA 3 NA 4 NA ...
##   ..- attr(*, "label")= chr "number of games won in 3rd set"
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ swon         : num [1:112138] 2 0 0 1 0 2 1 2 1 2 ...
##   ..- attr(*, "label")= chr "number of sets won"
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ b365         : num [1:112138] 1.5 1.53 1.53 2.62 1.83 ...
##   ..- attr(*, "label")= chr "bet365 odds"
##   ..- attr(*, "format.stata")= chr "%10.0g"
##  $ gender       : num [1:112138] 1 1 1 0 1 1 1 1 1 1 ...
##   ..- attr(*, "label")= chr "male=1, female=0"
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ tylocation   : num [1:112138] 149 100 101 100 123 123 98 151 172 140 ...
##   ..- attr(*, "label")= chr "type of location"
##   ..- attr(*, "format.stata")= chr "%12.0g"
##  $ tytournament : num [1:112138] 279 257 229 195 34 34 220 285 235 266 ...
##   ..- attr(*, "label")= chr "type of tournament"
##   ..- attr(*, "format.stata")= chr "%12.0g"
##  $ tyseries     : num [1:112138] 1 5 1 4 7 7 1 4 4 1 ...
##   ..- attr(*, "label")= chr "type of series"
##   ..- attr(*, "format.stata")= chr "%12.0g"
##  $ tycourt      : num [1:112138] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "label")= chr "type of court (outdoors=1, indoors=0)"
##   ..- attr(*, "format.stata")= chr "%12.0g"
##  $ tysurface    : num [1:112138] 5 5 5 5 5 5 5 1 1 5 ...
##   ..- attr(*, "label")= chr "type of surface (carpet=1, clay=2, grass=3, greenset=4, hard=5)"
##   ..- attr(*, "format.stata")= chr "%12.0g"
##  $ tyround      : num [1:112138] 2 5 2 2 2 1 2 2 1 2 ...
##   ..- attr(*, "label")= chr "round of match"
##   ..- attr(*, "format.stata")= chr "%12.0g"
##  $ year         : num [1:112138] 2021 2008 2019 2012 2019 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ month        : num [1:112138] 9 3 9 2 10 11 2 10 2 2 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ day          : num [1:112138] 30 1 19 22 29 9 18 24 26 10 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ rankdiff     : num [1:112138] -10 -36 -39 -13 24 -680 174 -31 291 -16 ...
##   ..- attr(*, "label")= chr "rank of player-rank of opponent"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ s1diff       : num [1:112138] 3 -1 -1 -1 -1 1 -3 5 1 4 ...
##   ..- attr(*, "label")= chr "s1 of player-s1 of oppone"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ s2diff       : num [1:112138] 6 -3 -4 6 -1 2 5 4 -2 6 ...
##   ..- attr(*, "label")= chr "s2 of player-s2 of oppone"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ s3diff       : num [1:112138] NA NA NA -3 NA NA -3 NA -2 NA ...
##   ..- attr(*, "label")= chr "s3 of player-s3 of oppone"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ swondiff     : num [1:112138] 2 -2 -2 -1 -2 2 -1 2 -1 2 ...
##   ..- attr(*, "label")= chr "swin of player-swin of oppone"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ b365diff     : num [1:112138] -1.12 -0.84 -0.84 1.18 0 ...
##   ..- attr(*, "label")= chr "b365 of player-b365 of oppone"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ s1w          : num [1:112138] 1 0 0 0 0 1 0 1 1 1 ...
##   ..- attr(*, "label")= chr "won set1=1, lost=0"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ s2w          : num [1:112138] 1 0 0 1 0 1 1 1 0 1 ...
##   ..- attr(*, "label")= chr "won set2=1, lost=0"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ s3w          : num [1:112138] NA NA NA 0 NA NA 0 NA 0 NA ...
##   ..- attr(*, "label")= chr "won set3=1, lost=0"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ rankindex    : num [1:112138] 2.79 5.42 2.79 2.12 1.79 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ rankop       : num [1:112138] 47 42 76 72 50 705 22 78 105 45 ...
##   ..- attr(*, "label")= chr "rank of opponent"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ rankindexop  : num [1:112138] 2.45 2.61 1.75 1.83 2.36 ...
##   ..- attr(*, "label")= chr "rankindex of opponent"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ rankindexdiff: num [1:112138] 0.345 2.807 1.038 0.287 -0.566 ...
##   ..- attr(*, "label")= chr "rankindex of player-rankindex of opponent"
##   ..- attr(*, "format.stata")= chr "%9.0g"
head(data)
# Hráči, ktorí boli svetovou 1.
rank1_players <- data %>%
filter(rank == 1)
print(rank1_players)
## # A tibble: 1,163 × 30
##    player      rank    s1    s2    s3  swon  b365 gender tylocation tytournament
##    <chr>      <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>      <dbl>        <dbl>
##  1 Djokovic …     1     6     4     7     2  1.04      1         19          291
##  2 Djokovic …     1     6     6     3     1  1.40      1        163          193
##  3 Wozniacki…     1     4     2    NA     0  2.37      0         75          280
##  4 Djokovic …     1     7     6    NA     2  1.03      1        123           34
##  5 Nadal R.       1     6     6    NA     2  1.01      1        123           31
##  6 Jankovic …     1     0     6     6     2  1.53      0        108          174
##  7 Nadal R.       1     7     3     6     2  1.03      1        138            1
##  8 Djokovic …     1     3     6     7     2  1.16      1        123           34
##  9 Djokovic …     1     7     6    NA     2  1.12      1        166          106
## 10 Wozniacki…     1     4     3    NA     0  1.20      0         50          280
## # ℹ 1,153 more rows
## # ℹ 20 more variables: tyseries <dbl>, tycourt <dbl>, tysurface <dbl>,
## #   tyround <dbl>, year <dbl>, month <dbl>, day <dbl>, rankdiff <dbl>,
## #   s1diff <dbl>, s2diff <dbl>, s3diff <dbl>, swondiff <dbl>, b365diff <dbl>,
## #   s1w <dbl>, s2w <dbl>, s3w <dbl>, rankindex <dbl>, rankop <dbl>,
## #   rankindexop <dbl>, rankindexdiff <dbl>
write.csv(rank1_players, "rank1_players.csv", row.names = FALSE)
library(dplyr)
rank1_players <- data %>%
filter(rank == 1)  
rank1_unique <- rank1_players %>%
distinct(player, year, .keep_all = TRUE)
print(rank1_unique)
## # A tibble: 57 × 30
##    player      rank    s1    s2    s3  swon  b365 gender tylocation tytournament
##    <chr>      <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>      <dbl>        <dbl>
##  1 Djokovic …     1     6     4     7     2  1.04      1         19          291
##  2 Djokovic …     1     6     6     3     1  1.40      1        163          193
##  3 Wozniacki…     1     4     2    NA     0  2.37      0         75          280
##  4 Djokovic …     1     7     6    NA     2  1.03      1        123           34
##  5 Nadal R.       1     6     6    NA     2  1.01      1        123           31
##  6 Jankovic …     1     0     6     6     2  1.53      0        108          174
##  7 Nadal R.       1     7     3     6     2  1.03      1        138            1
##  8 Djokovic …     1     7     6    NA     2  1.12      1        166          106
##  9 Wozniacki…     1     4     3    NA     0  1.20      0         50          280
## 10 Murray A.      1     6     6     6     2  1.28      1         87          193
## # ℹ 47 more rows
## # ℹ 20 more variables: tyseries <dbl>, tycourt <dbl>, tysurface <dbl>,
## #   tyround <dbl>, year <dbl>, month <dbl>, day <dbl>, rankdiff <dbl>,
## #   s1diff <dbl>, s2diff <dbl>, s3diff <dbl>, swondiff <dbl>, b365diff <dbl>,
## #   s1w <dbl>, s2w <dbl>, s3w <dbl>, rankindex <dbl>, rankop <dbl>,
## #   rankindexop <dbl>, rankindexdiff <dbl>

1 Cieľ

Krátka analýza zameraná iba na hráčov s rankom 1

1.1 Načítanie dát s autodetekciou stĺpcov

library(tidyverse)
library(haven)
library(lubridate)
library(janitor)
library(stringr)

# 1) načítaj dáta
path <- c(params$data_path, "./all.dta", "/mnt/data/all.dta") %>% purrr::keep(file.exists) %>% dplyr::first()
stopifnot(!is.null(path))
raw <- read_dta(path)
df <- labelled::to_factor(raw) %>% clean_names()

# 2) autodetekcia mien stĺpcov
nm <- names(df)
# hráč
guess_player <- c(params$player_var, nm[str_detect(nm, "player|hrac|nick|name|id")]) %>% purrr::discard(is.null) %>% dplyr::first()
# rank
guess_rank   <- c(params$rank_var,   nm[str_detect(nm, "(^|_)rank($|_)|rating_rank|ranking")]) %>% purrr::discard(is.null) %>% dplyr::first()
# dátum
date_candidates <- nm[vapply(df, inherits, logical(1), what = c("Date","POSIXct","POSIXt"))]
if (length(date_candidates) == 0) {
  likely_date_names <- nm[str_detect(nm, "date|datum|day|time|datetime|match_date|played|created")]
  for (v in likely_date_names){
    dt <- suppressWarnings(ymd(df[[v]]))
    if (sum(!is.na(dt)) > nrow(df)*0.2) { df[[v]] <- dt; date_candidates <- c(date_candidates, v); break }
  }
}
guess_date <- dplyr::first(purrr::discard(c(params$date_var, date_candidates), is.null))

# sets won / lost (autodetekcia) – v datasete mám len swon
guess_sw <- dplyr::first(purrr::discard(c(
  params$sets_won_var,
  nm[stringr::str_detect(nm, "(^|_)(swon|sets?_won|won_sets|set_won|setsfor|sets_for)($|_)")]
), is.null))
# sets_lost neexistuje – vypočítame neskôr pomocou swon → lost

guess_sl <- c(
  params$sets_lost_var,
  nm[str_detect(nm, "(^|_)(sets?_lost|lost_sets|set_lost|sl|setsagainst|sets_against)($|_)")]
) %>% purrr::discard(is.null) %>% dplyr::first()

# 3) validácia a zostavenie core dát
if (is.null(guess_player) || is.null(guess_rank)) {
  stop("Nepodarilo sa nájsť stĺpce player/rank. Zadaj mená do params$player_var a params$rank_var.")
}

# vyber povinné stĺpce
core <- df %>% dplyr::select(player = dplyr::all_of(guess_player), rank = dplyr::all_of(guess_rank))

# doplň date stĺpec — robustne, aj keď sa nenašiel
if (!is.null(guess_date) && guess_date %in% names(df)) {
  if (inherits(df[[guess_date]], "Date")) {
    core$date <- df[[guess_date]]
  } else {
    core$date <- suppressWarnings(as.Date(df[[guess_date]]))
  }
} else {
  core$date <- seq_len(nrow(core)) # náhradný „čas“ podľa poradia riadkov
}

# doplň sets_won (povinné) a z neho dopočítaj sets_lost podľa pravidla
if (!is.null(guess_sw) && guess_sw %in% names(df)) {
  core$sets_won <- suppressWarnings(readr::parse_number(as.character(df[[guess_sw]])))
} else {
  stop("Nenašiel som stĺpec s vyhratými setmi (swon). Nastav params$sets_won_var = 'swon'.")
}
# Pravidlo: swon == 0 alebo 1 → prehral 2 sety; swon >= 2 → prehral 0
core$sets_lost <- dplyr::case_when(
  is.na(core$sets_won) ~ NA_real_,
  core$sets_won %in% c(0,1) ~ 2,
  core$sets_won >= 2 ~ 0,
  TRUE ~ NA_real_
)

# typy/ranky/filtre
if (!is.numeric(core$rank)) core <- core %>% mutate(rank = readr::parse_number(as.character(rank)))
core <- core %>% filter(!is.na(player), !is.na(rank)) %>% arrange(player, date) %>% mutate(is_r1 = rank == 1)

# info
list(
  n_rows = nrow(core), n_players = n_distinct(core$player),
  vars = list(player = guess_player, rank = guess_rank, date = guess_date)
)
## $n_rows
## [1] 111941
## 
## $n_players
## [1] 1999
## 
## $vars
## $vars$player
## [1] "player"
## 
## $vars$rank
## [1] "rank"
## 
## $vars$date
## [1] NA

1.2 Koľko hráčov malo niekedy rank 1

players_total <- n_distinct(core$player)
players_r1    <- core %>% filter(is_r1) %>% distinct(player) %>% nrow()
share_r1      <- players_r1 / players_total

p1 <- tibble(metric = c("hráči s rank1", "ostatní"), 
             value = c(players_r1, players_total - players_r1)) %>%
  ggplot(aes(x = metric, y = value)) +
  geom_col() +
  geom_text(aes(label = value), vjust = -0.3) +
  labs(title = "Počet hráčov, ktorí niekedy dosiahli rank 1",
       x = NULL, y = "Počet hráčov") +
  theme_minimal()
print(p1)

Interpretácia: Stĺpec „hráči s rank1“ ukazuje, koľko hráčov sa aspoň raz dostalo na prvé miesto rebríčku. Druhý stĺpec zobrazuje všetkých ostatných.

1.3 2) Percento hráčov, ktorí dosiahli rank 1

p2 <- tibble(group = c("Rank 1 niekedy", "Nikdy"), share = c(share_r1, 1 - share_r1)) %>%
  ggplot(aes(x = group, y = share)) +
  geom_col() +
  scale_y_continuous(labels = scales::percent) +
  geom_text(aes(label = scales::percent(share, accuracy = 0.1)), vjust = -0.3) +
  labs(title = "Podiel hráčov, ktorí niekedy dosiahli rank 1",
       x = NULL, y = "% hráčov") +
  theme_minimal()
print(p2)

Interpretácia: Výška stĺpca vyjadruje, koľko % všetkých hráčov získalo rank 1 aspoň raz.

1.4 Koľko setov vyhrali a prehrali hráči, keď mali rank 1

r1_sets <- core %>% filter(is_r1) %>%
  group_by(player) %>%
  summarise(sets_won = sum(sets_won, na.rm = TRUE),
            sets_lost = sum(sets_lost, na.rm = TRUE), .groups = "drop")

long_r1 <- r1_sets %>% pivot_longer(c(sets_won, sets_lost), names_to = "type", values_to = "sets")

p3 <- long_r1 %>%
  group_by(type) %>% summarise(total = sum(sets), .groups = "drop") %>%
  ggplot(aes(x = type, y = total)) +
  geom_col() +
  geom_text(aes(label = total), vjust = -0.3) +
  labs(title = "Súčet vyhratých a prehratých setov počas ranku 1 (všetci hráči)",
       subtitle = "Pravidlo: swon 0/1 ⇒ prehrané 2 sety; swon ≥ 2 ⇒ prehrané 0",
       x = NULL, y = "Počet setov") +
  theme_minimal()
print(p3)

Interpretácia: Vidíme celkový „objem“ setov, ktoré hráči odohrali v tom období, keď mali rank 1, rozdelené na výhry a prehry.

1.5 Rank 1 hráči podľa čistého skóre setov

r1_leaderboard <- r1_sets %>% mutate(net_sets = sets_won - sets_lost,
                                     win_rate = sets_won/pmax(1, (sets_won + sets_lost)))

p4 <- r1_leaderboard %>%
  slice_max(net_sets, n = 10, with_ties = FALSE) %>%
  mutate(player = forcats::fct_reorder(player, net_sets)) %>%
  ggplot(aes(x = player, y = net_sets)) +
  geom_col() + coord_flip() +
  labs(title = "Top 10 hráčov podľa čistého skóre setov (rank 1)",
       subtitle = "net_sets = sets_won − sets_lost",
       x = NULL, y = "Čisté sety") +
  theme_minimal()
print(p4)

Interpretácia: Rebríček zvýrazňuje, kto mal počas ranku 1 najvýraznejšiu dominanciu v setoch (čisté skóre).


need <- c("haven", "dplyr", "lmtest", "sandwich", "broom")
to_install <- setdiff(need, rownames(installed.packages()))
if (length(to_install) > 0) install.packages(to_install, dependencies = TRUE)

library(haven); library(dplyr); library(lmtest); library(sandwich); library(broom)
df <- haven::read_dta("all.dta") %>% as.data.frame()

names(df)[1:30]
##  [1] "player"        "rank"          "s1"            "s2"           
##  [5] "s3"            "swon"          "b365"          "gender"       
##  [9] "tylocation"    "tytournament"  "tyseries"      "tycourt"      
## [13] "tysurface"     "tyround"       "year"          "month"        
## [17] "day"           "rankdiff"      "s1diff"        "s2diff"       
## [21] "s3diff"        "swondiff"      "b365diff"      "s1w"          
## [25] "s2w"           "s3w"           "rankindex"     "rankop"       
## [29] "rankindexop"   "rankindexdiff"
dim(df)
## [1] 112138     30
# Ľahký wrangling (kategórie ako faktory)
df <- df %>% mutate(
  gender    = factor(gender),
  tysurface = factor(tysurface),
  year      = as.integer(year)
)

# Zvolené premenné (ak niektorá chýba, automaticky sa vynechá)
vars_needed <- c("b365","rankdiff","s1","s2","s3","swon","gender","tysurface","year")
vars_needed <- intersect(vars_needed, names(df))

df_model <- df %>% dplyr::select(all_of(vars_needed)) %>% na.omit()
dim(df_model); head(df_model)
## [1] 38582     9

2 Špecifikácia a OLS odhad

# Dynamicky poskladaná formula
rhs <- c()
for (v in c("rankdiff","s1","s2","s3","swon","gender","tysurface","year")) {
  if (v %in% names(df_model)) rhs <- c(rhs, v)
}

stopifnot("b365" %in% names(df_model))
form <- as.formula(paste("b365 ~", paste(rhs, collapse = " + ")))
form
## b365 ~ rankdiff + s1 + s2 + s3 + swon + gender + tysurface + 
##     year
# OLS
m_ols <- lm(form, data = df_model)
summary(m_ols)
## 
## Call:
## lm(formula = form, data = df_model)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.3216 -0.6884 -0.3176  0.2827 25.9022 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.860e+01  3.368e+00   8.490  < 2e-16 ***
## rankdiff     5.320e-03  6.313e-05  84.268  < 2e-16 ***
## s1          -5.370e-02  4.790e-03 -11.209  < 2e-16 ***
## s2          -4.656e-02  4.726e-03  -9.851  < 2e-16 ***
## s3          -6.735e-02  5.470e-03 -12.312  < 2e-16 ***
## swon        -2.908e-01  2.069e-02 -14.053  < 2e-16 ***
## gender1      1.206e-01  1.419e-02   8.498  < 2e-16 ***
## tysurface2  -4.251e-02  7.190e-02  -0.591    0.554    
## tysurface3  -6.302e-02  7.516e-02  -0.838    0.402    
## tysurface4  -4.067e-01  5.633e-01  -0.722    0.470    
## tysurface5  -1.078e-02  7.145e-02  -0.151    0.880    
## year        -1.244e-02  1.678e-03  -7.415 1.24e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.369 on 38570 degrees of freedom
## Multiple R-squared:  0.2042, Adjusted R-squared:  0.204 
## F-statistic: 899.9 on 11 and 38570 DF,  p-value: < 2.2e-16

3 Test heteroskedasticity

# Breusch–Pagan
bp <- lmtest::bptest(m_ols)
bp
## 
##  studentized Breusch-Pagan test
## 
## data:  m_ols
## BP = 625.62, df = 11, p-value < 2.2e-16
# White 
make_white_rhs <- function(vars) {
  keep <- intersect(vars, names(df_model))
  if (length(keep) == 0) return(~ 1)
  terms <- unlist(lapply(keep, function(v) c(v, paste0("I(", v, "^2)"))))
  as.formula(paste("~", paste(terms, collapse = " + ")))
}
white_rhs <- make_white_rhs(c("rankdiff","s1","s2","s3","swon"))
white <- lmtest::bptest(m_ols, varformula = white_rhs, studentize = TRUE, data = df_model)
white
## 
##  studentized Breusch-Pagan test
## 
## data:  m_ols
## BP = 1073.8, df = 10, p-value < 2.2e-16

4 Korekcia heteroskedasticity (WLS/FGLS)

aux_form <- as.formula(paste("log(residuals(m_ols)^2) ~", paste(rhs, collapse = " + ")))
aux <- lm(aux_form, data = df_model)

sigma_hat2 <- exp(fitted(aux))     # odhad var(e|X)
w <- 1 / sigma_hat2                # WLS váhy (menšia váha pri väčšej variancii)

# WLS/FGLS model
m_wls <- lm(form, data = df_model, weights = w)
summary(m_wls)
## 
## Call:
## lm(formula = form, data = df_model, weights = w)
## 
## Weighted Residuals:
##    Min     1Q Median     3Q    Max 
## -2.874 -1.464 -0.688  0.738 45.040 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.586e+01  2.770e+00   5.725 1.04e-08 ***
## rankdiff     1.824e-03  3.117e-05  58.517  < 2e-16 ***
## s1          -3.408e-02  4.037e-03  -8.442  < 2e-16 ***
## s2          -3.465e-02  3.974e-03  -8.719  < 2e-16 ***
## s3          -4.019e-02  5.564e-03  -7.224 5.15e-13 ***
## swon        -3.380e-01  1.834e-02 -18.428  < 2e-16 ***
## gender1      8.945e-02  1.153e-02   7.757 8.89e-15 ***
## tysurface2  -1.138e-03  7.308e-02  -0.016    0.988    
## tysurface3  -7.731e-03  7.486e-02  -0.103    0.918    
## tysurface4   2.939e-03  5.143e-01   0.006    0.995    
## tysurface5   1.186e-02  7.284e-02   0.163    0.871    
## year        -6.270e-03  1.379e-03  -4.545 5.50e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.502 on 38570 degrees of freedom
## Multiple R-squared:  0.1244, Adjusted R-squared:  0.1241 
## F-statistic: 497.9 on 11 and 38570 DF,  p-value: < 2.2e-16

5 Diagnostika: Pred vs Po korekcii

Na porovnanie použijeme Residuals vs Fitted — priamo ukazuje, či sa rozptyl mení s predikovanou hodnotou (lievik = heteroskedasticita).

5.1 Graf 1 — Pred korekciou (OLS)

par(mfrow=c(1,1))
plot(m_ols, which = 1)

Vysvetlivka (Graf 1 – OLS):
- Čo skúma: vzťah medzi predikovanými hodnotami (X) a reziduami (Y).
- Čo hľadám: náhodný „mrak“ okolo 0 bez trendu. Ak sa rozptyl mení s X (tvar lievika), ide o heteroskedasticitu.
- Dôsledok: OLS koeficienty sú stále nestranné, ale štandardné chyby sú skreslené

5.2 Graf 2 — Po korekcii (WLS/FGLS)

par(mfrow=c(1,1))
plot(m_wls, which = 1)

Vysvetlivka (Graf 2 – WLS/FGLS):
- Čo skúma: to isté čo Graf 1, ale po aplikácii váženia podľa odhadnutej variability chýb.
- Čo hľadám: rozptyl bodov by mal byť stabilnejší naprieč fitted hodnotami (menej „lievikovitý“).
- Dôsledok: Po korekcii sú odhady efektívnejšie a intervaly/spoľahlivostné testy sú spoľahlivejšie.

6 Zhrnutie testov

data.frame(
  Test = c("Breusch–Pagan", "White"),
  p_value = signif(c(bp$p.value, white$p.value), 4),
  stringsAsFactors = FALSE
)
library(haven)
library(dplyr)
library(ggplot2)
library(lmtest)
library(forecast)
library(sandwich)
library(nlme)

7 1. Dáta a príprava časového radu

dat <- read_dta("all.dta")

dat <- dat %>%
  mutate(date = as.Date(paste(year, month, day, sep = "-"))) %>%
  filter(!is.na(date))

Vyberieme hráča s najvyšším počtom pozorovaní, aby sme mali dostatočne dlhý časový rad.

player_pick <- dat %>% count(player, sort = TRUE) %>% slice(1) %>% pull(player)

d <- dat %>%
  filter(player == player_pick) %>%
  arrange(date) %>%
  filter(!is.na(rankindex), !is.na(b365), !is.na(rankindexdiff)) %>%
  droplevels()

# DÔLEŽITÉ: AR(1) v nlme vyžaduje unikátne hodnoty "času".
# Keď má hráč viac zápasov v rovnaký deň, date nie je unikátny.
# Preto vytvoríme jednoduchý časový index 1..T:
d <- d %>% mutate(t = row_number())

cat("Použitý hráč:", player_pick, "\n")
## Použitý hráč: Djokovic N.
cat("Počet pozorovaní:", nrow(d), "\n")
## Počet pozorovaní: 675
cat("Rozsah dátumov:", format(min(d$date)), "–", format(max(d$date)), "\n")
## Rozsah dátumov: 2007-01-02 – 2021-11-20

8 2. Základný model (OLS)

Použijeme jednoduchý model:

\[ rankindex_t = \beta_0 + \beta_1 b365_t + \beta_2 rankindexdiff_t + u_t \]

m <- lm(rankindex ~ b365 + rankindexdiff, data = d)
summary(m)
## 
## Call:
## lm(formula = rankindex ~ b365 + rankindexdiff, data = d)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.1134 -0.5423  0.1437  0.6627  1.6056 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    6.01219    0.14410  41.723   <2e-16 ***
## b365           0.09303    0.06475   1.437    0.151    
## rankindexdiff  0.26588    0.02218  11.986   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.963 on 672 degrees of freedom
## Multiple R-squared:  0.2141, Adjusted R-squared:  0.2118 
## F-statistic: 91.53 on 2 and 672 DF,  p-value: < 2.2e-16

9 3. Detekcia autokorelácie

9.1 3.1 Graf rezíduí v čase

d <- d %>% mutate(e = resid(m))

ggplot(d, aes(x = date, y = e)) +
  geom_line() +
  labs(
    title = "Rezíduá OLS v čase",
    x = "Dátum", y = "rezíduá"
  )

9.2 3.2 Lag-plot rezíduí: e(t) vs e(t−1)

d_lag <- d %>%
  mutate(e_lag1 = dplyr::lag(e, 1)) %>%
  filter(!is.na(e_lag1))

ggplot(d_lag, aes(x = e_lag1, y = e)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  labs(
    title = "Lag-plot rezíduí: e(t) vs e(t-1)",
    x = "e(t-1)", y = "e(t)"
  )
## `geom_smooth()` using formula = 'y ~ x'

(Voliteľne) pomocná regresia:

summary(lm(e ~ e_lag1, data = d_lag))
## 
## Call:
## lm(formula = e ~ e_lag1, data = d_lag)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.10462 -0.32983  0.05773  0.37198  1.66452 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.005759   0.021145   0.272    0.785    
## e_lag1      0.815752   0.022026  37.035   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5489 on 672 degrees of freedom
## Multiple R-squared:  0.6712, Adjusted R-squared:  0.6707 
## F-statistic:  1372 on 1 and 672 DF,  p-value: < 2.2e-16

9.3 3.3 Korelogram (ACF) rezíduí

Acf(resid(m), main = "ACF rezíduí (OLS)")

9.4 3.4 Durbin–Watson test

dwtest(m)
## 
##  Durbin-Watson test
## 
## data:  m
## DW = 0.35882, p-value < 2.2e-16
## alternative hypothesis: true autocorrelation is greater than 0

9.5 3.5 Breusch–Godfrey test (AR(1) až AR(4))

bgtest(m, order = 1)
## 
##  Breusch-Godfrey test for serial correlation of order up to 1
## 
## data:  m
## LM test = 499.02, df = 1, p-value < 2.2e-16
bgtest(m, order = 2)
## 
##  Breusch-Godfrey test for serial correlation of order up to 2
## 
## data:  m
## LM test = 542.46, df = 2, p-value < 2.2e-16
bgtest(m, order = 3)
## 
##  Breusch-Godfrey test for serial correlation of order up to 3
## 
## data:  m
## LM test = 565.28, df = 3, p-value < 2.2e-16
bgtest(m, order = 4)
## 
##  Breusch-Godfrey test for serial correlation of order up to 4
## 
## data:  m
## LM test = 576.46, df = 4, p-value < 2.2e-16
req <- c("haven","dplyr","ggplot2","car","MASS")
to_install <- req[!sapply(req, requireNamespace, quietly = TRUE)]
if(length(to_install)) install.packages(to_install, dependencies = TRUE)

library(haven)
library(dplyr)
library(ggplot2)
library(car)
library(MASS)

9.6 1) Načítanie dát all.dta

paths <- c("all.dta", file.path(getwd(), "all.dta"))
path <- paths[file.exists(paths)][1]
stopifnot(!is.na(path), file.exists(path))

df <- read_dta(path) |> as.data.frame()

# základný sanity check
cat("Rows:", nrow(df), " Cols:", ncol(df), "\n")
## Rows: 112138  Cols: 30
str(df[1:min(10,ncol(df))])
## 'data.frame':    112138 obs. of  10 variables:
##  $ player      : chr  "Krajinovic F." "Roddick A." "Simon G." "Parmentier P." ...
##   ..- attr(*, "format.stata")= chr "%24s"
##  $ rank        : num  37 6 37 59 74 25 196 47 396 29 ...
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ s1          : num  6 6 6 6 6 7 3 6 7 6 ...
##   ..- attr(*, "label")= chr "number of games won in 1st set"
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ s2          : num  6 3 2 6 6 6 6 6 4 6 ...
##   ..- attr(*, "label")= chr "number of games won in 2nd set"
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ s3          : num  NA NA NA 3 NA NA 3 NA 4 NA ...
##   ..- attr(*, "label")= chr "number of games won in 3rd set"
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ swon        : num  2 0 0 1 0 2 1 2 1 2 ...
##   ..- attr(*, "label")= chr "number of sets won"
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ b365        : num  1.5 1.53 1.53 2.62 1.83 ...
##   ..- attr(*, "label")= chr "bet365 odds"
##   ..- attr(*, "format.stata")= chr "%10.0g"
##  $ gender      : num  1 1 1 0 1 1 1 1 1 1 ...
##   ..- attr(*, "label")= chr "male=1, female=0"
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ tylocation  : num  149 100 101 100 123 123 98 151 172 140 ...
##   ..- attr(*, "label")= chr "type of location"
##   ..- attr(*, "format.stata")= chr "%12.0g"
##  $ tytournament: num  279 257 229 195 34 34 220 285 235 266 ...
##   ..- attr(*, "label")= chr "type of tournament"
##   ..- attr(*, "format.stata")= chr "%12.0g"

9.7 2) Príprava premenných pre model

num <- df |> dplyr::select(where(is.numeric))

# vyhodiť konštanty / nulový rozptyl
nzv <- sapply(num, function(x) isTRUE(sd(x, na.rm=TRUE) > 0))
num <- num[, nzv, drop = FALSE]

num <- na.omit(num)

cat("Po čistení: Rows:", nrow(num), " Cols:", ncol(num), "\n")
## Po čistení: Rows: 38582  Cols: 29

9.8 3) Korelačná matica (heatmap)

cor_mat <- cor(num)

# prehľad top korelácií (absolútne)
cm <- cor_mat
diag(cm) <- NA
top_pairs <- as.data.frame(as.table(cm)) |>
  dplyr::mutate(abs_cor = abs(Freq)) |>
  dplyr::filter(!is.na(abs_cor)) |>
  dplyr::arrange(desc(abs_cor)) |>
  dplyr::slice(1:20)

top_pairs
# jednoduchý heatmap bez ďalších balíkov
ord <- order(colnames(cor_mat))
image(
  1:ncol(cor_mat), 1:ncol(cor_mat),
  cor_mat[ord, ord],
  axes = FALSE, xlab = "", ylab = ""
)
axis(1, at = 1:ncol(cor_mat), labels = colnames(cor_mat)[ord], las = 2, cex.axis = 0.6)
axis(2, at = 1:ncol(cor_mat), labels = colnames(cor_mat)[ord], las = 2, cex.axis = 0.6)
title("Korelačná matica (numerické premenné)")

9.9 4) Model + multikolinearita

Keďže v dátach sa často nachádzajú konštrukčné identity (napr. rankdiff = rank - rankop), OLS môže mať presnú kolinearitu.

Preto spravíme: 1. zvolíme cieľovú premennú y automaticky ako prvý numerický stĺpec (ak chceš inú, zmeň to), 2. odstránime presne lineárne závislé prediktory pomocou QR (alias()/rank), 3. potom vypočítame VIF a condition number.

y_name <- names(num)[1]          # <-- zmeň si sem cieľovú premennú ak treba
x_names <- setdiff(names(num), y_name)

y <- num[[y_name]]
X <- num[, x_names, drop = FALSE]

# odstránenie perfektnej multikolinearity (závislé stĺpce)
mm <- model.matrix(~ . , data = X)  # pridá intercept
qr_mm <- qr(mm)
rank_mm <- qr_mm$rank
p_mm <- ncol(mm)

if(rank_mm < p_mm){
  indep_idx <- qr_mm$pivot[1:rank_mm]
  mm2 <- mm[, indep_idx, drop = FALSE]
  # späť na dátový rámec bez interceptu
  keep_cols <- colnames(mm2)
  keep_cols <- setdiff(keep_cols, "(Intercept)")
  X2 <- as.data.frame(mm2[, keep_cols, drop = FALSE])
  cat("Odstránil som", ncol(X) - ncol(X2), "perfektne závislých prediktorov.\n")
} else {
  X2 <- X
  cat("Perfektná kolinearita nebola zistená.\n")
}
## Odstránil som 1 perfektne závislých prediktorov.
dat2 <- cbind.data.frame(y = y, X2)
names(dat2)[1] <- y_name

9.9.1 4.1 OLS

f <- as.formula(paste(y_name, "~", paste(names(X2), collapse = " + ")))
ols <- lm(f, data = dat2)
summary(ols)
## 
## Call:
## lm(formula = f, data = dat2)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -1.458e-11 -2.300e-14 -5.000e-15  1.200e-14  1.023e-10 
## 
## Coefficients:
##                Estimate Std. Error    t value Pr(>|t|)    
## (Intercept)  -1.131e-13  1.689e-12 -6.700e-02 0.946606    
## s1           -4.314e-14  3.412e-15 -1.264e+01  < 2e-16 ***
## s2           -2.755e-15  3.376e-15 -8.160e-01 0.414420    
## s3            3.265e-14  3.146e-15  1.038e+01  < 2e-16 ***
## swon         -4.524e-15  2.938e-13 -1.500e-02 0.987713    
## b365          1.674e-13  4.456e-15  3.757e+01  < 2e-16 ***
## gender        8.085e-14  7.982e-15  1.013e+01  < 2e-16 ***
## tylocation   -2.455e-16  7.205e-17 -3.408e+00 0.000656 ***
## tytournament -3.363e-17  3.514e-17 -9.570e-01 0.338571    
## tyseries      1.454e-14  7.681e-16  1.893e+01  < 2e-16 ***
## tycourt      -4.671e-14  9.442e-15 -4.947e+00 7.58e-07 ***
## tysurface    -2.182e-15  2.502e-15 -8.720e-01 0.383068    
## tyround      -3.155e-14  1.858e-15 -1.698e+01  < 2e-16 ***
## year          1.040e-15  8.012e-16  1.298e+00 0.194406    
## month         4.831e-15  1.187e-15  4.071e+00 4.69e-05 ***
## day           5.416e-16  4.098e-16  1.322e+00 0.186253    
## rankdiff      1.000e+00  5.791e-17  1.727e+16  < 2e-16 ***
## s1diff        1.155e-15  3.136e-15  3.680e-01 0.712781    
## s2diff        1.301e-14  3.106e-15  4.189e+00 2.81e-05 ***
## s3diff       -4.414e-15  2.984e-15 -1.479e+00 0.139113    
## swondiff     -1.090e-13  2.076e-13 -5.250e-01 0.599628    
## b365diff     -1.600e-13  2.995e-15 -5.341e+01  < 2e-16 ***
## s1w          -1.131e-13  2.078e-13 -5.440e-01 0.586401    
## s2w          -1.653e-13  2.077e-13 -7.960e-01 0.426262    
## s3w           4.666e-14  2.939e-13  1.590e-01 0.873861    
## rankindex    -3.905e-13  4.039e-15 -9.667e+01  < 2e-16 ***
## rankop        1.000e+00  8.264e-17  1.210e+16  < 2e-16 ***
## rankindexop  -3.680e-16  4.039e-15 -9.100e-02 0.927408    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.556e-13 on 38554 degrees of freedom
## Multiple R-squared:      1,  Adjusted R-squared:      1 
## F-statistic: 2.432e+31 on 27 and 38554 DF,  p-value: < 2.2e-16

9.9.2 4.2 VIF (Variance Inflation Factor)

v <- car::vif(ols)
v <- sort(v, decreasing = TRUE)
head(v, 30)
##     swondiff         swon          s3w          s1w          s2w       s3diff 
##  3873.433541  1941.018847  1938.255665   969.267107   968.358139     8.093307 
##       s2diff       s1diff     b365diff       rankop         b365     rankdiff 
##     7.808480     7.692521     5.486488     4.484770     4.195253     3.755672 
##  rankindexop    rankindex           s2           s1           s3     tyseries 
##     3.571906     3.571855     3.318751     3.294786     3.178247     1.449856 
##       gender      tyround      tycourt   tylocation    tysurface tytournament 
##     1.419780     1.264489     1.186184     1.149751     1.124088     1.117263 
##        month          day         year 
##     1.116594     1.026043     1.018728

9.9.3 4.3 Condition number (číslo podmienenosti)

Xmm <- model.matrix(ols)[, -1, drop = FALSE]  # bez interceptu
kappa_val <- kappa(Xmm, exact = TRUE)
kappa_val
## [1] 212047

10 1. Príprava dát a definícia binárnej premennej

# načítanie dát zo Stata súboru
tenis <- haven::read_dta("all.dta")

# rýchla kontrola
dim(tenis)
## [1] 112138     30
names(tenis)
##  [1] "player"        "rank"          "s1"            "s2"           
##  [5] "s3"            "swon"          "b365"          "gender"       
##  [9] "tylocation"    "tytournament"  "tyseries"      "tycourt"      
## [13] "tysurface"     "tyround"       "year"          "month"        
## [17] "day"           "rankdiff"      "s1diff"        "s2diff"       
## [21] "s3diff"        "swondiff"      "b365diff"      "s1w"          
## [25] "s2w"           "s3w"           "rankindex"     "rankop"       
## [29] "rankindexop"   "rankindexdiff"
head(tenis)
# binárna vysvetľovaná premenná: výhra hráča 1
# predpoklad: swondiff > 0 znamená, že hráč 1 vyhral zápas
tenis <- tenis %>%
  mutate(
    win = ifelse(swondiff > 0, 1, 0),
    win = as.numeric(win),
    gender    = factor(gender, labels = c("Zena", "Muz")),
    tysurface = factor(tysurface),
    tyround   = factor(tyround)
  )

table(tenis$win)
## 
##     0     1 
## 56069 56069
# dataset pre model – vyhodíme riadky s NA v relevantných premenných
data_model <- tenis %>%
  dplyr::select(win, rankindexdiff, b365diff, gender, tysurface, tyround) %>%
  tidyr::drop_na()

dim(tenis)      # pôvodný počet riadkov
## [1] 112138     31
dim(data_model) # počet riadkov použitých v modeloch
## [1] 111744      6
str(data_model)
## tibble [111,744 × 6] (S3: tbl_df/tbl/data.frame)
##  $ win          : num [1:111744] 1 0 0 0 0 1 0 1 0 1 ...
##  $ rankindexdiff: num [1:111744] 0.345 2.807 1.038 0.287 -0.566 ...
##   ..- attr(*, "label")= chr "rankindex of player-rankindex of opponent"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ b365diff     : num [1:111744] -1.12 -0.84 -0.84 1.18 0 ...
##   ..- attr(*, "label")= chr "b365 of player-b365 of oppone"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ gender       : Factor w/ 2 levels "Zena","Muz": 2 2 2 1 2 2 2 2 2 2 ...
##  $ tysurface    : Factor w/ 5 levels "1","2","3","4",..: 5 5 5 5 5 5 5 1 1 5 ...
##  $ tyround      : Factor w/ 9 levels "1","2","3","4",..: 2 5 2 2 2 1 2 2 1 2 ...

11 2. Lineárny model (LPM)

# lineárny pravdepodobnostný model
lpm <- lm(
  win ~ rankindexdiff + b365diff + gender + tysurface + tyround,
  data = data_model
)

summary(lpm)
## 
## Call:
## lm(formula = win ~ rankindexdiff + b365diff + gender + tysurface + 
##     tyround, data = data_model)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.0455 -0.4422  0.0000  0.4422  2.0455 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    5.000e-01  1.380e-02   36.22   <2e-16 ***
## rankindexdiff  2.721e-02  1.092e-03   24.92   <2e-16 ***
## b365diff      -4.654e-02  6.640e-04  -70.10   <2e-16 ***
## genderMuz     -9.723e-15  2.758e-03    0.00        1    
## tysurface2    -1.606e-13  1.382e-02    0.00        1    
## tysurface3    -1.592e-13  1.447e-02    0.00        1    
## tysurface4    -1.652e-13  6.513e-02    0.00        1    
## tysurface5    -1.657e-13  1.372e-02    0.00        1    
## tyround2      -8.770e-15  3.299e-03    0.00        1    
## tyround3       1.140e-15  6.189e-03    0.00        1    
## tyround4       3.245e-15  1.581e-02    0.00        1    
## tyround5      -5.509e-16  4.622e-03    0.00        1    
## tyround6       3.374e-15  1.403e-02    0.00        1    
## tyround7      -2.889e-16  6.147e-03    0.00        1    
## tyround8       3.321e-16  8.354e-03    0.00        1    
## tyround9       1.083e-14  2.296e-01    0.00        1    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4591 on 111728 degrees of freedom
## Multiple R-squared:  0.157,  Adjusted R-squared:  0.1569 
## F-statistic:  1387 on 15 and 111728 DF,  p-value: < 2.2e-16
# test heteroskedasticity
bptest(lpm)
## 
##  studentized Breusch-Pagan test
## 
## data:  lpm
## BP = 218, df = 15, p-value < 2.2e-16
# robustné (White/HC) štandardné chyby
vcov_lpm_robust <- sandwich::vcovHC(lpm, type = "HC1")
lpm_robust <- lmtest::coeftest(lpm, vcov. = vcov_lpm_robust)
lpm_robust
## 
## t test of coefficients:
## 
##                  Estimate  Std. Error t value Pr(>|t|)    
## (Intercept)    5.0000e-01  1.3638e-02  36.663   <2e-16 ***
## rankindexdiff  2.7207e-02  1.2054e-03  22.570   <2e-16 ***
## b365diff      -4.6545e-02  8.1881e-04 -56.844   <2e-16 ***
## genderMuz     -9.7233e-15  2.7582e-03   0.000        1    
## tysurface2    -1.6060e-13  1.3655e-02   0.000        1    
## tysurface3    -1.5924e-13  1.4345e-02   0.000        1    
## tysurface4    -1.6517e-13  6.4979e-02   0.000        1    
## tysurface5    -1.6570e-13  1.3554e-02   0.000        1    
## tyround2      -8.7700e-15  3.2761e-03   0.000        1    
## tyround3       1.1399e-15  6.1574e-03   0.000        1    
## tyround4       3.2452e-15  1.4954e-02   0.000        1    
## tyround5      -5.5089e-16  4.6409e-03   0.000        1    
## tyround6       3.3743e-15  1.4166e-02   0.000        1    
## tyround7      -2.8886e-16  6.2336e-03   0.000        1    
## tyround8       3.3213e-16  8.4209e-03   0.000        1    
## tyround9       1.0825e-14  2.5838e-01   0.000        1    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# predikované pravdepodobnosti z LPM
data_model$phat_lpm <- fitted(lpm)

summary(data_model$phat_lpm)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -2.0455  0.4047  0.5000  0.5000  0.5953  3.0455

12 3. Nelineárny model – logit

# logit model
logit_mod <- glm(
  win ~ rankindexdiff + b365diff + gender + tysurface + tyround,
  data   = data_model,
  family = binomial(link = "logit")
)

summary(logit_mod)
## 
## Call:
## glm(formula = win ~ rankindexdiff + b365diff + gender + tysurface + 
##     tyround, family = binomial(link = "logit"), data = data_model)
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -5.260e-15  6.870e-02   0.000    1.000    
## rankindexdiff -8.541e-03  5.807e-03  -1.471    0.141    
## b365diff      -4.205e-01  5.072e-03 -82.912   <2e-16 ***
## genderMuz     -1.356e-16  1.328e-02   0.000    1.000    
## tysurface2     5.486e-15  6.874e-02   0.000    1.000    
## tysurface3     5.412e-15  7.171e-02   0.000    1.000    
## tysurface4     1.080e-14  3.243e-01   0.000    1.000    
## tysurface5     5.388e-15  6.829e-02   0.000    1.000    
## tyround2      -1.366e-16  1.601e-02   0.000    1.000    
## tyround3      -1.481e-16  3.068e-02   0.000    1.000    
## tyround4       5.645e-16  8.024e-02   0.000    1.000    
## tyround5      -1.132e-16  2.211e-02   0.000    1.000    
## tyround6      -7.832e-16  6.671e-02   0.000    1.000    
## tyround7       3.868e-17  2.924e-02   0.000    1.000    
## tyround8      -2.613e-16  3.966e-02   0.000    1.000    
## tyround9       3.047e-14  1.002e+00   0.000    1.000    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 154910  on 111743  degrees of freedom
## Residual deviance: 131840  on 111728  degrees of freedom
## AIC: 131872
## 
## Number of Fisher Scoring iterations: 5
# pseudo R2 (McFadden)
logLik_null <- logLik(glm(win ~ 1, data = data_model, family = binomial(link = "logit")))
logLik_full <- logLik(logit_mod)
pseudoR2_logit <- 1 - as.numeric(logLik_full / logLik_null)
pseudoR2_logit
## [1] 0.1489228
# predikované pravdepodobnosti z logitu
data_model$phat_logit <- fitted(logit_mod)
summary(data_model$phat_logit)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.3638  0.5000  0.5000  0.6362  1.0000
# jednoduchý graf: pravdepodobnosť výhry a rankindexdiff
ggplot(data_model, aes(x = rankindexdiff, y = phat_logit)) +
  geom_point(alpha = 0.2) +
  geom_smooth(se = FALSE) +
  labs(
    x = "rankindexdiff",
    y = "P( win = 1 ) – logit",
    title = "Predikované pravdepodobnosti z logit modelu"
  )

13 4. Nelineárny model – probit

# probit model
probit_mod <- glm(
  win ~ rankindexdiff + b365diff + gender + tysurface + tyround,
  data   = data_model,
  family = binomial(link = "probit")
)

summary(probit_mod)
## 
## Call:
## glm(formula = win ~ rankindexdiff + b365diff + gender + tysurface + 
##     tyround, family = binomial(link = "probit"), data = data_model)
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -2.878e-14  4.114e-02   0.000   1.0000    
## rankindexdiff  7.128e-03  3.483e-03   2.047   0.0407 *  
## b365diff      -2.317e-01  2.871e-03 -80.702   <2e-16 ***
## genderMuz      2.195e-14  8.029e-03   0.000   1.0000    
## tysurface2     4.372e-14  4.116e-02   0.000   1.0000    
## tysurface3     4.266e-15  4.299e-02   0.000   1.0000    
## tysurface4     1.968e-14  1.940e-01   0.000   1.0000    
## tysurface5     5.593e-15  4.089e-02   0.000   1.0000    
## tyround2       4.514e-14  9.660e-03   0.000   1.0000    
## tyround3       9.323e-16  1.845e-02   0.000   1.0000    
## tyround4       1.219e-14  4.800e-02   0.000   1.0000    
## tyround5      -8.008e-16  1.339e-02   0.000   1.0000    
## tyround6       8.777e-15  4.044e-02   0.000   1.0000    
## tyround7      -3.018e-16  1.774e-02   0.000   1.0000    
## tyround8       1.457e-16  2.403e-02   0.000   1.0000    
## tyround9       1.936e-14  6.272e-01   0.000   1.0000    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 154910  on 111743  degrees of freedom
## Residual deviance: 132406  on 111728  degrees of freedom
## AIC: 132438
## 
## Number of Fisher Scoring iterations: 5
# pseudo R2 (McFadden) pre probit
logLik_full_probit <- logLik(probit_mod)
pseudoR2_probit <- 1 - as.numeric(logLik_full_probit / logLik_null)
pseudoR2_probit
## [1] 0.1452715
# predikované pravdepodobnosti z probitu
data_model$phat_probit <- fitted(probit_mod)
summary(data_model$phat_probit)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.3745  0.5000  0.5000  0.6255  1.0000

14 5. Porovnanie modelov a klasifikácia

# pomocná funkcia na klasifikačnú tabuľku a percento správnych predikcií
class_table <- function(y, p, cut = 0.5) {
  yhat <- ifelse(p >= cut, 1, 0)
  tab <- table(skutocnost = y, predikcia = yhat)
  acc <- mean(yhat == y)
  list(tab = tab, acc = acc)
}

# LPM (pravdepodobnosti orezané na [0,1])
phat_lpm_trim <- pmin(pmax(data_model$phat_lpm, 0), 1)
ct_lpm <- class_table(data_model$win, phat_lpm_trim, cut = 0.5)

# LOGIT
ct_logit <- class_table(data_model$win, data_model$phat_logit, cut = 0.5)

# PROBIT
ct_probit <- class_table(data_model$win, data_model$phat_probit, cut = 0.5)

ct_lpm$tab
##           predikcia
## skutocnost     0     1
##          0 37889 17983
##          1 17983 37889
ct_lpm$acc
## [1] 0.6781393
ct_logit$tab
##           predikcia
## skutocnost     0     1
##          0 38271 17601
##          1 17601 38271
ct_logit$acc
## [1] 0.6849764
ct_probit$tab
##           predikcia
## skutocnost     0     1
##          0 38240 17632
##          1 17632 38240
ct_probit$acc
## [1] 0.6844215

15 Literatúra