install.packages("knitr")
install.packages("dplyr")
install.packages("ggplot2")
install.packages("haven")
names(data)
## NULL
library(haven)
library(dplyr)
data <- read_dta("all.dta")
str(data)
## tibble [112,138 × 30] (S3: tbl_df/tbl/data.frame)
## $ player : chr [1:112138] "Krajinovic F." "Roddick A." "Simon G." "Parmentier P." ...
## ..- attr(*, "format.stata")= chr "%24s"
## $ rank : num [1:112138] 37 6 37 59 74 25 196 47 396 29 ...
## ..- attr(*, "format.stata")= chr "%8.0g"
## $ s1 : num [1:112138] 6 6 6 6 6 7 3 6 7 6 ...
## ..- attr(*, "label")= chr "number of games won in 1st set"
## ..- attr(*, "format.stata")= chr "%8.0g"
## $ s2 : num [1:112138] 6 3 2 6 6 6 6 6 4 6 ...
## ..- attr(*, "label")= chr "number of games won in 2nd set"
## ..- attr(*, "format.stata")= chr "%8.0g"
## $ s3 : num [1:112138] NA NA NA 3 NA NA 3 NA 4 NA ...
## ..- attr(*, "label")= chr "number of games won in 3rd set"
## ..- attr(*, "format.stata")= chr "%8.0g"
## $ swon : num [1:112138] 2 0 0 1 0 2 1 2 1 2 ...
## ..- attr(*, "label")= chr "number of sets won"
## ..- attr(*, "format.stata")= chr "%8.0g"
## $ b365 : num [1:112138] 1.5 1.53 1.53 2.62 1.83 ...
## ..- attr(*, "label")= chr "bet365 odds"
## ..- attr(*, "format.stata")= chr "%10.0g"
## $ gender : num [1:112138] 1 1 1 0 1 1 1 1 1 1 ...
## ..- attr(*, "label")= chr "male=1, female=0"
## ..- attr(*, "format.stata")= chr "%8.0g"
## $ tylocation : num [1:112138] 149 100 101 100 123 123 98 151 172 140 ...
## ..- attr(*, "label")= chr "type of location"
## ..- attr(*, "format.stata")= chr "%12.0g"
## $ tytournament : num [1:112138] 279 257 229 195 34 34 220 285 235 266 ...
## ..- attr(*, "label")= chr "type of tournament"
## ..- attr(*, "format.stata")= chr "%12.0g"
## $ tyseries : num [1:112138] 1 5 1 4 7 7 1 4 4 1 ...
## ..- attr(*, "label")= chr "type of series"
## ..- attr(*, "format.stata")= chr "%12.0g"
## $ tycourt : num [1:112138] 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "label")= chr "type of court (outdoors=1, indoors=0)"
## ..- attr(*, "format.stata")= chr "%12.0g"
## $ tysurface : num [1:112138] 5 5 5 5 5 5 5 1 1 5 ...
## ..- attr(*, "label")= chr "type of surface (carpet=1, clay=2, grass=3, greenset=4, hard=5)"
## ..- attr(*, "format.stata")= chr "%12.0g"
## $ tyround : num [1:112138] 2 5 2 2 2 1 2 2 1 2 ...
## ..- attr(*, "label")= chr "round of match"
## ..- attr(*, "format.stata")= chr "%12.0g"
## $ year : num [1:112138] 2021 2008 2019 2012 2019 ...
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ month : num [1:112138] 9 3 9 2 10 11 2 10 2 2 ...
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ day : num [1:112138] 30 1 19 22 29 9 18 24 26 10 ...
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ rankdiff : num [1:112138] -10 -36 -39 -13 24 -680 174 -31 291 -16 ...
## ..- attr(*, "label")= chr "rank of player-rank of opponent"
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ s1diff : num [1:112138] 3 -1 -1 -1 -1 1 -3 5 1 4 ...
## ..- attr(*, "label")= chr "s1 of player-s1 of oppone"
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ s2diff : num [1:112138] 6 -3 -4 6 -1 2 5 4 -2 6 ...
## ..- attr(*, "label")= chr "s2 of player-s2 of oppone"
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ s3diff : num [1:112138] NA NA NA -3 NA NA -3 NA -2 NA ...
## ..- attr(*, "label")= chr "s3 of player-s3 of oppone"
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ swondiff : num [1:112138] 2 -2 -2 -1 -2 2 -1 2 -1 2 ...
## ..- attr(*, "label")= chr "swin of player-swin of oppone"
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ b365diff : num [1:112138] -1.12 -0.84 -0.84 1.18 0 ...
## ..- attr(*, "label")= chr "b365 of player-b365 of oppone"
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ s1w : num [1:112138] 1 0 0 0 0 1 0 1 1 1 ...
## ..- attr(*, "label")= chr "won set1=1, lost=0"
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ s2w : num [1:112138] 1 0 0 1 0 1 1 1 0 1 ...
## ..- attr(*, "label")= chr "won set2=1, lost=0"
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ s3w : num [1:112138] NA NA NA 0 NA NA 0 NA 0 NA ...
## ..- attr(*, "label")= chr "won set3=1, lost=0"
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ rankindex : num [1:112138] 2.79 5.42 2.79 2.12 1.79 ...
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ rankop : num [1:112138] 47 42 76 72 50 705 22 78 105 45 ...
## ..- attr(*, "label")= chr "rank of opponent"
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ rankindexop : num [1:112138] 2.45 2.61 1.75 1.83 2.36 ...
## ..- attr(*, "label")= chr "rankindex of opponent"
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ rankindexdiff: num [1:112138] 0.345 2.807 1.038 0.287 -0.566 ...
## ..- attr(*, "label")= chr "rankindex of player-rankindex of opponent"
## ..- attr(*, "format.stata")= chr "%9.0g"
head(data)
# Hráči, ktorí boli svetovou 1.
rank1_players <- data %>%
filter(rank == 1)
print(rank1_players)
## # A tibble: 1,163 × 30
## player rank s1 s2 s3 swon b365 gender tylocation tytournament
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Djokovic … 1 6 4 7 2 1.04 1 19 291
## 2 Djokovic … 1 6 6 3 1 1.40 1 163 193
## 3 Wozniacki… 1 4 2 NA 0 2.37 0 75 280
## 4 Djokovic … 1 7 6 NA 2 1.03 1 123 34
## 5 Nadal R. 1 6 6 NA 2 1.01 1 123 31
## 6 Jankovic … 1 0 6 6 2 1.53 0 108 174
## 7 Nadal R. 1 7 3 6 2 1.03 1 138 1
## 8 Djokovic … 1 3 6 7 2 1.16 1 123 34
## 9 Djokovic … 1 7 6 NA 2 1.12 1 166 106
## 10 Wozniacki… 1 4 3 NA 0 1.20 0 50 280
## # ℹ 1,153 more rows
## # ℹ 20 more variables: tyseries <dbl>, tycourt <dbl>, tysurface <dbl>,
## # tyround <dbl>, year <dbl>, month <dbl>, day <dbl>, rankdiff <dbl>,
## # s1diff <dbl>, s2diff <dbl>, s3diff <dbl>, swondiff <dbl>, b365diff <dbl>,
## # s1w <dbl>, s2w <dbl>, s3w <dbl>, rankindex <dbl>, rankop <dbl>,
## # rankindexop <dbl>, rankindexdiff <dbl>
write.csv(rank1_players, "rank1_players.csv", row.names = FALSE)
library(dplyr)
rank1_players <- data %>%
filter(rank == 1)
rank1_unique <- rank1_players %>%
distinct(player, year, .keep_all = TRUE)
print(rank1_unique)
## # A tibble: 57 × 30
## player rank s1 s2 s3 swon b365 gender tylocation tytournament
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Djokovic … 1 6 4 7 2 1.04 1 19 291
## 2 Djokovic … 1 6 6 3 1 1.40 1 163 193
## 3 Wozniacki… 1 4 2 NA 0 2.37 0 75 280
## 4 Djokovic … 1 7 6 NA 2 1.03 1 123 34
## 5 Nadal R. 1 6 6 NA 2 1.01 1 123 31
## 6 Jankovic … 1 0 6 6 2 1.53 0 108 174
## 7 Nadal R. 1 7 3 6 2 1.03 1 138 1
## 8 Djokovic … 1 7 6 NA 2 1.12 1 166 106
## 9 Wozniacki… 1 4 3 NA 0 1.20 0 50 280
## 10 Murray A. 1 6 6 6 2 1.28 1 87 193
## # ℹ 47 more rows
## # ℹ 20 more variables: tyseries <dbl>, tycourt <dbl>, tysurface <dbl>,
## # tyround <dbl>, year <dbl>, month <dbl>, day <dbl>, rankdiff <dbl>,
## # s1diff <dbl>, s2diff <dbl>, s3diff <dbl>, swondiff <dbl>, b365diff <dbl>,
## # s1w <dbl>, s2w <dbl>, s3w <dbl>, rankindex <dbl>, rankop <dbl>,
## # rankindexop <dbl>, rankindexdiff <dbl>
Krátka analýza zameraná iba na hráčov s rankom 1
library(tidyverse)
library(haven)
library(lubridate)
library(janitor)
library(stringr)
# 1) načítaj dáta
path <- c(params$data_path, "./all.dta", "/mnt/data/all.dta") %>% purrr::keep(file.exists) %>% dplyr::first()
stopifnot(!is.null(path))
raw <- read_dta(path)
df <- labelled::to_factor(raw) %>% clean_names()
# 2) autodetekcia mien stĺpcov
nm <- names(df)
# hráč
guess_player <- c(params$player_var, nm[str_detect(nm, "player|hrac|nick|name|id")]) %>% purrr::discard(is.null) %>% dplyr::first()
# rank
guess_rank <- c(params$rank_var, nm[str_detect(nm, "(^|_)rank($|_)|rating_rank|ranking")]) %>% purrr::discard(is.null) %>% dplyr::first()
# dátum
date_candidates <- nm[vapply(df, inherits, logical(1), what = c("Date","POSIXct","POSIXt"))]
if (length(date_candidates) == 0) {
likely_date_names <- nm[str_detect(nm, "date|datum|day|time|datetime|match_date|played|created")]
for (v in likely_date_names){
dt <- suppressWarnings(ymd(df[[v]]))
if (sum(!is.na(dt)) > nrow(df)*0.2) { df[[v]] <- dt; date_candidates <- c(date_candidates, v); break }
}
}
guess_date <- dplyr::first(purrr::discard(c(params$date_var, date_candidates), is.null))
# sets won / lost (autodetekcia) – v datasete mám len swon
guess_sw <- dplyr::first(purrr::discard(c(
params$sets_won_var,
nm[stringr::str_detect(nm, "(^|_)(swon|sets?_won|won_sets|set_won|setsfor|sets_for)($|_)")]
), is.null))
# sets_lost neexistuje – vypočítame neskôr pomocou swon → lost
guess_sl <- c(
params$sets_lost_var,
nm[str_detect(nm, "(^|_)(sets?_lost|lost_sets|set_lost|sl|setsagainst|sets_against)($|_)")]
) %>% purrr::discard(is.null) %>% dplyr::first()
# 3) validácia a zostavenie core dát
if (is.null(guess_player) || is.null(guess_rank)) {
stop("Nepodarilo sa nájsť stĺpce player/rank. Zadaj mená do params$player_var a params$rank_var.")
}
# vyber povinné stĺpce
core <- df %>% dplyr::select(player = dplyr::all_of(guess_player), rank = dplyr::all_of(guess_rank))
# doplň date stĺpec — robustne, aj keď sa nenašiel
if (!is.null(guess_date) && guess_date %in% names(df)) {
if (inherits(df[[guess_date]], "Date")) {
core$date <- df[[guess_date]]
} else {
core$date <- suppressWarnings(as.Date(df[[guess_date]]))
}
} else {
core$date <- seq_len(nrow(core)) # náhradný „čas“ podľa poradia riadkov
}
# doplň sets_won (povinné) a z neho dopočítaj sets_lost podľa pravidla
if (!is.null(guess_sw) && guess_sw %in% names(df)) {
core$sets_won <- suppressWarnings(readr::parse_number(as.character(df[[guess_sw]])))
} else {
stop("Nenašiel som stĺpec s vyhratými setmi (swon). Nastav params$sets_won_var = 'swon'.")
}
# Pravidlo: swon == 0 alebo 1 → prehral 2 sety; swon >= 2 → prehral 0
core$sets_lost <- dplyr::case_when(
is.na(core$sets_won) ~ NA_real_,
core$sets_won %in% c(0,1) ~ 2,
core$sets_won >= 2 ~ 0,
TRUE ~ NA_real_
)
# typy/ranky/filtre
if (!is.numeric(core$rank)) core <- core %>% mutate(rank = readr::parse_number(as.character(rank)))
core <- core %>% filter(!is.na(player), !is.na(rank)) %>% arrange(player, date) %>% mutate(is_r1 = rank == 1)
# info
list(
n_rows = nrow(core), n_players = n_distinct(core$player),
vars = list(player = guess_player, rank = guess_rank, date = guess_date)
)
## $n_rows
## [1] 111941
##
## $n_players
## [1] 1999
##
## $vars
## $vars$player
## [1] "player"
##
## $vars$rank
## [1] "rank"
##
## $vars$date
## [1] NA
players_total <- n_distinct(core$player)
players_r1 <- core %>% filter(is_r1) %>% distinct(player) %>% nrow()
share_r1 <- players_r1 / players_total
p1 <- tibble(metric = c("hráči s rank1", "ostatní"),
value = c(players_r1, players_total - players_r1)) %>%
ggplot(aes(x = metric, y = value)) +
geom_col() +
geom_text(aes(label = value), vjust = -0.3) +
labs(title = "Počet hráčov, ktorí niekedy dosiahli rank 1",
x = NULL, y = "Počet hráčov") +
theme_minimal()
print(p1)
Interpretácia: Stĺpec „hráči s rank1“ ukazuje, koľko hráčov sa aspoň raz dostalo na prvé miesto rebríčku. Druhý stĺpec zobrazuje všetkých ostatných.
p2 <- tibble(group = c("Rank 1 niekedy", "Nikdy"), share = c(share_r1, 1 - share_r1)) %>%
ggplot(aes(x = group, y = share)) +
geom_col() +
scale_y_continuous(labels = scales::percent) +
geom_text(aes(label = scales::percent(share, accuracy = 0.1)), vjust = -0.3) +
labs(title = "Podiel hráčov, ktorí niekedy dosiahli rank 1",
x = NULL, y = "% hráčov") +
theme_minimal()
print(p2)
Interpretácia: Výška stĺpca vyjadruje, koľko % všetkých hráčov získalo rank 1 aspoň raz.
r1_sets <- core %>% filter(is_r1) %>%
group_by(player) %>%
summarise(sets_won = sum(sets_won, na.rm = TRUE),
sets_lost = sum(sets_lost, na.rm = TRUE), .groups = "drop")
long_r1 <- r1_sets %>% pivot_longer(c(sets_won, sets_lost), names_to = "type", values_to = "sets")
p3 <- long_r1 %>%
group_by(type) %>% summarise(total = sum(sets), .groups = "drop") %>%
ggplot(aes(x = type, y = total)) +
geom_col() +
geom_text(aes(label = total), vjust = -0.3) +
labs(title = "Súčet vyhratých a prehratých setov počas ranku 1 (všetci hráči)",
subtitle = "Pravidlo: swon 0/1 ⇒ prehrané 2 sety; swon ≥ 2 ⇒ prehrané 0",
x = NULL, y = "Počet setov") +
theme_minimal()
print(p3)
Interpretácia: Vidíme celkový „objem“ setov, ktoré hráči odohrali v tom období, keď mali rank 1, rozdelené na výhry a prehry.
r1_leaderboard <- r1_sets %>% mutate(net_sets = sets_won - sets_lost,
win_rate = sets_won/pmax(1, (sets_won + sets_lost)))
p4 <- r1_leaderboard %>%
slice_max(net_sets, n = 10, with_ties = FALSE) %>%
mutate(player = forcats::fct_reorder(player, net_sets)) %>%
ggplot(aes(x = player, y = net_sets)) +
geom_col() + coord_flip() +
labs(title = "Top 10 hráčov podľa čistého skóre setov (rank 1)",
subtitle = "net_sets = sets_won − sets_lost",
x = NULL, y = "Čisté sety") +
theme_minimal()
print(p4)
Interpretácia: Rebríček zvýrazňuje, kto mal počas ranku 1 najvýraznejšiu dominanciu v setoch (čisté skóre).
need <- c("haven", "dplyr", "lmtest", "sandwich", "broom")
to_install <- setdiff(need, rownames(installed.packages()))
if (length(to_install) > 0) install.packages(to_install, dependencies = TRUE)
library(haven); library(dplyr); library(lmtest); library(sandwich); library(broom)
df <- haven::read_dta("all.dta") %>% as.data.frame()
names(df)[1:30]
## [1] "player" "rank" "s1" "s2"
## [5] "s3" "swon" "b365" "gender"
## [9] "tylocation" "tytournament" "tyseries" "tycourt"
## [13] "tysurface" "tyround" "year" "month"
## [17] "day" "rankdiff" "s1diff" "s2diff"
## [21] "s3diff" "swondiff" "b365diff" "s1w"
## [25] "s2w" "s3w" "rankindex" "rankop"
## [29] "rankindexop" "rankindexdiff"
dim(df)
## [1] 112138 30
# Ľahký wrangling (kategórie ako faktory)
df <- df %>% mutate(
gender = factor(gender),
tysurface = factor(tysurface),
year = as.integer(year)
)
# Zvolené premenné (ak niektorá chýba, automaticky sa vynechá)
vars_needed <- c("b365","rankdiff","s1","s2","s3","swon","gender","tysurface","year")
vars_needed <- intersect(vars_needed, names(df))
df_model <- df %>% dplyr::select(all_of(vars_needed)) %>% na.omit()
dim(df_model); head(df_model)
## [1] 38582 9
# Dynamicky poskladaná formula
rhs <- c()
for (v in c("rankdiff","s1","s2","s3","swon","gender","tysurface","year")) {
if (v %in% names(df_model)) rhs <- c(rhs, v)
}
stopifnot("b365" %in% names(df_model))
form <- as.formula(paste("b365 ~", paste(rhs, collapse = " + ")))
form
## b365 ~ rankdiff + s1 + s2 + s3 + swon + gender + tysurface +
## year
# OLS
m_ols <- lm(form, data = df_model)
summary(m_ols)
##
## Call:
## lm(formula = form, data = df_model)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.3216 -0.6884 -0.3176 0.2827 25.9022
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.860e+01 3.368e+00 8.490 < 2e-16 ***
## rankdiff 5.320e-03 6.313e-05 84.268 < 2e-16 ***
## s1 -5.370e-02 4.790e-03 -11.209 < 2e-16 ***
## s2 -4.656e-02 4.726e-03 -9.851 < 2e-16 ***
## s3 -6.735e-02 5.470e-03 -12.312 < 2e-16 ***
## swon -2.908e-01 2.069e-02 -14.053 < 2e-16 ***
## gender1 1.206e-01 1.419e-02 8.498 < 2e-16 ***
## tysurface2 -4.251e-02 7.190e-02 -0.591 0.554
## tysurface3 -6.302e-02 7.516e-02 -0.838 0.402
## tysurface4 -4.067e-01 5.633e-01 -0.722 0.470
## tysurface5 -1.078e-02 7.145e-02 -0.151 0.880
## year -1.244e-02 1.678e-03 -7.415 1.24e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.369 on 38570 degrees of freedom
## Multiple R-squared: 0.2042, Adjusted R-squared: 0.204
## F-statistic: 899.9 on 11 and 38570 DF, p-value: < 2.2e-16
# Breusch–Pagan
bp <- lmtest::bptest(m_ols)
bp
##
## studentized Breusch-Pagan test
##
## data: m_ols
## BP = 625.62, df = 11, p-value < 2.2e-16
# White
make_white_rhs <- function(vars) {
keep <- intersect(vars, names(df_model))
if (length(keep) == 0) return(~ 1)
terms <- unlist(lapply(keep, function(v) c(v, paste0("I(", v, "^2)"))))
as.formula(paste("~", paste(terms, collapse = " + ")))
}
white_rhs <- make_white_rhs(c("rankdiff","s1","s2","s3","swon"))
white <- lmtest::bptest(m_ols, varformula = white_rhs, studentize = TRUE, data = df_model)
white
##
## studentized Breusch-Pagan test
##
## data: m_ols
## BP = 1073.8, df = 10, p-value < 2.2e-16
aux_form <- as.formula(paste("log(residuals(m_ols)^2) ~", paste(rhs, collapse = " + ")))
aux <- lm(aux_form, data = df_model)
sigma_hat2 <- exp(fitted(aux)) # odhad var(e|X)
w <- 1 / sigma_hat2 # WLS váhy (menšia váha pri väčšej variancii)
# WLS/FGLS model
m_wls <- lm(form, data = df_model, weights = w)
summary(m_wls)
##
## Call:
## lm(formula = form, data = df_model, weights = w)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -2.874 -1.464 -0.688 0.738 45.040
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.586e+01 2.770e+00 5.725 1.04e-08 ***
## rankdiff 1.824e-03 3.117e-05 58.517 < 2e-16 ***
## s1 -3.408e-02 4.037e-03 -8.442 < 2e-16 ***
## s2 -3.465e-02 3.974e-03 -8.719 < 2e-16 ***
## s3 -4.019e-02 5.564e-03 -7.224 5.15e-13 ***
## swon -3.380e-01 1.834e-02 -18.428 < 2e-16 ***
## gender1 8.945e-02 1.153e-02 7.757 8.89e-15 ***
## tysurface2 -1.138e-03 7.308e-02 -0.016 0.988
## tysurface3 -7.731e-03 7.486e-02 -0.103 0.918
## tysurface4 2.939e-03 5.143e-01 0.006 0.995
## tysurface5 1.186e-02 7.284e-02 0.163 0.871
## year -6.270e-03 1.379e-03 -4.545 5.50e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.502 on 38570 degrees of freedom
## Multiple R-squared: 0.1244, Adjusted R-squared: 0.1241
## F-statistic: 497.9 on 11 and 38570 DF, p-value: < 2.2e-16
Na porovnanie použijeme Residuals vs Fitted — priamo ukazuje, či sa rozptyl mení s predikovanou hodnotou (lievik = heteroskedasticita).
par(mfrow=c(1,1))
plot(m_ols, which = 1)
Vysvetlivka (Graf 1 – OLS):
- Čo skúma: vzťah medzi predikovanými hodnotami (X) a reziduami (Y).
- Čo hľadám: náhodný „mrak“ okolo 0 bez trendu. Ak sa rozptyl mení s X (tvar lievika), ide o heteroskedasticitu.
- Dôsledok: OLS koeficienty sú stále nestranné, ale štandardné chyby sú skreslené
par(mfrow=c(1,1))
plot(m_wls, which = 1)
Vysvetlivka (Graf 2 – WLS/FGLS):
- Čo skúma: to isté čo Graf 1, ale po aplikácii váženia podľa odhadnutej variability chýb.
- Čo hľadám: rozptyl bodov by mal byť stabilnejší naprieč fitted hodnotami (menej „lievikovitý“).
- Dôsledok: Po korekcii sú odhady efektívnejšie a intervaly/spoľahlivostné testy sú spoľahlivejšie.
data.frame(
Test = c("Breusch–Pagan", "White"),
p_value = signif(c(bp$p.value, white$p.value), 4),
stringsAsFactors = FALSE
)
library(haven)
library(dplyr)
library(ggplot2)
library(lmtest)
library(forecast)
library(sandwich)
library(nlme)
dat <- read_dta("all.dta")
dat <- dat %>%
mutate(date = as.Date(paste(year, month, day, sep = "-"))) %>%
filter(!is.na(date))
Vyberieme hráča s najvyšším počtom pozorovaní, aby sme mali dostatočne dlhý časový rad.
player_pick <- dat %>% count(player, sort = TRUE) %>% slice(1) %>% pull(player)
d <- dat %>%
filter(player == player_pick) %>%
arrange(date) %>%
filter(!is.na(rankindex), !is.na(b365), !is.na(rankindexdiff)) %>%
droplevels()
# DÔLEŽITÉ: AR(1) v nlme vyžaduje unikátne hodnoty "času".
# Keď má hráč viac zápasov v rovnaký deň, date nie je unikátny.
# Preto vytvoríme jednoduchý časový index 1..T:
d <- d %>% mutate(t = row_number())
cat("Použitý hráč:", player_pick, "\n")
## Použitý hráč: Djokovic N.
cat("Počet pozorovaní:", nrow(d), "\n")
## Počet pozorovaní: 675
cat("Rozsah dátumov:", format(min(d$date)), "–", format(max(d$date)), "\n")
## Rozsah dátumov: 2007-01-02 – 2021-11-20
Použijeme jednoduchý model:
\[ rankindex_t = \beta_0 + \beta_1 b365_t + \beta_2 rankindexdiff_t + u_t \]
m <- lm(rankindex ~ b365 + rankindexdiff, data = d)
summary(m)
##
## Call:
## lm(formula = rankindex ~ b365 + rankindexdiff, data = d)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.1134 -0.5423 0.1437 0.6627 1.6056
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.01219 0.14410 41.723 <2e-16 ***
## b365 0.09303 0.06475 1.437 0.151
## rankindexdiff 0.26588 0.02218 11.986 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.963 on 672 degrees of freedom
## Multiple R-squared: 0.2141, Adjusted R-squared: 0.2118
## F-statistic: 91.53 on 2 and 672 DF, p-value: < 2.2e-16
d <- d %>% mutate(e = resid(m))
ggplot(d, aes(x = date, y = e)) +
geom_line() +
labs(
title = "Rezíduá OLS v čase",
x = "Dátum", y = "rezíduá"
)
d_lag <- d %>%
mutate(e_lag1 = dplyr::lag(e, 1)) %>%
filter(!is.na(e_lag1))
ggplot(d_lag, aes(x = e_lag1, y = e)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
labs(
title = "Lag-plot rezíduí: e(t) vs e(t-1)",
x = "e(t-1)", y = "e(t)"
)
## `geom_smooth()` using formula = 'y ~ x'
(Voliteľne) pomocná regresia:
summary(lm(e ~ e_lag1, data = d_lag))
##
## Call:
## lm(formula = e ~ e_lag1, data = d_lag)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.10462 -0.32983 0.05773 0.37198 1.66452
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.005759 0.021145 0.272 0.785
## e_lag1 0.815752 0.022026 37.035 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5489 on 672 degrees of freedom
## Multiple R-squared: 0.6712, Adjusted R-squared: 0.6707
## F-statistic: 1372 on 1 and 672 DF, p-value: < 2.2e-16
Acf(resid(m), main = "ACF rezíduí (OLS)")
dwtest(m)
##
## Durbin-Watson test
##
## data: m
## DW = 0.35882, p-value < 2.2e-16
## alternative hypothesis: true autocorrelation is greater than 0
bgtest(m, order = 1)
##
## Breusch-Godfrey test for serial correlation of order up to 1
##
## data: m
## LM test = 499.02, df = 1, p-value < 2.2e-16
bgtest(m, order = 2)
##
## Breusch-Godfrey test for serial correlation of order up to 2
##
## data: m
## LM test = 542.46, df = 2, p-value < 2.2e-16
bgtest(m, order = 3)
##
## Breusch-Godfrey test for serial correlation of order up to 3
##
## data: m
## LM test = 565.28, df = 3, p-value < 2.2e-16
bgtest(m, order = 4)
##
## Breusch-Godfrey test for serial correlation of order up to 4
##
## data: m
## LM test = 576.46, df = 4, p-value < 2.2e-16
req <- c("haven","dplyr","ggplot2","car","MASS")
to_install <- req[!sapply(req, requireNamespace, quietly = TRUE)]
if(length(to_install)) install.packages(to_install, dependencies = TRUE)
library(haven)
library(dplyr)
library(ggplot2)
library(car)
library(MASS)
all.dtapaths <- c("all.dta", file.path(getwd(), "all.dta"))
path <- paths[file.exists(paths)][1]
stopifnot(!is.na(path), file.exists(path))
df <- read_dta(path) |> as.data.frame()
# základný sanity check
cat("Rows:", nrow(df), " Cols:", ncol(df), "\n")
## Rows: 112138 Cols: 30
str(df[1:min(10,ncol(df))])
## 'data.frame': 112138 obs. of 10 variables:
## $ player : chr "Krajinovic F." "Roddick A." "Simon G." "Parmentier P." ...
## ..- attr(*, "format.stata")= chr "%24s"
## $ rank : num 37 6 37 59 74 25 196 47 396 29 ...
## ..- attr(*, "format.stata")= chr "%8.0g"
## $ s1 : num 6 6 6 6 6 7 3 6 7 6 ...
## ..- attr(*, "label")= chr "number of games won in 1st set"
## ..- attr(*, "format.stata")= chr "%8.0g"
## $ s2 : num 6 3 2 6 6 6 6 6 4 6 ...
## ..- attr(*, "label")= chr "number of games won in 2nd set"
## ..- attr(*, "format.stata")= chr "%8.0g"
## $ s3 : num NA NA NA 3 NA NA 3 NA 4 NA ...
## ..- attr(*, "label")= chr "number of games won in 3rd set"
## ..- attr(*, "format.stata")= chr "%8.0g"
## $ swon : num 2 0 0 1 0 2 1 2 1 2 ...
## ..- attr(*, "label")= chr "number of sets won"
## ..- attr(*, "format.stata")= chr "%8.0g"
## $ b365 : num 1.5 1.53 1.53 2.62 1.83 ...
## ..- attr(*, "label")= chr "bet365 odds"
## ..- attr(*, "format.stata")= chr "%10.0g"
## $ gender : num 1 1 1 0 1 1 1 1 1 1 ...
## ..- attr(*, "label")= chr "male=1, female=0"
## ..- attr(*, "format.stata")= chr "%8.0g"
## $ tylocation : num 149 100 101 100 123 123 98 151 172 140 ...
## ..- attr(*, "label")= chr "type of location"
## ..- attr(*, "format.stata")= chr "%12.0g"
## $ tytournament: num 279 257 229 195 34 34 220 285 235 266 ...
## ..- attr(*, "label")= chr "type of tournament"
## ..- attr(*, "format.stata")= chr "%12.0g"
num <- df |> dplyr::select(where(is.numeric))
# vyhodiť konštanty / nulový rozptyl
nzv <- sapply(num, function(x) isTRUE(sd(x, na.rm=TRUE) > 0))
num <- num[, nzv, drop = FALSE]
num <- na.omit(num)
cat("Po čistení: Rows:", nrow(num), " Cols:", ncol(num), "\n")
## Po čistení: Rows: 38582 Cols: 29
cor_mat <- cor(num)
# prehľad top korelácií (absolútne)
cm <- cor_mat
diag(cm) <- NA
top_pairs <- as.data.frame(as.table(cm)) |>
dplyr::mutate(abs_cor = abs(Freq)) |>
dplyr::filter(!is.na(abs_cor)) |>
dplyr::arrange(desc(abs_cor)) |>
dplyr::slice(1:20)
top_pairs
# jednoduchý heatmap bez ďalších balíkov
ord <- order(colnames(cor_mat))
image(
1:ncol(cor_mat), 1:ncol(cor_mat),
cor_mat[ord, ord],
axes = FALSE, xlab = "", ylab = ""
)
axis(1, at = 1:ncol(cor_mat), labels = colnames(cor_mat)[ord], las = 2, cex.axis = 0.6)
axis(2, at = 1:ncol(cor_mat), labels = colnames(cor_mat)[ord], las = 2, cex.axis = 0.6)
title("Korelačná matica (numerické premenné)")
Keďže v dátach sa často nachádzajú konštrukčné identity (napr. rankdiff = rank - rankop),
OLS môže mať presnú kolinearitu.
Preto spravíme:
1. zvolíme cieľovú premennú y automaticky ako prvý numerický stĺpec (ak chceš inú, zmeň to),
2. odstránime presne lineárne závislé prediktory pomocou QR (alias()/rank),
3. potom vypočítame VIF a condition number.
y_name <- names(num)[1] # <-- zmeň si sem cieľovú premennú ak treba
x_names <- setdiff(names(num), y_name)
y <- num[[y_name]]
X <- num[, x_names, drop = FALSE]
# odstránenie perfektnej multikolinearity (závislé stĺpce)
mm <- model.matrix(~ . , data = X) # pridá intercept
qr_mm <- qr(mm)
rank_mm <- qr_mm$rank
p_mm <- ncol(mm)
if(rank_mm < p_mm){
indep_idx <- qr_mm$pivot[1:rank_mm]
mm2 <- mm[, indep_idx, drop = FALSE]
# späť na dátový rámec bez interceptu
keep_cols <- colnames(mm2)
keep_cols <- setdiff(keep_cols, "(Intercept)")
X2 <- as.data.frame(mm2[, keep_cols, drop = FALSE])
cat("Odstránil som", ncol(X) - ncol(X2), "perfektne závislých prediktorov.\n")
} else {
X2 <- X
cat("Perfektná kolinearita nebola zistená.\n")
}
## Odstránil som 1 perfektne závislých prediktorov.
dat2 <- cbind.data.frame(y = y, X2)
names(dat2)[1] <- y_name
f <- as.formula(paste(y_name, "~", paste(names(X2), collapse = " + ")))
ols <- lm(f, data = dat2)
summary(ols)
##
## Call:
## lm(formula = f, data = dat2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.458e-11 -2.300e-14 -5.000e-15 1.200e-14 1.023e-10
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.131e-13 1.689e-12 -6.700e-02 0.946606
## s1 -4.314e-14 3.412e-15 -1.264e+01 < 2e-16 ***
## s2 -2.755e-15 3.376e-15 -8.160e-01 0.414420
## s3 3.265e-14 3.146e-15 1.038e+01 < 2e-16 ***
## swon -4.524e-15 2.938e-13 -1.500e-02 0.987713
## b365 1.674e-13 4.456e-15 3.757e+01 < 2e-16 ***
## gender 8.085e-14 7.982e-15 1.013e+01 < 2e-16 ***
## tylocation -2.455e-16 7.205e-17 -3.408e+00 0.000656 ***
## tytournament -3.363e-17 3.514e-17 -9.570e-01 0.338571
## tyseries 1.454e-14 7.681e-16 1.893e+01 < 2e-16 ***
## tycourt -4.671e-14 9.442e-15 -4.947e+00 7.58e-07 ***
## tysurface -2.182e-15 2.502e-15 -8.720e-01 0.383068
## tyround -3.155e-14 1.858e-15 -1.698e+01 < 2e-16 ***
## year 1.040e-15 8.012e-16 1.298e+00 0.194406
## month 4.831e-15 1.187e-15 4.071e+00 4.69e-05 ***
## day 5.416e-16 4.098e-16 1.322e+00 0.186253
## rankdiff 1.000e+00 5.791e-17 1.727e+16 < 2e-16 ***
## s1diff 1.155e-15 3.136e-15 3.680e-01 0.712781
## s2diff 1.301e-14 3.106e-15 4.189e+00 2.81e-05 ***
## s3diff -4.414e-15 2.984e-15 -1.479e+00 0.139113
## swondiff -1.090e-13 2.076e-13 -5.250e-01 0.599628
## b365diff -1.600e-13 2.995e-15 -5.341e+01 < 2e-16 ***
## s1w -1.131e-13 2.078e-13 -5.440e-01 0.586401
## s2w -1.653e-13 2.077e-13 -7.960e-01 0.426262
## s3w 4.666e-14 2.939e-13 1.590e-01 0.873861
## rankindex -3.905e-13 4.039e-15 -9.667e+01 < 2e-16 ***
## rankop 1.000e+00 8.264e-17 1.210e+16 < 2e-16 ***
## rankindexop -3.680e-16 4.039e-15 -9.100e-02 0.927408
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.556e-13 on 38554 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 2.432e+31 on 27 and 38554 DF, p-value: < 2.2e-16
v <- car::vif(ols)
v <- sort(v, decreasing = TRUE)
head(v, 30)
## swondiff swon s3w s1w s2w s3diff
## 3873.433541 1941.018847 1938.255665 969.267107 968.358139 8.093307
## s2diff s1diff b365diff rankop b365 rankdiff
## 7.808480 7.692521 5.486488 4.484770 4.195253 3.755672
## rankindexop rankindex s2 s1 s3 tyseries
## 3.571906 3.571855 3.318751 3.294786 3.178247 1.449856
## gender tyround tycourt tylocation tysurface tytournament
## 1.419780 1.264489 1.186184 1.149751 1.124088 1.117263
## month day year
## 1.116594 1.026043 1.018728
Xmm <- model.matrix(ols)[, -1, drop = FALSE] # bez interceptu
kappa_val <- kappa(Xmm, exact = TRUE)
kappa_val
## [1] 212047
# načítanie dát zo Stata súboru
tenis <- haven::read_dta("all.dta")
# rýchla kontrola
dim(tenis)
## [1] 112138 30
names(tenis)
## [1] "player" "rank" "s1" "s2"
## [5] "s3" "swon" "b365" "gender"
## [9] "tylocation" "tytournament" "tyseries" "tycourt"
## [13] "tysurface" "tyround" "year" "month"
## [17] "day" "rankdiff" "s1diff" "s2diff"
## [21] "s3diff" "swondiff" "b365diff" "s1w"
## [25] "s2w" "s3w" "rankindex" "rankop"
## [29] "rankindexop" "rankindexdiff"
head(tenis)
# binárna vysvetľovaná premenná: výhra hráča 1
# predpoklad: swondiff > 0 znamená, že hráč 1 vyhral zápas
tenis <- tenis %>%
mutate(
win = ifelse(swondiff > 0, 1, 0),
win = as.numeric(win),
gender = factor(gender, labels = c("Zena", "Muz")),
tysurface = factor(tysurface),
tyround = factor(tyround)
)
table(tenis$win)
##
## 0 1
## 56069 56069
# dataset pre model – vyhodíme riadky s NA v relevantných premenných
data_model <- tenis %>%
dplyr::select(win, rankindexdiff, b365diff, gender, tysurface, tyround) %>%
tidyr::drop_na()
dim(tenis) # pôvodný počet riadkov
## [1] 112138 31
dim(data_model) # počet riadkov použitých v modeloch
## [1] 111744 6
str(data_model)
## tibble [111,744 × 6] (S3: tbl_df/tbl/data.frame)
## $ win : num [1:111744] 1 0 0 0 0 1 0 1 0 1 ...
## $ rankindexdiff: num [1:111744] 0.345 2.807 1.038 0.287 -0.566 ...
## ..- attr(*, "label")= chr "rankindex of player-rankindex of opponent"
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ b365diff : num [1:111744] -1.12 -0.84 -0.84 1.18 0 ...
## ..- attr(*, "label")= chr "b365 of player-b365 of oppone"
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ gender : Factor w/ 2 levels "Zena","Muz": 2 2 2 1 2 2 2 2 2 2 ...
## $ tysurface : Factor w/ 5 levels "1","2","3","4",..: 5 5 5 5 5 5 5 1 1 5 ...
## $ tyround : Factor w/ 9 levels "1","2","3","4",..: 2 5 2 2 2 1 2 2 1 2 ...
# lineárny pravdepodobnostný model
lpm <- lm(
win ~ rankindexdiff + b365diff + gender + tysurface + tyround,
data = data_model
)
summary(lpm)
##
## Call:
## lm(formula = win ~ rankindexdiff + b365diff + gender + tysurface +
## tyround, data = data_model)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.0455 -0.4422 0.0000 0.4422 2.0455
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.000e-01 1.380e-02 36.22 <2e-16 ***
## rankindexdiff 2.721e-02 1.092e-03 24.92 <2e-16 ***
## b365diff -4.654e-02 6.640e-04 -70.10 <2e-16 ***
## genderMuz -9.723e-15 2.758e-03 0.00 1
## tysurface2 -1.606e-13 1.382e-02 0.00 1
## tysurface3 -1.592e-13 1.447e-02 0.00 1
## tysurface4 -1.652e-13 6.513e-02 0.00 1
## tysurface5 -1.657e-13 1.372e-02 0.00 1
## tyround2 -8.770e-15 3.299e-03 0.00 1
## tyround3 1.140e-15 6.189e-03 0.00 1
## tyround4 3.245e-15 1.581e-02 0.00 1
## tyround5 -5.509e-16 4.622e-03 0.00 1
## tyround6 3.374e-15 1.403e-02 0.00 1
## tyround7 -2.889e-16 6.147e-03 0.00 1
## tyround8 3.321e-16 8.354e-03 0.00 1
## tyround9 1.083e-14 2.296e-01 0.00 1
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4591 on 111728 degrees of freedom
## Multiple R-squared: 0.157, Adjusted R-squared: 0.1569
## F-statistic: 1387 on 15 and 111728 DF, p-value: < 2.2e-16
# test heteroskedasticity
bptest(lpm)
##
## studentized Breusch-Pagan test
##
## data: lpm
## BP = 218, df = 15, p-value < 2.2e-16
# robustné (White/HC) štandardné chyby
vcov_lpm_robust <- sandwich::vcovHC(lpm, type = "HC1")
lpm_robust <- lmtest::coeftest(lpm, vcov. = vcov_lpm_robust)
lpm_robust
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.0000e-01 1.3638e-02 36.663 <2e-16 ***
## rankindexdiff 2.7207e-02 1.2054e-03 22.570 <2e-16 ***
## b365diff -4.6545e-02 8.1881e-04 -56.844 <2e-16 ***
## genderMuz -9.7233e-15 2.7582e-03 0.000 1
## tysurface2 -1.6060e-13 1.3655e-02 0.000 1
## tysurface3 -1.5924e-13 1.4345e-02 0.000 1
## tysurface4 -1.6517e-13 6.4979e-02 0.000 1
## tysurface5 -1.6570e-13 1.3554e-02 0.000 1
## tyround2 -8.7700e-15 3.2761e-03 0.000 1
## tyround3 1.1399e-15 6.1574e-03 0.000 1
## tyround4 3.2452e-15 1.4954e-02 0.000 1
## tyround5 -5.5089e-16 4.6409e-03 0.000 1
## tyround6 3.3743e-15 1.4166e-02 0.000 1
## tyround7 -2.8886e-16 6.2336e-03 0.000 1
## tyround8 3.3213e-16 8.4209e-03 0.000 1
## tyround9 1.0825e-14 2.5838e-01 0.000 1
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# predikované pravdepodobnosti z LPM
data_model$phat_lpm <- fitted(lpm)
summary(data_model$phat_lpm)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.0455 0.4047 0.5000 0.5000 0.5953 3.0455
# logit model
logit_mod <- glm(
win ~ rankindexdiff + b365diff + gender + tysurface + tyround,
data = data_model,
family = binomial(link = "logit")
)
summary(logit_mod)
##
## Call:
## glm(formula = win ~ rankindexdiff + b365diff + gender + tysurface +
## tyround, family = binomial(link = "logit"), data = data_model)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.260e-15 6.870e-02 0.000 1.000
## rankindexdiff -8.541e-03 5.807e-03 -1.471 0.141
## b365diff -4.205e-01 5.072e-03 -82.912 <2e-16 ***
## genderMuz -1.356e-16 1.328e-02 0.000 1.000
## tysurface2 5.486e-15 6.874e-02 0.000 1.000
## tysurface3 5.412e-15 7.171e-02 0.000 1.000
## tysurface4 1.080e-14 3.243e-01 0.000 1.000
## tysurface5 5.388e-15 6.829e-02 0.000 1.000
## tyround2 -1.366e-16 1.601e-02 0.000 1.000
## tyround3 -1.481e-16 3.068e-02 0.000 1.000
## tyround4 5.645e-16 8.024e-02 0.000 1.000
## tyround5 -1.132e-16 2.211e-02 0.000 1.000
## tyround6 -7.832e-16 6.671e-02 0.000 1.000
## tyround7 3.868e-17 2.924e-02 0.000 1.000
## tyround8 -2.613e-16 3.966e-02 0.000 1.000
## tyround9 3.047e-14 1.002e+00 0.000 1.000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 154910 on 111743 degrees of freedom
## Residual deviance: 131840 on 111728 degrees of freedom
## AIC: 131872
##
## Number of Fisher Scoring iterations: 5
# pseudo R2 (McFadden)
logLik_null <- logLik(glm(win ~ 1, data = data_model, family = binomial(link = "logit")))
logLik_full <- logLik(logit_mod)
pseudoR2_logit <- 1 - as.numeric(logLik_full / logLik_null)
pseudoR2_logit
## [1] 0.1489228
# predikované pravdepodobnosti z logitu
data_model$phat_logit <- fitted(logit_mod)
summary(data_model$phat_logit)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.3638 0.5000 0.5000 0.6362 1.0000
# jednoduchý graf: pravdepodobnosť výhry a rankindexdiff
ggplot(data_model, aes(x = rankindexdiff, y = phat_logit)) +
geom_point(alpha = 0.2) +
geom_smooth(se = FALSE) +
labs(
x = "rankindexdiff",
y = "P( win = 1 ) – logit",
title = "Predikované pravdepodobnosti z logit modelu"
)
# probit model
probit_mod <- glm(
win ~ rankindexdiff + b365diff + gender + tysurface + tyround,
data = data_model,
family = binomial(link = "probit")
)
summary(probit_mod)
##
## Call:
## glm(formula = win ~ rankindexdiff + b365diff + gender + tysurface +
## tyround, family = binomial(link = "probit"), data = data_model)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.878e-14 4.114e-02 0.000 1.0000
## rankindexdiff 7.128e-03 3.483e-03 2.047 0.0407 *
## b365diff -2.317e-01 2.871e-03 -80.702 <2e-16 ***
## genderMuz 2.195e-14 8.029e-03 0.000 1.0000
## tysurface2 4.372e-14 4.116e-02 0.000 1.0000
## tysurface3 4.266e-15 4.299e-02 0.000 1.0000
## tysurface4 1.968e-14 1.940e-01 0.000 1.0000
## tysurface5 5.593e-15 4.089e-02 0.000 1.0000
## tyround2 4.514e-14 9.660e-03 0.000 1.0000
## tyround3 9.323e-16 1.845e-02 0.000 1.0000
## tyround4 1.219e-14 4.800e-02 0.000 1.0000
## tyround5 -8.008e-16 1.339e-02 0.000 1.0000
## tyround6 8.777e-15 4.044e-02 0.000 1.0000
## tyround7 -3.018e-16 1.774e-02 0.000 1.0000
## tyround8 1.457e-16 2.403e-02 0.000 1.0000
## tyround9 1.936e-14 6.272e-01 0.000 1.0000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 154910 on 111743 degrees of freedom
## Residual deviance: 132406 on 111728 degrees of freedom
## AIC: 132438
##
## Number of Fisher Scoring iterations: 5
# pseudo R2 (McFadden) pre probit
logLik_full_probit <- logLik(probit_mod)
pseudoR2_probit <- 1 - as.numeric(logLik_full_probit / logLik_null)
pseudoR2_probit
## [1] 0.1452715
# predikované pravdepodobnosti z probitu
data_model$phat_probit <- fitted(probit_mod)
summary(data_model$phat_probit)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.3745 0.5000 0.5000 0.6255 1.0000
# pomocná funkcia na klasifikačnú tabuľku a percento správnych predikcií
class_table <- function(y, p, cut = 0.5) {
yhat <- ifelse(p >= cut, 1, 0)
tab <- table(skutocnost = y, predikcia = yhat)
acc <- mean(yhat == y)
list(tab = tab, acc = acc)
}
# LPM (pravdepodobnosti orezané na [0,1])
phat_lpm_trim <- pmin(pmax(data_model$phat_lpm, 0), 1)
ct_lpm <- class_table(data_model$win, phat_lpm_trim, cut = 0.5)
# LOGIT
ct_logit <- class_table(data_model$win, data_model$phat_logit, cut = 0.5)
# PROBIT
ct_probit <- class_table(data_model$win, data_model$phat_probit, cut = 0.5)
ct_lpm$tab
## predikcia
## skutocnost 0 1
## 0 37889 17983
## 1 17983 37889
ct_lpm$acc
## [1] 0.6781393
ct_logit$tab
## predikcia
## skutocnost 0 1
## 0 38271 17601
## 1 17601 38271
ct_logit$acc
## [1] 0.6849764
ct_probit$tab
## predikcia
## skutocnost 0 1
## 0 38240 17632
## 1 17632 38240
ct_probit$acc
## [1] 0.6844215