1. Príprava dát a definícia binárnej premennej
# načítanie dát zo Stata súboru
tenis <- haven::read_dta("all.dta")
# rýchla kontrola
dim(tenis)
## [1] 112138 30
names(tenis)
## [1] "player" "rank" "s1" "s2"
## [5] "s3" "swon" "b365" "gender"
## [9] "tylocation" "tytournament" "tyseries" "tycourt"
## [13] "tysurface" "tyround" "year" "month"
## [17] "day" "rankdiff" "s1diff" "s2diff"
## [21] "s3diff" "swondiff" "b365diff" "s1w"
## [25] "s2w" "s3w" "rankindex" "rankop"
## [29] "rankindexop" "rankindexdiff"
head(tenis)
## # A tibble: 6 × 30
## player rank s1 s2 s3 swon b365 gender tylocation tytournament
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Krajinovic… 37 6 6 NA 2 1.5 1 149 279
## 2 Roddick A. 6 6 3 NA 0 1.53 1 100 257
## 3 Simon G. 37 6 2 NA 0 1.53 1 101 229
## 4 Parmentier… 59 6 6 3 1 2.62 0 100 195
## 5 Seppi A. 74 6 6 NA 0 1.83 1 123 34
## 6 Ljubicic I. 25 7 6 NA 2 1.14 1 123 34
## # ℹ 20 more variables: tyseries <dbl>, tycourt <dbl>, tysurface <dbl>,
## # tyround <dbl>, year <dbl>, month <dbl>, day <dbl>, rankdiff <dbl>,
## # s1diff <dbl>, s2diff <dbl>, s3diff <dbl>, swondiff <dbl>, b365diff <dbl>,
## # s1w <dbl>, s2w <dbl>, s3w <dbl>, rankindex <dbl>, rankop <dbl>,
## # rankindexop <dbl>, rankindexdiff <dbl>
# binárna vysvetľovaná premenná: výhra hráča 1
# predpoklad: swondiff > 0 znamená, že hráč 1 vyhral zápas
tenis <- tenis %>%
mutate(
win = ifelse(swondiff > 0, 1, 0),
win = as.numeric(win),
gender = factor(gender, labels = c("Zena", "Muz")),
tysurface = factor(tysurface),
tyround = factor(tyround)
)
table(tenis$win)
##
## 0 1
## 56069 56069
# dataset pre model – vyhodíme riadky s NA v relevantných premenných
data_model <- tenis %>%
select(win, rankindexdiff, b365diff, gender, tysurface, tyround) %>%
tidyr::drop_na()
dim(tenis) # pôvodný počet riadkov
## [1] 112138 31
dim(data_model) # počet riadkov použitých v modeloch
## [1] 111744 6
str(data_model)
## tibble [111,744 × 6] (S3: tbl_df/tbl/data.frame)
## $ win : num [1:111744] 1 0 0 0 0 1 0 1 0 1 ...
## $ rankindexdiff: num [1:111744] 0.345 2.807 1.038 0.287 -0.566 ...
## ..- attr(*, "label")= chr "rankindex of player-rankindex of opponent"
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ b365diff : num [1:111744] -1.12 -0.84 -0.84 1.18 0 ...
## ..- attr(*, "label")= chr "b365 of player-b365 of oppone"
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ gender : Factor w/ 2 levels "Zena","Muz": 2 2 2 1 2 2 2 2 2 2 ...
## $ tysurface : Factor w/ 5 levels "1","2","3","4",..: 5 5 5 5 5 5 5 1 1 5 ...
## $ tyround : Factor w/ 9 levels "1","2","3","4",..: 2 5 2 2 2 1 2 2 1 2 ...
2. Lineárny model (LPM)
# lineárny pravdepodobnostný model
lpm <- lm(
win ~ rankindexdiff + b365diff + gender + tysurface + tyround,
data = data_model
)
summary(lpm)
##
## Call:
## lm(formula = win ~ rankindexdiff + b365diff + gender + tysurface +
## tyround, data = data_model)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.0455 -0.4422 0.0000 0.4422 2.0455
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.000e-01 1.380e-02 36.22 <2e-16 ***
## rankindexdiff 2.721e-02 1.092e-03 24.92 <2e-16 ***
## b365diff -4.654e-02 6.640e-04 -70.10 <2e-16 ***
## genderMuz 1.984e-16 2.758e-03 0.00 1
## tysurface2 3.962e-15 1.382e-02 0.00 1
## tysurface3 3.950e-15 1.447e-02 0.00 1
## tysurface4 4.010e-15 6.513e-02 0.00 1
## tysurface5 4.055e-15 1.372e-02 0.00 1
## tyround2 2.247e-16 3.299e-03 0.00 1
## tyround3 5.728e-17 6.189e-03 0.00 1
## tyround4 -6.942e-17 1.581e-02 0.00 1
## tyround5 5.164e-17 4.622e-03 0.00 1
## tyround6 2.742e-16 1.403e-02 0.00 1
## tyround7 3.424e-17 6.147e-03 0.00 1
## tyround8 3.013e-17 8.354e-03 0.00 1
## tyround9 -2.221e-15 2.296e-01 0.00 1
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4591 on 111728 degrees of freedom
## Multiple R-squared: 0.157, Adjusted R-squared: 0.1569
## F-statistic: 1387 on 15 and 111728 DF, p-value: < 2.2e-16
# test heteroskedasticity
bptest(lpm)
##
## studentized Breusch-Pagan test
##
## data: lpm
## BP = 218, df = 15, p-value < 2.2e-16
# robustné (White/HC) štandardné chyby
vcov_lpm_robust <- sandwich::vcovHC(lpm, type = "HC1")
lpm_robust <- lmtest::coeftest(lpm, vcov. = vcov_lpm_robust)
lpm_robust
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.0000e-01 1.3638e-02 36.663 <2e-16 ***
## rankindexdiff 2.7207e-02 1.2054e-03 22.570 <2e-16 ***
## b365diff -4.6545e-02 8.1881e-04 -56.844 <2e-16 ***
## genderMuz 1.9835e-16 2.7582e-03 0.000 1
## tysurface2 3.9619e-15 1.3655e-02 0.000 1
## tysurface3 3.9503e-15 1.4345e-02 0.000 1
## tysurface4 4.0100e-15 6.4979e-02 0.000 1
## tysurface5 4.0546e-15 1.3554e-02 0.000 1
## tyround2 2.2467e-16 3.2761e-03 0.000 1
## tyround3 5.7285e-17 6.1574e-03 0.000 1
## tyround4 -6.9423e-17 1.4954e-02 0.000 1
## tyround5 5.1637e-17 4.6409e-03 0.000 1
## tyround6 2.7422e-16 1.4166e-02 0.000 1
## tyround7 3.4245e-17 6.2336e-03 0.000 1
## tyround8 3.0128e-17 8.4209e-03 0.000 1
## tyround9 -2.2206e-15 2.5838e-01 0.000 1
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# predikované pravdepodobnosti z LPM
data_model$phat_lpm <- fitted(lpm)
summary(data_model$phat_lpm)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.0455 0.4047 0.5000 0.5000 0.5953 3.0455
3. Nelineárny model – logit
# logit model
logit_mod <- glm(
win ~ rankindexdiff + b365diff + gender + tysurface + tyround,
data = data_model,
family = binomial(link = "logit")
)
summary(logit_mod)
##
## Call:
## glm(formula = win ~ rankindexdiff + b365diff + gender + tysurface +
## tyround, family = binomial(link = "logit"), data = data_model)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.244e-16 6.870e-02 0.000 1.000
## rankindexdiff -8.541e-03 5.807e-03 -1.471 0.141
## b365diff -4.205e-01 5.072e-03 -82.912 <2e-16 ***
## genderMuz -1.126e-16 1.328e-02 0.000 1.000
## tysurface2 2.474e-17 6.874e-02 0.000 1.000
## tysurface3 2.475e-16 7.171e-02 0.000 1.000
## tysurface4 -2.559e-16 3.243e-01 0.000 1.000
## tysurface5 1.385e-16 6.829e-02 0.000 1.000
## tyround2 6.545e-17 1.601e-02 0.000 1.000
## tyround3 9.412e-17 3.068e-02 0.000 1.000
## tyround4 1.202e-15 8.024e-02 0.000 1.000
## tyround5 6.239e-17 2.211e-02 0.000 1.000
## tyround6 3.188e-16 6.671e-02 0.000 1.000
## tyround7 -1.481e-16 2.924e-02 0.000 1.000
## tyround8 1.925e-16 3.966e-02 0.000 1.000
## tyround9 -1.290e-14 1.002e+00 0.000 1.000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 154910 on 111743 degrees of freedom
## Residual deviance: 131840 on 111728 degrees of freedom
## AIC: 131872
##
## Number of Fisher Scoring iterations: 5
# pseudo R2 (McFadden)
logLik_null <- logLik(glm(win ~ 1, data = data_model, family = binomial(link = "logit")))
logLik_full <- logLik(logit_mod)
pseudoR2_logit <- 1 - as.numeric(logLik_full / logLik_null)
pseudoR2_logit
## [1] 0.1489228
# predikované pravdepodobnosti z logitu
data_model$phat_logit <- fitted(logit_mod)
summary(data_model$phat_logit)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.3638 0.5000 0.5000 0.6362 1.0000
# jednoduchý graf: pravdepodobnosť výhry a rankindexdiff
ggplot(data_model, aes(x = rankindexdiff, y = phat_logit)) +
geom_point(alpha = 0.2) +
geom_smooth(se = FALSE) +
labs(
x = "rankindexdiff",
y = "P( win = 1 ) – logit",
title = "Predikované pravdepodobnosti z logit modelu"
)

4. Nelineárny model – probit
# probit model
probit_mod <- glm(
win ~ rankindexdiff + b365diff + gender + tysurface + tyround,
data = data_model,
family = binomial(link = "probit")
)
summary(probit_mod)
##
## Call:
## glm(formula = win ~ rankindexdiff + b365diff + gender + tysurface +
## tyround, family = binomial(link = "probit"), data = data_model)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.808e-14 4.114e-02 0.000 1.0000
## rankindexdiff 7.128e-03 3.483e-03 2.047 0.0407 *
## b365diff -2.317e-01 2.871e-03 -80.702 <2e-16 ***
## genderMuz 2.211e-14 8.029e-03 0.000 1.0000
## tysurface2 4.317e-14 4.116e-02 0.000 1.0000
## tysurface3 3.005e-15 4.299e-02 0.000 1.0000
## tysurface4 1.774e-14 1.940e-01 0.000 1.0000
## tysurface5 4.323e-15 4.089e-02 0.000 1.0000
## tyround2 4.575e-14 9.660e-03 0.000 1.0000
## tyround3 2.017e-15 1.845e-02 0.000 1.0000
## tyround4 1.280e-14 4.800e-02 0.000 1.0000
## tyround5 -5.608e-16 1.339e-02 0.000 1.0000
## tyround6 9.318e-15 4.044e-02 0.000 1.0000
## tyround7 -1.642e-16 1.774e-02 0.000 1.0000
## tyround8 7.386e-16 2.403e-02 0.000 1.0000
## tyround9 2.702e-14 6.272e-01 0.000 1.0000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 154910 on 111743 degrees of freedom
## Residual deviance: 132406 on 111728 degrees of freedom
## AIC: 132438
##
## Number of Fisher Scoring iterations: 5
# pseudo R2 (McFadden) pre probit
logLik_full_probit <- logLik(probit_mod)
pseudoR2_probit <- 1 - as.numeric(logLik_full_probit / logLik_null)
pseudoR2_probit
## [1] 0.1452715
# predikované pravdepodobnosti z probitu
data_model$phat_probit <- fitted(probit_mod)
summary(data_model$phat_probit)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.3745 0.5000 0.5000 0.6255 1.0000
5. Porovnanie modelov a klasifikácia
# pomocná funkcia na klasifikačnú tabuľku a percento správnych predikcií
class_table <- function(y, p, cut = 0.5) {
yhat <- ifelse(p >= cut, 1, 0)
tab <- table(skutocnost = y, predikcia = yhat)
acc <- mean(yhat == y)
list(tab = tab, acc = acc)
}
# LPM (pravdepodobnosti orezané na [0,1])
phat_lpm_trim <- pmin(pmax(data_model$phat_lpm, 0), 1)
ct_lpm <- class_table(data_model$win, phat_lpm_trim, cut = 0.5)
# LOGIT
ct_logit <- class_table(data_model$win, data_model$phat_logit, cut = 0.5)
# PROBIT
ct_probit <- class_table(data_model$win, data_model$phat_probit, cut = 0.5)
ct_lpm$tab
## predikcia
## skutocnost 0 1
## 0 37889 17983
## 1 17983 37889
ct_lpm$acc
## [1] 0.6781393
ct_logit$tab
## predikcia
## skutocnost 0 1
## 0 38271 17601
## 1 17601 38271
ct_logit$acc
## [1] 0.6849764
ct_probit$tab
## predikcia
## skutocnost 0 1
## 0 38240 17632
## 1 17632 38240
ct_probit$acc
## [1] 0.6844215