1. Príprava dát a definícia binárnej premennej

# načítanie dát zo Stata súboru
tenis <- haven::read_dta("all.dta")

# rýchla kontrola
dim(tenis)
## [1] 112138     30
names(tenis)
##  [1] "player"        "rank"          "s1"            "s2"           
##  [5] "s3"            "swon"          "b365"          "gender"       
##  [9] "tylocation"    "tytournament"  "tyseries"      "tycourt"      
## [13] "tysurface"     "tyround"       "year"          "month"        
## [17] "day"           "rankdiff"      "s1diff"        "s2diff"       
## [21] "s3diff"        "swondiff"      "b365diff"      "s1w"          
## [25] "s2w"           "s3w"           "rankindex"     "rankop"       
## [29] "rankindexop"   "rankindexdiff"
head(tenis)
## # A tibble: 6 × 30
##   player       rank    s1    s2    s3  swon  b365 gender tylocation tytournament
##   <chr>       <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>      <dbl>        <dbl>
## 1 Krajinovic…    37     6     6    NA     2  1.5       1        149          279
## 2 Roddick A.      6     6     3    NA     0  1.53      1        100          257
## 3 Simon G.       37     6     2    NA     0  1.53      1        101          229
## 4 Parmentier…    59     6     6     3     1  2.62      0        100          195
## 5 Seppi A.       74     6     6    NA     0  1.83      1        123           34
## 6 Ljubicic I.    25     7     6    NA     2  1.14      1        123           34
## # ℹ 20 more variables: tyseries <dbl>, tycourt <dbl>, tysurface <dbl>,
## #   tyround <dbl>, year <dbl>, month <dbl>, day <dbl>, rankdiff <dbl>,
## #   s1diff <dbl>, s2diff <dbl>, s3diff <dbl>, swondiff <dbl>, b365diff <dbl>,
## #   s1w <dbl>, s2w <dbl>, s3w <dbl>, rankindex <dbl>, rankop <dbl>,
## #   rankindexop <dbl>, rankindexdiff <dbl>
# binárna vysvetľovaná premenná: výhra hráča 1
# predpoklad: swondiff > 0 znamená, že hráč 1 vyhral zápas
tenis <- tenis %>%
  mutate(
    win = ifelse(swondiff > 0, 1, 0),
    win = as.numeric(win),
    gender    = factor(gender, labels = c("Zena", "Muz")),
    tysurface = factor(tysurface),
    tyround   = factor(tyround)
  )

table(tenis$win)
## 
##     0     1 
## 56069 56069
# dataset pre model – vyhodíme riadky s NA v relevantných premenných
data_model <- tenis %>%
  select(win, rankindexdiff, b365diff, gender, tysurface, tyround) %>%
  tidyr::drop_na()

dim(tenis)      # pôvodný počet riadkov
## [1] 112138     31
dim(data_model) # počet riadkov použitých v modeloch
## [1] 111744      6
str(data_model)
## tibble [111,744 × 6] (S3: tbl_df/tbl/data.frame)
##  $ win          : num [1:111744] 1 0 0 0 0 1 0 1 0 1 ...
##  $ rankindexdiff: num [1:111744] 0.345 2.807 1.038 0.287 -0.566 ...
##   ..- attr(*, "label")= chr "rankindex of player-rankindex of opponent"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ b365diff     : num [1:111744] -1.12 -0.84 -0.84 1.18 0 ...
##   ..- attr(*, "label")= chr "b365 of player-b365 of oppone"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ gender       : Factor w/ 2 levels "Zena","Muz": 2 2 2 1 2 2 2 2 2 2 ...
##  $ tysurface    : Factor w/ 5 levels "1","2","3","4",..: 5 5 5 5 5 5 5 1 1 5 ...
##  $ tyround      : Factor w/ 9 levels "1","2","3","4",..: 2 5 2 2 2 1 2 2 1 2 ...

2. Lineárny model (LPM)

# lineárny pravdepodobnostný model
lpm <- lm(
  win ~ rankindexdiff + b365diff + gender + tysurface + tyround,
  data = data_model
)

summary(lpm)
## 
## Call:
## lm(formula = win ~ rankindexdiff + b365diff + gender + tysurface + 
##     tyround, data = data_model)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.0455 -0.4422  0.0000  0.4422  2.0455 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    5.000e-01  1.380e-02   36.22   <2e-16 ***
## rankindexdiff  2.721e-02  1.092e-03   24.92   <2e-16 ***
## b365diff      -4.654e-02  6.640e-04  -70.10   <2e-16 ***
## genderMuz      1.984e-16  2.758e-03    0.00        1    
## tysurface2     3.962e-15  1.382e-02    0.00        1    
## tysurface3     3.950e-15  1.447e-02    0.00        1    
## tysurface4     4.010e-15  6.513e-02    0.00        1    
## tysurface5     4.055e-15  1.372e-02    0.00        1    
## tyround2       2.247e-16  3.299e-03    0.00        1    
## tyround3       5.728e-17  6.189e-03    0.00        1    
## tyround4      -6.942e-17  1.581e-02    0.00        1    
## tyround5       5.164e-17  4.622e-03    0.00        1    
## tyround6       2.742e-16  1.403e-02    0.00        1    
## tyround7       3.424e-17  6.147e-03    0.00        1    
## tyround8       3.013e-17  8.354e-03    0.00        1    
## tyround9      -2.221e-15  2.296e-01    0.00        1    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4591 on 111728 degrees of freedom
## Multiple R-squared:  0.157,  Adjusted R-squared:  0.1569 
## F-statistic:  1387 on 15 and 111728 DF,  p-value: < 2.2e-16
# test heteroskedasticity
bptest(lpm)
## 
##  studentized Breusch-Pagan test
## 
## data:  lpm
## BP = 218, df = 15, p-value < 2.2e-16
# robustné (White/HC) štandardné chyby
vcov_lpm_robust <- sandwich::vcovHC(lpm, type = "HC1")
lpm_robust <- lmtest::coeftest(lpm, vcov. = vcov_lpm_robust)
lpm_robust
## 
## t test of coefficients:
## 
##                  Estimate  Std. Error t value Pr(>|t|)    
## (Intercept)    5.0000e-01  1.3638e-02  36.663   <2e-16 ***
## rankindexdiff  2.7207e-02  1.2054e-03  22.570   <2e-16 ***
## b365diff      -4.6545e-02  8.1881e-04 -56.844   <2e-16 ***
## genderMuz      1.9835e-16  2.7582e-03   0.000        1    
## tysurface2     3.9619e-15  1.3655e-02   0.000        1    
## tysurface3     3.9503e-15  1.4345e-02   0.000        1    
## tysurface4     4.0100e-15  6.4979e-02   0.000        1    
## tysurface5     4.0546e-15  1.3554e-02   0.000        1    
## tyround2       2.2467e-16  3.2761e-03   0.000        1    
## tyround3       5.7285e-17  6.1574e-03   0.000        1    
## tyround4      -6.9423e-17  1.4954e-02   0.000        1    
## tyround5       5.1637e-17  4.6409e-03   0.000        1    
## tyround6       2.7422e-16  1.4166e-02   0.000        1    
## tyround7       3.4245e-17  6.2336e-03   0.000        1    
## tyround8       3.0128e-17  8.4209e-03   0.000        1    
## tyround9      -2.2206e-15  2.5838e-01   0.000        1    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# predikované pravdepodobnosti z LPM
data_model$phat_lpm <- fitted(lpm)

summary(data_model$phat_lpm)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -2.0455  0.4047  0.5000  0.5000  0.5953  3.0455

3. Nelineárny model – logit

# logit model
logit_mod <- glm(
  win ~ rankindexdiff + b365diff + gender + tysurface + tyround,
  data   = data_model,
  family = binomial(link = "logit")
)

summary(logit_mod)
## 
## Call:
## glm(formula = win ~ rankindexdiff + b365diff + gender + tysurface + 
##     tyround, family = binomial(link = "logit"), data = data_model)
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -1.244e-16  6.870e-02   0.000    1.000    
## rankindexdiff -8.541e-03  5.807e-03  -1.471    0.141    
## b365diff      -4.205e-01  5.072e-03 -82.912   <2e-16 ***
## genderMuz     -1.126e-16  1.328e-02   0.000    1.000    
## tysurface2     2.474e-17  6.874e-02   0.000    1.000    
## tysurface3     2.475e-16  7.171e-02   0.000    1.000    
## tysurface4    -2.559e-16  3.243e-01   0.000    1.000    
## tysurface5     1.385e-16  6.829e-02   0.000    1.000    
## tyround2       6.545e-17  1.601e-02   0.000    1.000    
## tyround3       9.412e-17  3.068e-02   0.000    1.000    
## tyround4       1.202e-15  8.024e-02   0.000    1.000    
## tyround5       6.239e-17  2.211e-02   0.000    1.000    
## tyround6       3.188e-16  6.671e-02   0.000    1.000    
## tyround7      -1.481e-16  2.924e-02   0.000    1.000    
## tyround8       1.925e-16  3.966e-02   0.000    1.000    
## tyround9      -1.290e-14  1.002e+00   0.000    1.000    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 154910  on 111743  degrees of freedom
## Residual deviance: 131840  on 111728  degrees of freedom
## AIC: 131872
## 
## Number of Fisher Scoring iterations: 5
# pseudo R2 (McFadden)
logLik_null <- logLik(glm(win ~ 1, data = data_model, family = binomial(link = "logit")))
logLik_full <- logLik(logit_mod)
pseudoR2_logit <- 1 - as.numeric(logLik_full / logLik_null)
pseudoR2_logit
## [1] 0.1489228
# predikované pravdepodobnosti z logitu
data_model$phat_logit <- fitted(logit_mod)
summary(data_model$phat_logit)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.3638  0.5000  0.5000  0.6362  1.0000
# jednoduchý graf: pravdepodobnosť výhry a rankindexdiff
ggplot(data_model, aes(x = rankindexdiff, y = phat_logit)) +
  geom_point(alpha = 0.2) +
  geom_smooth(se = FALSE) +
  labs(
    x = "rankindexdiff",
    y = "P( win = 1 ) – logit",
    title = "Predikované pravdepodobnosti z logit modelu"
  )

4. Nelineárny model – probit

# probit model
probit_mod <- glm(
  win ~ rankindexdiff + b365diff + gender + tysurface + tyround,
  data   = data_model,
  family = binomial(link = "probit")
)

summary(probit_mod)
## 
## Call:
## glm(formula = win ~ rankindexdiff + b365diff + gender + tysurface + 
##     tyround, family = binomial(link = "probit"), data = data_model)
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -2.808e-14  4.114e-02   0.000   1.0000    
## rankindexdiff  7.128e-03  3.483e-03   2.047   0.0407 *  
## b365diff      -2.317e-01  2.871e-03 -80.702   <2e-16 ***
## genderMuz      2.211e-14  8.029e-03   0.000   1.0000    
## tysurface2     4.317e-14  4.116e-02   0.000   1.0000    
## tysurface3     3.005e-15  4.299e-02   0.000   1.0000    
## tysurface4     1.774e-14  1.940e-01   0.000   1.0000    
## tysurface5     4.323e-15  4.089e-02   0.000   1.0000    
## tyround2       4.575e-14  9.660e-03   0.000   1.0000    
## tyround3       2.017e-15  1.845e-02   0.000   1.0000    
## tyround4       1.280e-14  4.800e-02   0.000   1.0000    
## tyround5      -5.608e-16  1.339e-02   0.000   1.0000    
## tyround6       9.318e-15  4.044e-02   0.000   1.0000    
## tyround7      -1.642e-16  1.774e-02   0.000   1.0000    
## tyround8       7.386e-16  2.403e-02   0.000   1.0000    
## tyround9       2.702e-14  6.272e-01   0.000   1.0000    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 154910  on 111743  degrees of freedom
## Residual deviance: 132406  on 111728  degrees of freedom
## AIC: 132438
## 
## Number of Fisher Scoring iterations: 5
# pseudo R2 (McFadden) pre probit
logLik_full_probit <- logLik(probit_mod)
pseudoR2_probit <- 1 - as.numeric(logLik_full_probit / logLik_null)
pseudoR2_probit
## [1] 0.1452715
# predikované pravdepodobnosti z probitu
data_model$phat_probit <- fitted(probit_mod)
summary(data_model$phat_probit)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.3745  0.5000  0.5000  0.6255  1.0000

5. Porovnanie modelov a klasifikácia

# pomocná funkcia na klasifikačnú tabuľku a percento správnych predikcií
class_table <- function(y, p, cut = 0.5) {
  yhat <- ifelse(p >= cut, 1, 0)
  tab <- table(skutocnost = y, predikcia = yhat)
  acc <- mean(yhat == y)
  list(tab = tab, acc = acc)
}

# LPM (pravdepodobnosti orezané na [0,1])
phat_lpm_trim <- pmin(pmax(data_model$phat_lpm, 0), 1)
ct_lpm <- class_table(data_model$win, phat_lpm_trim, cut = 0.5)

# LOGIT
ct_logit <- class_table(data_model$win, data_model$phat_logit, cut = 0.5)

# PROBIT
ct_probit <- class_table(data_model$win, data_model$phat_probit, cut = 0.5)

ct_lpm$tab
##           predikcia
## skutocnost     0     1
##          0 37889 17983
##          1 17983 37889
ct_lpm$acc
## [1] 0.6781393
ct_logit$tab
##           predikcia
## skutocnost     0     1
##          0 38271 17601
##          1 17601 38271
ct_logit$acc
## [1] 0.6849764
ct_probit$tab
##           predikcia
## skutocnost     0     1
##          0 38240 17632
##          1 17632 38240
ct_probit$acc
## [1] 0.6844215