Multikolinearita

1 1) Načítanie dát all.dta
2 2) Príprava premenných pre model
3 3) Korelačná matica (heatmap)
4 4) Model + multikolinearita

req <- c("haven","dplyr","ggplot2","car","MASS")
to_install <- req[!sapply(req, requireNamespace, quietly = TRUE)]
if(length(to_install)) install.packages(to_install, dependencies = TRUE)

library(haven)
library(dplyr)
library(ggplot2)
library(car)
library(MASS)

1 1) Načítanie dát `all.dta`

paths <- c("all.dta", file.path(getwd(), "all.dta"))
path <- paths[file.exists(paths)][1]
stopifnot(!is.na(path), file.exists(path))

df <- read_dta(path) |> as.data.frame()

# základný sanity check
cat("Rows:", nrow(df), " Cols:", ncol(df), "\n")

## Rows: 112138  Cols: 30

str(df[1:min(10,ncol(df))])

## 'data.frame':    112138 obs. of  10 variables:
##  $ player      : chr  "Krajinovic F." "Roddick A." "Simon G." "Parmentier P." ...
##   ..- attr(*, "format.stata")= chr "%24s"
##  $ rank        : num  37 6 37 59 74 25 196 47 396 29 ...
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ s1          : num  6 6 6 6 6 7 3 6 7 6 ...
##   ..- attr(*, "label")= chr "number of games won in 1st set"
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ s2          : num  6 3 2 6 6 6 6 6 4 6 ...
##   ..- attr(*, "label")= chr "number of games won in 2nd set"
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ s3          : num  NA NA NA 3 NA NA 3 NA 4 NA ...
##   ..- attr(*, "label")= chr "number of games won in 3rd set"
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ swon        : num  2 0 0 1 0 2 1 2 1 2 ...
##   ..- attr(*, "label")= chr "number of sets won"
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ b365        : num  1.5 1.53 1.53 2.62 1.83 ...
##   ..- attr(*, "label")= chr "bet365 odds"
##   ..- attr(*, "format.stata")= chr "%10.0g"
##  $ gender      : num  1 1 1 0 1 1 1 1 1 1 ...
##   ..- attr(*, "label")= chr "male=1, female=0"
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ tylocation  : num  149 100 101 100 123 123 98 151 172 140 ...
##   ..- attr(*, "label")= chr "type of location"
##   ..- attr(*, "format.stata")= chr "%12.0g"
##  $ tytournament: num  279 257 229 195 34 34 220 285 235 266 ...
##   ..- attr(*, "label")= chr "type of tournament"
##   ..- attr(*, "format.stata")= chr "%12.0g"

2 2) Príprava premenných pre model

num <- df |> dplyr::select(where(is.numeric))

# vyhodiť konštanty / nulový rozptyl
nzv <- sapply(num, function(x) isTRUE(sd(x, na.rm=TRUE) > 0))
num <- num[, nzv, drop = FALSE]

num <- na.omit(num)

cat("Po čistení: Rows:", nrow(num), " Cols:", ncol(num), "\n")

## Po čistení: Rows: 38582  Cols: 29

3 3) Korelačná matica (heatmap)

cor_mat <- cor(num)

# prehľad top korelácií (absolútne)
cm <- cor_mat
diag(cm) <- NA
top_pairs <- as.data.frame(as.table(cm)) |>
  dplyr::mutate(abs_cor = abs(Freq)) |>
  dplyr::filter(!is.na(abs_cor)) |>
  dplyr::arrange(desc(abs_cor)) |>
  dplyr::slice(1:20)

top_pairs

##        Var1     Var2       Freq   abs_cor
## 1  swondiff     swon  0.9997414 0.9997414
## 2      swon swondiff  0.9997414 0.9997414
## 3       s3w swondiff  0.9997413 0.9997413
## 4  swondiff      s3w  0.9997413 0.9997413
## 5       s3w     swon  0.9994828 0.9994828
## 6      swon      s3w  0.9994828 0.9994828
## 7       s2w      s1w -0.9994816 0.9994816
## 8       s1w      s2w -0.9994816 0.9994816
## 9       s3w   s3diff  0.9060014 0.9060014
## 10   s3diff      s3w  0.9060014 0.9060014
## 11 swondiff   s3diff  0.9057535 0.9057535
## 12   s3diff swondiff  0.9057535 0.9057535
## 13   s3diff     swon  0.9055193 0.9055193
## 14     swon   s3diff  0.9055193 0.9055193
## 15      s2w   s2diff  0.9034477 0.9034477
## 16   s2diff      s2w  0.9034477 0.9034477
## 17      s1w   s2diff -0.9031024 0.9031024
## 18   s2diff      s1w -0.9031024 0.9031024
## 19      s1w   s1diff  0.9015631 0.9015631
## 20   s1diff      s1w  0.9015631 0.9015631

# jednoduchý heatmap bez ďalších balíkov
ord <- order(colnames(cor_mat))
image(
  1:ncol(cor_mat), 1:ncol(cor_mat),
  cor_mat[ord, ord],
  axes = FALSE, xlab = "", ylab = ""
)
axis(1, at = 1:ncol(cor_mat), labels = colnames(cor_mat)[ord], las = 2, cex.axis = 0.6)
axis(2, at = 1:ncol(cor_mat), labels = colnames(cor_mat)[ord], las = 2, cex.axis = 0.6)
title("Korelačná matica (numerické premenné)")

4 4) Model + multikolinearita

Keďže v dátach sa často nachádzajú konštrukčné identity (napr. rankdiff = rank - rankop), OLS môže mať presnú kolinearitu.

Preto spravíme: 1. zvolíme cieľovú premennú y automaticky ako prvý numerický stĺpec (ak chceš inú, zmeň to), 2. odstránime presne lineárne závislé prediktory pomocou QR (alias()/rank), 3. potom vypočítame VIF a condition number.

y_name <- names(num)[1]          # <-- zmeň si sem cieľovú premennú ak treba
x_names <- setdiff(names(num), y_name)

y <- num[[y_name]]
X <- num[, x_names, drop = FALSE]

# odstránenie perfektnej multikolinearity (závislé stĺpce)
mm <- model.matrix(~ . , data = X)  # pridá intercept
qr_mm <- qr(mm)
rank_mm <- qr_mm$rank
p_mm <- ncol(mm)

if(rank_mm < p_mm){
  indep_idx <- qr_mm$pivot[1:rank_mm]
  mm2 <- mm[, indep_idx, drop = FALSE]
  # späť na dátový rámec bez interceptu
  keep_cols <- colnames(mm2)
  keep_cols <- setdiff(keep_cols, "(Intercept)")
  X2 <- as.data.frame(mm2[, keep_cols, drop = FALSE])
  cat("Odstránil som", ncol(X) - ncol(X2), "perfektne závislých prediktorov.\n")
} else {
  X2 <- X
  cat("Perfektná kolinearita nebola zistená.\n")
}

## Odstránil som 1 perfektne závislých prediktorov.

dat2 <- cbind.data.frame(y = y, X2)
names(dat2)[1] <- y_name

4.1 4.1 OLS

f <- as.formula(paste(y_name, "~", paste(names(X2), collapse = " + ")))
ols <- lm(f, data = dat2)
summary(ols)

## 
## Call:
## lm(formula = f, data = dat2)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -7.066e-12 -9.100e-15  1.200e-15  1.050e-14  1.050e-12 
## 
## Coefficients:
##                Estimate Std. Error    t value Pr(>|t|)    
## (Intercept)  -7.695e-14  1.634e-13 -4.710e-01 0.637752    
## s1            1.024e-15  3.302e-16  3.100e+00 0.001935 ** 
## s2            1.128e-15  3.266e-16  3.452e+00 0.000556 ***
## s3            1.157e-14  3.044e-16  3.802e+01  < 2e-16 ***
## swon          1.283e-14  2.843e-14  4.510e-01 0.651654    
## b365         -1.883e-14  4.312e-16 -4.368e+01  < 2e-16 ***
## gender        2.974e-15  7.724e-16  3.851e+00 0.000118 ***
## tylocation    5.528e-17  6.972e-18  7.928e+00 2.29e-15 ***
## tytournament -1.084e-17  3.401e-18 -3.188e+00 0.001436 ** 
## tyseries      5.503e-16  7.433e-17  7.404e+00 1.34e-13 ***
## tycourt       1.377e-14  9.137e-16  1.507e+01  < 2e-16 ***
## tysurface    -5.788e-16  2.421e-16 -2.391e+00 0.016810 *  
## tyround       1.284e-14  1.798e-16  7.139e+01  < 2e-16 ***
## year         -9.638e-17  7.753e-17 -1.243e+00 0.213837    
## month        -8.511e-16  1.148e-16 -7.411e+00 1.28e-13 ***
## day           3.995e-16  3.965e-17  1.008e+01  < 2e-16 ***
## rankdiff      1.000e+00  5.604e-18  1.785e+17  < 2e-16 ***
## s1diff        4.062e-16  3.035e-16  1.339e+00 0.180735    
## s2diff       -4.455e-16  3.005e-16 -1.482e+00 0.138222    
## s3diff       -1.145e-16  2.888e-16 -3.970e-01 0.691721    
## swondiff     -2.561e-15  2.009e-14 -1.280e-01 0.898527    
## b365diff      1.185e-14  2.899e-16  4.089e+01  < 2e-16 ***
## s1w           8.584e-15  2.011e-14  4.270e-01 0.669514    
## s2w           1.315e-14  2.010e-14  6.540e-01 0.513037    
## s3w           1.058e-14  2.844e-14  3.720e-01 0.709854    
## rankindex     3.124e-14  3.909e-16  7.992e+01  < 2e-16 ***
## rankop        1.000e+00  7.996e-18  1.251e+17  < 2e-16 ***
## rankindexop   2.107e-15  3.909e-16  5.390e+00 7.10e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.344e-14 on 38554 degrees of freedom
## Multiple R-squared:      1,  Adjusted R-squared:      1 
## F-statistic: 2.598e+33 on 27 and 38554 DF,  p-value: < 2.2e-16

4.2 4.2 VIF (Variance Inflation Factor)

v <- car::vif(ols)
v <- sort(v, decreasing = TRUE)
head(v, 30)

##     swondiff         swon          s3w          s1w          s2w       s3diff 
##  3873.433541  1941.018847  1938.255665   969.267107   968.358139     8.093307 
##       s2diff       s1diff     b365diff       rankop         b365     rankdiff 
##     7.808480     7.692521     5.486488     4.484770     4.195253     3.755672 
##  rankindexop    rankindex           s2           s1           s3     tyseries 
##     3.571906     3.571855     3.318751     3.294786     3.178247     1.449856 
##       gender      tyround      tycourt   tylocation    tysurface tytournament 
##     1.419780     1.264489     1.186184     1.149751     1.124088     1.117263 
##        month          day         year 
##     1.116594     1.026043     1.018728

4.3 4.3 Condition number (číslo podmienenosti)

Xmm <- model.matrix(ols)[, -1, drop = FALSE]  # bez interceptu
kappa_val <- kappa(Xmm, exact = TRUE)
kappa_val

## [1] 212047

Multikolinearita – Németh