req <- c("haven","dplyr","ggplot2","car","MASS")
to_install <- req[!sapply(req, requireNamespace, quietly = TRUE)]
if(length(to_install)) install.packages(to_install, dependencies = TRUE)
library(haven)
library(dplyr)
library(ggplot2)
library(car)
library(MASS)
all.dtapaths <- c("all.dta", file.path(getwd(), "all.dta"))
path <- paths[file.exists(paths)][1]
stopifnot(!is.na(path), file.exists(path))
df <- read_dta(path) |> as.data.frame()
# základný sanity check
cat("Rows:", nrow(df), " Cols:", ncol(df), "\n")
## Rows: 112138 Cols: 30
str(df[1:min(10,ncol(df))])
## 'data.frame': 112138 obs. of 10 variables:
## $ player : chr "Krajinovic F." "Roddick A." "Simon G." "Parmentier P." ...
## ..- attr(*, "format.stata")= chr "%24s"
## $ rank : num 37 6 37 59 74 25 196 47 396 29 ...
## ..- attr(*, "format.stata")= chr "%8.0g"
## $ s1 : num 6 6 6 6 6 7 3 6 7 6 ...
## ..- attr(*, "label")= chr "number of games won in 1st set"
## ..- attr(*, "format.stata")= chr "%8.0g"
## $ s2 : num 6 3 2 6 6 6 6 6 4 6 ...
## ..- attr(*, "label")= chr "number of games won in 2nd set"
## ..- attr(*, "format.stata")= chr "%8.0g"
## $ s3 : num NA NA NA 3 NA NA 3 NA 4 NA ...
## ..- attr(*, "label")= chr "number of games won in 3rd set"
## ..- attr(*, "format.stata")= chr "%8.0g"
## $ swon : num 2 0 0 1 0 2 1 2 1 2 ...
## ..- attr(*, "label")= chr "number of sets won"
## ..- attr(*, "format.stata")= chr "%8.0g"
## $ b365 : num 1.5 1.53 1.53 2.62 1.83 ...
## ..- attr(*, "label")= chr "bet365 odds"
## ..- attr(*, "format.stata")= chr "%10.0g"
## $ gender : num 1 1 1 0 1 1 1 1 1 1 ...
## ..- attr(*, "label")= chr "male=1, female=0"
## ..- attr(*, "format.stata")= chr "%8.0g"
## $ tylocation : num 149 100 101 100 123 123 98 151 172 140 ...
## ..- attr(*, "label")= chr "type of location"
## ..- attr(*, "format.stata")= chr "%12.0g"
## $ tytournament: num 279 257 229 195 34 34 220 285 235 266 ...
## ..- attr(*, "label")= chr "type of tournament"
## ..- attr(*, "format.stata")= chr "%12.0g"
num <- df |> dplyr::select(where(is.numeric))
# vyhodiť konštanty / nulový rozptyl
nzv <- sapply(num, function(x) isTRUE(sd(x, na.rm=TRUE) > 0))
num <- num[, nzv, drop = FALSE]
num <- na.omit(num)
cat("Po čistení: Rows:", nrow(num), " Cols:", ncol(num), "\n")
## Po čistení: Rows: 38582 Cols: 29
cor_mat <- cor(num)
# prehľad top korelácií (absolútne)
cm <- cor_mat
diag(cm) <- NA
top_pairs <- as.data.frame(as.table(cm)) |>
dplyr::mutate(abs_cor = abs(Freq)) |>
dplyr::filter(!is.na(abs_cor)) |>
dplyr::arrange(desc(abs_cor)) |>
dplyr::slice(1:20)
top_pairs
## Var1 Var2 Freq abs_cor
## 1 swondiff swon 0.9997414 0.9997414
## 2 swon swondiff 0.9997414 0.9997414
## 3 s3w swondiff 0.9997413 0.9997413
## 4 swondiff s3w 0.9997413 0.9997413
## 5 s3w swon 0.9994828 0.9994828
## 6 swon s3w 0.9994828 0.9994828
## 7 s2w s1w -0.9994816 0.9994816
## 8 s1w s2w -0.9994816 0.9994816
## 9 s3w s3diff 0.9060014 0.9060014
## 10 s3diff s3w 0.9060014 0.9060014
## 11 swondiff s3diff 0.9057535 0.9057535
## 12 s3diff swondiff 0.9057535 0.9057535
## 13 s3diff swon 0.9055193 0.9055193
## 14 swon s3diff 0.9055193 0.9055193
## 15 s2w s2diff 0.9034477 0.9034477
## 16 s2diff s2w 0.9034477 0.9034477
## 17 s1w s2diff -0.9031024 0.9031024
## 18 s2diff s1w -0.9031024 0.9031024
## 19 s1w s1diff 0.9015631 0.9015631
## 20 s1diff s1w 0.9015631 0.9015631
# jednoduchý heatmap bez ďalších balíkov
ord <- order(colnames(cor_mat))
image(
1:ncol(cor_mat), 1:ncol(cor_mat),
cor_mat[ord, ord],
axes = FALSE, xlab = "", ylab = ""
)
axis(1, at = 1:ncol(cor_mat), labels = colnames(cor_mat)[ord], las = 2, cex.axis = 0.6)
axis(2, at = 1:ncol(cor_mat), labels = colnames(cor_mat)[ord], las = 2, cex.axis = 0.6)
title("Korelačná matica (numerické premenné)")
Keďže v dátach sa často nachádzajú konštrukčné
identity (napr. rankdiff = rank - rankop), OLS
môže mať presnú kolinearitu.
Preto spravíme: 1. zvolíme cieľovú premennú y
automaticky ako prvý numerický stĺpec (ak chceš inú, zmeň to), 2.
odstránime presne lineárne závislé prediktory pomocou QR
(alias()/rank), 3. potom vypočítame VIF a condition
number.
y_name <- names(num)[1] # <-- zmeň si sem cieľovú premennú ak treba
x_names <- setdiff(names(num), y_name)
y <- num[[y_name]]
X <- num[, x_names, drop = FALSE]
# odstránenie perfektnej multikolinearity (závislé stĺpce)
mm <- model.matrix(~ . , data = X) # pridá intercept
qr_mm <- qr(mm)
rank_mm <- qr_mm$rank
p_mm <- ncol(mm)
if(rank_mm < p_mm){
indep_idx <- qr_mm$pivot[1:rank_mm]
mm2 <- mm[, indep_idx, drop = FALSE]
# späť na dátový rámec bez interceptu
keep_cols <- colnames(mm2)
keep_cols <- setdiff(keep_cols, "(Intercept)")
X2 <- as.data.frame(mm2[, keep_cols, drop = FALSE])
cat("Odstránil som", ncol(X) - ncol(X2), "perfektne závislých prediktorov.\n")
} else {
X2 <- X
cat("Perfektná kolinearita nebola zistená.\n")
}
## Odstránil som 1 perfektne závislých prediktorov.
dat2 <- cbind.data.frame(y = y, X2)
names(dat2)[1] <- y_name
f <- as.formula(paste(y_name, "~", paste(names(X2), collapse = " + ")))
ols <- lm(f, data = dat2)
summary(ols)
##
## Call:
## lm(formula = f, data = dat2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.066e-12 -9.100e-15 1.200e-15 1.050e-14 1.050e-12
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.695e-14 1.634e-13 -4.710e-01 0.637752
## s1 1.024e-15 3.302e-16 3.100e+00 0.001935 **
## s2 1.128e-15 3.266e-16 3.452e+00 0.000556 ***
## s3 1.157e-14 3.044e-16 3.802e+01 < 2e-16 ***
## swon 1.283e-14 2.843e-14 4.510e-01 0.651654
## b365 -1.883e-14 4.312e-16 -4.368e+01 < 2e-16 ***
## gender 2.974e-15 7.724e-16 3.851e+00 0.000118 ***
## tylocation 5.528e-17 6.972e-18 7.928e+00 2.29e-15 ***
## tytournament -1.084e-17 3.401e-18 -3.188e+00 0.001436 **
## tyseries 5.503e-16 7.433e-17 7.404e+00 1.34e-13 ***
## tycourt 1.377e-14 9.137e-16 1.507e+01 < 2e-16 ***
## tysurface -5.788e-16 2.421e-16 -2.391e+00 0.016810 *
## tyround 1.284e-14 1.798e-16 7.139e+01 < 2e-16 ***
## year -9.638e-17 7.753e-17 -1.243e+00 0.213837
## month -8.511e-16 1.148e-16 -7.411e+00 1.28e-13 ***
## day 3.995e-16 3.965e-17 1.008e+01 < 2e-16 ***
## rankdiff 1.000e+00 5.604e-18 1.785e+17 < 2e-16 ***
## s1diff 4.062e-16 3.035e-16 1.339e+00 0.180735
## s2diff -4.455e-16 3.005e-16 -1.482e+00 0.138222
## s3diff -1.145e-16 2.888e-16 -3.970e-01 0.691721
## swondiff -2.561e-15 2.009e-14 -1.280e-01 0.898527
## b365diff 1.185e-14 2.899e-16 4.089e+01 < 2e-16 ***
## s1w 8.584e-15 2.011e-14 4.270e-01 0.669514
## s2w 1.315e-14 2.010e-14 6.540e-01 0.513037
## s3w 1.058e-14 2.844e-14 3.720e-01 0.709854
## rankindex 3.124e-14 3.909e-16 7.992e+01 < 2e-16 ***
## rankop 1.000e+00 7.996e-18 1.251e+17 < 2e-16 ***
## rankindexop 2.107e-15 3.909e-16 5.390e+00 7.10e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.344e-14 on 38554 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 2.598e+33 on 27 and 38554 DF, p-value: < 2.2e-16
v <- car::vif(ols)
v <- sort(v, decreasing = TRUE)
head(v, 30)
## swondiff swon s3w s1w s2w s3diff
## 3873.433541 1941.018847 1938.255665 969.267107 968.358139 8.093307
## s2diff s1diff b365diff rankop b365 rankdiff
## 7.808480 7.692521 5.486488 4.484770 4.195253 3.755672
## rankindexop rankindex s2 s1 s3 tyseries
## 3.571906 3.571855 3.318751 3.294786 3.178247 1.449856
## gender tyround tycourt tylocation tysurface tytournament
## 1.419780 1.264489 1.186184 1.149751 1.124088 1.117263
## month day year
## 1.116594 1.026043 1.018728
Xmm <- model.matrix(ols)[, -1, drop = FALSE] # bez interceptu
kappa_val <- kappa(Xmm, exact = TRUE)
kappa_val
## [1] 212047