# ============================================================
# Cvičenie 10 – Multikolinearita v regresných modeloch
# Dáta: EuStockMarkets (stock indexy)
# Model: FTSE ~ DAX + SMI + CAC
# ============================================================

# Balík na VIF
# Ak ho ešte nemáš: install.packages("car")
library(car)
## Loading required package: carData
# 1. Načítanie dát
data("EuStockMarkets")
stocks <- as.data.frame(EuStockMarkets)

# Ukážka dát
head(stocks)
##       DAX    SMI    CAC   FTSE
## 1 1628.75 1678.1 1772.8 2443.6
## 2 1613.63 1688.5 1750.5 2460.2
## 3 1606.51 1678.6 1718.0 2448.2
## 4 1621.04 1684.1 1708.1 2470.4
## 5 1618.16 1686.6 1723.1 2484.7
## 6 1610.61 1671.6 1714.3 2466.8
# 2. Základný regresný model
# FTSE_t = β0 + β1*DAX_t + β2*SMI_t + β3*CAC_t + u_t
model <- lm(FTSE ~ DAX + SMI + CAC, data = stocks)
summary(model)
## 
## Call:
## lm(formula = FTSE ~ DAX + SMI + CAC, data = stocks)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -534.61  -76.61   12.18   84.13  386.73 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1988.54565   18.75930 106.003   <2e-16 ***
## DAX           -0.02123    0.02578  -0.823     0.41    
## SMI            0.70758    0.01347  52.541   <2e-16 ***
## CAC           -0.34029    0.01988 -17.120   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 121.5 on 1856 degrees of freedom
## Multiple R-squared:  0.9845, Adjusted R-squared:  0.9845 
## F-statistic: 3.941e+04 on 3 and 1856 DF,  p-value: < 2.2e-16
# 3. Korelačná matica vysvetľujúcich premenných
xvars <- stocks[, c("DAX", "SMI", "CAC")]
round(cor(xvars), 3)
##       DAX   SMI   CAC
## DAX 1.000 0.991 0.966
## SMI 0.991 1.000 0.947
## CAC 0.966 0.947 1.000
# Scatterplotová matica – vizuálne posúdenie vzťahov
pairs(
  xvars,
  main = "Scatterplotová matica – DAX, SMI, CAC"
)

# 4. VIF – Variance Inflation Factor
vif(model)
##      DAX      SMI      CAC 
## 98.46480 63.14756 16.75008
# 5. Condition Number pôvodného modelu
X   <- model.matrix(model)[, -1]  # bez interceptu
XtX <- t(X) %*% X
eig <- eigen(XtX)

condition_number <- sqrt(max(eig$values) / min(eig$values))
condition_number  # čím vyššie, tým väčšia multikolinearita
## [1] 53.55796
# 6. Riešenia multikolinearity
# 6.1 Model bez jednotlivých premenných

# Bez DAX
model_no_DAX <- lm(FTSE ~ SMI + CAC, data = stocks)
summary(model_no_DAX)
## 
## Call:
## lm(formula = FTSE ~ SMI + CAC, data = stocks)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -533.97  -76.18   12.70   84.55  383.04 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.993e+03  1.796e+01  110.98   <2e-16 ***
## SMI          6.974e-01  5.266e-03  132.42   <2e-16 ***
## CAC         -3.509e-01  1.509e-02  -23.25   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 121.5 on 1857 degrees of freedom
## Multiple R-squared:  0.9845, Adjusted R-squared:  0.9845 
## F-statistic: 5.913e+04 on 2 and 1857 DF,  p-value: < 2.2e-16
# Bez SMI
model_no_SMI <- lm(FTSE ~ DAX + CAC, data = stocks)
summary(model_no_SMI)
## 
## Call:
## lm(formula = FTSE ~ DAX + CAC, data = stocks)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -645.35 -136.66   50.85  140.69  489.10 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1961.84473   29.56711   66.35   <2e-16 ***
## DAX            1.22542    0.01590   77.09   <2e-16 ***
## CAC           -0.67210    0.02972  -22.62   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 191.6 on 1857 degrees of freedom
## Multiple R-squared:  0.9616, Adjusted R-squared:  0.9615 
## F-statistic: 2.323e+04 on 2 and 1857 DF,  p-value: < 2.2e-16
# Bez CAC
model_no_CAC <- lm(FTSE ~ DAX + SMI, data = stocks)
summary(model_no_CAC)
## 
## Call:
## lm(formula = FTSE ~ DAX + SMI, data = stocks)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -589.54  -76.07   -2.32   81.31  428.57 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1709.91642   10.03651  170.37   <2e-16 ***
## DAX           -0.30843    0.02106  -14.65   <2e-16 ***
## SMI            0.78083    0.01374   56.84   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 130.7 on 1857 degrees of freedom
## Multiple R-squared:  0.9821, Adjusted R-squared:  0.9821 
## F-statistic: 5.096e+04 on 2 and 1857 DF,  p-value: < 2.2e-16
# 7. Z-škálovanie premenných (centrovanie + škálovanie)
stocks$DAX_c <- scale(stocks$DAX, center = TRUE, scale = TRUE)
stocks$SMI_c <- scale(stocks$SMI, center = TRUE, scale = TRUE)
stocks$CAC_c <- scale(stocks$CAC, center = TRUE, scale = TRUE)

model_centered <- lm(FTSE ~ DAX_c + SMI_c + CAC_c, data = stocks)
summary(model_centered)
## 
## Call:
## lm(formula = FTSE ~ DAX_c + SMI_c + CAC_c, data = stocks)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -534.61  -76.61   12.18   84.13  386.73 
## 
## Coefficients:
##             Estimate Std. Error  t value Pr(>|t|)    
## (Intercept) 3565.643      2.818 1265.485   <2e-16 ***
## DAX_c        -23.028     27.966   -0.823     0.41    
## SMI_c       1176.716     22.396   52.541   <2e-16 ***
## CAC_c       -197.476     11.535  -17.120   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 121.5 on 1856 degrees of freedom
## Multiple R-squared:  0.9845, Adjusted R-squared:  0.9845 
## F-statistic: 3.941e+04 on 3 and 1856 DF,  p-value: < 2.2e-16
# VIF po škálovaní
vif(model_centered)
##    DAX_c    SMI_c    CAC_c 
## 98.46480 63.14756 16.75008
# Condition number po škálovaní
Xc   <- model.matrix(model_centered)[, -1]
XtXc <- t(Xc) %*% Xc
eigc <- eigen(XtXc)

condition_number_scaled <- sqrt(max(eigc$values) / min(eigc$values))
condition_number_scaled
## [1] 21.71815
# 8. Jednoduchá reskalácia – DAX/1000 (aby boli rády podobné)
stocks$DAX_1000 <- stocks$DAX / 1000

model_DAX_1000 <- lm(FTSE ~ DAX_1000 + SMI + CAC, data = stocks)
summary(model_DAX_1000)
## 
## Call:
## lm(formula = FTSE ~ DAX_1000 + SMI + CAC, data = stocks)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -534.61  -76.61   12.18   84.13  386.73 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1988.54565   18.75930 106.003   <2e-16 ***
## DAX_1000     -21.22824   25.78049  -0.823     0.41    
## SMI            0.70758    0.01347  52.541   <2e-16 ***
## CAC           -0.34029    0.01988 -17.120   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 121.5 on 1856 degrees of freedom
## Multiple R-squared:  0.9845, Adjusted R-squared:  0.9845 
## F-statistic: 3.941e+04 on 3 and 1856 DF,  p-value: < 2.2e-16
# VIF po reskalácii
vif(model_DAX_1000)
## DAX_1000      SMI      CAC 
## 98.46480 63.14756 16.75008
# Condition number po reskalácii
Xr   <- model.matrix(model_DAX_1000)[, -1]
XtXr <- t(Xr) %*% Xr
eigr <- eigen(XtXr)

condition_number_rescaled <- sqrt(max(eigr$values) / min(eigr$values))
condition_number_rescaled
## [1] 38442.15