Popis

Tento R Markdown súbor aplikuje štruktúru cvičenia 5 (lineárna regresia + diagnostika) na dataset nhlplayoffs.csv. Cieľ: modelovať win_loss_percentage pomocou troch vysvetľujúcich premenných: goals_scored, goals_against, a games.

Balíčky

# Nainštaluj balíčky, ak ešte nie sú nainštalované
needed <- c("tidyverse", "broom", "lmtest", "car", "tseries", "sandwich", "ggplot2")
new <- needed[!(needed %in% installed.packages()[, "Package"]) ]
if(length(new)) install.packages(new)

library(tidyverse)
library(broom)
library(lmtest)
library(car)
library(tseries)
library(sandwich)
library(ggplot2)

Načítanie dát

# Uprav cestu podľa potreby. Tu sa predpokladá "/mnt/data/nhlplayoffs.csv".
nhl <- read_csv("nhlplayoffs.csv")

# Zobrazenie základných informácií
glimpse(nhl)
## Rows: 1,009
## Columns: 13
## $ rank                <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,…
## $ team                <chr> "Colorado Avalanche", "Tampa Bay Lightning", "New …
## $ year                <dbl> 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022, 20…
## $ games               <dbl> 20, 23, 20, 16, 14, 12, 12, 10, 7, 7, 7, 7, 7, 6, …
## $ wins                <dbl> 16, 14, 10, 8, 7, 6, 5, 4, 3, 3, 3, 3, 3, 2, 2, 0,…
## $ losses              <dbl> 4, 9, 10, 8, 7, 6, 7, 6, 4, 4, 4, 4, 4, 4, 4, 4, 7…
## $ ties                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ shootout_wins       <dbl> 5, 1, 1, 1, 1, 1, 1, 2, 0, 1, 0, 0, 1, 0, 0, 0, 0,…
## $ shootout_losses     <dbl> 1, 2, 2, 2, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 2, 1, 4,…
## $ win_loss_percentage <dbl> 0.800, 0.609, 0.500, 0.500, 0.500, 0.500, 0.417, 0…
## $ goals_scored        <dbl> 85, 67, 62, 65, 37, 40, 35, 23, 20, 17, 24, 14, 29…
## $ goals_against       <dbl> 55, 61, 58, 59, 40, 38, 39, 32, 24, 27, 23, 15, 28…
## $ goal_differential   <dbl> 30, 6, 4, 6, -3, 2, -4, -9, -4, -10, 1, -1, 1, -6,…
summary(nhl)
##       rank            team                year          games       
##  Min.   : 1.000   Length:1009        Min.   :1918   Min.   : 2.000  
##  1st Qu.: 3.000   Class :character   1st Qu.:1972   1st Qu.: 5.000  
##  Median : 6.000   Mode  :character   Median :1990   Median : 7.000  
##  Mean   : 7.067                      Mean   :1986   Mean   : 9.364  
##  3rd Qu.:11.000                      3rd Qu.:2007   3rd Qu.:12.000  
##  Max.   :24.000                      Max.   :2022   Max.   :27.000  
##       wins            losses            ties         shootout_wins    
##  Min.   : 0.000   Min.   : 0.000   Min.   :0.00000   Min.   : 0.0000  
##  1st Qu.: 1.000   1st Qu.: 4.000   1st Qu.:0.00000   1st Qu.: 0.0000  
##  Median : 3.000   Median : 4.000   Median :0.00000   Median : 1.0000  
##  Mean   : 4.657   Mean   : 4.657   Mean   :0.04955   Mean   : 0.9326  
##  3rd Qu.: 7.000   3rd Qu.: 6.000   3rd Qu.:0.00000   3rd Qu.: 1.0000  
##  Max.   :18.000   Max.   :12.000   Max.   :4.00000   Max.   :10.0000  
##  shootout_losses  win_loss_percentage  goals_scored   goals_against  
##  Min.   :0.0000   Min.   :0.0000      Min.   : 0.00   Min.   : 0.00  
##  1st Qu.:0.0000   1st Qu.:0.3330      1st Qu.:11.00   1st Qu.:16.00  
##  Median :1.0000   Median :0.4290      Median :20.00   Median :22.00  
##  Mean   :0.9326   Mean   :0.4112      Mean   :26.63   Mean   :26.63  
##  3rd Qu.:1.0000   3rd Qu.:0.5450      3rd Qu.:37.00   3rd Qu.:35.00  
##  Max.   :4.0000   Max.   :1.0000      Max.   :98.00   Max.   :91.00  
##  goal_differential
##  Min.   :-27      
##  1st Qu.: -6      
##  Median : -2      
##  Mean   :  0      
##  3rd Qu.:  3      
##  Max.   : 49
# Prvých 10 riadkov
head(nhl, 10)
## # A tibble: 10 × 13
##     rank team        year games  wins losses  ties shootout_wins shootout_losses
##    <dbl> <chr>      <dbl> <dbl> <dbl>  <dbl> <dbl>         <dbl>           <dbl>
##  1     1 Colorado …  2022    20    16      4     0             5               1
##  2     2 Tampa Bay…  2022    23    14      9     0             1               2
##  3     3 New York …  2022    20    10     10     0             1               2
##  4     4 Edmonton …  2022    16     8      8     0             1               2
##  5     5 Carolina …  2022    14     7      7     0             1               0
##  6     6 St. Louis…  2022    12     6      6     0             1               1
##  7     7 Calgary F…  2022    12     5      7     0             1               1
##  8     8 Florida P…  2022    10     4      6     0             2               0
##  9     9 Boston Br…  2022     7     3      4     0             0               0
## 10    10 Los Angel…  2022     7     3      4     0             1               0
## # ℹ 4 more variables: win_loss_percentage <dbl>, goals_scored <dbl>,
## #   goals_against <dbl>, goal_differential <dbl>

Predspracovanie

# Vyberieme len premenné relevantné pre model
nhl_model <- nhl %>%
  select(win_loss_percentage, goals_scored, goals_against, games, team, year) %>%
  mutate(across(c(win_loss_percentage, goals_scored, goals_against, games), as.numeric))

# Kontrola NA
nhl_model %>% summarise(across(everything(), ~ sum(is.na(.))))
## # A tibble: 1 × 6
##   win_loss_percentage goals_scored goals_against games  team  year
##                 <int>        <int>         <int> <int> <int> <int>
## 1                   0            0             0     0     0     0
# Ak by boli NA, jednoduchá imputácia priemerom (tu len ako postup)
if(any(is.na(nhl_model))) {
  nhl_model <- nhl_model %>%
    mutate(across(where(is.numeric), ~ ifelse(is.na(.), mean(., na.rm = TRUE), .)))
}

Deskriptívna štatistika a korelácie

# Základné štatistiky
nhl_model %>% select(win_loss_percentage, goals_scored, goals_against, games) %>% summary()
##  win_loss_percentage  goals_scored   goals_against       games       
##  Min.   :0.0000      Min.   : 0.00   Min.   : 0.00   Min.   : 2.000  
##  1st Qu.:0.3330      1st Qu.:11.00   1st Qu.:16.00   1st Qu.: 5.000  
##  Median :0.4290      Median :20.00   Median :22.00   Median : 7.000  
##  Mean   :0.4112      Mean   :26.63   Mean   :26.63   Mean   : 9.364  
##  3rd Qu.:0.5450      3rd Qu.:37.00   3rd Qu.:35.00   3rd Qu.:12.000  
##  Max.   :1.0000      Max.   :98.00   Max.   :91.00   Max.   :27.000
# Korelačná matica
cor_mat <- nhl_model %>% select(win_loss_percentage, goals_scored, goals_against, games) %>% cor()
cor_mat
##                     win_loss_percentage goals_scored goals_against     games
## win_loss_percentage           1.0000000    0.7009174     0.5133309 0.6829670
## goals_scored                  0.7009174    1.0000000     0.9088340 0.9401903
## goals_against                 0.5133309    0.9088340     1.0000000 0.8948069
## games                         0.6829670    0.9401903     0.8948069 1.0000000
# Heatmap korelácií
library(ggplot2)
cor_df <- as.data.frame(as.table(cor_mat))
ggplot(cor_df, aes(Var1, Var2, fill = Freq)) +
  geom_tile() +
  geom_text(aes(label = round(Freq, 2))) +
  labs(title = "Korelačná matica") +
  theme_minimal()

Odhad regresného modelu

model <- lm(win_loss_percentage ~ goals_scored + goals_against + games, data = nhl_model)
summary(model)
## 
## Call:
## lm(formula = win_loss_percentage ~ goals_scored + goals_against + 
##     games, data = nhl_model)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.30101 -0.06681  0.01477  0.07457  0.65960 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    0.2767744  0.0095021   29.13  < 2e-16 ***
## goals_scored   0.0100937  0.0006642   15.20  < 2e-16 ***
## goals_against -0.0113519  0.0006818  -16.65  < 2e-16 ***
## games          0.0179319  0.0022057    8.13 1.26e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1325 on 1005 degrees of freedom
## Multiple R-squared:  0.6052, Adjusted R-squared:  0.604 
## F-statistic: 513.5 on 3 and 1005 DF,  p-value: < 2.2e-16
# Harmonizovaný tidy výstup
tidy(model)
## # A tibble: 4 × 5
##   term          estimate std.error statistic   p.value
##   <chr>            <dbl>     <dbl>     <dbl>     <dbl>
## 1 (Intercept)     0.277   0.00950      29.1  1.00e-135
## 2 goals_scored    0.0101  0.000664     15.2  4.15e- 47
## 3 goals_against  -0.0114  0.000682    -16.7  3.65e- 55
## 4 games           0.0179  0.00221       8.13 1.26e- 15
glance(model)
## # A tibble: 1 × 12
##   r.squared adj.r.squared sigma statistic   p.value    df logLik    AIC    BIC
##       <dbl>         <dbl> <dbl>     <dbl>     <dbl> <dbl>  <dbl>  <dbl>  <dbl>
## 1     0.605         0.604 0.132      513. 3.12e-202     3   610. -1210. -1185.
## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>

Diagnostika rezíduí

# Rezidua a fitted hodnoty
nhl_model <- nhl_model %>%
  mutate(fitted = fitted(model), residuals = residuals(model), std_resid = rstandard(model))

# Graf: rezidua vs fitted
ggplot(nhl_model, aes(x = fitted, y = residuals)) +
  geom_point() +
  geom_hline(yintercept = 0, linetype = "dashed") +
  labs(title = "Rezidua vs Fitted hodnoty", x = "Fitted", y = "Rezidua") +
  theme_minimal()

# Histogram rezíduí + normálna krivka
ggplot(nhl_model, aes(x = residuals)) +
  geom_histogram(aes(y = ..density..), bins = 30) +
  stat_function(fun = dnorm, args = list(mean = mean(nhl_model$residuals), sd = sd(nhl_model$residuals))) +
  labs(title = "Histogram rezíduí s normálnou krivkou") +
  theme_minimal()

# QQ-plot
qqPlot(model, main = "QQ-plot rezíduí")

## [1] 996 998
# Jarque-Bera test (normálnosť)
jarque.test <- tseries::jarque.bera.test(nhl_model$residuals)
jarque.test
## 
##  Jarque Bera Test
## 
## data:  nhl_model$residuals
## X-squared = 162.83, df = 2, p-value < 2.2e-16

Test heteroskedasticity

# Breusch-Pagan test
bptest_res <- bptest(model)
bptest_res
## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 72.531, df = 3, p-value = 1.225e-15
# White test (approx pomocou bptest na kvadratické termíny)
# Pridáme kvadratické a interakčné termíny pre aproximáciu White testu
model_white <- lm(residuals(model)^2 ~ fitted(model) + I(fitted(model)^2), data = nhl_model)
summary(model_white)
## 
## Call:
## lm(formula = residuals(model)^2 ~ fitted(model) + I(fitted(model)^2), 
##     data = nhl_model)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.03452 -0.01482 -0.00935  0.00232  0.41837 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         0.064579   0.007487   8.625  < 2e-16 ***
## fitted(model)      -0.206015   0.032159  -6.406 2.29e-10 ***
## I(fitted(model)^2)  0.192016   0.030854   6.223 7.13e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.03406 on 1006 degrees of freedom
## Multiple R-squared:  0.03925,    Adjusted R-squared:  0.03734 
## F-statistic: 20.55 on 2 and 1006 DF,  p-value: 1.795e-09

Autokorelácia

# Durbin-Watson test
dw_res <- lmtest::dwtest(model)
dw_res
## 
##  Durbin-Watson test
## 
## data:  model
## DW = 1.4977, p-value = 5.168e-16
## alternative hypothesis: true autocorrelation is greater than 0

Multikolinearita

vifs <- car::vif(model)
vifs
##  goals_scored goals_against         games 
##     10.735066      6.249835      9.372463

Robustné štandardné chyby

coeftest(model, vcov. = sandwich::vcovHC(model, type = "HC1"))
## 
## t test of coefficients:
## 
##                  Estimate  Std. Error t value  Pr(>|t|)    
## (Intercept)    0.27677444  0.01235335  22.405 < 2.2e-16 ***
## goals_scored   0.01009367  0.00071421  14.133 < 2.2e-16 ***
## goals_against -0.01135187  0.00085493 -13.278 < 2.2e-16 ***
## games          0.01793191  0.00200022   8.965 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Test špecifikácie modelu (RESET)

# Ramsey RESET test
reset_res <- lmtest::resettest(model)
reset_res
## 
##  RESET test
## 
## data:  model
## RESET = 195.4, df1 = 2, df2 = 1003, p-value < 2.2e-16

Identifikácia vplyvných pozorovaní

# Cook's distance
nhl_model$cooks <- cooks.distance(model)
# Najviac vplyvné pozorovania
top_influential <- nhl_model %>%
  mutate(row = row_number()) %>%
  arrange(desc(cooks)) %>%
  slice(1:10) %>%
  select(row, team, year, cooks, residuals, fitted)

top_influential
## # A tibble: 10 × 6
##      row team                 year  cooks residuals fitted
##    <int> <chr>               <dbl>  <dbl>     <dbl>  <dbl>
##  1  1004 Ottawa Senators      1921 0.0358     0.617  0.383
##  2   996 Montreal Canadiens   1925 0.0294     0.660  0.340
##  3   998 Montreal Canadiens   1924 0.0294     0.660  0.340
##  4   649 New York Islanders   1981 0.0288    -0.201  1.03 
##  5   975 Boston Bruins        1929 0.0266     0.577  0.423
##  6   857 Detroit Red Wings    1952 0.0247     0.394  0.606
##  7   617 New York Islanders   1983 0.0237    -0.233  0.983
##  8   825 Montreal Canadiens   1960 0.0204     0.412  0.588
##  9   431 San Jose Sharks      1995 0.0189     0.237  0.127
## 10   587 Chicago Black Hawks  1985 0.0171     0.156  0.444
# Graf Cook's distance
plot(model, which = 4)

Zhrnutie a interpretácia (krátke poznámky)

# Koeficienty modelu:
summary(model)$coefficients
##                  Estimate   Std. Error    t value      Pr(>|t|)
## (Intercept)    0.27677444 0.0095021309  29.127618 1.003985e-135
## goals_scored   0.01009367 0.0006641658  15.197515  4.154912e-47
## goals_against -0.01135187 0.0006817521 -16.651024  3.654125e-55
## games          0.01793191 0.0022056752   8.129897  1.256273e-15
cat("\nInterpretácia: \n")
## 
## Interpretácia:
cat(" - Koeficient pri goals_scored ukazuje, ako sa mení win_loss_percentage pri náraste počtu nastrieľaných gólov o 1, pri zachovaní ostatných premenných.\n")
##  - Koeficient pri goals_scored ukazuje, ako sa mení win_loss_percentage pri náraste počtu nastrieľaných gólov o 1, pri zachovaní ostatných premenných.
cat(" - Koeficient pri goals_against udáva efekt inkasovaných gólov.\n")
##  - Koeficient pri goals_against udáva efekt inkasovaných gólov.
cat(" - Premenná games zachytáva rozsah alebo počet odohraných zápasov v play-off (ak má zmysel v dátach).\n")
##  - Premenná games zachytáva rozsah alebo počet odohraných zápasov v play-off (ak má zmysel v dátach).
cat("\nDiagnostika: \n")
## 
## Diagnostika:
cat(" - Ak Jarque-Bera test zamieta normalitu, rezidua nie sú normálne rozdelené.\n")
##  - Ak Jarque-Bera test zamieta normalitu, rezidua nie sú normálne rozdelené.
cat(" - Ak Breusch-Pagan test indikuje heteroskedasticitu, použitie robustných SE (HC1) je odporúčané.\n")
##  - Ak Breusch-Pagan test indikuje heteroskedasticitu, použitie robustných SE (HC1) je odporúčané.
cat(" - Vysoké VIF (>5 alebo >10) naznačujú multikolinearitu.\n")
##  - Vysoké VIF (>5 alebo >10) naznačujú multikolinearitu.
cat(" - Ak RESET test je signifikantný, môže ísť o nesprávnu špecifikáciu (chýbajúce nelineárne termíny alebo interakcie).\n")
##  - Ak RESET test je signifikantný, môže ísť o nesprávnu špecifikáciu (chýbajúce nelineárne termíny alebo interakcie).

Pokus s rozšírením modelu (interakcie / kvadratické členy) — ak treba

model2 <- lm(win_loss_percentage ~ goals_scored + goals_against + games + I(goals_scored^2) + I(goals_against^2) + goals_scored:goals_against, data = nhl_model)
summary(model2)
## 
## Call:
## lm(formula = win_loss_percentage ~ goals_scored + goals_against + 
##     games + I(goals_scored^2) + I(goals_against^2) + goals_scored:goals_against, 
##     data = nhl_model)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.25371 -0.06076  0.00143  0.06160  0.61379 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 2.651e-01  1.222e-02  21.686  < 2e-16 ***
## goals_scored                3.214e-02  1.099e-03  29.256  < 2e-16 ***
## goals_against              -2.733e-02  1.357e-03 -20.142  < 2e-16 ***
## games                       9.425e-03  1.851e-03   5.093 4.21e-07 ***
## I(goals_scored^2)          -3.649e-05  2.613e-05  -1.397    0.163    
## I(goals_against^2)          4.626e-04  4.815e-05   9.607  < 2e-16 ***
## goals_scored:goals_against -4.675e-04  6.775e-05  -6.900 9.22e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1063 on 1002 degrees of freedom
## Multiple R-squared:  0.7467, Adjusted R-squared:  0.7452 
## F-statistic: 492.3 on 6 and 1002 DF,  p-value: < 2.2e-16
# Porovnanie AIC/BIC
AIC(model, model2)
##        df       AIC
## model   5 -1209.875
## model2  8 -1651.846
BIC(model, model2)
##        df       BIC
## model   5 -1185.292
## model2  8 -1612.512

Uloženie výsledkov (voliteľné)

# Uložiť súhrn modelu ako CSV
write_csv(broom::tidy(model), "model_coefficients.csv")
write_csv(broom::glance(model), "model_glance.csv")