# 📌 **1. Načítanie dát**


data_path <- "nhlplayoffs.csv"     
df <- read.csv(data_path, stringsAsFactors = FALSE)

glimpse(df)
## Rows: 1,009
## Columns: 13
## $ rank                <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,…
## $ team                <chr> "Colorado Avalanche", "Tampa Bay Lightning", "New …
## $ year                <int> 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022, 20…
## $ games               <int> 20, 23, 20, 16, 14, 12, 12, 10, 7, 7, 7, 7, 7, 6, …
## $ wins                <int> 16, 14, 10, 8, 7, 6, 5, 4, 3, 3, 3, 3, 3, 2, 2, 0,…
## $ losses              <int> 4, 9, 10, 8, 7, 6, 7, 6, 4, 4, 4, 4, 4, 4, 4, 4, 7…
## $ ties                <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ shootout_wins       <int> 5, 1, 1, 1, 1, 1, 1, 2, 0, 1, 0, 0, 1, 0, 0, 0, 0,…
## $ shootout_losses     <int> 1, 2, 2, 2, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 2, 1, 4,…
## $ win_loss_percentage <dbl> 0.800, 0.609, 0.500, 0.500, 0.500, 0.500, 0.417, 0…
## $ goals_scored        <int> 85, 67, 62, 65, 37, 40, 35, 23, 20, 17, 24, 14, 29…
## $ goals_against       <int> 55, 61, 58, 59, 40, 38, 39, 32, 24, 27, 23, 15, 28…
## $ goal_differential   <int> 30, 6, 4, 6, -3, 2, -4, -9, -4, -10, 1, -1, 1, -6,…
kable(head(df, 10), caption = "Ukážka prvých 10 riadkov datasetu nhlplayoffs.csv")
Ukážka prvých 10 riadkov datasetu nhlplayoffs.csv
rank team year games wins losses ties shootout_wins shootout_losses win_loss_percentage goals_scored goals_against goal_differential
1 Colorado Avalanche 2022 20 16 4 0 5 1 0.800 85 55 30
2 Tampa Bay Lightning 2022 23 14 9 0 1 2 0.609 67 61 6
3 New York Rangers 2022 20 10 10 0 1 2 0.500 62 58 4
4 Edmonton Oilers 2022 16 8 8 0 1 2 0.500 65 59 6
5 Carolina Hurricanes 2022 14 7 7 0 1 0 0.500 37 40 -3
6 St. Louis Blues 2022 12 6 6 0 1 1 0.500 40 38 2
7 Calgary Flames 2022 12 5 7 0 1 1 0.417 35 39 -4
8 Florida Panthers 2022 10 4 6 0 2 0 0.400 23 32 -9
9 Boston Bruins 2022 7 3 4 0 0 0 0.429 20 24 -4
10 Los Angeles Kings 2022 7 3 4 0 1 0 0.429 17 27 -10
# 📌 **1.1 Kontrola chýbajúcich hodnôt a typov**

summary(df)
##       rank            team                year          games       
##  Min.   : 1.000   Length:1009        Min.   :1918   Min.   : 2.000  
##  1st Qu.: 3.000   Class :character   1st Qu.:1972   1st Qu.: 5.000  
##  Median : 6.000   Mode  :character   Median :1990   Median : 7.000  
##  Mean   : 7.067                      Mean   :1986   Mean   : 9.364  
##  3rd Qu.:11.000                      3rd Qu.:2007   3rd Qu.:12.000  
##  Max.   :24.000                      Max.   :2022   Max.   :27.000  
##       wins            losses            ties         shootout_wins    
##  Min.   : 0.000   Min.   : 0.000   Min.   :0.00000   Min.   : 0.0000  
##  1st Qu.: 1.000   1st Qu.: 4.000   1st Qu.:0.00000   1st Qu.: 0.0000  
##  Median : 3.000   Median : 4.000   Median :0.00000   Median : 1.0000  
##  Mean   : 4.657   Mean   : 4.657   Mean   :0.04955   Mean   : 0.9326  
##  3rd Qu.: 7.000   3rd Qu.: 6.000   3rd Qu.:0.00000   3rd Qu.: 1.0000  
##  Max.   :18.000   Max.   :12.000   Max.   :4.00000   Max.   :10.0000  
##  shootout_losses  win_loss_percentage  goals_scored   goals_against  
##  Min.   :0.0000   Min.   :0.0000      Min.   : 0.00   Min.   : 0.00  
##  1st Qu.:0.0000   1st Qu.:0.3330      1st Qu.:11.00   1st Qu.:16.00  
##  Median :1.0000   Median :0.4290      Median :20.00   Median :22.00  
##  Mean   :0.9326   Mean   :0.4112      Mean   :26.63   Mean   :26.63  
##  3rd Qu.:1.0000   3rd Qu.:0.5450      3rd Qu.:37.00   3rd Qu.:35.00  
##  Max.   :4.0000   Max.   :1.0000      Max.   :98.00   Max.   :91.00  
##  goal_differential
##  Min.   :-27      
##  1st Qu.: -6      
##  Median : -2      
##  Mean   :  0      
##  3rd Qu.:  3      
##  Max.   : 49
sapply(df, function(x) sum(is.na(x)))
##                rank                team                year               games 
##                   0                   0                   0                   0 
##                wins              losses                ties       shootout_wins 
##                   0                   0                   0                   0 
##     shootout_losses win_loss_percentage        goals_scored       goals_against 
##                   0                   0                   0                   0 
##   goal_differential 
##                   0

1 📌 2. Deskriptívna štatistika

numeric_cols <- df %>% select_if(is.numeric)

kable(psych::describe(numeric_cols), caption = "Deskriptívne štatistiky")
Deskriptívne štatistiky
vars n mean sd median trimmed mad min max range skew kurtosis se
rank 1 1009 7.0673935 4.7529666 6.000 6.7404203 5.9304000 1 24 23 0.5399687 -0.7164820 0.1496302
year 2 1009 1985.7869177 26.5147680 1990.000 1988.3053152 25.2042000 1918 2022 104 -0.7321970 -0.3569555 0.8347227
games 3 1009 9.3637265 5.7909611 7.000 8.6699629 4.4478000 2 27 25 0.9728487 0.1216465 0.1823077
wins 4 1009 4.6570862 4.2964723 3.000 4.0506799 2.9652000 0 18 18 1.0837430 0.3816458 0.1352591
losses 5 1009 4.6570862 2.0367048 4.000 4.5463535 1.4826000 0 12 12 0.6721954 0.7580494 0.0641184
ties 6 1009 0.0495540 0.2737205 0.000 0.0000000 0.0000000 0 4 4 7.3770616 70.9099879 0.0086171
shootout_wins 7 1009 0.9326065 1.2627985 1.000 0.6922126 1.4826000 0 10 10 1.8895501 5.1467496 0.0397547
shootout_losses 8 1009 0.9326065 0.9694834 1.000 0.8034611 1.4826000 0 4 4 0.9504485 0.4438761 0.0305207
win_loss_percentage 9 1009 0.4111764 0.2104940 0.429 0.4182818 0.1616034 0 1 1 -0.2187167 -0.0225603 0.0066267
goals_scored 10 1009 26.6313181 20.5821730 20.000 23.8813350 17.7912000 0 98 98 1.1227258 0.6950234 0.6479562
goals_against 11 1009 26.6313181 15.2993558 22.000 25.1075402 11.8608000 0 91 91 0.9331024 0.5009427 0.4816456
goal_differential 12 1009 0.0000000 9.2370673 -2.000 -1.1742892 5.9304000 -27 49 76 1.4556594 3.0574734 0.2907961
# 📌 **3. Korelačná matica**

corr <- round(cor(numeric_cols), 2)
kable(corr, caption = "Korelačná matica (Pearson)")
Korelačná matica (Pearson)
rank year games wins losses ties shootout_wins shootout_losses win_loss_percentage goals_scored goals_against goal_differential
rank 1.00 0.46 -0.59 -0.65 -0.28 -0.16 -0.42 -0.10 -0.70 -0.57 -0.44 -0.55
year 0.46 1.00 0.32 0.23 0.48 -0.36 0.21 0.27 -0.01 0.27 0.36 0.00
games -0.59 0.32 1.00 0.97 0.82 -0.13 0.67 0.37 0.68 0.94 0.89 0.61
wins -0.65 0.23 0.97 1.00 0.65 -0.10 0.67 0.30 0.78 0.94 0.82 0.75
losses -0.28 0.48 0.82 0.65 1.00 -0.29 0.51 0.43 0.28 0.70 0.85 0.16
ties -0.16 -0.36 -0.13 -0.10 -0.29 1.00 -0.06 -0.13 0.07 -0.13 -0.20 0.05
shootout_wins -0.42 0.21 0.67 0.67 0.51 -0.06 1.00 0.24 0.50 0.58 0.57 0.34
shootout_losses -0.10 0.27 0.37 0.30 0.43 -0.13 0.24 1.00 0.15 0.31 0.28 0.24
win_loss_percentage -0.70 -0.01 0.68 0.78 0.28 0.07 0.50 0.15 1.00 0.70 0.51 0.71
goals_scored -0.57 0.27 0.94 0.94 0.70 -0.13 0.58 0.31 0.70 1.00 0.91 0.72
goals_against -0.44 0.36 0.89 0.82 0.85 -0.20 0.57 0.28 0.51 0.91 1.00 0.37
goal_differential -0.55 0.00 0.61 0.75 0.16 0.05 0.34 0.24 0.71 0.72 0.37 1.00
# 📌 **4. Grafická analýza**

### Histogramy

numeric_cols %>%
  gather(variable, value) %>%
  ggplot(aes(value)) +
  geom_histogram(bins = 20, fill = "grey70") +
  facet_wrap(~variable, scales = "free") +
  theme_minimal()

### Scatter ploty pre výkon vs góly
ggplot(df, aes(goals_scored, win_loss_percentage)) +
  geom_point() +
  geom_smooth(method = "lm") +
  theme_minimal()

# OLS model
model <- lm(win_loss_percentage ~ goals_scored + goals_against + shootout_wins + games, data = df)

# Výsledky s robustnými štandardnými chybami (HC3)
reg_results <- lmtest::coeftest(model, vcov = sandwich::vcovHC(model, type = "HC3"))

reg_results
## 
## t test of coefficients:
## 
##                  Estimate  Std. Error  t value  Pr(>|t|)    
## (Intercept)    0.28864600  0.01258781  22.9306 < 2.2e-16 ***
## goals_scored   0.01068951  0.00073867  14.4712 < 2.2e-16 ***
## goals_against -0.01135014  0.00083993 -13.5132 < 2.2e-16 ***
## shootout_wins  0.02070860  0.00389600   5.3153 1.312e-07 ***
## games          0.01290199  0.00224039   5.7588 1.125e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
model <- lm(win_loss_percentage ~ goals_scored + goals_against + shootout_wins + games, data = df)

coeftest(model, vcov = vcovHC(model, type = "HC3"))
## 
## t test of coefficients:
## 
##                  Estimate  Std. Error  t value  Pr(>|t|)    
## (Intercept)    0.28864600  0.01258781  22.9306 < 2.2e-16 ***
## goals_scored   0.01068951  0.00073867  14.4712 < 2.2e-16 ***
## goals_against -0.01135014  0.00083993 -13.5132 < 2.2e-16 ***
## shootout_wins  0.02070860  0.00389600   5.3153 1.312e-07 ***
## games          0.01290199  0.00224039   5.7588 1.125e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 📌 **6. Diagnostika modelu**

### 6.1 Shapiro-Wilk (normalita reziduí)

shapiro.test(residuals(model))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(model)
## W = 0.95308, p-value < 2.2e-16
### 6.2 Durbin–Watson (autokorelácia)

lmtest::dwtest(model)
## 
##  Durbin-Watson test
## 
## data:  model
## DW = 1.4729, p-value < 2.2e-16
## alternative hypothesis: true autocorrelation is greater than 0
### 6.3 Breusch–Pagan (heteroskedasticita)

lmtest::bptest(model)
## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 71.251, df = 4, p-value = 1.236e-14
### 6.4 VIF – multikolinearita
car::vif(model)
##  goals_scored goals_against shootout_wins         games 
##     11.155745      6.249837      1.912803     11.745606
### 6.5 Reziduálne grafy

par(mfrow=c(2,2))
plot(model)

par(mfrow=c(1,1))
# 📌 **7. Alternatívny model (s goal_differential)**

model2 <- lm(win_loss_percentage ~ goal_differential + shootout_wins + games, data = df)
coeftest(model2, vcov = vcovHC(model2, type = "HC3"))
## 
## t test of coefficients:
## 
##                     Estimate Std. Error t value  Pr(>|t|)    
## (Intercept)       0.28899873 0.01265249 22.8413 < 2.2e-16 ***
## goal_differential 0.01101578 0.00071873 15.3267 < 2.2e-16 ***
## shootout_wins     0.02147247 0.00384484  5.5847 3.012e-08 ***
## games             0.01090937 0.00126276  8.6393 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 📌 **8. Dvoj vzorkový t-test**

#Porovnanie tímov s veľa vs málo SO výhier.

df$so_group <- ifelse(df$shootout_wins > 1, "High SO", "Low SO")

t.test(win_loss_percentage ~ so_group, data = df, var.equal = FALSE)
## 
##  Welch Two Sample t-test
## 
## data:  win_loss_percentage by so_group
## t = 19.037, df = 700.86, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group High SO and group Low SO is not equal to 0
## 95 percent confidence interval:
##  0.1850012 0.2275501
## sample estimates:
## mean in group High SO  mean in group Low SO 
##             0.5683875             0.3621118
# 📌 **9. Export výsledkov**

results <- tidy(model)
write.csv(results, "regression_table.csv", row.names = FALSE)

write.csv(head(df, 50), "nhlplayoffs_cleaned_sample.csv", row.names = FALSE)
# 📌 **Časová rada – Points (výkonnosť tímov)**


library(tidyverse)
library(lmtest)
library(sandwich)


# zoradenie podľa roku
df_ts <- df %>% arrange(year)

# playoff body (wins = 1 point, ties ignored)
# Alebo si zvoľ inú metriku – napr. win_loss_percentage
points_ts <- ts(df_ts$win_loss_percentage,
                start = min(df_ts$year),
                end = max(df_ts$year),
                frequency = 1)

2 📌 Jednoduchá lineárna regresia – trend v čase

\[ win\_loss\_percentage_t = \beta_0 + \beta_1 \cdot year + \epsilon_t \]

model_simple <- lm(win_loss_percentage ~ year, data = df_ts)
summary(model_simple)
## 
## Call:
## lm(formula = win_loss_percentage ~ year, data = df_ts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.41604 -0.08064  0.01867  0.13369  0.58689 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)
## (Intercept)  0.5603088  0.4968085   1.128    0.260
## year        -0.0000751  0.0002502  -0.300    0.764
## 
## Residual standard error: 0.2106 on 1007 degrees of freedom
## Multiple R-squared:  8.949e-05,  Adjusted R-squared:  -0.0009035 
## F-statistic: 0.09012 on 1 and 1007 DF,  p-value: 0.7641

3 📌 Diagnostika rezíduí — autokorelácia

3.1 🔹 1. Ljung–Box test

Box.test(model_simple$residuals,
         lag = 10,
         type = "Ljung-Box")
## 
##  Box-Ljung test
## 
## data:  model_simple$residuals
## X-squared = 322.57, df = 10, p-value < 2.2e-16

3.2 🔹 2. Breusch–Godfrey test (viacnásobná autokorelácia)

library(lmtest)

bgtest(model_simple, order = 5)
## 
##  Breusch-Godfrey test for serial correlation of order up to 5
## 
## data:  model_simple
## LM test = 142.92, df = 5, p-value < 2.2e-16

4 📌 Záver

V tomto RMarkdown sú zahrnuté všetky kroky (načítanie dát, čistenie, deskriptíva, korelácie, grafy, OLS, diagnostika, alternatívne modely a testy).