# 📌 **1. Načítanie dát**
data_path <- "nhlplayoffs.csv"
df <- read.csv(data_path, stringsAsFactors = FALSE)
glimpse(df)
## Rows: 1,009
## Columns: 13
## $ rank <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,…
## $ team <chr> "Colorado Avalanche", "Tampa Bay Lightning", "New …
## $ year <int> 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022, 20…
## $ games <int> 20, 23, 20, 16, 14, 12, 12, 10, 7, 7, 7, 7, 7, 6, …
## $ wins <int> 16, 14, 10, 8, 7, 6, 5, 4, 3, 3, 3, 3, 3, 2, 2, 0,…
## $ losses <int> 4, 9, 10, 8, 7, 6, 7, 6, 4, 4, 4, 4, 4, 4, 4, 4, 7…
## $ ties <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ shootout_wins <int> 5, 1, 1, 1, 1, 1, 1, 2, 0, 1, 0, 0, 1, 0, 0, 0, 0,…
## $ shootout_losses <int> 1, 2, 2, 2, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 2, 1, 4,…
## $ win_loss_percentage <dbl> 0.800, 0.609, 0.500, 0.500, 0.500, 0.500, 0.417, 0…
## $ goals_scored <int> 85, 67, 62, 65, 37, 40, 35, 23, 20, 17, 24, 14, 29…
## $ goals_against <int> 55, 61, 58, 59, 40, 38, 39, 32, 24, 27, 23, 15, 28…
## $ goal_differential <int> 30, 6, 4, 6, -3, 2, -4, -9, -4, -10, 1, -1, 1, -6,…
kable(head(df, 10), caption = "Ukážka prvých 10 riadkov datasetu nhlplayoffs.csv")
Ukážka prvých 10 riadkov datasetu nhlplayoffs.csv
| 1 |
Colorado Avalanche |
2022 |
20 |
16 |
4 |
0 |
5 |
1 |
0.800 |
85 |
55 |
30 |
| 2 |
Tampa Bay Lightning |
2022 |
23 |
14 |
9 |
0 |
1 |
2 |
0.609 |
67 |
61 |
6 |
| 3 |
New York Rangers |
2022 |
20 |
10 |
10 |
0 |
1 |
2 |
0.500 |
62 |
58 |
4 |
| 4 |
Edmonton Oilers |
2022 |
16 |
8 |
8 |
0 |
1 |
2 |
0.500 |
65 |
59 |
6 |
| 5 |
Carolina Hurricanes |
2022 |
14 |
7 |
7 |
0 |
1 |
0 |
0.500 |
37 |
40 |
-3 |
| 6 |
St. Louis Blues |
2022 |
12 |
6 |
6 |
0 |
1 |
1 |
0.500 |
40 |
38 |
2 |
| 7 |
Calgary Flames |
2022 |
12 |
5 |
7 |
0 |
1 |
1 |
0.417 |
35 |
39 |
-4 |
| 8 |
Florida Panthers |
2022 |
10 |
4 |
6 |
0 |
2 |
0 |
0.400 |
23 |
32 |
-9 |
| 9 |
Boston Bruins |
2022 |
7 |
3 |
4 |
0 |
0 |
0 |
0.429 |
20 |
24 |
-4 |
| 10 |
Los Angeles Kings |
2022 |
7 |
3 |
4 |
0 |
1 |
0 |
0.429 |
17 |
27 |
-10 |
# 📌 **1.1 Kontrola chýbajúcich hodnôt a typov**
summary(df)
## rank team year games
## Min. : 1.000 Length:1009 Min. :1918 Min. : 2.000
## 1st Qu.: 3.000 Class :character 1st Qu.:1972 1st Qu.: 5.000
## Median : 6.000 Mode :character Median :1990 Median : 7.000
## Mean : 7.067 Mean :1986 Mean : 9.364
## 3rd Qu.:11.000 3rd Qu.:2007 3rd Qu.:12.000
## Max. :24.000 Max. :2022 Max. :27.000
## wins losses ties shootout_wins
## Min. : 0.000 Min. : 0.000 Min. :0.00000 Min. : 0.0000
## 1st Qu.: 1.000 1st Qu.: 4.000 1st Qu.:0.00000 1st Qu.: 0.0000
## Median : 3.000 Median : 4.000 Median :0.00000 Median : 1.0000
## Mean : 4.657 Mean : 4.657 Mean :0.04955 Mean : 0.9326
## 3rd Qu.: 7.000 3rd Qu.: 6.000 3rd Qu.:0.00000 3rd Qu.: 1.0000
## Max. :18.000 Max. :12.000 Max. :4.00000 Max. :10.0000
## shootout_losses win_loss_percentage goals_scored goals_against
## Min. :0.0000 Min. :0.0000 Min. : 0.00 Min. : 0.00
## 1st Qu.:0.0000 1st Qu.:0.3330 1st Qu.:11.00 1st Qu.:16.00
## Median :1.0000 Median :0.4290 Median :20.00 Median :22.00
## Mean :0.9326 Mean :0.4112 Mean :26.63 Mean :26.63
## 3rd Qu.:1.0000 3rd Qu.:0.5450 3rd Qu.:37.00 3rd Qu.:35.00
## Max. :4.0000 Max. :1.0000 Max. :98.00 Max. :91.00
## goal_differential
## Min. :-27
## 1st Qu.: -6
## Median : -2
## Mean : 0
## 3rd Qu.: 3
## Max. : 49
sapply(df, function(x) sum(is.na(x)))
## rank team year games
## 0 0 0 0
## wins losses ties shootout_wins
## 0 0 0 0
## shootout_losses win_loss_percentage goals_scored goals_against
## 0 0 0 0
## goal_differential
## 0
📌 2.
Deskriptívna štatistika
numeric_cols <- df %>% select_if(is.numeric)
kable(psych::describe(numeric_cols), caption = "Deskriptívne štatistiky")
Deskriptívne štatistiky
| rank |
1 |
1009 |
7.0673935 |
4.7529666 |
6.000 |
6.7404203 |
5.9304000 |
1 |
24 |
23 |
0.5399687 |
-0.7164820 |
0.1496302 |
| year |
2 |
1009 |
1985.7869177 |
26.5147680 |
1990.000 |
1988.3053152 |
25.2042000 |
1918 |
2022 |
104 |
-0.7321970 |
-0.3569555 |
0.8347227 |
| games |
3 |
1009 |
9.3637265 |
5.7909611 |
7.000 |
8.6699629 |
4.4478000 |
2 |
27 |
25 |
0.9728487 |
0.1216465 |
0.1823077 |
| wins |
4 |
1009 |
4.6570862 |
4.2964723 |
3.000 |
4.0506799 |
2.9652000 |
0 |
18 |
18 |
1.0837430 |
0.3816458 |
0.1352591 |
| losses |
5 |
1009 |
4.6570862 |
2.0367048 |
4.000 |
4.5463535 |
1.4826000 |
0 |
12 |
12 |
0.6721954 |
0.7580494 |
0.0641184 |
| ties |
6 |
1009 |
0.0495540 |
0.2737205 |
0.000 |
0.0000000 |
0.0000000 |
0 |
4 |
4 |
7.3770616 |
70.9099879 |
0.0086171 |
| shootout_wins |
7 |
1009 |
0.9326065 |
1.2627985 |
1.000 |
0.6922126 |
1.4826000 |
0 |
10 |
10 |
1.8895501 |
5.1467496 |
0.0397547 |
| shootout_losses |
8 |
1009 |
0.9326065 |
0.9694834 |
1.000 |
0.8034611 |
1.4826000 |
0 |
4 |
4 |
0.9504485 |
0.4438761 |
0.0305207 |
| win_loss_percentage |
9 |
1009 |
0.4111764 |
0.2104940 |
0.429 |
0.4182818 |
0.1616034 |
0 |
1 |
1 |
-0.2187167 |
-0.0225603 |
0.0066267 |
| goals_scored |
10 |
1009 |
26.6313181 |
20.5821730 |
20.000 |
23.8813350 |
17.7912000 |
0 |
98 |
98 |
1.1227258 |
0.6950234 |
0.6479562 |
| goals_against |
11 |
1009 |
26.6313181 |
15.2993558 |
22.000 |
25.1075402 |
11.8608000 |
0 |
91 |
91 |
0.9331024 |
0.5009427 |
0.4816456 |
| goal_differential |
12 |
1009 |
0.0000000 |
9.2370673 |
-2.000 |
-1.1742892 |
5.9304000 |
-27 |
49 |
76 |
1.4556594 |
3.0574734 |
0.2907961 |
# 📌 **3. Korelačná matica**
corr <- round(cor(numeric_cols), 2)
kable(corr, caption = "Korelačná matica (Pearson)")
Korelačná matica (Pearson)
| rank |
1.00 |
0.46 |
-0.59 |
-0.65 |
-0.28 |
-0.16 |
-0.42 |
-0.10 |
-0.70 |
-0.57 |
-0.44 |
-0.55 |
| year |
0.46 |
1.00 |
0.32 |
0.23 |
0.48 |
-0.36 |
0.21 |
0.27 |
-0.01 |
0.27 |
0.36 |
0.00 |
| games |
-0.59 |
0.32 |
1.00 |
0.97 |
0.82 |
-0.13 |
0.67 |
0.37 |
0.68 |
0.94 |
0.89 |
0.61 |
| wins |
-0.65 |
0.23 |
0.97 |
1.00 |
0.65 |
-0.10 |
0.67 |
0.30 |
0.78 |
0.94 |
0.82 |
0.75 |
| losses |
-0.28 |
0.48 |
0.82 |
0.65 |
1.00 |
-0.29 |
0.51 |
0.43 |
0.28 |
0.70 |
0.85 |
0.16 |
| ties |
-0.16 |
-0.36 |
-0.13 |
-0.10 |
-0.29 |
1.00 |
-0.06 |
-0.13 |
0.07 |
-0.13 |
-0.20 |
0.05 |
| shootout_wins |
-0.42 |
0.21 |
0.67 |
0.67 |
0.51 |
-0.06 |
1.00 |
0.24 |
0.50 |
0.58 |
0.57 |
0.34 |
| shootout_losses |
-0.10 |
0.27 |
0.37 |
0.30 |
0.43 |
-0.13 |
0.24 |
1.00 |
0.15 |
0.31 |
0.28 |
0.24 |
| win_loss_percentage |
-0.70 |
-0.01 |
0.68 |
0.78 |
0.28 |
0.07 |
0.50 |
0.15 |
1.00 |
0.70 |
0.51 |
0.71 |
| goals_scored |
-0.57 |
0.27 |
0.94 |
0.94 |
0.70 |
-0.13 |
0.58 |
0.31 |
0.70 |
1.00 |
0.91 |
0.72 |
| goals_against |
-0.44 |
0.36 |
0.89 |
0.82 |
0.85 |
-0.20 |
0.57 |
0.28 |
0.51 |
0.91 |
1.00 |
0.37 |
| goal_differential |
-0.55 |
0.00 |
0.61 |
0.75 |
0.16 |
0.05 |
0.34 |
0.24 |
0.71 |
0.72 |
0.37 |
1.00 |
# 📌 **4. Grafická analýza**
### Histogramy
numeric_cols %>%
gather(variable, value) %>%
ggplot(aes(value)) +
geom_histogram(bins = 20, fill = "grey70") +
facet_wrap(~variable, scales = "free") +
theme_minimal()

### Scatter ploty pre výkon vs góly
ggplot(df, aes(goals_scored, win_loss_percentage)) +
geom_point() +
geom_smooth(method = "lm") +
theme_minimal()

# OLS model
model <- lm(win_loss_percentage ~ goals_scored + goals_against + shootout_wins + games, data = df)
# Výsledky s robustnými štandardnými chybami (HC3)
reg_results <- lmtest::coeftest(model, vcov = sandwich::vcovHC(model, type = "HC3"))
reg_results
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.28864600 0.01258781 22.9306 < 2.2e-16 ***
## goals_scored 0.01068951 0.00073867 14.4712 < 2.2e-16 ***
## goals_against -0.01135014 0.00083993 -13.5132 < 2.2e-16 ***
## shootout_wins 0.02070860 0.00389600 5.3153 1.312e-07 ***
## games 0.01290199 0.00224039 5.7588 1.125e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
model <- lm(win_loss_percentage ~ goals_scored + goals_against + shootout_wins + games, data = df)
coeftest(model, vcov = vcovHC(model, type = "HC3"))
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.28864600 0.01258781 22.9306 < 2.2e-16 ***
## goals_scored 0.01068951 0.00073867 14.4712 < 2.2e-16 ***
## goals_against -0.01135014 0.00083993 -13.5132 < 2.2e-16 ***
## shootout_wins 0.02070860 0.00389600 5.3153 1.312e-07 ***
## games 0.01290199 0.00224039 5.7588 1.125e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 📌 **6. Diagnostika modelu**
### 6.1 Shapiro-Wilk (normalita reziduí)
shapiro.test(residuals(model))
##
## Shapiro-Wilk normality test
##
## data: residuals(model)
## W = 0.95308, p-value < 2.2e-16
### 6.2 Durbin–Watson (autokorelácia)
lmtest::dwtest(model)
##
## Durbin-Watson test
##
## data: model
## DW = 1.4729, p-value < 2.2e-16
## alternative hypothesis: true autocorrelation is greater than 0
### 6.3 Breusch–Pagan (heteroskedasticita)
lmtest::bptest(model)
##
## studentized Breusch-Pagan test
##
## data: model
## BP = 71.251, df = 4, p-value = 1.236e-14
### 6.4 VIF – multikolinearita
car::vif(model)
## goals_scored goals_against shootout_wins games
## 11.155745 6.249837 1.912803 11.745606
### 6.5 Reziduálne grafy
par(mfrow=c(2,2))
plot(model)

par(mfrow=c(1,1))
# 📌 **7. Alternatívny model (s goal_differential)**
model2 <- lm(win_loss_percentage ~ goal_differential + shootout_wins + games, data = df)
coeftest(model2, vcov = vcovHC(model2, type = "HC3"))
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.28899873 0.01265249 22.8413 < 2.2e-16 ***
## goal_differential 0.01101578 0.00071873 15.3267 < 2.2e-16 ***
## shootout_wins 0.02147247 0.00384484 5.5847 3.012e-08 ***
## games 0.01090937 0.00126276 8.6393 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 📌 **8. Dvoj vzorkový t-test**
#Porovnanie tímov s veľa vs málo SO výhier.
df$so_group <- ifelse(df$shootout_wins > 1, "High SO", "Low SO")
t.test(win_loss_percentage ~ so_group, data = df, var.equal = FALSE)
##
## Welch Two Sample t-test
##
## data: win_loss_percentage by so_group
## t = 19.037, df = 700.86, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group High SO and group Low SO is not equal to 0
## 95 percent confidence interval:
## 0.1850012 0.2275501
## sample estimates:
## mean in group High SO mean in group Low SO
## 0.5683875 0.3621118
# 📌 **9. Export výsledkov**
results <- tidy(model)
write.csv(results, "regression_table.csv", row.names = FALSE)
write.csv(head(df, 50), "nhlplayoffs_cleaned_sample.csv", row.names = FALSE)
# 📌 **Časová rada – Points (výkonnosť tímov)**
library(tidyverse)
library(lmtest)
library(sandwich)
# zoradenie podľa roku
df_ts <- df %>% arrange(year)
# playoff body (wins = 1 point, ties ignored)
# Alebo si zvoľ inú metriku – napr. win_loss_percentage
points_ts <- ts(df_ts$win_loss_percentage,
start = min(df_ts$year),
end = max(df_ts$year),
frequency = 1)
📌 Jednoduchá
lineárna regresia – trend v čase
\[
win\_loss\_percentage_t = \beta_0 + \beta_1 \cdot year + \epsilon_t
\]
model_simple <- lm(win_loss_percentage ~ year, data = df_ts)
summary(model_simple)
##
## Call:
## lm(formula = win_loss_percentage ~ year, data = df_ts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.41604 -0.08064 0.01867 0.13369 0.58689
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.5603088 0.4968085 1.128 0.260
## year -0.0000751 0.0002502 -0.300 0.764
##
## Residual standard error: 0.2106 on 1007 degrees of freedom
## Multiple R-squared: 8.949e-05, Adjusted R-squared: -0.0009035
## F-statistic: 0.09012 on 1 and 1007 DF, p-value: 0.7641
📌 Diagnostika
rezíduí — autokorelácia
🔹 1.
Ljung–Box test
Box.test(model_simple$residuals,
lag = 10,
type = "Ljung-Box")
##
## Box-Ljung test
##
## data: model_simple$residuals
## X-squared = 322.57, df = 10, p-value < 2.2e-16
🔹 2.
Breusch–Godfrey test (viacnásobná autokorelácia)
library(lmtest)
bgtest(model_simple, order = 5)
##
## Breusch-Godfrey test for serial correlation of order up to 5
##
## data: model_simple
## LM test = 142.92, df = 5, p-value < 2.2e-16
📌
Záver
V tomto RMarkdown sú zahrnuté všetky kroky (načítanie dát, čistenie,
deskriptíva, korelácie, grafy, OLS, diagnostika, alternatívne modely a
testy).