Tento R Markdown súbor aplikuje štruktúru cvičenia 5 (lineárna
regresia + diagnostika) na dataset nhlplayoffs.csv. Cieľ:
modelovať win_loss_percentage pomocou troch vysvetľujúcich
premenných: goals_scored, goals_against, a
games.
# Nainštaluj balíčky, ak ešte nie sú nainštalované
needed <- c("tidyverse", "broom", "lmtest", "car", "tseries", "sandwich", "ggplot2")
new <- needed[!(needed %in% installed.packages()[, "Package"]) ]
if(length(new)) install.packages(new)
library(tidyverse)
library(broom)
library(lmtest)
library(car)
library(tseries)
library(sandwich)
library(ggplot2)
# Uprav cestu podľa potreby. Tu sa predpokladá "/mnt/data/nhlplayoffs.csv".
nhl <- read_csv("nhlplayoffs.csv")
# Zobrazenie základných informácií
glimpse(nhl)
## Rows: 1,009
## Columns: 13
## $ rank <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,…
## $ team <chr> "Colorado Avalanche", "Tampa Bay Lightning", "New …
## $ year <dbl> 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022, 20…
## $ games <dbl> 20, 23, 20, 16, 14, 12, 12, 10, 7, 7, 7, 7, 7, 6, …
## $ wins <dbl> 16, 14, 10, 8, 7, 6, 5, 4, 3, 3, 3, 3, 3, 2, 2, 0,…
## $ losses <dbl> 4, 9, 10, 8, 7, 6, 7, 6, 4, 4, 4, 4, 4, 4, 4, 4, 7…
## $ ties <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ shootout_wins <dbl> 5, 1, 1, 1, 1, 1, 1, 2, 0, 1, 0, 0, 1, 0, 0, 0, 0,…
## $ shootout_losses <dbl> 1, 2, 2, 2, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 2, 1, 4,…
## $ win_loss_percentage <dbl> 0.800, 0.609, 0.500, 0.500, 0.500, 0.500, 0.417, 0…
## $ goals_scored <dbl> 85, 67, 62, 65, 37, 40, 35, 23, 20, 17, 24, 14, 29…
## $ goals_against <dbl> 55, 61, 58, 59, 40, 38, 39, 32, 24, 27, 23, 15, 28…
## $ goal_differential <dbl> 30, 6, 4, 6, -3, 2, -4, -9, -4, -10, 1, -1, 1, -6,…
summary(nhl)
## rank team year games
## Min. : 1.000 Length:1009 Min. :1918 Min. : 2.000
## 1st Qu.: 3.000 Class :character 1st Qu.:1972 1st Qu.: 5.000
## Median : 6.000 Mode :character Median :1990 Median : 7.000
## Mean : 7.067 Mean :1986 Mean : 9.364
## 3rd Qu.:11.000 3rd Qu.:2007 3rd Qu.:12.000
## Max. :24.000 Max. :2022 Max. :27.000
## wins losses ties shootout_wins
## Min. : 0.000 Min. : 0.000 Min. :0.00000 Min. : 0.0000
## 1st Qu.: 1.000 1st Qu.: 4.000 1st Qu.:0.00000 1st Qu.: 0.0000
## Median : 3.000 Median : 4.000 Median :0.00000 Median : 1.0000
## Mean : 4.657 Mean : 4.657 Mean :0.04955 Mean : 0.9326
## 3rd Qu.: 7.000 3rd Qu.: 6.000 3rd Qu.:0.00000 3rd Qu.: 1.0000
## Max. :18.000 Max. :12.000 Max. :4.00000 Max. :10.0000
## shootout_losses win_loss_percentage goals_scored goals_against
## Min. :0.0000 Min. :0.0000 Min. : 0.00 Min. : 0.00
## 1st Qu.:0.0000 1st Qu.:0.3330 1st Qu.:11.00 1st Qu.:16.00
## Median :1.0000 Median :0.4290 Median :20.00 Median :22.00
## Mean :0.9326 Mean :0.4112 Mean :26.63 Mean :26.63
## 3rd Qu.:1.0000 3rd Qu.:0.5450 3rd Qu.:37.00 3rd Qu.:35.00
## Max. :4.0000 Max. :1.0000 Max. :98.00 Max. :91.00
## goal_differential
## Min. :-27
## 1st Qu.: -6
## Median : -2
## Mean : 0
## 3rd Qu.: 3
## Max. : 49
# Prvých 10 riadkov
head(nhl, 10)
## # A tibble: 10 × 13
## rank team year games wins losses ties shootout_wins shootout_losses
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 Colorado … 2022 20 16 4 0 5 1
## 2 2 Tampa Bay… 2022 23 14 9 0 1 2
## 3 3 New York … 2022 20 10 10 0 1 2
## 4 4 Edmonton … 2022 16 8 8 0 1 2
## 5 5 Carolina … 2022 14 7 7 0 1 0
## 6 6 St. Louis… 2022 12 6 6 0 1 1
## 7 7 Calgary F… 2022 12 5 7 0 1 1
## 8 8 Florida P… 2022 10 4 6 0 2 0
## 9 9 Boston Br… 2022 7 3 4 0 0 0
## 10 10 Los Angel… 2022 7 3 4 0 1 0
## # ℹ 4 more variables: win_loss_percentage <dbl>, goals_scored <dbl>,
## # goals_against <dbl>, goal_differential <dbl>
# Vyberieme len premenné relevantné pre model
nhl_model <- nhl %>%
select(win_loss_percentage, goals_scored, goals_against, games, team, year) %>%
mutate(across(c(win_loss_percentage, goals_scored, goals_against, games), as.numeric))
# Kontrola NA
nhl_model %>% summarise(across(everything(), ~ sum(is.na(.))))
## # A tibble: 1 × 6
## win_loss_percentage goals_scored goals_against games team year
## <int> <int> <int> <int> <int> <int>
## 1 0 0 0 0 0 0
# Ak by boli NA, jednoduchá imputácia priemerom (tu len ako postup)
if(any(is.na(nhl_model))) {
nhl_model <- nhl_model %>%
mutate(across(where(is.numeric), ~ ifelse(is.na(.), mean(., na.rm = TRUE), .)))
}
# Základné štatistiky
nhl_model %>% select(win_loss_percentage, goals_scored, goals_against, games) %>% summary()
## win_loss_percentage goals_scored goals_against games
## Min. :0.0000 Min. : 0.00 Min. : 0.00 Min. : 2.000
## 1st Qu.:0.3330 1st Qu.:11.00 1st Qu.:16.00 1st Qu.: 5.000
## Median :0.4290 Median :20.00 Median :22.00 Median : 7.000
## Mean :0.4112 Mean :26.63 Mean :26.63 Mean : 9.364
## 3rd Qu.:0.5450 3rd Qu.:37.00 3rd Qu.:35.00 3rd Qu.:12.000
## Max. :1.0000 Max. :98.00 Max. :91.00 Max. :27.000
# Korelačná matica
cor_mat <- nhl_model %>% select(win_loss_percentage, goals_scored, goals_against, games) %>% cor()
cor_mat
## win_loss_percentage goals_scored goals_against games
## win_loss_percentage 1.0000000 0.7009174 0.5133309 0.6829670
## goals_scored 0.7009174 1.0000000 0.9088340 0.9401903
## goals_against 0.5133309 0.9088340 1.0000000 0.8948069
## games 0.6829670 0.9401903 0.8948069 1.0000000
# Heatmap korelácií
library(ggplot2)
cor_df <- as.data.frame(as.table(cor_mat))
ggplot(cor_df, aes(Var1, Var2, fill = Freq)) +
geom_tile() +
geom_text(aes(label = round(Freq, 2))) +
labs(title = "Korelačná matica") +
theme_minimal()
model <- lm(win_loss_percentage ~ goals_scored + goals_against + games, data = nhl_model)
summary(model)
##
## Call:
## lm(formula = win_loss_percentage ~ goals_scored + goals_against +
## games, data = nhl_model)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.30101 -0.06681 0.01477 0.07457 0.65960
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.2767744 0.0095021 29.13 < 2e-16 ***
## goals_scored 0.0100937 0.0006642 15.20 < 2e-16 ***
## goals_against -0.0113519 0.0006818 -16.65 < 2e-16 ***
## games 0.0179319 0.0022057 8.13 1.26e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1325 on 1005 degrees of freedom
## Multiple R-squared: 0.6052, Adjusted R-squared: 0.604
## F-statistic: 513.5 on 3 and 1005 DF, p-value: < 2.2e-16
# Harmonizovaný tidy výstup
tidy(model)
## # A tibble: 4 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 0.277 0.00950 29.1 1.00e-135
## 2 goals_scored 0.0101 0.000664 15.2 4.15e- 47
## 3 goals_against -0.0114 0.000682 -16.7 3.65e- 55
## 4 games 0.0179 0.00221 8.13 1.26e- 15
glance(model)
## # A tibble: 1 × 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.605 0.604 0.132 513. 3.12e-202 3 610. -1210. -1185.
## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
# Rezidua a fitted hodnoty
nhl_model <- nhl_model %>%
mutate(fitted = fitted(model), residuals = residuals(model), std_resid = rstandard(model))
# Graf: rezidua vs fitted
ggplot(nhl_model, aes(x = fitted, y = residuals)) +
geom_point() +
geom_hline(yintercept = 0, linetype = "dashed") +
labs(title = "Rezidua vs Fitted hodnoty", x = "Fitted", y = "Rezidua") +
theme_minimal()
# Histogram rezíduí + normálna krivka
ggplot(nhl_model, aes(x = residuals)) +
geom_histogram(aes(y = ..density..), bins = 30) +
stat_function(fun = dnorm, args = list(mean = mean(nhl_model$residuals), sd = sd(nhl_model$residuals))) +
labs(title = "Histogram rezíduí s normálnou krivkou") +
theme_minimal()
# QQ-plot
qqPlot(model, main = "QQ-plot rezíduí")
## [1] 996 998
# Jarque-Bera test (normálnosť)
jarque.test <- tseries::jarque.bera.test(nhl_model$residuals)
jarque.test
##
## Jarque Bera Test
##
## data: nhl_model$residuals
## X-squared = 162.83, df = 2, p-value < 2.2e-16
# Breusch-Pagan test
bptest_res <- bptest(model)
bptest_res
##
## studentized Breusch-Pagan test
##
## data: model
## BP = 72.531, df = 3, p-value = 1.225e-15
# White test (approx pomocou bptest na kvadratické termíny)
# Pridáme kvadratické a interakčné termíny pre aproximáciu White testu
model_white <- lm(residuals(model)^2 ~ fitted(model) + I(fitted(model)^2), data = nhl_model)
summary(model_white)
##
## Call:
## lm(formula = residuals(model)^2 ~ fitted(model) + I(fitted(model)^2),
## data = nhl_model)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.03452 -0.01482 -0.00935 0.00232 0.41837
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.064579 0.007487 8.625 < 2e-16 ***
## fitted(model) -0.206015 0.032159 -6.406 2.29e-10 ***
## I(fitted(model)^2) 0.192016 0.030854 6.223 7.13e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.03406 on 1006 degrees of freedom
## Multiple R-squared: 0.03925, Adjusted R-squared: 0.03734
## F-statistic: 20.55 on 2 and 1006 DF, p-value: 1.795e-09
# Durbin-Watson test
dw_res <- lmtest::dwtest(model)
dw_res
##
## Durbin-Watson test
##
## data: model
## DW = 1.4977, p-value = 5.168e-16
## alternative hypothesis: true autocorrelation is greater than 0
vifs <- car::vif(model)
vifs
## goals_scored goals_against games
## 10.735066 6.249835 9.372463
coeftest(model, vcov. = sandwich::vcovHC(model, type = "HC1"))
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.27677444 0.01235335 22.405 < 2.2e-16 ***
## goals_scored 0.01009367 0.00071421 14.133 < 2.2e-16 ***
## goals_against -0.01135187 0.00085493 -13.278 < 2.2e-16 ***
## games 0.01793191 0.00200022 8.965 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Ramsey RESET test
reset_res <- lmtest::resettest(model)
reset_res
##
## RESET test
##
## data: model
## RESET = 195.4, df1 = 2, df2 = 1003, p-value < 2.2e-16
# Cook's distance
nhl_model$cooks <- cooks.distance(model)
# Najviac vplyvné pozorovania
top_influential <- nhl_model %>%
mutate(row = row_number()) %>%
arrange(desc(cooks)) %>%
slice(1:10) %>%
select(row, team, year, cooks, residuals, fitted)
top_influential
## # A tibble: 10 × 6
## row team year cooks residuals fitted
## <int> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 1004 Ottawa Senators 1921 0.0358 0.617 0.383
## 2 996 Montreal Canadiens 1925 0.0294 0.660 0.340
## 3 998 Montreal Canadiens 1924 0.0294 0.660 0.340
## 4 649 New York Islanders 1981 0.0288 -0.201 1.03
## 5 975 Boston Bruins 1929 0.0266 0.577 0.423
## 6 857 Detroit Red Wings 1952 0.0247 0.394 0.606
## 7 617 New York Islanders 1983 0.0237 -0.233 0.983
## 8 825 Montreal Canadiens 1960 0.0204 0.412 0.588
## 9 431 San Jose Sharks 1995 0.0189 0.237 0.127
## 10 587 Chicago Black Hawks 1985 0.0171 0.156 0.444
# Graf Cook's distance
plot(model, which = 4)
# Koeficienty modelu:
summary(model)$coefficients
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.27677444 0.0095021309 29.127618 1.003985e-135
## goals_scored 0.01009367 0.0006641658 15.197515 4.154912e-47
## goals_against -0.01135187 0.0006817521 -16.651024 3.654125e-55
## games 0.01793191 0.0022056752 8.129897 1.256273e-15
cat("\nInterpretácia: \n")
##
## Interpretácia:
cat(" - Koeficient pri goals_scored ukazuje, ako sa mení win_loss_percentage pri náraste počtu nastrieľaných gólov o 1, pri zachovaní ostatných premenných.\n")
## - Koeficient pri goals_scored ukazuje, ako sa mení win_loss_percentage pri náraste počtu nastrieľaných gólov o 1, pri zachovaní ostatných premenných.
cat(" - Koeficient pri goals_against udáva efekt inkasovaných gólov.\n")
## - Koeficient pri goals_against udáva efekt inkasovaných gólov.
cat(" - Premenná games zachytáva rozsah alebo počet odohraných zápasov v play-off (ak má zmysel v dátach).\n")
## - Premenná games zachytáva rozsah alebo počet odohraných zápasov v play-off (ak má zmysel v dátach).
cat("\nDiagnostika: \n")
##
## Diagnostika:
cat(" - Ak Jarque-Bera test zamieta normalitu, rezidua nie sú normálne rozdelené.\n")
## - Ak Jarque-Bera test zamieta normalitu, rezidua nie sú normálne rozdelené.
cat(" - Ak Breusch-Pagan test indikuje heteroskedasticitu, použitie robustných SE (HC1) je odporúčané.\n")
## - Ak Breusch-Pagan test indikuje heteroskedasticitu, použitie robustných SE (HC1) je odporúčané.
cat(" - Vysoké VIF (>5 alebo >10) naznačujú multikolinearitu.\n")
## - Vysoké VIF (>5 alebo >10) naznačujú multikolinearitu.
cat(" - Ak RESET test je signifikantný, môže ísť o nesprávnu špecifikáciu (chýbajúce nelineárne termíny alebo interakcie).\n")
## - Ak RESET test je signifikantný, môže ísť o nesprávnu špecifikáciu (chýbajúce nelineárne termíny alebo interakcie).
model2 <- lm(win_loss_percentage ~ goals_scored + goals_against + games + I(goals_scored^2) + I(goals_against^2) + goals_scored:goals_against, data = nhl_model)
summary(model2)
##
## Call:
## lm(formula = win_loss_percentage ~ goals_scored + goals_against +
## games + I(goals_scored^2) + I(goals_against^2) + goals_scored:goals_against,
## data = nhl_model)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.25371 -0.06076 0.00143 0.06160 0.61379
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.651e-01 1.222e-02 21.686 < 2e-16 ***
## goals_scored 3.214e-02 1.099e-03 29.256 < 2e-16 ***
## goals_against -2.733e-02 1.357e-03 -20.142 < 2e-16 ***
## games 9.425e-03 1.851e-03 5.093 4.21e-07 ***
## I(goals_scored^2) -3.649e-05 2.613e-05 -1.397 0.163
## I(goals_against^2) 4.626e-04 4.815e-05 9.607 < 2e-16 ***
## goals_scored:goals_against -4.675e-04 6.775e-05 -6.900 9.22e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1063 on 1002 degrees of freedom
## Multiple R-squared: 0.7467, Adjusted R-squared: 0.7452
## F-statistic: 492.3 on 6 and 1002 DF, p-value: < 2.2e-16
# Porovnanie AIC/BIC
AIC(model, model2)
## df AIC
## model 5 -1209.875
## model2 8 -1651.846
BIC(model, model2)
## df BIC
## model 5 -1185.292
## model2 8 -1612.512
# Uložiť súhrn modelu ako CSV
write_csv(broom::tidy(model), "model_coefficients.csv")
write_csv(broom::glance(model), "model_glance.csv")