Sú zahrnuté EDA, grafy, viacero špecifikácií regresných modelov, diagnostické testy, robustné štandardné chyby, transformácie premenných, sub-samplovanie a ďalšie postupy z pôvodného zadania.
nhl <- read.csv("nhlplayoffs.csv", stringsAsFactors = FALSE)
cat("Rozmery datasetu:", dim(nhl), "\n")
## Rozmery datasetu: 1009 13
str(nhl)
## 'data.frame': 1009 obs. of 13 variables:
## $ rank : int 1 2 3 4 5 6 7 8 9 10 ...
## $ team : chr "Colorado Avalanche" "Tampa Bay Lightning" "New York Rangers" "Edmonton Oilers" ...
## $ year : int 2022 2022 2022 2022 2022 2022 2022 2022 2022 2022 ...
## $ games : int 20 23 20 16 14 12 12 10 7 7 ...
## $ wins : int 16 14 10 8 7 6 5 4 3 3 ...
## $ losses : int 4 9 10 8 7 6 7 6 4 4 ...
## $ ties : int 0 0 0 0 0 0 0 0 0 0 ...
## $ shootout_wins : int 5 1 1 1 1 1 1 2 0 1 ...
## $ shootout_losses : int 1 2 2 2 0 1 1 0 0 0 ...
## $ win_loss_percentage: num 0.8 0.609 0.5 0.5 0.5 0.5 0.417 0.4 0.429 0.429 ...
## $ goals_scored : int 85 67 62 65 37 40 35 23 20 17 ...
## $ goals_against : int 55 61 58 59 40 38 39 32 24 27 ...
## $ goal_differential : int 30 6 4 6 -3 2 -4 -9 -4 -10 ...
summary(nhl)
## rank team year games
## Min. : 1.000 Length:1009 Min. :1918 Min. : 2.000
## 1st Qu.: 3.000 Class :character 1st Qu.:1972 1st Qu.: 5.000
## Median : 6.000 Mode :character Median :1990 Median : 7.000
## Mean : 7.067 Mean :1986 Mean : 9.364
## 3rd Qu.:11.000 3rd Qu.:2007 3rd Qu.:12.000
## Max. :24.000 Max. :2022 Max. :27.000
## wins losses ties shootout_wins
## Min. : 0.000 Min. : 0.000 Min. :0.00000 Min. : 0.0000
## 1st Qu.: 1.000 1st Qu.: 4.000 1st Qu.:0.00000 1st Qu.: 0.0000
## Median : 3.000 Median : 4.000 Median :0.00000 Median : 1.0000
## Mean : 4.657 Mean : 4.657 Mean :0.04955 Mean : 0.9326
## 3rd Qu.: 7.000 3rd Qu.: 6.000 3rd Qu.:0.00000 3rd Qu.: 1.0000
## Max. :18.000 Max. :12.000 Max. :4.00000 Max. :10.0000
## shootout_losses win_loss_percentage goals_scored goals_against
## Min. :0.0000 Min. :0.0000 Min. : 0.00 Min. : 0.00
## 1st Qu.:0.0000 1st Qu.:0.3330 1st Qu.:11.00 1st Qu.:16.00
## Median :1.0000 Median :0.4290 Median :20.00 Median :22.00
## Mean :0.9326 Mean :0.4112 Mean :26.63 Mean :26.63
## 3rd Qu.:1.0000 3rd Qu.:0.5450 3rd Qu.:37.00 3rd Qu.:35.00
## Max. :4.0000 Max. :1.0000 Max. :98.00 Max. :91.00
## goal_differential
## Min. :-27
## 1st Qu.: -6
## Median : -2
## Mean : 0
## 3rd Qu.: 3
## Max. : 49
# Počet chýbajúcich hodnôt
sapply(nhl, function(x) sum(is.na(x)))
## rank team year games
## 0 0 0 0
## wins losses ties shootout_wins
## 0 0 0 0
## shootout_losses win_loss_percentage goals_scored goals_against
## 0 0 0 0
## goal_differential
## 0
# Základné štatistiky pre číselné premenné
numvars <- nhl[sapply(nhl, is.numeric)]
summary(numvars) # oprava: round() odstránené kvôli chybe pri knitovaní
## rank year games wins
## Min. : 1.000 Min. :1918 Min. : 2.000 Min. : 0.000
## 1st Qu.: 3.000 1st Qu.:1972 1st Qu.: 5.000 1st Qu.: 1.000
## Median : 6.000 Median :1990 Median : 7.000 Median : 3.000
## Mean : 7.067 Mean :1986 Mean : 9.364 Mean : 4.657
## 3rd Qu.:11.000 3rd Qu.:2007 3rd Qu.:12.000 3rd Qu.: 7.000
## Max. :24.000 Max. :2022 Max. :27.000 Max. :18.000
## losses ties shootout_wins shootout_losses
## Min. : 0.000 Min. :0.00000 Min. : 0.0000 Min. :0.0000
## 1st Qu.: 4.000 1st Qu.:0.00000 1st Qu.: 0.0000 1st Qu.:0.0000
## Median : 4.000 Median :0.00000 Median : 1.0000 Median :1.0000
## Mean : 4.657 Mean :0.04955 Mean : 0.9326 Mean :0.9326
## 3rd Qu.: 6.000 3rd Qu.:0.00000 3rd Qu.: 1.0000 3rd Qu.:1.0000
## Max. :12.000 Max. :4.00000 Max. :10.0000 Max. :4.0000
## win_loss_percentage goals_scored goals_against goal_differential
## Min. :0.0000 Min. : 0.00 Min. : 0.00 Min. :-27
## 1st Qu.:0.3330 1st Qu.:11.00 1st Qu.:16.00 1st Qu.: -6
## Median :0.4290 Median :20.00 Median :22.00 Median : -2
## Mean :0.4112 Mean :26.63 Mean :26.63 Mean : 0
## 3rd Qu.:0.5450 3rd Qu.:37.00 3rd Qu.:35.00 3rd Qu.: 3
## Max. :1.0000 Max. :98.00 Max. :91.00 Max. : 49
p1 <- ggplot(nhl, aes(x=goals_scored)) + geom_histogram(bins=30) + ggtitle("Goals scored - histogram")
p2 <- ggplot(nhl, aes(x=goals_against)) + geom_histogram(bins=30) + ggtitle("Goals against - histogram")
p3 <- ggplot(nhl, aes(x=goal_differential)) + geom_histogram(bins=30) + ggtitle("Goal differential - histogram")
p4 <- ggplot(nhl, aes(x=wins)) + geom_histogram(bins=20) + ggtitle("Wins - histogram")
grid.arrange(p1,p2,p3,p4, ncol=2)
ggplot(nhl, aes(y=goal_differential, x=factor(year))) + geom_boxplot() + theme(axis.text.x = element_text(angle=90, vjust=0.5, size=6)) + ggtitle("Goal differential by year (boxplots)")
cor_mat <- cor(numvars, use="pairwise.complete.obs")
print(round(cor_mat,3))
## rank year games wins losses ties shootout_wins
## rank 1.000 0.458 -0.589 -0.652 -0.278 -0.158 -0.420
## year 0.458 1.000 0.318 0.226 0.477 -0.362 0.211
## games -0.589 0.318 1.000 0.967 0.822 -0.128 0.672
## wins -0.652 0.226 0.967 1.000 0.652 -0.098 0.670
## losses -0.278 0.477 0.822 0.652 1.000 -0.293 0.506
## ties -0.158 -0.362 -0.128 -0.098 -0.293 1.000 -0.065
## shootout_wins -0.420 0.211 0.672 0.670 0.506 -0.065 1.000
## shootout_losses -0.095 0.275 0.366 0.300 0.426 -0.129 0.237
## win_loss_percentage -0.697 -0.009 0.683 0.785 0.277 0.070 0.496
## goals_scored -0.572 0.268 0.940 0.944 0.700 -0.131 0.577
## goals_against -0.438 0.361 0.895 0.818 0.845 -0.204 0.569
## goal_differential -0.548 0.000 0.613 0.747 0.160 0.046 0.343
## shootout_losses win_loss_percentage goals_scored
## rank -0.095 -0.697 -0.572
## year 0.275 -0.009 0.268
## games 0.366 0.683 0.940
## wins 0.300 0.785 0.944
## losses 0.426 0.277 0.700
## ties -0.129 0.070 -0.131
## shootout_wins 0.237 0.496 0.577
## shootout_losses 1.000 0.154 0.313
## win_loss_percentage 0.154 1.000 0.701
## goals_scored 0.313 0.701 1.000
## goals_against 0.279 0.513 0.909
## goal_differential 0.236 0.712 0.723
## goals_against goal_differential
## rank -0.438 -0.548
## year 0.361 0.000
## games 0.895 0.613
## wins 0.818 0.747
## losses 0.845 0.160
## ties -0.204 0.046
## shootout_wins 0.569 0.343
## shootout_losses 0.279 0.236
## win_loss_percentage 0.513 0.712
## goals_scored 0.909 0.723
## goals_against 1.000 0.369
## goal_differential 0.369 1.000
# Heatmap korelácií
melted <- melt(cor_mat)
ggplot(melted, aes(Var1, Var2, fill=value)) + geom_tile() + scale_fill_gradient2(low="blue", mid="white", high="red", midpoint=0) + theme(axis.text.x = element_text(angle=90)) + ggtitle("Correlation matrix heatmap")
pairs(nhl[, c("goal_differential","goals_scored","goals_against","wins","games")], gap=0.4, main="Pairs plot")
V pôvodnom súbore sa uvažoval lineárny model pre Life.expectancy. Tu ako cieľ používam goal_differential. Preukážem postupy, ktoré sú v pôvodnom cvičení: základný OLS, diagnostika, riešenie multikolinearity, robustné SE, transformácie a pod.
model1 <- lm(goal_differential ~ goals_scored + goals_against + wins + win_loss_percentage + games, data=nhl)
summary(model1)
##
## Call:
## lm(formula = goal_differential ~ goals_scored + goals_against +
## wins + win_loss_percentage + games, data = nhl)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.754e-14 -8.140e-16 -5.400e-17 6.470e-16 1.879e-13
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.579e-15 8.503e-16 -4.209e+00 2.79e-05 ***
## goals_scored 1.000e+00 4.581e-17 2.183e+16 < 2e-16 ***
## goals_against -1.000e+00 4.714e-17 -2.121e+16 < 2e-16 ***
## wins 5.151e-16 3.297e-16 1.563e+00 0.1185
## win_loss_percentage -3.249e-15 1.768e-15 -1.837e+00 0.0665 .
## games -2.302e-16 2.171e-16 -1.060e+00 0.2894
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.439e-15 on 1003 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 4.148e+32 on 5 and 1003 DF, p-value: < 2.2e-16
# Skontrolujeme, či platí identity goal_differential = goals_scored - goals_against
table(abs(nhl$goals_scored - nhl$goals_against - nhl$goal_differential) < 1e-8)
##
## TRUE
## 1009
Komentár: v tomto datasete existuje takmer presná
algebraická väzba medzi goal_differential,
goals_scored a goals_against. To vedie k
perfektnému R^2 a multikolinearite.
# Breusch–Pagan test na heteroskedasticitu
bp1 <- bptest(model1)
bp1
##
## studentized Breusch-Pagan test
##
## data: model1
## BP = 9.0401, df = 5, p-value = 0.1075
# Durbin-Watson test
dw1 <- dwtest(model1)
dw1
##
## Durbin-Watson test
##
## data: model1
## DW = 1.5693, p-value = 2.658e-12
## alternative hypothesis: true autocorrelation is greater than 0
# VIF (multikolinearita)
vif1 <- vif(model1)
vif1
## goals_scored goals_against wins win_loss_percentage
## 21.615570 12.646316 48.778562 3.368474
## games
## 38.435238
# Reziduálne grafy
par(mfrow=c(2,2))
plot(model1)
par(mfrow=c(1,1))
# Odstránime goals_scored a goals_against, ktoré sú príčinou kolinearity
model2 <- lm(goal_differential ~ wins + win_loss_percentage + games, data=nhl)
summary(model2)
##
## Call:
## lm(formula = goal_differential ~ wins + win_loss_percentage +
## games, data = nhl)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.8840 -2.4688 0.0154 2.6990 19.5153
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.30438 0.62038 2.103 0.0358 *
## wins 5.07489 0.18043 28.127 <2e-16 ***
## win_loss_percentage -0.07867 1.29233 -0.061 0.9515
## games -2.65986 0.11360 -23.413 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.711 on 1005 degrees of freedom
## Multiple R-squared: 0.7407, Adjusted R-squared: 0.7399
## F-statistic: 956.7 on 3 and 1005 DF, p-value: < 2.2e-16
# Robustné (HC3) štandardné chyby — originálne odporúčané
coeftest(model2, vcov = vcovHC(model2, type="HC3"))
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.304377 0.666204 1.9579 0.05052 .
## wins 5.074891 0.255608 19.8542 < 2e-16 ***
## win_loss_percentage -0.078674 1.289918 -0.0610 0.95138
## games -2.659864 0.155062 -17.1535 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
bptest(model2) # heteroskedasticita
##
## studentized Breusch-Pagan test
##
## data: model2
## BP = 109.82, df = 3, p-value < 2.2e-16
dwtest(model2) # autokorelácia rezíduí
##
## Durbin-Watson test
##
## data: model2
## DW = 2.198, p-value = 0.9991
## alternative hypothesis: true autocorrelation is greater than 0
vif(model2) # VIF pre nový model
## wins win_loss_percentage games
## 27.293099 3.360903 19.657305
# Rezíduá
par(mfrow=c(2,2))
plot(model2)
par(mfrow=c(1,1))
summary(nhl$goals_scored)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 11.00 20.00 26.63 37.00 98.00
nhl$log_goals_scored <- ifelse(nhl$goals_scored > 0, log(nhl$goals_scored), NA)
summary(nhl$log_goals_scored)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 2.398 2.996 2.954 3.611 4.585 4
# Odhad modelu s log transformáciou (ak je zmysluplné)
model_log <- lm(goal_differential ~ log_goals_scored + wins + win_loss_percentage + games, data=nhl)
summary(model_log)
##
## Call:
## lm(formula = goal_differential ~ log_goals_scored + wins + win_loss_percentage +
## games, data = nhl)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.5773 -2.4956 0.0439 2.6295 18.1644
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.6615 0.7303 -0.906 0.36526
## log_goals_scored 1.9332 0.3747 5.160 2.98e-07 ***
## wins 5.3768 0.1881 28.588 < 2e-16 ***
## win_loss_percentage -4.1132 1.4958 -2.750 0.00607 **
## games -3.0318 0.1336 -22.697 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.658 on 1000 degrees of freedom
## (4 observations deleted due to missingness)
## Multiple R-squared: 0.7475, Adjusted R-squared: 0.7465
## F-statistic: 740 on 4 and 1000 DF, p-value: < 2.2e-16
coeftest(model_log, vcov = vcovHC(model_log, type="HC3"))
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.66153 0.75833 -0.8723 0.383231
## log_goals_scored 1.93324 0.39324 4.9162 1.031e-06 ***
## wins 5.37677 0.26540 20.2591 < 2.2e-16 ***
## win_loss_percentage -4.11318 1.47705 -2.7847 0.005458 **
## games -3.03178 0.18037 -16.8087 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Rozdelíme na obdobia (napr. pred rokom 2000 a po 2000)
model_pre2000 <- lm(goal_differential ~ wins + win_loss_percentage + games, data=subset(nhl, year < 2000))
model_post2000 <- lm(goal_differential ~ wins + win_loss_percentage + games, data=subset(nhl, year >= 2000))
cat("Pre-2000: n =", length(na.omit(subset(nhl, year<2000)$goal_differential)), "\n")
## Pre-2000: n = 649
cat("Post-2000: n =", length(na.omit(subset(nhl, year>=2000)$goal_differential)), "\n")
## Post-2000: n = 360
summary(model_pre2000)
##
## Call:
## lm(formula = goal_differential ~ wins + win_loss_percentage +
## games, data = subset(nhl, year < 2000))
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.0987 -2.4061 0.1935 2.6059 18.1916
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.3200 0.7116 1.855 0.0641 .
## wins 5.3023 0.2154 24.619 <2e-16 ***
## win_loss_percentage -1.0555 1.4930 -0.707 0.4798
## games -2.7315 0.1353 -20.188 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.706 on 645 degrees of freedom
## Multiple R-squared: 0.7604, Adjusted R-squared: 0.7593
## F-statistic: 682.5 on 3 and 645 DF, p-value: < 2.2e-16
summary(model_post2000)
##
## Call:
## lm(formula = goal_differential ~ wins + win_loss_percentage +
## games, data = subset(nhl, year >= 2000))
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.0319 -2.7010 0.0716 2.8594 15.5106
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.3733 1.2764 1.076 0.283
## wins 4.6925 0.3447 13.615 <2e-16 ***
## win_loss_percentage -0.2377 2.7488 -0.086 0.931
## games -2.4634 0.2252 -10.939 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.672 on 356 degrees of freedom
## Multiple R-squared: 0.7054, Adjusted R-squared: 0.703
## F-statistic: 284.2 on 3 and 356 DF, p-value: < 2.2e-16
coeftest(model_pre2000, vcov = vcovHC(model_pre2000, type="HC3"))
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.31998 0.77609 1.7008 0.08946 .
## wins 5.30227 0.31642 16.7572 < 2e-16 ***
## win_loss_percentage -1.05551 1.53005 -0.6899 0.49053
## games -2.73146 0.18773 -14.5501 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
coeftest(model_post2000, vcov = vcovHC(model_post2000, type="HC3"))
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.37326 1.30423 1.0529 0.2931
## wins 4.69248 0.44351 10.5804 <2e-16 ***
## win_loss_percentage -0.23772 2.52240 -0.0942 0.9250
## games -2.46344 0.28477 -8.6506 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Ak by sme očakávali časovú závislosť medzi pozorovaniami (napr. tím-rok sekvencie),
# môžeme použiť Newey-West (HAC) covariance.
library(sandwich)
nw_cov <- NeweyWest(model2, lag=1, prewhite=FALSE)
coeftest(model2, vcov = nw_cov)
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.304377 0.667691 1.9536 0.05103 .
## wins 5.074891 0.254687 19.9260 < 2e-16 ***
## win_loss_percentage -0.078674 1.267961 -0.0620 0.95054
## games -2.659864 0.154044 -17.2670 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# PCA pre numerické premenné ako alternatíva pri kolinearite
num_complete <- na.omit(nhl[, c("goals_scored","goals_against","wins","win_loss_percentage","games","goal_differential")])
pca <- prcomp(num_complete[, c("goals_scored","goals_against","wins","win_loss_percentage","games")], center=TRUE, scale.=TRUE)
summary(pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5
## Standard deviation 2.0707 0.7283 0.34223 0.23230 0.10225
## Proportion of Variance 0.8576 0.1061 0.02342 0.01079 0.00209
## Cumulative Proportion 0.8576 0.9637 0.98712 0.99791 1.00000
# Použitie prvého komponentu v regresii
num_complete$PC1 <- pca$x[,1]
model_pca <- lm(goal_differential ~ PC1, data=num_complete)
summary(model_pca)
##
## Call:
## lm(formula = goal_differential ~ PC1, data = num_complete)
##
## Residuals:
## Min 1Q Median 3Q Max
## -30.102 -3.639 -0.183 3.762 34.340
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.678e-15 2.128e-01 0.0 1
## PC1 -3.043e+00 1.028e-01 -29.6 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.758 on 1007 degrees of freedom
## Multiple R-squared: 0.4653, Adjusted R-squared: 0.4647
## F-statistic: 876.2 on 1 and 1007 DF, p-value: < 2.2e-16
coeftest(model_pca, vcov = vcovHC(model_pca, type="HC3"))
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.6777e-15 2.1316e-01 0.000 1
## PC1 -3.0427e+00 1.4264e-01 -21.332 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Počet pozorovaní: 1009
## R-squared: 0.7407 Adjusted R2: 0.7399
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.304377 0.666204 1.9579 0.05052 .
## wins 5.074891 0.255608 19.8542 < 2e-16 ***
## win_loss_percentage -0.078674 1.289918 -0.0610 0.95138
## games -2.659864 0.155062 -17.1535 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] "Breusch-Pagan test p-value:"
## BP
## 1.198961e-23
## [1] "Durbin-Watson stat:"
## DW
## 2.198043
## [1] "VIF:"
## wins win_loss_percentage games
## 27.293099 3.360903 19.657305
# Trend goals_scored over years (agregované priemerne)
agg_year <- aggregate(cbind(goals_scored, goals_against, goal_differential) ~ year, data=nhl, FUN=mean, na.rm=TRUE)
ggplot(agg_year, aes(x=year)) + geom_line(aes(y=goals_scored, linetype="scored")) + geom_line(aes(y=goals_against, linetype="against")) + geom_line(aes(y=goal_differential, linetype="differential")) + ggtitle("Average goals and differential by year") + ylab("Average per team per season") + scale_linetype_manual(values=c("solid","dashed","dotdash"), labels=c("scored","against","differential"))
ggplot(data.frame(fitted=fitted(model2), resid=residuals(model2)), aes(x=fitted, y=resid)) + geom_point() + geom_hline(yintercept=0, col="red") + ggtitle("Residuals vs Fitted (model2)")
goal_differential,
goals_scored a goals_against. Preto nie je
vhodné používať obe tieto premenné súčasne.wins a games sú významné v
mnohých špecifikáciách, ale vysoký VIF naznačuje, že medzi nimi panuje
kolinearita.