Cvičenie 6 – Diagnostika regresného modelu (Pit Stops F1)

# Odhad modelu
# * lap (-25.63): Štatisticky významný. Každým kolom sa pit stop v priemere skráti o 25,6 
# * ms.stop (1.57): Štatisticky nevýznamný ($p = 0.959$). Poradie zastávky dĺžku času nemení.
# * $R^2$ (0.0065): Model vysvetľuje len 0,65 % zmien v čase. Je takmer nepoužiteľný na predpovede, lebo dĺžku pit stopu ovplyvňujú iné faktory (chyby, počasie).
# * Rezíduá: Obrovský rozptyl (chyby až do +35 sekúnd) potvrdzuje prítomnosť extrémnych zlyhaní v boxoch, ktoré model nevie zachytiť.
# Diagnostické grafy
# Reset par nastavení pre istotu
model <- lm(milliseconds ~ lap + stop, data = pit_stops)
summary(model)

## 
## Call:
## lm(formula = milliseconds ~ lap + stop, data = pit_stops)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -11804  -2564   -993   1222  35777 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 25102.854     93.112 269.598  < 2e-16 ***
## lap           -25.630      3.197  -8.017  1.2e-15 ***
## stop            1.576     30.431   0.052    0.959    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4591 on 10851 degrees of freedom
## Multiple R-squared:  0.006565,   Adjusted R-squared:  0.006382 
## F-statistic: 35.85 on 2 and 10851 DF,  p-value: 3.022e-16

par(mfrow = c(2, 2), mar = c(4.5, 4.5, 3, 1))

plot(model, which = 1, 
     col = "steelblue", pch = 19, cex = 0.7)
plot(model, which = 2,  
     col = "steelblue", pch = 19, cex = 0.7)
plot(model, which = 3, , 
     col = "steelblue", pch = 19, cex = 0.7)
plot(model, which = 5, 
     col = "steelblue", pch = 19, cex = 0.7)

par(mfrow = c(1, 1))

# Reset na predvolené nastavenie
par(mfrow = c(1, 1))
# Normality - Shapiro-Wilk (max 5000 pozorovaní) → použijeme náhodný výber
set.seed(123)
resid_sample <- sample(residuals(model), min(5000, length(residuals(model))))

cat("Shapiro-Wilk test (vzorka 5000 rezíduí):\n")

## Shapiro-Wilk test (vzorka 5000 rezíduí):

print(shapiro.test(resid_sample))

## 
##  Shapiro-Wilk normality test
## 
## data:  resid_sample
## W = 0.85022, p-value < 2.2e-16

cat("\nJarque-Bera test:\n")

## 
## Jarque-Bera test:

print(jarque.bera.test(residuals(model)))

## 
##  Jarque Bera Test
## 
## data:  residuals(model)
## X-squared = 34872, df = 2, p-value < 2.2e-16

cat("\nBreusch-Pagan test:\n")

## 
## Breusch-Pagan test:

print(bptest(model))

## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 49.874, df = 2, p-value = 1.479e-11

cat("\nDurbin-Watson test:\n")

## 
## Durbin-Watson test:

print(dwtest(model))

## 
##  Durbin-Watson test
## 
## data:  model
## DW = 0.9287, p-value < 2.2e-16
## alternative hypothesis: true autocorrelation is greater than 0

cat("\nOutlier Test:\n")

## 
## Outlier Test:

print(outlierTest(model))

##       rstudent unadjusted p-value Bonferroni p
## 4183  7.816374         5.9396e-15   6.4468e-11
## 10724 7.549229         4.7297e-14   5.1336e-10
## 7041  7.471668         8.5276e-14   9.2558e-10
## 4613  7.311887         2.8198e-13   3.0606e-09
## 7904  7.279347         3.5865e-13   3.8928e-09
## 4703  7.214757         5.7636e-13   6.2558e-09
## 7905  7.089481         1.4299e-12   1.5520e-08
## 188   6.966937         3.4269e-12   3.7195e-08
## 7906  6.946501         3.9590e-12   4.2971e-08
## 6205  6.905420         5.2854e-12   5.7368e-08

cat("\n=== Top 10 Cookovej vzdialenosti ===\n")

## 
## === Top 10 Cookovej vzdialenosti ===

print(head(sort(cooks.distance(model), decreasing = TRUE), 10))

##       10354        4183        7807        7905        9443        9754 
## 0.018274838 0.012751728 0.009697134 0.008411823 0.007555129 0.007541992 
##        7041        4613        7907        5926 
## 0.006576089 0.006299202 0.006157171 0.005838120

# Test heteroskedasticity
bptest(model)

## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 49.874, df = 2, p-value = 1.479e-11

bptest(model, ~ fitted(model) + I(fitted(model)^2))

## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 160.25, df = 2, p-value < 2.2e-16

# Test autokorelácie
dwtest(model)

## 
##  Durbin-Watson test
## 
## data:  model
## DW = 0.9287, p-value < 2.2e-16
## alternative hypothesis: true autocorrelation is greater than 0

bgtest(model, order = 1)

## 
##  Breusch-Godfrey test for serial correlation of order up to 1
## 
## data:  model
## LM test = 3114.3, df = 1, p-value < 2.2e-16

# Odľahlé a vplyvné pozorovania
outlierTest(model)

##       rstudent unadjusted p-value Bonferroni p
## 4183  7.816374         5.9396e-15   6.4468e-11
## 10724 7.549229         4.7297e-14   5.1336e-10
## 7041  7.471668         8.5276e-14   9.2558e-10
## 4613  7.311887         2.8198e-13   3.0606e-09
## 7904  7.279347         3.5865e-13   3.8928e-09
## 4703  7.214757         5.7636e-13   6.2558e-09
## 7905  7.089481         1.4299e-12   1.5520e-08
## 188   6.966937         3.4269e-12   3.7195e-08
## 7906  6.946501         3.9590e-12   4.2971e-08
## 6205  6.905420         5.2854e-12   5.7368e-08

cat("\n=== Top 10 najvplyvnejších pozorovaní (Cookova vzdialenosť) ===\n")

## 
## === Top 10 najvplyvnejších pozorovaní (Cookova vzdialenosť) ===

head(sort(cooks.distance(model), decreasing = TRUE), 10)

##       10354        4183        7807        7905        9443        9754 
## 0.018274838 0.012751728 0.009697134 0.008411823 0.007555129 0.007541992 
##        7041        4613        7907        5926 
## 0.006576089 0.006299202 0.006157171 0.005838120