# Balíky pre regresiu, testy a diagnostiku
library(lmtest)
library(sandwich)
library(car)
# Pomocné balíky na prácu s dátumom a úpravy dát
library(dplyr)
library(lubridate)
rm(list = ls())
V tomto zošite skúmame, ako je spotreba elektriny (zvolená zóna) ovplyvnená meteorologickými premennými (teplota, vlhkosť, rýchlosť vetra) a premennými súvisiacimi so slnečným žiarením (GeneralDiffuseFlows, DiffuseFlows).
Budeme postupovať nasledovne: 1. odhad základného lineárneho modelu, 2. otestovanie funkčnej formy (RESET), 3. diagnostika (reziduá, C+R grafy), 4. návrhy nelineárnych úprav (kvadrát, prípadne zlom cez dummy premennú).
# Načítanie dát
# (ak máš súbor v inom priečinku, uprav si cestu)
udaje <- read.csv("powerconsumption.csv", sep = ",", dec = ".", header = TRUE)
# Prevod dátumu/času
udaje$Datetime <- mdy_hm(udaje$Datetime)
# Pomocné časové premenné (môžu byť užitočné v rozšíreniach modelu)
udaje <- udaje %>%
mutate(
Hour = hour(Datetime),
Weekday = wday(Datetime, label = TRUE, week_start = 1)
)
# Rýchla kontrola štruktúry
str(udaje)
'data.frame': 52416 obs. of 11 variables:
$ Datetime : POSIXct, format: "2017-01-01 00:00:00" "2017-01-01 00:10:00" "2017-01-01 00:20:00" "2017-01-01 00:30:00" ...
$ Temperature : num 6.56 6.41 6.31 6.12 5.92 ...
$ Humidity : num 73.8 74.5 74.5 75 75.7 76.9 77.7 78.2 78.1 77.3 ...
$ WindSpeed : num 0.083 0.083 0.08 0.083 0.081 0.081 0.08 0.085 0.081 0.082 ...
$ GeneralDiffuseFlows : num 0.051 0.07 0.062 0.091 0.048 0.059 0.048 0.055 0.066 0.062 ...
$ DiffuseFlows : num 0.119 0.085 0.1 0.096 0.085 0.108 0.096 0.093 0.141 0.111 ...
$ PowerConsumption_Zone1: num 34056 29815 29128 28229 27336 ...
$ PowerConsumption_Zone2: num 16129 19375 19007 18361 17872 ...
$ PowerConsumption_Zone3: num 20241 20131 19668 18899 18442 ...
$ Hour : int 0 0 0 0 0 0 1 1 1 1 ...
$ Weekday : Ord.factor w/ 7 levels "Mon"<"Tue"<"Wed"<..: 7 7 7 7 7 7 7 7 7 7 ...
summary(udaje)
Datetime Temperature Humidity WindSpeed GeneralDiffuseFlows
Min. :2017-01-01 00:00:00 Min. : 3.247 Min. :11.34 Min. :0.050 Min. : 0.004
1st Qu.:2017-04-01 23:57:30 1st Qu.:14.410 1st Qu.:58.31 1st Qu.:0.078 1st Qu.: 0.062
Median :2017-07-01 23:55:00 Median :18.780 Median :69.86 Median :0.086 Median : 5.035
Mean :2017-07-01 23:55:00 Mean :18.810 Mean :68.26 Mean :1.959 Mean : 182.697
3rd Qu.:2017-09-30 23:52:30 3rd Qu.:22.890 3rd Qu.:81.40 3rd Qu.:4.915 3rd Qu.: 319.600
Max. :2017-12-30 23:50:00 Max. :40.010 Max. :94.80 Max. :6.483 Max. :1163.000
DiffuseFlows PowerConsumption_Zone1 PowerConsumption_Zone2 PowerConsumption_Zone3 Hour
Min. : 0.011 Min. :13896 Min. : 8560 Min. : 5935 Min. : 0.00
1st Qu.: 0.122 1st Qu.:26311 1st Qu.:16981 1st Qu.:13129 1st Qu.: 5.75
Median : 4.456 Median :32266 Median :20823 Median :16415 Median :11.50
Mean : 75.028 Mean :32345 Mean :21043 Mean :17835 Mean :11.50
3rd Qu.:101.000 3rd Qu.:37309 3rd Qu.:24714 3rd Qu.:21624 3rd Qu.:17.25
Max. :936.000 Max. :52204 Max. :37409 Max. :47598 Max. :23.00
Weekday
Mon:7488
Tue:7488
Wed:7488
Thu:7488
Fri:7488
Sat:7488
Sun:7488
Poznámka: V týchto dátach typicky nebývajú chýbajúce hodnoty; ak by sa objavili, riešenie doplnenia (imputácie) by bolo vhodné doplniť až podľa zadania.
Ako základ použijeme lineárny model, kde spotrebu (Zone1) vysvetľujeme počasím a difúznym žiarením.
model0 <- lm(
PowerConsumption_Zone1 ~ Temperature + Humidity + WindSpeed + GeneralDiffuseFlows + DiffuseFlows,
data = udaje
)
summary(model0)
Call:
lm(formula = PowerConsumption_Zone1 ~ Temperature + Humidity +
WindSpeed + GeneralDiffuseFlows + DiffuseFlows, data = udaje)
Residuals:
Min 1Q Median 3Q Max
-15613.5 -4933.0 -653.3 4113.8 17982.7
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.675e+04 2.138e+02 125.132 <2e-16 ***
Temperature 5.349e+02 6.418e+00 83.331 <2e-16 ***
Humidity -5.651e+01 2.130e+00 -26.525 <2e-16 ***
WindSpeed -1.487e+02 1.358e+01 -10.951 <2e-16 ***
GeneralDiffuseFlows -1.702e+00 1.464e-01 -11.628 <2e-16 ***
DiffuseFlows -8.731e-02 2.721e-01 -0.321 0.748
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 6349 on 52410 degrees of freedom
Multiple R-squared: 0.2073, Adjusted R-squared: 0.2072
F-statistic: 2741 on 5 and 52410 DF, p-value: < 2.2e-16
Ak chceš rovnaký model pre inú zónu, stačí zmeniť ľavú stranu napr.
na PowerConsumption_Zone2 alebo
PowerConsumption_Zone3.
RESET test kontroluje, či lineárna špecifikácia nie je „príliš jednoduchá“ (napr. chýba nelinearita alebo dôležité premenné).
Hypotézy: - H0: model je funkčne správne špecifikovaný, - H1: model je nesprávne špecifikovaný (typicky nelinearita / vynechané premenné).
resettest(model0)
RESET test
data: model0
RESET = 92.704, df1 = 2, df2 = 52408, p-value < 2.2e-16
Interpretácia: - ak p-hodnota < 0.05, model je pravdepodobne funkčne zle špecifikovaný, - ak p-hodnota ≥ 0.05, nevidíme silný dôkaz proti lineárnej špecifikácii.
plot(model0, which = 1)
Ak vidíš zakrivenie alebo systematický vzor, často to signalizuje nelinearitu (napr. potrebu log/kvadrátu/interakcie).
C+R grafy pomôžu identifikovať, ktorá konkrétna premenná má nelineárny vzťah k vysvetľovanej premennej.
crPlots(model0)
Pri spotrebe elektriny býva nelinearita často pri teplote (napr. kúrenie/chladenie). Skúsime teda pridať kvadratický člen teploty.
model_quad <- lm(
PowerConsumption_Zone1 ~ Temperature + I(Temperature^2) +
Humidity + WindSpeed + GeneralDiffuseFlows + DiffuseFlows,
data = udaje
)
summary(model_quad)
Call:
lm(formula = PowerConsumption_Zone1 ~ Temperature + I(Temperature^2) +
Humidity + WindSpeed + GeneralDiffuseFlows + DiffuseFlows,
data = udaje)
Residuals:
Min 1Q Median 3Q Max
-15556.9 -4897.7 -658.7 4118.7 18073.0
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 25687.2407 278.2829 92.306 < 2e-16 ***
Temperature 683.1004 25.6667 26.614 < 2e-16 ***
I(Temperature^2) -4.0173 0.6735 -5.965 2.46e-09 ***
Humidity -59.3702 2.1831 -27.196 < 2e-16 ***
WindSpeed -138.6930 13.6779 -10.140 < 2e-16 ***
GeneralDiffuseFlows -1.5517 0.1485 -10.451 < 2e-16 ***
DiffuseFlows -0.3505 0.2755 -1.272 0.203
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 6347 on 52409 degrees of freedom
Multiple R-squared: 0.2078, Adjusted R-squared: 0.2077
F-statistic: 2291 on 6 and 52409 DF, p-value: < 2.2e-16
# Porovnanie so základným modelom
anova(model0, model_quad)
Analysis of Variance Table
Model 1: PowerConsumption_Zone1 ~ Temperature + Humidity + WindSpeed +
GeneralDiffuseFlows + DiffuseFlows
Model 2: PowerConsumption_Zone1 ~ Temperature + I(Temperature^2) + Humidity +
WindSpeed + GeneralDiffuseFlows + DiffuseFlows
Res.Df RSS Df Sum of Sq F Pr(>F)
1 52410 2.1127e+12
2 52409 2.1112e+12 1 1433326660 35.581 2.462e-09 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# RESET po rozšírení modelu
resettest(model_quad)
RESET test
data: model_quad
RESET = 262.5, df1 = 2, df2 = 52407, p-value < 2.2e-16
Vzťahy v spotrebe sa často líšia cez deň a v noci. Ako jednoduchý
indikátor dňa/noci použijeme GeneralDiffuseFlows: -
DAY = 1, ak GeneralDiffuseFlows > 0 -
DAY = 0, inak
Najprv otestujeme posun (zlom v intercept-e), potom zmenu sklonu pri teplote.
udaje$DAY <- ifelse(udaje$GeneralDiffuseFlows > 0, 1, 0)
# (A) zlom v autonómnom člene (posun úrovne spotreby cez deň)
model_day_shift <- lm(
PowerConsumption_Zone1 ~ DAY + Temperature + Humidity + WindSpeed + GeneralDiffuseFlows + DiffuseFlows,
data = udaje
)
summary(model_day_shift)
Call:
lm(formula = PowerConsumption_Zone1 ~ DAY + Temperature + Humidity +
WindSpeed + GeneralDiffuseFlows + DiffuseFlows, data = udaje)
Residuals:
Min 1Q Median 3Q Max
-15613.5 -4933.0 -653.3 4113.8 17982.7
Coefficients: (1 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.675e+04 2.138e+02 125.132 <2e-16 ***
DAY NA NA NA NA
Temperature 5.349e+02 6.418e+00 83.331 <2e-16 ***
Humidity -5.651e+01 2.130e+00 -26.525 <2e-16 ***
WindSpeed -1.487e+02 1.358e+01 -10.951 <2e-16 ***
GeneralDiffuseFlows -1.702e+00 1.464e-01 -11.628 <2e-16 ***
DiffuseFlows -8.731e-02 2.721e-01 -0.321 0.748
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 6349 on 52410 degrees of freedom
Multiple R-squared: 0.2073, Adjusted R-squared: 0.2072
F-statistic: 2741 on 5 and 52410 DF, p-value: < 2.2e-16
# (B) zlom v sklone: teplota môže pôsobiť inak cez deň a inak v noci
model_day_slope <- lm(
PowerConsumption_Zone1 ~ Temperature + I(DAY * Temperature) +
Humidity + WindSpeed + GeneralDiffuseFlows + DiffuseFlows,
data = udaje
)
summary(model_day_slope)
Call:
lm(formula = PowerConsumption_Zone1 ~ Temperature + I(DAY * Temperature) +
Humidity + WindSpeed + GeneralDiffuseFlows + DiffuseFlows,
data = udaje)
Residuals:
Min 1Q Median 3Q Max
-15613.5 -4933.0 -653.3 4113.8 17982.7
Coefficients: (1 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.675e+04 2.138e+02 125.132 <2e-16 ***
Temperature 5.349e+02 6.418e+00 83.331 <2e-16 ***
I(DAY * Temperature) NA NA NA NA
Humidity -5.651e+01 2.130e+00 -26.525 <2e-16 ***
WindSpeed -1.487e+02 1.358e+01 -10.951 <2e-16 ***
GeneralDiffuseFlows -1.702e+00 1.464e-01 -11.628 <2e-16 ***
DiffuseFlows -8.731e-02 2.721e-01 -0.321 0.748
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 6349 on 52410 degrees of freedom
Multiple R-squared: 0.2073, Adjusted R-squared: 0.2072
F-statistic: 2741 on 5 and 52410 DF, p-value: < 2.2e-16
# Porovnanie so základom
anova(model0, model_day_slope)
Analysis of Variance Table
Model 1: PowerConsumption_Zone1 ~ Temperature + Humidity + WindSpeed +
GeneralDiffuseFlows + DiffuseFlows
Model 2: PowerConsumption_Zone1 ~ Temperature + I(DAY * Temperature) +
Humidity + WindSpeed + GeneralDiffuseFlows + DiffuseFlows
Res.Df RSS Df Sum of Sq F Pr(>F)
1 52410 2.1127e+12
2 52410 2.1127e+12 0 0
# RESET test pre model so zlomom v sklone
resettest(model_day_slope)
RESET test
data: model_day_slope
RESET = 92.703, df1 = 2, df2 = 52407, p-value < 2.2e-16
Interpretácia interakcie I(DAY * Temperature): -
koeficient pri Temperature je efekt teploty v noci (DAY =
0), - koeficient pri I(DAY * Temperature) je rozdiel v
efekte teploty cez deň oproti noci.
Ak by model trpel nelinearitou v závislej premennej, niekedy pomôže transformácia spotreby. Toto je voliteľné, keďže môže zhoršiť interpretáciu.
library(MASS)
boxcox(model0)
Ak by vhodná transformácia vyšla blízko logaritmu, dá sa skúsiť:
model_log <- lm(
log(PowerConsumption_Zone1) ~ Temperature + Humidity + WindSpeed + GeneralDiffuseFlows + DiffuseFlows,
data = udaje
)
summary(model_log)
Call:
lm(formula = log(PowerConsumption_Zone1) ~ Temperature + Humidity +
WindSpeed + GeneralDiffuseFlows + DiffuseFlows, data = udaje)
Residuals:
Min 1Q Median 3Q Max
-0.65223 -0.15051 -0.00796 0.13554 0.51379
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.018e+01 6.669e-03 1525.693 < 2e-16 ***
Temperature 1.661e-02 2.002e-04 82.955 < 2e-16 ***
Humidity -1.723e-03 6.646e-05 -25.927 < 2e-16 ***
WindSpeed -4.780e-03 4.236e-04 -11.284 < 2e-16 ***
GeneralDiffuseFlows -2.224e-05 4.566e-06 -4.871 1.11e-06 ***
DiffuseFlows 3.983e-05 8.487e-06 4.693 2.70e-06 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.1981 on 52410 degrees of freedom
Multiple R-squared: 0.2201, Adjusted R-squared: 0.22
F-statistic: 2958 on 5 and 52410 DF, p-value: < 2.2e-16
resettest(model_log)
RESET test
data: model_log
RESET = 269.93, df1 = 2, df2 = 52408, p-value < 2.2e-16
Pri dátach s časovou štruktúrou býva užitočné pozrieť aj robustné (HC) štandardné chyby:
coeftest(model0, vcov = vcovHC(model0, type = "HC1"))
t test of coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.6750e+04 2.2087e+02 121.1130 <2e-16 ***
Temperature 5.3486e+02 6.3420e+00 84.3361 <2e-16 ***
Humidity -5.6510e+01 2.1607e+00 -26.1539 <2e-16 ***
WindSpeed -1.4870e+02 1.3402e+01 -11.0953 <2e-16 ***
GeneralDiffuseFlows -1.7020e+00 1.1080e-01 -15.3611 <2e-16 ***
DiffuseFlows -8.7307e-02 1.7969e-01 -0.4859 0.6271
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1