url <- "https://raw.githubusercontent.com/underckit/machine_learning/main/SolarPrediction.csv"
SolarPrediction <- read.csv(url)

Разделение датасета

dummy_sep <- rbinom(nrow(SolarPrediction), 1, 0.9)
SolarPrediction.train <- SolarPrediction[dummy_sep == 1, ]
SolarPrediction.test <- SolarPrediction[dummy_sep == 0, ]

Линейная регрессия с исходными предикторами

Построим модель

model_ols <- lm(data = SolarPrediction.train, Radiation ~ Temperature + Pressure + Humidity)
summary(model_ols)
## 
## Call:
## lm(formula = Radiation ~ Temperature + Pressure + Humidity, data = SolarPrediction.train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -753.01 -137.83  -21.19  103.84 1188.32 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.056e+04  7.260e+02  28.314   <2e-16 ***
## Temperature  3.888e+01  2.144e-01 181.311   <2e-16 ***
## Pressure    -7.331e+02  2.394e+01 -30.625   <2e-16 ***
## Humidity    -4.588e-01  4.992e-02  -9.191   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 211 on 29501 degrees of freedom
## Multiple R-squared:  0.5534, Adjusted R-squared:  0.5534 
## F-statistic: 1.219e+04 on 3 and 29501 DF,  p-value: < 2.2e-16

Вычислим разницу

ost.lr <- SolarPrediction.test$Radiation - predict(model_ols, SolarPrediction.test)

Проверим модель с помощью T-test

t.test(ost.lr, mu=0)
## 
##  One Sample t-test
## 
## data:  ost.lr
## t = -0.47775, df = 3180, p-value = 0.6329
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  -9.053920  5.506204
## sample estimates:
## mean of x 
## -1.773858

p-value больше 5%, т.е. можем сказать, что среднее значение удерживаемых остатков статистически похоже на 0.

Преобразование Бокса-Кокса

library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
( trans = preProcess(SolarPrediction.train, method = c("BoxCox")) )
## Created from 29505 samples and 10 variables
## 
## Pre-processing:
##   - Box-Cox transformation (6)
##   - ignored (4)
## 
## Lambda estimates for Box-Cox transformation:
## -2, -0.1, -0.9, 2, 1.9, 0.6
testSetTrans = predict(trans, SolarPrediction.test)
trSetTrans = predict(trans, SolarPrediction.train)

model_trls <- lm(data = trSetTrans, Radiation ~ Temperature + Pressure + Humidity)
summary(model_trls)
## 
## Call:
## lm(formula = Radiation ~ Temperature + Pressure + Humidity, data = trSetTrans)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.9204 -1.4760 -0.0956  1.3243  6.7889 
## 
## Coefficients:
##               Estimate Std. Error  t value Pr(>|t|)    
## (Intercept) -5.102e+02  4.172e+00 -122.277  < 2e-16 ***
## Temperature  6.229e+02  3.448e+00  180.653  < 2e-16 ***
## Pressure    -3.440e-01  7.122e-03  -48.296  < 2e-16 ***
## Humidity     6.544e-05  1.026e-05    6.376 1.84e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.892 on 29501 degrees of freedom
## Multiple R-squared:  0.5323, Adjusted R-squared:  0.5323 
## F-statistic: 1.119e+04 on 3 and 29501 DF,  p-value: < 2.2e-16

Вычислим разницу

ost.tlr <- testSetTrans$Radiation - predict(model_trls, testSetTrans)

Проверим модель с помощью T-test

t.test(ost.lr, mu=0)
## 
##  One Sample t-test
## 
## data:  ost.lr
## t = -0.47775, df = 3180, p-value = 0.6329
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  -9.053920  5.506204
## sample estimates:
## mean of x 
## -1.773858

Преобразование методом главных компонентов

library(caret)
( trans1 = preProcess(SolarPrediction.train[5:7], method = "pca") )
## Created from 29505 samples and 3 variables
## 
## Pre-processing:
##   - centered (3)
##   - ignored (0)
##   - principal component signal extraction (3)
##   - scaled (3)
## 
## PCA needed 3 components to capture 95 percent of the variance
testSetTrans1 = predict(trans1, SolarPrediction.test)
trSetTrans1 = predict(trans1, SolarPrediction.train)

model_trls1 <- lm(data = trSetTrans1, Radiation ~ PC1 + PC2 + PC3)
summary(model_trls1)
## 
## Call:
## lm(formula = Radiation ~ PC1 + PC2 + PC3, data = trSetTrans1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -753.01 -137.83  -21.19  103.84 1188.32 
## 
## Coefficients:
##             Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)  207.208      1.228  168.700   <2e-16 ***
## PC1          130.380      0.988  131.966   <2e-16 ***
## PC2           11.575      1.393    8.311   <2e-16 ***
## PC3         -206.221      1.493 -138.105   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 211 on 29501 degrees of freedom
## Multiple R-squared:  0.5534, Adjusted R-squared:  0.5534 
## F-statistic: 1.219e+04 on 3 and 29501 DF,  p-value: < 2.2e-16

Вычислим разницу

ost.tlr1 <- testSetTrans1$Radiation - predict(model_trls1, testSetTrans1)

Проверим модель с помощью T-test

t.test(ost.tlr1, mu=0)
## 
##  One Sample t-test
## 
## data:  ost.tlr1
## t = -0.47775, df = 3180, p-value = 0.6329
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  -9.053920  5.506204
## sample estimates:
## mean of x 
## -1.773858