url <- "https://raw.githubusercontent.com/underckit/machine_learning/main/SolarPrediction.csv"
SolarPrediction <- read.csv(url)
dummy_sep <- rbinom(nrow(SolarPrediction), 1, 0.9)
SolarPrediction.train <- SolarPrediction[dummy_sep == 1, ]
SolarPrediction.test <- SolarPrediction[dummy_sep == 0, ]
Построим модель
model_ols <- lm(data = SolarPrediction.train, Radiation ~ Temperature + Pressure + Humidity)
summary(model_ols)
##
## Call:
## lm(formula = Radiation ~ Temperature + Pressure + Humidity, data = SolarPrediction.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -753.01 -137.83 -21.19 103.84 1188.32
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.056e+04 7.260e+02 28.314 <2e-16 ***
## Temperature 3.888e+01 2.144e-01 181.311 <2e-16 ***
## Pressure -7.331e+02 2.394e+01 -30.625 <2e-16 ***
## Humidity -4.588e-01 4.992e-02 -9.191 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 211 on 29501 degrees of freedom
## Multiple R-squared: 0.5534, Adjusted R-squared: 0.5534
## F-statistic: 1.219e+04 on 3 and 29501 DF, p-value: < 2.2e-16
Вычислим разницу
ost.lr <- SolarPrediction.test$Radiation - predict(model_ols, SolarPrediction.test)
Проверим модель с помощью T-test
t.test(ost.lr, mu=0)
##
## One Sample t-test
##
## data: ost.lr
## t = -0.47775, df = 3180, p-value = 0.6329
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## -9.053920 5.506204
## sample estimates:
## mean of x
## -1.773858
p-value больше 5%, т.е. можем сказать, что среднее значение удерживаемых остатков статистически похоже на 0.
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
( trans = preProcess(SolarPrediction.train, method = c("BoxCox")) )
## Created from 29505 samples and 10 variables
##
## Pre-processing:
## - Box-Cox transformation (6)
## - ignored (4)
##
## Lambda estimates for Box-Cox transformation:
## -2, -0.1, -0.9, 2, 1.9, 0.6
testSetTrans = predict(trans, SolarPrediction.test)
trSetTrans = predict(trans, SolarPrediction.train)
model_trls <- lm(data = trSetTrans, Radiation ~ Temperature + Pressure + Humidity)
summary(model_trls)
##
## Call:
## lm(formula = Radiation ~ Temperature + Pressure + Humidity, data = trSetTrans)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.9204 -1.4760 -0.0956 1.3243 6.7889
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.102e+02 4.172e+00 -122.277 < 2e-16 ***
## Temperature 6.229e+02 3.448e+00 180.653 < 2e-16 ***
## Pressure -3.440e-01 7.122e-03 -48.296 < 2e-16 ***
## Humidity 6.544e-05 1.026e-05 6.376 1.84e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.892 on 29501 degrees of freedom
## Multiple R-squared: 0.5323, Adjusted R-squared: 0.5323
## F-statistic: 1.119e+04 on 3 and 29501 DF, p-value: < 2.2e-16
Вычислим разницу
ost.tlr <- testSetTrans$Radiation - predict(model_trls, testSetTrans)
Проверим модель с помощью T-test
t.test(ost.lr, mu=0)
##
## One Sample t-test
##
## data: ost.lr
## t = -0.47775, df = 3180, p-value = 0.6329
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## -9.053920 5.506204
## sample estimates:
## mean of x
## -1.773858
library(caret)
( trans1 = preProcess(SolarPrediction.train[5:7], method = "pca") )
## Created from 29505 samples and 3 variables
##
## Pre-processing:
## - centered (3)
## - ignored (0)
## - principal component signal extraction (3)
## - scaled (3)
##
## PCA needed 3 components to capture 95 percent of the variance
testSetTrans1 = predict(trans1, SolarPrediction.test)
trSetTrans1 = predict(trans1, SolarPrediction.train)
model_trls1 <- lm(data = trSetTrans1, Radiation ~ PC1 + PC2 + PC3)
summary(model_trls1)
##
## Call:
## lm(formula = Radiation ~ PC1 + PC2 + PC3, data = trSetTrans1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -753.01 -137.83 -21.19 103.84 1188.32
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 207.208 1.228 168.700 <2e-16 ***
## PC1 130.380 0.988 131.966 <2e-16 ***
## PC2 11.575 1.393 8.311 <2e-16 ***
## PC3 -206.221 1.493 -138.105 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 211 on 29501 degrees of freedom
## Multiple R-squared: 0.5534, Adjusted R-squared: 0.5534
## F-statistic: 1.219e+04 on 3 and 29501 DF, p-value: < 2.2e-16
Вычислим разницу
ost.tlr1 <- testSetTrans1$Radiation - predict(model_trls1, testSetTrans1)
Проверим модель с помощью T-test
t.test(ost.tlr1, mu=0)
##
## One Sample t-test
##
## data: ost.tlr1
## t = -0.47775, df = 3180, p-value = 0.6329
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## -9.053920 5.506204
## sample estimates:
## mean of x
## -1.773858