Creat new database mnNew which Add two new independent variables on mn ( including the daily Max o3, NOx,NO2,RH,TMP,WSP,WDR,SO2,CO all of which are from the same hour.)
library(knitr)
library(DBI)
library(RODBC)
library(RMySQL)
if (file.exists("mnNew.RData")) {
load("mnNew.RData")
} else {
# Get data CO,SO2 from SQL
m = dbDriver("MySQL")
con <- dbConnect(m, user = "upm", password = "password", dbname = "dbname",
host = "host")
CO <- dbGetQuery(con, "select FECHA,YEAR(FECHA) AS YEAR,MONTH(FECHA) AS MONTH,DAY(FECHA) AS DAY,WEEKDAY(FECHA) AS WEEKDAY,HORA,PED as CO FROM CO WHERE PED>0 GROUP BY FECHA ASC")
SO2 <- dbGetQuery(con, "select FECHA,YEAR(FECHA) AS YEAR,MONTH(FECHA) AS MONTH,DAY(FECHA) AS DAY,WEEKDAY(FECHA) AS WEEKDAY,HORA,PED as SO2 FROM SO2 WHERE PED>0 GROUP BY FECHA ASC")
# Merge the CO,SO2 and mn to creat a database mnNew,which includes the
# daily Max o3, NOx,NO2,RH,TMP,WSP,WDR,SO2,CO all of which are from the
# same hour
mnCO <- merge(mn, CO, by = c("FECHA", "YEAR", "MONTH", "DAY", "WEEKDAY",
"HORA"))
mnNew <- merge(mnCO, SO2, by = c("FECHA", "YEAR", "MONTH", "DAY", "WEEKDAY",
"HORA"))
save(mnNew, file = "mnNew.RData")
}
For database mnNew,O3 variable as dependent values,“MONTH”,“DAY”, “WEEKDAY”,“HORA”,“SEASON”,“NOx”,“NO2”,“RH”,“TMP”,“WDR”,“WSP”,“CO”,“SO2” as independent values Build 4 models,lm,svm,nnet,and randforest
# for mnNew,O3 variable as dependent values,'MONTH','DAY',
# 'WEEKDAY','HORA','SEASON','NOx','NO2','RH','TMP','WDR','WSP','CO','SO2'
# as independent value
if (file.exists("mnNewTrainingAndTesting.RData")) {
load("mnNewTrainingAndTesting.RData")
} else {
mnNewTarget <- mnNew[, "O3"]
mnNewInputs <- mnNew[, c("DAY", "WEEKDAY", "HORA", "SEASON", "NOx", "NO2",
"RH", "TMP", "WDR", "WSP", "CO", "SO2")]
# normalize datasu
library(RSNNS)
mnNewTargetNorm <- normalizeData(mnNewTarget, type = "0_1")
mnNewInputsNorm <- normalizeData(mnNewInputs, type = "0_1")
# split the data into Training set and Testing set
mnSplit <- splitForTrainingAndTest(mnNewInputsNorm, mnNewTargetNorm, ratio = 0.15)
mnNewInputsTrain <- mnSplit$inputsTrain
mnNewTargetTrain <- mnSplit$targetsTrain
mnNewInputsTest <- mnSplit$inputsTest
mnNewTargetTest <- mnSplit$targetsTest
save(mnNewInputsTrain, mnNewTargetTrain, mnNewInputsTest, mnNewTargetTest,
file = "mnNewTrainingAndTesting.RData")
}
library(caret)
library(doMC)
registerDoMC(cores = 3)
cvcontrol <- trainControl(method = "cv", number = 5, repeats = 1)
# Linear Least Squares
if (file.exists("mnNew.lmFit.RData")) {
load("mnNew.lmFit.RData")
} else {
mnNew.lmFit <- train(mnNewInputsTrain, mnNewTargetTrain, method = "lm",
trControl = cvcontrol, tuneLength = 4)
save(mnNew.lmFit, file = "mnNew.lmFit.RData")
}
# svm
if (file.exists("mnNew.svmFit.RData")) {
load("mnNew.svmFit.RData")
} else {
mnNew.svmFit <- train(mnNewInputsTrain, mnNewTargetTrain, method = "svmRadial",
trControl = cvcontrol, tuneLength = 4)
save(mnNew.svmFit, file = "mnNew.svmFit.RData")
}
# randomForest
if (file.exists("mnNew.rfFit.RData")) {
load("mnNew.rfFit.RData")
} else {
mnNew.rfFit <- train(mnNewInputsTrain, mnNewTargetTrain, method = "rf",
trControl = cvcontrol, tuneLength = 4)
save(mnNew.rfFit, file = "mnNew.rfFit.RData")
}
# neural networks
if (file.exists("mnNew.nnetFit.RData")) {
load("mnNew.nnetFit.RData")
} else {
nnet.grid <- expand.grid(.size = c(7:15), .decay = c(1e-04, 2e-04, 0.005,
0.01))
mnNew.nnetFit <- train(mnNewInputsTrain, mnNewTargetTrain, method = "nnet",
trControl = cvcontrol, tuneGrid = nnet.grid)
save(mnNew.nnetFit, file = "mnNew.nnetFit.RData")
}
# measure errors
modelErrors <- function(predicted, actual) {
sal <- vector(mode = "numeric", length = 3)
names(sal) <- c("MAE", "RMSE", "RELE")
meanPredicted <- mean(predicted)
meanActual <- mean(actual)
sumPred <- sum((predicted - meanPredicted)^2)
sumActual <- sum((actual - meanActual)^2)
n <- length(actual)
p3 <- vector(mode = "numeric", length = n)
for (i in c(1:n)) {
if (actual[i] == 0) {
p3[i] <- abs(predicted[i])
} else {
p3[i] <- ((abs(predicted[i] - actual[i]))/actual[i])
}
}
sal[1] <- mean(abs(predicted - actual))
sal[2] <- sqrt(sum((predicted - actual)^2)/n)
sal[3] <- mean(p3)
sal
}
models <- list(lm = mnNew.lmFit, svm = mnNew.svmFit, rf = mnNew.rfFit, nnet = mnNew.nnetFit)
preValues <- extractPrediction(models, testX = mnNewInputsTest, testY = mnNewTargetTest)
plotObsVsPred(preValues)
error <- function(model) {
pd <- predict(model, newdata = mnNewInputsTest)
modelErrors(pd, mnNewTargetTest)
}
mnNew.svmE <- error(mnNew.svmFit)
mnNew.lmE <- error(mnNew.lmFit)
mnNew.rfE <- error(mnNew.rfFit)
mnNew.nnetE <- error(mnNew.nnetFit)
summary(mnNew.lmFit)
##
## Call:
## lm(formula = modFormula, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.5090 -0.0796 -0.0096 0.0748 0.9126
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.12412 0.03465 -3.58 0.00035 ***
## V1 -0.01017 0.00964 -1.06 0.29132
## V2 0.01820 0.00848 2.15 0.03206 *
## V3 0.37862 0.04271 8.87 < 2e-16 ***
## V4 0.02532 0.00788 3.21 0.00134 **
## V5 -0.83913 0.11155 -7.52 8.1e-14 ***
## V6 0.91131 0.07339 12.42 < 2e-16 ***
## V7 -0.23189 0.02078 -11.16 < 2e-16 ***
## V8 0.31839 0.02496 12.75 < 2e-16 ***
## V9 0.01987 0.00944 2.11 0.03531 *
## V10 -0.11305 0.03110 -3.64 0.00028 ***
## V11 0.63800 0.03376 18.90 < 2e-16 ***
## V12 -0.11349 0.03166 -3.58 0.00035 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.124 on 1987 degrees of freedom
## Multiple R-squared: 0.493, Adjusted R-squared: 0.49
## F-statistic: 161 on 12 and 1987 DF, p-value: <2e-16
Try to find out the relationship between different varialbes, and delete the ones having close correlation with the other input varialbes.
if (file.exists("mnNew2TrainingAndTesting.RData")) {
load("mnNew2TrainingAndTesting.RData")
} else {
# find if there are some varialbes are close correlation
library(PerformanceAnalytics)
# the correlations among different variables
cor.prob <- function(X, dfr = nrow(X) - 2) {
R <- cor(X, use = "pairwise.complete.obs")
above <- row(R) < col(R)
r2 <- R[above]^2
Fstat <- r2 * dfr/(1 - r2)
R[above] <- 1 - pf(Fstat, 1, dfr)
R[row(R) == col(R)] <- NA
R
}
flattenSquareMatrix <- function(m) {
if ((class(m) != "matrix") | (nrow(m) != ncol(m)))
stop("Must be a square matrix.")
if (!identical(rownames(m), colnames(m)))
stop("Row and column names must be equal.")
ut <- upper.tri(m)
data.frame(i = rownames(m)[row(m)[ut]], j = rownames(m)[col(m)[ut]],
cor = t(m)[ut], p = m[ut])
}
mnCorr <- cor(mnNewInputs)
mnCor.prob <- cor.prob(mnNewInputs)
flattenSquareMatrix(mnCor.prob)
chart.Correlation(mnNewInputs)
# It can be seen that,there are strong correlation between month and
# season,NOx and NO2,greater than 0.9. Remove SEASON and NOx repectively
# from each pairs.Creat a new database mnNew2 without the two variables
# above. remove SEASON and NOx
mnNew.high.corr <- findCorrelation(mnCorr)
mnNewInputs2 <- mnNewInputs[, -mnNew.high.corr]
mnNewTarget2 <- mnNewTarget
mnNew2 <- cbind(mnNewTarget2, mnNewInputs2)
save(mnNew2, file = "mnNew2.RData")
library(RSNNS)
mnNew2TargetNorm <- normalizeData(mnNewTarget2, type = "0_1")
mnNew2InputsNorm <- normalizeData(mnNewInputs2, type = "0_1")
# split the data into Training set and Testing set
mnSplit <- splitForTrainingAndTest(mnNew2InputsNorm, mnNew2TargetNorm, ratio = 0.15)
mnNew2InputsTrain <- mnSplit$inputsTrain
mnNew2TargetTrain <- mnSplit$targetsTrain
mnNew2InputsTest <- mnSplit$inputsTest
mnNew2TargetTest <- mnSplit$targetsTest
save(mnNew2InputsTrain, mnNew2TargetTrain, mnNew2InputsTest, mnNew2TargetTest,
file = "mnNew2TrainingAndTesting.RData")
}
# Linear Least Squares
if (file.exists("mnNew2.lmFit.RData")) {
load("mnNew2.lmFit.RData")
} else {
mnNew2.lmFit <- train(mnNew2InputsTrain, mnNew2TargetTrain, method = "lm",
trControl = cvcontrol, tuneLength = 4)
save(mnNew2.lmFit, file = "mnNew2.lmFit.RData")
}
# svm
if (file.exists("mnNew2.svmFit.RData")) {
load("mnNew2.svmFit.RData")
} else {
mnNew2.svmFit <- train(mnNew2InputsTrain, mnNew2TargetTrain, method = "svmRadial",
trControl = cvcontrol, tuneLength = 3)
save(mnNew2.svmFit, file = "mnNew2.svmFit.RData")
}
# randomForest
if (file.exists("mnNew2.rfFit.RData")) {
load("mnNew2.rfFit.RData")
} else {
mnNew2.rfFit <- train(mnNew2InputsTrain, mnNew2TargetTrain, method = "rf",
trControl = cvcontrol, tuneLength = 4)
save(mnNew2.rfFit, file = "mnNew2.rfFit.RData")
}
# neural networks
if (file.exists("mnNew2.nnetFit.RData")) {
load("mnNew2.nnetFit.RData")
} else {
nnet.grid <- expand.grid(.size = c(7:15), .decay = c(1e-04, 2e-04, 0.005,
0.01))
mnNew2.nnetFit <- train(mnNew2InputsTrain, mnNew2TargetTrain, method = "nnet",
trControl = cvcontrol, tuneGrid = nnet.grid)
save(mnNew2.nnetFit, file = "mnNew2.nnetFit.RData")
}
error2 <- function(model) {
pd <- predict(model, newdata = mnNew2InputsTest)
modelErrors(pd, mnNew2TargetTest)
}
mnNew2.lmE <- error2(mnNew2.lmFit)
mnNew2.rfE <- error2(mnNew2.rfFit)
mnNew2.svmE <- error2(mnNew2.svmFit)
mnNew2.nnetE <- error2(mnNew2.nnetFit)
summary(mnNew2.lmFit)
##
## Call:
## lm(formula = modFormula, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.4904 -0.0819 -0.0081 0.0780 0.5497
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.15658 0.03486 -4.49 7.5e-06 ***
## V1 -0.01093 0.00977 -1.12 0.26357
## V2 0.02144 0.00859 2.50 0.01265 *
## V3 0.40270 0.04318 9.33 < 2e-16 ***
## V4 0.02702 0.00799 3.38 0.00073 ***
## V5 0.40086 0.02834 14.14 < 2e-16 ***
## V6 -0.23817 0.02105 -11.32 < 2e-16 ***
## V7 0.33893 0.02516 13.47 < 2e-16 ***
## V8 0.01790 0.00956 1.87 0.06140 .
## V9 -0.12241 0.03150 -3.89 0.00011 ***
## V10 0.64644 0.03420 18.90 < 2e-16 ***
## V11 -0.11532 0.03210 -3.59 0.00034 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.125 on 1988 degrees of freedom
## Multiple R-squared: 0.478, Adjusted R-squared: 0.475
## F-statistic: 166 on 11 and 1988 DF, p-value: <2e-16
Add a new input variable “yesterday” Max O3 level into mnNew,creat a database mnNew3,inputs varialbes include:“DAY”, “WEEKDAY”,“HORA”,“SEASON”,“NOx”,“NO2”,“RH”,“TMP”,“WDR”,“WSP”,“CO”,“SO2”,“O3Y1”
if (file.exists("mnNew3TrainingAndTesting.RData")) {
load("mnNew3TrainingAndTesting.RData")
} else {
O3MaxY1 <- dbGetQuery(con, "select FECHA+1,FECHA,MAX(PED) as O3Y1 FROM O3 WHERE PED >0 GROUP BY fecha ASC")
colnames(O3MaxY1)[1] <- c("FECHAM")
O3MaxY1$FECHAM <- as.Date(as.factor(O3MaxY1$FECHAM), format = "%Y%m%d")
mnNew$FECHA <- as.Date(as.factor(mnNew$FECHA))
mnNew3 <- merge(O3MaxY1, mnNew, by.x = "FECHAM", by.y = "FECHA")
mnNew3Target <- mnNew3[, "O3"]
mnNew3Inputs <- mnNew3[, c("DAY", "WEEKDAY", "HORA", "SEASON", "NOx", "NO2",
"RH", "TMP", "WDR", "WSP", "CO", "SO2", "O3Y1")]
# normalize data
mnNew3TargetNorm <- normalizeData(mnNew3Target, type = "0_1")
mnNew3InputsNorm <- normalizeData(mnNew3Inputs, type = "0_1")
# split the data into Training set and Testing set
mnSplit <- splitForTrainingAndTest(mnNew3InputsNorm, mnNew3TargetNorm, ratio = 0.15)
mnNew3InputsTrain <- mnSplit$inputsTrain
mnNew3TargetTrain <- mnSplit$targetsTrain
mnNew3InputsTest <- mnSplit$inputsTest
mnNew3TargetTest <- mnSplit$targetsTest
save(mnNew3InputsTrain, mnNew3TargetTrain, mnNew3InputsTest, mnNew3TargetTest,
file = "mnNew3TrainingAndTesting.RData")
}
library(caret)
library(doMC)
registerDoMC(cores = 3)
cvcontrol <- trainControl(method = "cv", number = 5, repeats = 1)
# Linear Least Squares
if (file.exists("mnNew3.lmFit.RData")) {
load("mnNew3.lmFit.RData")
} else {
mnNew3.lmFit <- train(mnNew3InputsTrain, mnNew3TargetTrain, method = "lm",
trControl = cvcontrol, tuneLength = 4)
save(mnNew3.lmFit, file = "mnNew3.lmFit.RData")
}
# svm
if (file.exists("mnNew3.svmFit.RData")) {
load("mnNew3.svmFit.RData")
} else {
mnNew3.svmFit <- train(mnNew3InputsTrain, mnNew3TargetTrain, method = "svmRadial",
trControl = cvcontrol, tuneLength = 4)
save(mnNew3.svmFit, file = "mnNew3.svmFit.RData")
}
# neural networks
if (file.exists("mnNew3.nnetFit.RData")) {
load("mnNew3.nnetFit.RData")
} else {
nnet.grid <- expand.grid(.size = c(7:15), .decay = c(1e-04, 2e-04, 0.005,
0.01))
mnNew3.nnetFit <- train(mnNew3InputsTrain, mnNew3TargetTrain, method = "nnet",
trControl = cvcontrol, tuneGrid = nnet.grid)
save(mnNew3.nnetFit, file = "mnNew3.nnetFit.RData")
}
# random Forests
if (file.exists("mnNew3.rfFit.RData")) {
load("mnNew3.rfFit.RData")
} else {
mnNew3.rfFit <- train(mnNew3InputsTrain, mnNew3TargetTrain, method = "rf",
trControl = cvcontrol, tuneLength = 3)
save(mnNew3.rfFit, file = "mnNew3.rfFit.RData")
}
error3 <- function(model) {
pd <- predict(model, newdata = mnNew3InputsTest)
modelErrors(pd, mnNew3TargetTest)
}
summary(mnNew3.lmFit)
##
## Call:
## lm(formula = modFormula, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.3906 -0.0714 -0.0093 0.0605 0.8162
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.19257 0.03191 -6.04 1.9e-09 ***
## V1 -0.00383 0.00893 -0.43 0.668
## V2 0.00136 0.00783 0.17 0.862
## V3 0.38581 0.03923 9.83 < 2e-16 ***
## V4 0.01439 0.00729 1.97 0.048 *
## V5 -0.90132 0.10139 -8.89 < 2e-16 ***
## V6 0.86541 0.06682 12.95 < 2e-16 ***
## V7 -0.15659 0.01939 -8.08 1.2e-15 ***
## V8 0.26788 0.02317 11.56 < 2e-16 ***
## V9 0.01215 0.00867 1.40 0.161
## V10 -0.15118 0.02855 -5.30 1.3e-07 ***
## V11 0.56202 0.03157 17.80 < 2e-16 ***
## V12 -0.11951 0.02769 -4.32 1.7e-05 ***
## V13 0.37442 0.01804 20.76 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.112 on 1909 degrees of freedom
## Multiple R-squared: 0.591, Adjusted R-squared: 0.589
## F-statistic: 213 on 13 and 1909 DF, p-value: <2e-16
mnNew3.lmE <- error3(mnNew3.lmFit)
mnNew3.svmE <- error3(mnNew3.svmFit)
mnNew3.nnetE <- error3(mnNew3.nnetFit)
mnNew3.rfE <- error3(mnNew3.rfFit)
models <- list(svm = mnNew3.svmFit, lm = mnNew3.lmFit, nnet = mnNew3.nnetFit,
rf = mnNew3.rfFit)
preValues <- extractPrediction(models, testX = mnNew3InputsTest, testY = mnNew3TargetTest)
plotObsVsPred(preValues)
Next, I want to see if the day of week has an effect to the O3 level,get “weekday” out of input varibles, and compare the models based on mnNew3 with ANOVA
if (file.exists("mnNew4TrainingAndTesting.RData")) {
load("mnNew4TrainingAndTesting.RData")
} else {
mnNew4Target <- mnNew3[, "O3"]
mnNew4Inputs <- mnNew3[, c("DAY", "HORA", "SEASON", "NOx", "NO2", "RH",
"TMP", "WDR", "WSP", "CO", "SO2", "O3Y1")]
# normalize data
mnNew4TargetNorm <- normalizeData(mnNew4Target, type = "0_1")
mnNew4InputsNorm <- normalizeData(mnNew4Inputs, type = "0_1")
# split the data into Training set and Testing set
mnSplit <- splitForTrainingAndTest(mnNew4InputsNorm, mnNew4TargetNorm, ratio = 0.15)
mnNew4InputsTrain <- mnSplit$inputsTrain
mnNew4TargetTrain <- mnSplit$targetsTrain
mnNew4InputsTest <- mnSplit$inputsTest
mnNew4TargetTest <- mnSplit$targetsTest
save(mnNew4InputsTrain, mnNew4TargetTrain, mnNew4InputsTest, mnNew4TargetTest,
file = "mnNew4TrainingAndTesting.RData")
}
library(caret)
library(doMC)
registerDoMC(cores = 3)
cvcontrol <- trainControl(method = "cv", number = 5, repeats = 1)
# Linear Least Squares
if (file.exists("mnNew4.lmFit.RData")) {
load("mnNew4.lmFit.RData")
} else {
mnNew4.lmFit <- train(mnNew4InputsTrain, mnNew4TargetTrain, method = "lm",
trControl = cvcontrol, tuneLength = 4)
save(mnNew4.lmFit, file = "mnNew4.lmFit.RData")
}
# svm
if (file.exists("mnNew4.svmFit.RData")) {
load("mnNew4.svmFit.RData")
} else {
mnNew4.svmFit <- train(mnNew4InputsTrain, mnNew4TargetTrain, method = "svmRadial",
trControl = cvcontrol, tuneLength = 3)
save(mnNew4.svmFit, file = "mnNew4.svmFit.RData")
}
# neural networks
if (file.exists("mnNew4.nnetFit.RData")) {
load("mnNew4.nnetFit.RData")
} else {
nnet.grid <- expand.grid(.size = c(7:15), .decay = c(1e-04, 2e-04, 0.005,
0.01))
mnNew4.nnetFit <- train(mnNew4InputsTrain, mnNew4TargetTrain, method = "nnet",
trControl = cvcontrol, tuneGrid = nnet.grid)
save(mnNew4.nnetFit, file = "mnNew4.nnetFit.RData")
}
# random Forests
if (file.exists("mnNew4.rfFit.RData")) {
load("mnNew4.rfFit.RData")
} else {
mnNew4.rfFit <- train(mnNew4InputsTrain, mnNew4TargetTrain, method = "rf",
trControl = cvcontrol, tuneLength = 3)
save(mnNew4.rfFit, file = "mnNew4.rfFit.RData")
}
error4 <- function(model) {
pd <- predict(model, newdata = mnNew4InputsTest)
modelErrors(pd, mnNew4TargetTest)
}
mnNew4.lmE <- error4(mnNew4.lmFit)
mnNew4.svmE <- error4(mnNew4.svmFit)
mnNew4.nnetE <- error4(mnNew4.nnetFit)
mnNew4.rfE <- error4(mnNew4.rfFit)
summary(mnNew4.lmFit)
##
## Call:
## lm(formula = modFormula, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.3900 -0.0714 -0.0093 0.0606 0.8173
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.19180 0.03158 -6.07 1.5e-09 ***
## V1 -0.00382 0.00892 -0.43 0.669
## V2 0.38590 0.03922 9.84 < 2e-16 ***
## V3 0.01435 0.00728 1.97 0.049 *
## V4 -0.90227 0.10121 -8.91 < 2e-16 ***
## V5 0.86528 0.06680 12.95 < 2e-16 ***
## V6 -0.15644 0.01936 -8.08 1.1e-15 ***
## V7 0.26774 0.02316 11.56 < 2e-16 ***
## V8 0.01217 0.00867 1.40 0.160
## V9 -0.15138 0.02851 -5.31 1.2e-07 ***
## V10 0.56172 0.03151 17.82 < 2e-16 ***
## V11 -0.11910 0.02758 -4.32 1.7e-05 ***
## V12 0.37475 0.01793 20.90 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.112 on 1910 degrees of freedom
## Multiple R-squared: 0.591, Adjusted R-squared: 0.589
## F-statistic: 230 on 12 and 1910 DF, p-value: <2e-16
models <- list(svm = mnNew4.svmFit, lm = mnNew4.lmFit, nnet = mnNew4.nnetFit,
rf = mnNew4.rfFit)
preValues <- extractPrediction(models, testX = mnNew4InputsTest, testY = mnNew4TargetTest)
plotObsVsPred(preValues)
There is no significant dfferent between mnNew3.lmFit and mnNew4.lmFit
What about add the average of Ozone level one day ahead on mnNew3 database
if (file.exists("mnNew5TrainingAndTesting.RData")) {
load("mnNew5TrainingAndTesting.RData")
} else {
O3MAY1 <- dbGetQuery(con, "select FECHA+1,FECHA,AVG(PED) as O3AVGY1, MAX(PED) as O3MAXY1 FROM O3 WHERE PED>0 GROUP BY fecha ASC")
colnames(O3MAY1)[1] <- c("FECHAM")
O3MAY1$FECHAM <- as.Date(as.factor(O3MAY1$FECHAM), format = "%Y%m%d")
mnNew$FECHA <- as.Date(as.factor(mnNew$FECHA))
mnNew5 <- merge(O3MAY1, mnNew, by.x = "FECHAM", by.y = "FECHA")
mnNew5Target <- mnNew5[, "O3"]
mnNew5Inputs <- mnNew5[, c("DAY", "WEEKDAY", "HORA", "SEASON", "NOx", "NO2",
"RH", "TMP", "WDR", "WSP", "CO", "SO2", "O3AVGY1", "O3MAXY1")]
# normalize data
library(RSNNS)
mnNew5TargetNorm <- normalizeData(mnNew5Target, type = "0_1")
mnNew5InputsNorm <- normalizeData(mnNew5Inputs, type = "0_1")
# split the data into Training set and Testing set
mnSplit <- splitForTrainingAndTest(mnNew5InputsNorm, mnNew5TargetNorm, ratio = 0.15)
mnNew5InputsTrain <- mnSplit$inputsTrain
mnNew5TargetTrain <- mnSplit$targetsTrain
mnNew5InputsTest <- mnSplit$inputsTest
mnNew5TargetTest <- mnSplit$targetsTest
colnames(mnNew5InputsTrain) <- paste("V", 1:14, sep = "")
colnames(mnNew5InputsTest) <- paste("V", 1:14, sep = "")
save(mnNew5InputsTrain, mnNew5TargetTrain, mnNew5InputsTest, mnNew5TargetTest,
file = "mnNew5TrainingAndTesting.RData")
}
library(caret)
library(doMC)
registerDoMC(cores = 3)
cvcontrol <- trainControl(method = "cv", number = 5, repeats = 1)
# Linear Least Squares
if (file.exists("mnNew5.lmFit.RData")) {
load("mnNew5.lmFit.RData")
} else {
mnNew5.lmFit <- train(mnNew5InputsTrain, mnNew5TargetTrain, method = "lm",
trControl = cvcontrol, tuneLength = 4)
save(mnNew5.lmFit, file = "mnNew5.lmFit.RData")
}
# svm
if (file.exists("mnNew5.svmFit.RData")) {
load("mnNew5.svmFit.RData")
} else {
mnNew5.svmFit <- train(mnNew5InputsTrain, mnNew5TargetTrain, method = "svmRadial",
trControl = cvcontrol, tuneLength = 3)
save(mnNew5.svmFit, file = "mnNew5.svmFit.RData")
}
# neural networks
if (file.exists("mnNew5.nnetFit.RData")) {
load("mnNew5.nnetFit.RData")
} else {
nnet.grid <- expand.grid(.size = c(7:15), .decay = c(1e-04, 2e-04, 0.005,
0.01))
mnNew5.nnetFit <- train(mnNew5InputsTrain, mnNew5TargetTrain, method = "nnet",
trControl = cvcontrol, tuneGrid = nnet.grid)
save(mnNew5.nnetFit, file = "mnNew5.nnetFit.RData")
}
# random Forests
if (file.exists("mnNew5.rfFit.RData")) {
load("mnNew5.rfFit.RData")
} else {
mnNew5.rfFit <- train(mnNew5InputsTrain, mnNew5TargetTrain, method = "rf",
trControl = cvcontrol, tuneLength = 3)
save(mnNew5.rfFit, file = "mnNew5.rfFit.RData")
}
error5 <- function(model) {
pd <- predict(model, newdata = mnNew5InputsTest)
modelErrors(pd, mnNew5TargetTest)
}
summary(mnNew5.lmFit)
##
## Call:
## lm(formula = modFormula, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.3879 -0.0686 -0.0116 0.0626 0.8016
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.21021 0.03192 -6.59 5.8e-11 ***
## V1 -0.00349 0.00887 -0.39 0.6940
## V2 0.00432 0.00781 0.55 0.5797
## V3 0.40694 0.03923 10.37 < 2e-16 ***
## V4 0.01981 0.00733 2.70 0.0069 **
## V5 -0.91497 0.10082 -9.08 < 2e-16 ***
## V6 0.87842 0.06648 13.21 < 2e-16 ***
## V7 -0.13703 0.01968 -6.96 4.5e-12 ***
## V8 0.24483 0.02351 10.41 < 2e-16 ***
## V9 0.01145 0.00862 1.33 0.1843
## V10 -0.15223 0.02837 -5.36 9.1e-08 ***
## V11 0.55111 0.03146 17.52 < 2e-16 ***
## V12 -0.13205 0.02764 -4.78 1.9e-06 ***
## V13 0.18341 0.03736 4.91 9.9e-07 ***
## V14 0.22882 0.03466 6.60 5.2e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.111 on 1908 degrees of freedom
## Multiple R-squared: 0.597, Adjusted R-squared: 0.594
## F-statistic: 202 on 14 and 1908 DF, p-value: <2e-16
mnNew5.lmE <- error5(mnNew5.lmFit)
mnNew5.svmE <- error5(mnNew5.svmFit)
mnNew5.nnetE <- error5(mnNew5.nnetFit)
mnNew5.rfE <- error5(mnNew5.rfFit)
models <- list(svm = mnNew5.svmFit, lm = mnNew5.lmFit, nnet = mnNew5.nnetFit,
rf = mnNew5.rfFit)
preValues <- extractPrediction(models, testX = mnNew5InputsTest, testY = mnNew5TargetTest)
plotObsVsPred(preValues)
What about use the average of Ozone level one day ahead on mnNew3 database instead of max daily ozone level
if (file.exists("mnNew6TrainingAndTesting.RData")) {
load("mnNew6TrainingAndTesting.RData")
} else {
mnNew6Target <- mnNew5[, "O3"]
mnNew6Inputs <- mnNew5[, c("DAY", "WEEKDAY", "HORA", "SEASON", "NOx", "NO2",
"RH", "TMP", "WDR", "WSP", "CO", "SO2", "O3AVGY1")]
# normalize data
library(RSNNS)
mnNew6TargetNorm <- normalizeData(mnNew6Target, type = "0_1")
mnNew6InputsNorm <- normalizeData(mnNew6Inputs, type = "0_1")
# split the data into Training set and Testing set
mnSplit <- splitForTrainingAndTest(mnNew6InputsNorm, mnNew6TargetNorm, ratio = 0.15)
mnNew6InputsTrain <- mnSplit$inputsTrain
mnNew6TargetTrain <- mnSplit$targetsTrain
mnNew6InputsTest <- mnSplit$inputsTest
mnNew6TargetTest <- mnSplit$targetsTest
colnames(mnNew6InputsTrain) <- paste("V", 1:13, sep = "")
colnames(mnNew6InputsTest) <- paste("V", 1:13, sep = "")
save(mnNew6InputsTrain, mnNew6TargetTrain, mnNew6InputsTest, mnNew6TargetTest,
file = "mnNew6TrainingAndTesting.RData")
}
library(caret)
library(doMC)
registerDoMC(cores = 3)
cvcontrol <- trainControl(method = "cv", number = 5, repeats = 1)
# Linear Least Squares
if (file.exists("mnNew6.lmFit.RData")) {
load("mnNew6.lmFit.RData")
} else {
mnNew6.lmFit <- train(mnNew6InputsTrain, mnNew6TargetTrain, method = "lm",
trControl = cvcontrol, tuneLength = 4)
save(mnNew6.lmFit, file = "mnNew6.lmFit.RData")
}
# svm
if (file.exists("mnNew6.svmFit.RData")) {
load("mnNew6.svmFit.RData")
} else {
mnNew6.svmFit <- train(mnNew6InputsTrain, mnNew6TargetTrain, method = "svmRadial",
trControl = cvcontrol, tuneLength = 4)
save(mnNew6.svmFit, file = "mnNew6.svmFit.RData")
}
# neural networks
if (file.exists("mnNew6.nnetFit.RData")) {
load("mnNew6.nnetFit.RData")
} else {
nnet.grid <- expand.grid(.size = c(7:15), .decay = c(1e-04, 2e-04, 0.005,
0.01))
mnNew6.nnetFit <- train(mnNew6InputsTrain, mnNew6TargetTrain, method = "nnet",
trControl = cvcontrol, tuneGrid = nnet.grid)
save(mnNew6.nnetFit, file = "mnNew6.nnetFit.RData")
}
# random Forests
if (file.exists("mnNew6.rfFit.RData")) {
load("mnNew6.rfFit.RData")
} else {
mnNew6.rfFit <- train(mnNew6InputsTrain, mnNew6TargetTrain, method = "rf",
trControl = cvcontrol, tuneLength = 3)
save(mnNew6.rfFit, file = "mnNew6.rfFit.RData")
}
error6 <- function(model) {
pd <- predict(model, newdata = mnNew6InputsTest)
modelErrors(pd, mnNew6TargetTest)
}
mnNew6.lmE <- error6(mnNew6.lmFit)
mnNew6.svmE <- error6(mnNew6.svmFit)
mnNew6.nnetE <- error5(mnNew5.nnetFit)
mnNew6.rfE <- error6(mnNew6.rfFit)
summary(mnNew6.lmFit)
##
## Call:
## lm(formula = modFormula, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.4047 -0.0706 -0.0110 0.0625 0.7987
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.22091 0.03223 -6.85 9.6e-12 ***
## V1 -0.00353 0.00897 -0.39 0.69434
## V2 0.01055 0.00784 1.35 0.17852
## V3 0.43089 0.03950 10.91 < 2e-16 ***
## V4 0.02789 0.00730 3.82 0.00014 ***
## V5 -0.91815 0.10194 -9.01 < 2e-16 ***
## V6 0.89848 0.06714 13.38 < 2e-16 ***
## V7 -0.12609 0.01983 -6.36 2.5e-10 ***
## V8 0.22731 0.02362 9.62 < 2e-16 ***
## V9 0.01140 0.00872 1.31 0.19127
## V10 -0.14717 0.02868 -5.13 3.2e-07 ***
## V11 0.55343 0.03181 17.40 < 2e-16 ***
## V12 -0.14412 0.02789 -5.17 2.6e-07 ***
## V13 0.39451 0.01954 20.19 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.112 on 1909 degrees of freedom
## Multiple R-squared: 0.587, Adjusted R-squared: 0.585
## F-statistic: 209 on 13 and 1909 DF, p-value: <2e-16
models <- list(svm = mnNew6.svmFit, lm = mnNew6.lmFit, nnet = mnNew6.nnetFit,
rf = mnNew6.rfFit)
preValues <- extractPrediction(models, testX = mnNew6InputsTest, testY = mnNew6TargetTest)
plotObsVsPred(preValues)
Compare errors of 4 models among different databases
lmError <- rbind(mnNew.lmE, mnNew2.lmE, mnNew3.lmE, mnNew4.lmE, mnNew5.lmE,
mnNew6.lmE)
lmError
## MAE RMSE RELE
## mnNew.lmE 0.07191 0.08754 0.4215
## mnNew2.lmE 0.06943 0.08615 0.4056
## mnNew3.lmE 0.04842 0.06192 0.2982
## mnNew4.lmE 0.04847 0.06199 0.2985
## mnNew5.lmE 0.05019 0.06323 0.3103
## mnNew6.lmE 0.05626 0.06957 0.3431
svmError <- rbind(mnNew.svmE, mnNew2.svmE, mnNew3.svmE, mnNew4.svmE, mnNew5.svmE,
mnNew6.svmE)
svmError
## MAE RMSE RELE
## mnNew.svmE 0.04548 0.05815 0.3002
## mnNew2.svmE 0.04860 0.06129 0.3095
## mnNew3.svmE 0.04706 0.05916 0.3029
## mnNew4.svmE 0.04408 0.05711 0.2872
## mnNew5.svmE 0.04494 0.05700 0.2904
## mnNew6.svmE 0.04820 0.06048 0.3081
nnetError <- rbind(mnNew.nnetE, mnNew2.nnetE, mnNew3.nnetE, mnNew4.nnetE, mnNew5.nnetE,
mnNew6.nnetE)
nnetError
## MAE RMSE RELE
## mnNew.nnetE 0.04398 0.05865 0.2282
## mnNew2.nnetE 0.04976 0.06489 0.2609
## mnNew3.nnetE 0.04596 0.05945 0.2055
## mnNew4.nnetE 0.03979 0.05260 0.2104
## mnNew5.nnetE 0.04000 0.05198 0.1958
## mnNew6.nnetE 0.04000 0.05198 0.1958
rfError <- rbind(mnNew.rfE, mnNew2.rfE, mnNew3.rfE, mnNew4.rfE, mnNew5.rfE,
mnNew6.rfE)
rfError
## MAE RMSE RELE
## mnNew.rfE 0.04855 0.06175 0.2803
## mnNew2.rfE 0.04889 0.06214 0.2740
## mnNew3.rfE 0.04342 0.05462 0.2502
## mnNew4.rfE 0.04466 0.05595 0.2549
## mnNew5.rfE 0.04463 0.05533 0.2586
## mnNew6.rfE 0.04564 0.05655 0.2694
# anova
anova(mnNew2.lmFit$finalModel, mnNew.lmFit$finalModel)
## Analysis of Variance Table
##
## Model 1: .outcome ~ V1 + V2 + V3 + V4 + V5 + V6 + V7 + V8 + V9 + V10 +
## V11
## Model 2: .outcome ~ V1 + V2 + V3 + V4 + V5 + V6 + V7 + V8 + V9 + V10 +
## V11 + V12
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 1988 31.3
## 2 1987 30.4 1 0.866 56.6 8.1e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(mnNew4.lmFit$finalModel, mnNew3.lmFit$finalModel)
## Analysis of Variance Table
##
## Model 1: .outcome ~ V1 + V2 + V3 + V4 + V5 + V6 + V7 + V8 + V9 + V10 +
## V11 + V12
## Model 2: .outcome ~ V1 + V2 + V3 + V4 + V5 + V6 + V7 + V8 + V9 + V10 +
## V11 + V12 + V13
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 1910 23.9
## 2 1909 23.9 1 0.000376 0.03 0.86
anova(mnNew3.lmFit$finalModel, mnNew6.lmFit$finalModel)
## Analysis of Variance Table
##
## Model 1: .outcome ~ V1 + V2 + V3 + V4 + V5 + V6 + V7 + V8 + V9 + V10 +
## V11 + V12 + V13
## Model 2: .outcome ~ V1 + V2 + V3 + V4 + V5 + V6 + V7 + V8 + V9 + V10 +
## V11 + V12 + V13
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 1909 23.9
## 2 1909 24.1 0 -0.241
anova(mnNew3.lmFit$finalModel, mnNew5.lmFit$finalModel)
## Analysis of Variance Table
##
## Model 1: .outcome ~ V1 + V2 + V3 + V4 + V5 + V6 + V7 + V8 + V9 + V10 +
## V11 + V12 + V13
## Model 2: .outcome ~ V1 + V2 + V3 + V4 + V5 + V6 + V7 + V8 + V9 + V10 +
## V11 + V12 + V13 + V14
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 1909 23.9
## 2 1908 23.6 1 0.297 24.1 9.9e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1