Creat new database mnNew which Add two new independent variables on mn ( including the daily Max o3, NOx,NO2,RH,TMP,WSP,WDR,SO2,CO all of which are from the same hour.)

library(knitr)
library(DBI)
library(RODBC)
library(RMySQL)
if (file.exists("mnNew.RData")) {
    load("mnNew.RData")
} else {
    # Get data CO,SO2 from SQL
    m = dbDriver("MySQL")
    con <- dbConnect(m, user = "upm", password = "password", dbname = "dbname", 
        host = "host")
    CO <- dbGetQuery(con, "select FECHA,YEAR(FECHA) AS YEAR,MONTH(FECHA) AS MONTH,DAY(FECHA) AS DAY,WEEKDAY(FECHA) AS WEEKDAY,HORA,PED as CO FROM CO WHERE PED>0 GROUP BY FECHA ASC")
    SO2 <- dbGetQuery(con, "select FECHA,YEAR(FECHA) AS YEAR,MONTH(FECHA) AS MONTH,DAY(FECHA) AS DAY,WEEKDAY(FECHA) AS WEEKDAY,HORA,PED as SO2 FROM SO2 WHERE PED>0 GROUP BY FECHA ASC")
    # Merge the CO,SO2 and mn to creat a database mnNew,which includes the
    # daily Max o3, NOx,NO2,RH,TMP,WSP,WDR,SO2,CO all of which are from the
    # same hour
    mnCO <- merge(mn, CO, by = c("FECHA", "YEAR", "MONTH", "DAY", "WEEKDAY", 
        "HORA"))
    mnNew <- merge(mnCO, SO2, by = c("FECHA", "YEAR", "MONTH", "DAY", "WEEKDAY", 
        "HORA"))
    save(mnNew, file = "mnNew.RData")
}

For database mnNew,O3 variable as dependent values,“MONTH”,“DAY”, “WEEKDAY”,“HORA”,“SEASON”,“NOx”,“NO2”,“RH”,“TMP”,“WDR”,“WSP”,“CO”,“SO2” as independent values Build 4 models,lm,svm,nnet,and randforest

# for mnNew,O3 variable as dependent values,'MONTH','DAY',
# 'WEEKDAY','HORA','SEASON','NOx','NO2','RH','TMP','WDR','WSP','CO','SO2'
# as independent value
if (file.exists("mnNewTrainingAndTesting.RData")) {
    load("mnNewTrainingAndTesting.RData")
} else {
    mnNewTarget <- mnNew[, "O3"]
    mnNewInputs <- mnNew[, c("DAY", "WEEKDAY", "HORA", "SEASON", "NOx", "NO2", 
        "RH", "TMP", "WDR", "WSP", "CO", "SO2")]
    # normalize datasu
    library(RSNNS)
    mnNewTargetNorm <- normalizeData(mnNewTarget, type = "0_1")
    mnNewInputsNorm <- normalizeData(mnNewInputs, type = "0_1")
    # split the data into Training set and Testing set
    mnSplit <- splitForTrainingAndTest(mnNewInputsNorm, mnNewTargetNorm, ratio = 0.15)
    mnNewInputsTrain <- mnSplit$inputsTrain
    mnNewTargetTrain <- mnSplit$targetsTrain
    mnNewInputsTest <- mnSplit$inputsTest
    mnNewTargetTest <- mnSplit$targetsTest
    save(mnNewInputsTrain, mnNewTargetTrain, mnNewInputsTest, mnNewTargetTest, 
        file = "mnNewTrainingAndTesting.RData")
}

library(caret)
library(doMC)
registerDoMC(cores = 3)
cvcontrol <- trainControl(method = "cv", number = 5, repeats = 1)
# Linear Least Squares
if (file.exists("mnNew.lmFit.RData")) {
    load("mnNew.lmFit.RData")
} else {
    mnNew.lmFit <- train(mnNewInputsTrain, mnNewTargetTrain, method = "lm", 
        trControl = cvcontrol, tuneLength = 4)
    save(mnNew.lmFit, file = "mnNew.lmFit.RData")
}
# svm
if (file.exists("mnNew.svmFit.RData")) {
    load("mnNew.svmFit.RData")
} else {
    mnNew.svmFit <- train(mnNewInputsTrain, mnNewTargetTrain, method = "svmRadial", 
        trControl = cvcontrol, tuneLength = 4)
    save(mnNew.svmFit, file = "mnNew.svmFit.RData")
}

# randomForest
if (file.exists("mnNew.rfFit.RData")) {
    load("mnNew.rfFit.RData")
} else {
    mnNew.rfFit <- train(mnNewInputsTrain, mnNewTargetTrain, method = "rf", 
        trControl = cvcontrol, tuneLength = 4)
    save(mnNew.rfFit, file = "mnNew.rfFit.RData")
}
# neural networks
if (file.exists("mnNew.nnetFit.RData")) {
    load("mnNew.nnetFit.RData")
} else {
    nnet.grid <- expand.grid(.size = c(7:15), .decay = c(1e-04, 2e-04, 0.005, 
        0.01))
    mnNew.nnetFit <- train(mnNewInputsTrain, mnNewTargetTrain, method = "nnet", 
        trControl = cvcontrol, tuneGrid = nnet.grid)
    save(mnNew.nnetFit, file = "mnNew.nnetFit.RData")
}
# measure errors
modelErrors <- function(predicted, actual) {
    sal <- vector(mode = "numeric", length = 3)
    names(sal) <- c("MAE", "RMSE", "RELE")
    meanPredicted <- mean(predicted)
    meanActual <- mean(actual)
    sumPred <- sum((predicted - meanPredicted)^2)
    sumActual <- sum((actual - meanActual)^2)
    n <- length(actual)
    p3 <- vector(mode = "numeric", length = n)
    for (i in c(1:n)) {
        if (actual[i] == 0) {
            p3[i] <- abs(predicted[i])
        } else {
            p3[i] <- ((abs(predicted[i] - actual[i]))/actual[i])
        }
    }
    sal[1] <- mean(abs(predicted - actual))
    sal[2] <- sqrt(sum((predicted - actual)^2)/n)
    sal[3] <- mean(p3)
    sal
}
models <- list(lm = mnNew.lmFit, svm = mnNew.svmFit, rf = mnNew.rfFit, nnet = mnNew.nnetFit)
preValues <- extractPrediction(models, testX = mnNewInputsTest, testY = mnNewTargetTest)
plotObsVsPred(preValues)

plot of chunk extendMn.2

error <- function(model) {
    pd <- predict(model, newdata = mnNewInputsTest)
    modelErrors(pd, mnNewTargetTest)
}
mnNew.svmE <- error(mnNew.svmFit)
mnNew.lmE <- error(mnNew.lmFit)
mnNew.rfE <- error(mnNew.rfFit)
mnNew.nnetE <- error(mnNew.nnetFit)
summary(mnNew.lmFit)
## 
## Call:
## lm(formula = modFormula, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.5090 -0.0796 -0.0096  0.0748  0.9126 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.12412    0.03465   -3.58  0.00035 ***
## V1          -0.01017    0.00964   -1.06  0.29132    
## V2           0.01820    0.00848    2.15  0.03206 *  
## V3           0.37862    0.04271    8.87  < 2e-16 ***
## V4           0.02532    0.00788    3.21  0.00134 ** 
## V5          -0.83913    0.11155   -7.52  8.1e-14 ***
## V6           0.91131    0.07339   12.42  < 2e-16 ***
## V7          -0.23189    0.02078  -11.16  < 2e-16 ***
## V8           0.31839    0.02496   12.75  < 2e-16 ***
## V9           0.01987    0.00944    2.11  0.03531 *  
## V10         -0.11305    0.03110   -3.64  0.00028 ***
## V11          0.63800    0.03376   18.90  < 2e-16 ***
## V12         -0.11349    0.03166   -3.58  0.00035 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.124 on 1987 degrees of freedom
## Multiple R-squared:  0.493,  Adjusted R-squared:  0.49 
## F-statistic:  161 on 12 and 1987 DF,  p-value: <2e-16

Try to find out the relationship between different varialbes, and delete the ones having close correlation with the other input varialbes.

if (file.exists("mnNew2TrainingAndTesting.RData")) {
    load("mnNew2TrainingAndTesting.RData")
} else {
    # find if there are some varialbes are close correlation
    library(PerformanceAnalytics)
    # the correlations among different variables
    cor.prob <- function(X, dfr = nrow(X) - 2) {
        R <- cor(X, use = "pairwise.complete.obs")
        above <- row(R) < col(R)
        r2 <- R[above]^2
        Fstat <- r2 * dfr/(1 - r2)
        R[above] <- 1 - pf(Fstat, 1, dfr)
        R[row(R) == col(R)] <- NA
        R
    }
    flattenSquareMatrix <- function(m) {
        if ((class(m) != "matrix") | (nrow(m) != ncol(m))) 
            stop("Must be a square matrix.")
        if (!identical(rownames(m), colnames(m))) 
            stop("Row and column names must be equal.")
        ut <- upper.tri(m)
        data.frame(i = rownames(m)[row(m)[ut]], j = rownames(m)[col(m)[ut]], 
            cor = t(m)[ut], p = m[ut])
    }
    mnCorr <- cor(mnNewInputs)
    mnCor.prob <- cor.prob(mnNewInputs)
    flattenSquareMatrix(mnCor.prob)
    chart.Correlation(mnNewInputs)
    # It can be seen that,there are strong correlation between month and
    # season,NOx and NO2,greater than 0.9. Remove SEASON and NOx repectively
    # from each pairs.Creat a new database mnNew2 without the two variables
    # above. remove SEASON and NOx
    mnNew.high.corr <- findCorrelation(mnCorr)
    mnNewInputs2 <- mnNewInputs[, -mnNew.high.corr]
    mnNewTarget2 <- mnNewTarget
    mnNew2 <- cbind(mnNewTarget2, mnNewInputs2)
    save(mnNew2, file = "mnNew2.RData")
    library(RSNNS)
    mnNew2TargetNorm <- normalizeData(mnNewTarget2, type = "0_1")
    mnNew2InputsNorm <- normalizeData(mnNewInputs2, type = "0_1")
    # split the data into Training set and Testing set
    mnSplit <- splitForTrainingAndTest(mnNew2InputsNorm, mnNew2TargetNorm, ratio = 0.15)
    mnNew2InputsTrain <- mnSplit$inputsTrain
    mnNew2TargetTrain <- mnSplit$targetsTrain
    mnNew2InputsTest <- mnSplit$inputsTest
    mnNew2TargetTest <- mnSplit$targetsTest
    save(mnNew2InputsTrain, mnNew2TargetTrain, mnNew2InputsTest, mnNew2TargetTest, 
        file = "mnNew2TrainingAndTesting.RData")
}

# Linear Least Squares
if (file.exists("mnNew2.lmFit.RData")) {
    load("mnNew2.lmFit.RData")
} else {
    mnNew2.lmFit <- train(mnNew2InputsTrain, mnNew2TargetTrain, method = "lm", 
        trControl = cvcontrol, tuneLength = 4)
    save(mnNew2.lmFit, file = "mnNew2.lmFit.RData")
}
# svm
if (file.exists("mnNew2.svmFit.RData")) {
    load("mnNew2.svmFit.RData")
} else {
    mnNew2.svmFit <- train(mnNew2InputsTrain, mnNew2TargetTrain, method = "svmRadial", 
        trControl = cvcontrol, tuneLength = 3)
    save(mnNew2.svmFit, file = "mnNew2.svmFit.RData")
}

# randomForest
if (file.exists("mnNew2.rfFit.RData")) {
    load("mnNew2.rfFit.RData")
} else {
    mnNew2.rfFit <- train(mnNew2InputsTrain, mnNew2TargetTrain, method = "rf", 
        trControl = cvcontrol, tuneLength = 4)
    save(mnNew2.rfFit, file = "mnNew2.rfFit.RData")
}
# neural networks
if (file.exists("mnNew2.nnetFit.RData")) {
    load("mnNew2.nnetFit.RData")
} else {
    nnet.grid <- expand.grid(.size = c(7:15), .decay = c(1e-04, 2e-04, 0.005, 
        0.01))
    mnNew2.nnetFit <- train(mnNew2InputsTrain, mnNew2TargetTrain, method = "nnet", 
        trControl = cvcontrol, tuneGrid = nnet.grid)
    save(mnNew2.nnetFit, file = "mnNew2.nnetFit.RData")
}
error2 <- function(model) {
    pd <- predict(model, newdata = mnNew2InputsTest)
    modelErrors(pd, mnNew2TargetTest)
}
mnNew2.lmE <- error2(mnNew2.lmFit)
mnNew2.rfE <- error2(mnNew2.rfFit)
mnNew2.svmE <- error2(mnNew2.svmFit)
mnNew2.nnetE <- error2(mnNew2.nnetFit)
summary(mnNew2.lmFit)
## 
## Call:
## lm(formula = modFormula, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4904 -0.0819 -0.0081  0.0780  0.5497 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.15658    0.03486   -4.49  7.5e-06 ***
## V1          -0.01093    0.00977   -1.12  0.26357    
## V2           0.02144    0.00859    2.50  0.01265 *  
## V3           0.40270    0.04318    9.33  < 2e-16 ***
## V4           0.02702    0.00799    3.38  0.00073 ***
## V5           0.40086    0.02834   14.14  < 2e-16 ***
## V6          -0.23817    0.02105  -11.32  < 2e-16 ***
## V7           0.33893    0.02516   13.47  < 2e-16 ***
## V8           0.01790    0.00956    1.87  0.06140 .  
## V9          -0.12241    0.03150   -3.89  0.00011 ***
## V10          0.64644    0.03420   18.90  < 2e-16 ***
## V11         -0.11532    0.03210   -3.59  0.00034 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.125 on 1988 degrees of freedom
## Multiple R-squared:  0.478,  Adjusted R-squared:  0.475 
## F-statistic:  166 on 11 and 1988 DF,  p-value: <2e-16

Add a new input variable “yesterday” Max O3 level into mnNew,creat a database mnNew3,inputs varialbes include:“DAY”, “WEEKDAY”,“HORA”,“SEASON”,“NOx”,“NO2”,“RH”,“TMP”,“WDR”,“WSP”,“CO”,“SO2”,“O3Y1”

if (file.exists("mnNew3TrainingAndTesting.RData")) {
    load("mnNew3TrainingAndTesting.RData")
} else {
    O3MaxY1 <- dbGetQuery(con, "select FECHA+1,FECHA,MAX(PED) as O3Y1 FROM O3 WHERE PED  >0 GROUP BY fecha ASC")
    colnames(O3MaxY1)[1] <- c("FECHAM")
    O3MaxY1$FECHAM <- as.Date(as.factor(O3MaxY1$FECHAM), format = "%Y%m%d")
    mnNew$FECHA <- as.Date(as.factor(mnNew$FECHA))
    mnNew3 <- merge(O3MaxY1, mnNew, by.x = "FECHAM", by.y = "FECHA")
    mnNew3Target <- mnNew3[, "O3"]
    mnNew3Inputs <- mnNew3[, c("DAY", "WEEKDAY", "HORA", "SEASON", "NOx", "NO2", 
        "RH", "TMP", "WDR", "WSP", "CO", "SO2", "O3Y1")]
    # normalize data
    mnNew3TargetNorm <- normalizeData(mnNew3Target, type = "0_1")
    mnNew3InputsNorm <- normalizeData(mnNew3Inputs, type = "0_1")
    # split the data into Training set and Testing set
    mnSplit <- splitForTrainingAndTest(mnNew3InputsNorm, mnNew3TargetNorm, ratio = 0.15)
    mnNew3InputsTrain <- mnSplit$inputsTrain
    mnNew3TargetTrain <- mnSplit$targetsTrain
    mnNew3InputsTest <- mnSplit$inputsTest
    mnNew3TargetTest <- mnSplit$targetsTest
    save(mnNew3InputsTrain, mnNew3TargetTrain, mnNew3InputsTest, mnNew3TargetTest, 
        file = "mnNew3TrainingAndTesting.RData")
}
library(caret)
library(doMC)
registerDoMC(cores = 3)
cvcontrol <- trainControl(method = "cv", number = 5, repeats = 1)
# Linear Least Squares
if (file.exists("mnNew3.lmFit.RData")) {
    load("mnNew3.lmFit.RData")
} else {
    mnNew3.lmFit <- train(mnNew3InputsTrain, mnNew3TargetTrain, method = "lm", 
        trControl = cvcontrol, tuneLength = 4)
    save(mnNew3.lmFit, file = "mnNew3.lmFit.RData")
}
# svm
if (file.exists("mnNew3.svmFit.RData")) {
    load("mnNew3.svmFit.RData")
} else {
    mnNew3.svmFit <- train(mnNew3InputsTrain, mnNew3TargetTrain, method = "svmRadial", 
        trControl = cvcontrol, tuneLength = 4)
    save(mnNew3.svmFit, file = "mnNew3.svmFit.RData")
}
# neural networks
if (file.exists("mnNew3.nnetFit.RData")) {
    load("mnNew3.nnetFit.RData")
} else {
    nnet.grid <- expand.grid(.size = c(7:15), .decay = c(1e-04, 2e-04, 0.005, 
        0.01))
    mnNew3.nnetFit <- train(mnNew3InputsTrain, mnNew3TargetTrain, method = "nnet", 
        trControl = cvcontrol, tuneGrid = nnet.grid)
    save(mnNew3.nnetFit, file = "mnNew3.nnetFit.RData")
}
# random Forests
if (file.exists("mnNew3.rfFit.RData")) {
    load("mnNew3.rfFit.RData")
} else {
    mnNew3.rfFit <- train(mnNew3InputsTrain, mnNew3TargetTrain, method = "rf", 
        trControl = cvcontrol, tuneLength = 3)
    save(mnNew3.rfFit, file = "mnNew3.rfFit.RData")
}
error3 <- function(model) {
    pd <- predict(model, newdata = mnNew3InputsTest)
    modelErrors(pd, mnNew3TargetTest)
}
summary(mnNew3.lmFit)
## 
## Call:
## lm(formula = modFormula, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.3906 -0.0714 -0.0093  0.0605  0.8162 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.19257    0.03191   -6.04  1.9e-09 ***
## V1          -0.00383    0.00893   -0.43    0.668    
## V2           0.00136    0.00783    0.17    0.862    
## V3           0.38581    0.03923    9.83  < 2e-16 ***
## V4           0.01439    0.00729    1.97    0.048 *  
## V5          -0.90132    0.10139   -8.89  < 2e-16 ***
## V6           0.86541    0.06682   12.95  < 2e-16 ***
## V7          -0.15659    0.01939   -8.08  1.2e-15 ***
## V8           0.26788    0.02317   11.56  < 2e-16 ***
## V9           0.01215    0.00867    1.40    0.161    
## V10         -0.15118    0.02855   -5.30  1.3e-07 ***
## V11          0.56202    0.03157   17.80  < 2e-16 ***
## V12         -0.11951    0.02769   -4.32  1.7e-05 ***
## V13          0.37442    0.01804   20.76  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.112 on 1909 degrees of freedom
## Multiple R-squared:  0.591,  Adjusted R-squared:  0.589 
## F-statistic:  213 on 13 and 1909 DF,  p-value: <2e-16
mnNew3.lmE <- error3(mnNew3.lmFit)
mnNew3.svmE <- error3(mnNew3.svmFit)
mnNew3.nnetE <- error3(mnNew3.nnetFit)
mnNew3.rfE <- error3(mnNew3.rfFit)
models <- list(svm = mnNew3.svmFit, lm = mnNew3.lmFit, nnet = mnNew3.nnetFit, 
    rf = mnNew3.rfFit)
preValues <- extractPrediction(models, testX = mnNew3InputsTest, testY = mnNew3TargetTest)
plotObsVsPred(preValues)

plot of chunk extendMn.5

Next, I want to see if the day of week has an effect to the O3 level,get “weekday” out of input varibles, and compare the models based on mnNew3 with ANOVA

if (file.exists("mnNew4TrainingAndTesting.RData")) {
    load("mnNew4TrainingAndTesting.RData")
} else {
    mnNew4Target <- mnNew3[, "O3"]
    mnNew4Inputs <- mnNew3[, c("DAY", "HORA", "SEASON", "NOx", "NO2", "RH", 
        "TMP", "WDR", "WSP", "CO", "SO2", "O3Y1")]
    # normalize data
    mnNew4TargetNorm <- normalizeData(mnNew4Target, type = "0_1")
    mnNew4InputsNorm <- normalizeData(mnNew4Inputs, type = "0_1")
    # split the data into Training set and Testing set
    mnSplit <- splitForTrainingAndTest(mnNew4InputsNorm, mnNew4TargetNorm, ratio = 0.15)
    mnNew4InputsTrain <- mnSplit$inputsTrain
    mnNew4TargetTrain <- mnSplit$targetsTrain
    mnNew4InputsTest <- mnSplit$inputsTest
    mnNew4TargetTest <- mnSplit$targetsTest
    save(mnNew4InputsTrain, mnNew4TargetTrain, mnNew4InputsTest, mnNew4TargetTest, 
        file = "mnNew4TrainingAndTesting.RData")
}

library(caret)
library(doMC)
registerDoMC(cores = 3)
cvcontrol <- trainControl(method = "cv", number = 5, repeats = 1)
# Linear Least Squares
if (file.exists("mnNew4.lmFit.RData")) {
    load("mnNew4.lmFit.RData")
} else {
    mnNew4.lmFit <- train(mnNew4InputsTrain, mnNew4TargetTrain, method = "lm", 
        trControl = cvcontrol, tuneLength = 4)
    save(mnNew4.lmFit, file = "mnNew4.lmFit.RData")
}
# svm
if (file.exists("mnNew4.svmFit.RData")) {
    load("mnNew4.svmFit.RData")
} else {
    mnNew4.svmFit <- train(mnNew4InputsTrain, mnNew4TargetTrain, method = "svmRadial", 
        trControl = cvcontrol, tuneLength = 3)
    save(mnNew4.svmFit, file = "mnNew4.svmFit.RData")
}
# neural networks
if (file.exists("mnNew4.nnetFit.RData")) {
    load("mnNew4.nnetFit.RData")
} else {
    nnet.grid <- expand.grid(.size = c(7:15), .decay = c(1e-04, 2e-04, 0.005, 
        0.01))
    mnNew4.nnetFit <- train(mnNew4InputsTrain, mnNew4TargetTrain, method = "nnet", 
        trControl = cvcontrol, tuneGrid = nnet.grid)
    save(mnNew4.nnetFit, file = "mnNew4.nnetFit.RData")
}
# random Forests
if (file.exists("mnNew4.rfFit.RData")) {
    load("mnNew4.rfFit.RData")
} else {
    mnNew4.rfFit <- train(mnNew4InputsTrain, mnNew4TargetTrain, method = "rf", 
        trControl = cvcontrol, tuneLength = 3)
    save(mnNew4.rfFit, file = "mnNew4.rfFit.RData")
}
error4 <- function(model) {
    pd <- predict(model, newdata = mnNew4InputsTest)
    modelErrors(pd, mnNew4TargetTest)
}
mnNew4.lmE <- error4(mnNew4.lmFit)
mnNew4.svmE <- error4(mnNew4.svmFit)
mnNew4.nnetE <- error4(mnNew4.nnetFit)
mnNew4.rfE <- error4(mnNew4.rfFit)
summary(mnNew4.lmFit)
## 
## Call:
## lm(formula = modFormula, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.3900 -0.0714 -0.0093  0.0606  0.8173 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.19180    0.03158   -6.07  1.5e-09 ***
## V1          -0.00382    0.00892   -0.43    0.669    
## V2           0.38590    0.03922    9.84  < 2e-16 ***
## V3           0.01435    0.00728    1.97    0.049 *  
## V4          -0.90227    0.10121   -8.91  < 2e-16 ***
## V5           0.86528    0.06680   12.95  < 2e-16 ***
## V6          -0.15644    0.01936   -8.08  1.1e-15 ***
## V7           0.26774    0.02316   11.56  < 2e-16 ***
## V8           0.01217    0.00867    1.40    0.160    
## V9          -0.15138    0.02851   -5.31  1.2e-07 ***
## V10          0.56172    0.03151   17.82  < 2e-16 ***
## V11         -0.11910    0.02758   -4.32  1.7e-05 ***
## V12          0.37475    0.01793   20.90  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.112 on 1910 degrees of freedom
## Multiple R-squared:  0.591,  Adjusted R-squared:  0.589 
## F-statistic:  230 on 12 and 1910 DF,  p-value: <2e-16
models <- list(svm = mnNew4.svmFit, lm = mnNew4.lmFit, nnet = mnNew4.nnetFit, 
    rf = mnNew4.rfFit)
preValues <- extractPrediction(models, testX = mnNew4InputsTest, testY = mnNew4TargetTest)
plotObsVsPred(preValues)

plot of chunk extendMn.6

There is no significant dfferent between mnNew3.lmFit and mnNew4.lmFit

What about add the average of Ozone level one day ahead on mnNew3 database

if (file.exists("mnNew5TrainingAndTesting.RData")) {
    load("mnNew5TrainingAndTesting.RData")
} else {
    O3MAY1 <- dbGetQuery(con, "select FECHA+1,FECHA,AVG(PED) as O3AVGY1, MAX(PED) as O3MAXY1 FROM O3 WHERE PED>0 GROUP BY fecha ASC")
    colnames(O3MAY1)[1] <- c("FECHAM")
    O3MAY1$FECHAM <- as.Date(as.factor(O3MAY1$FECHAM), format = "%Y%m%d")
    mnNew$FECHA <- as.Date(as.factor(mnNew$FECHA))
    mnNew5 <- merge(O3MAY1, mnNew, by.x = "FECHAM", by.y = "FECHA")
    mnNew5Target <- mnNew5[, "O3"]
    mnNew5Inputs <- mnNew5[, c("DAY", "WEEKDAY", "HORA", "SEASON", "NOx", "NO2", 
        "RH", "TMP", "WDR", "WSP", "CO", "SO2", "O3AVGY1", "O3MAXY1")]
    # normalize data
    library(RSNNS)
    mnNew5TargetNorm <- normalizeData(mnNew5Target, type = "0_1")
    mnNew5InputsNorm <- normalizeData(mnNew5Inputs, type = "0_1")
    # split the data into Training set and Testing set
    mnSplit <- splitForTrainingAndTest(mnNew5InputsNorm, mnNew5TargetNorm, ratio = 0.15)
    mnNew5InputsTrain <- mnSplit$inputsTrain
    mnNew5TargetTrain <- mnSplit$targetsTrain
    mnNew5InputsTest <- mnSplit$inputsTest
    mnNew5TargetTest <- mnSplit$targetsTest
    colnames(mnNew5InputsTrain) <- paste("V", 1:14, sep = "")
    colnames(mnNew5InputsTest) <- paste("V", 1:14, sep = "")
    save(mnNew5InputsTrain, mnNew5TargetTrain, mnNew5InputsTest, mnNew5TargetTest, 
        file = "mnNew5TrainingAndTesting.RData")
}

library(caret)
library(doMC)
registerDoMC(cores = 3)
cvcontrol <- trainControl(method = "cv", number = 5, repeats = 1)
# Linear Least Squares
if (file.exists("mnNew5.lmFit.RData")) {
    load("mnNew5.lmFit.RData")
} else {
    mnNew5.lmFit <- train(mnNew5InputsTrain, mnNew5TargetTrain, method = "lm", 
        trControl = cvcontrol, tuneLength = 4)
    save(mnNew5.lmFit, file = "mnNew5.lmFit.RData")
}
# svm
if (file.exists("mnNew5.svmFit.RData")) {
    load("mnNew5.svmFit.RData")
} else {
    mnNew5.svmFit <- train(mnNew5InputsTrain, mnNew5TargetTrain, method = "svmRadial", 
        trControl = cvcontrol, tuneLength = 3)
    save(mnNew5.svmFit, file = "mnNew5.svmFit.RData")
}
# neural networks
if (file.exists("mnNew5.nnetFit.RData")) {
    load("mnNew5.nnetFit.RData")
} else {
    nnet.grid <- expand.grid(.size = c(7:15), .decay = c(1e-04, 2e-04, 0.005, 
        0.01))
    mnNew5.nnetFit <- train(mnNew5InputsTrain, mnNew5TargetTrain, method = "nnet", 
        trControl = cvcontrol, tuneGrid = nnet.grid)
    save(mnNew5.nnetFit, file = "mnNew5.nnetFit.RData")
}
# random Forests
if (file.exists("mnNew5.rfFit.RData")) {
    load("mnNew5.rfFit.RData")
} else {
    mnNew5.rfFit <- train(mnNew5InputsTrain, mnNew5TargetTrain, method = "rf", 
        trControl = cvcontrol, tuneLength = 3)
    save(mnNew5.rfFit, file = "mnNew5.rfFit.RData")
}
error5 <- function(model) {
    pd <- predict(model, newdata = mnNew5InputsTest)
    modelErrors(pd, mnNew5TargetTest)
}
summary(mnNew5.lmFit)
## 
## Call:
## lm(formula = modFormula, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.3879 -0.0686 -0.0116  0.0626  0.8016 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.21021    0.03192   -6.59  5.8e-11 ***
## V1          -0.00349    0.00887   -0.39   0.6940    
## V2           0.00432    0.00781    0.55   0.5797    
## V3           0.40694    0.03923   10.37  < 2e-16 ***
## V4           0.01981    0.00733    2.70   0.0069 ** 
## V5          -0.91497    0.10082   -9.08  < 2e-16 ***
## V6           0.87842    0.06648   13.21  < 2e-16 ***
## V7          -0.13703    0.01968   -6.96  4.5e-12 ***
## V8           0.24483    0.02351   10.41  < 2e-16 ***
## V9           0.01145    0.00862    1.33   0.1843    
## V10         -0.15223    0.02837   -5.36  9.1e-08 ***
## V11          0.55111    0.03146   17.52  < 2e-16 ***
## V12         -0.13205    0.02764   -4.78  1.9e-06 ***
## V13          0.18341    0.03736    4.91  9.9e-07 ***
## V14          0.22882    0.03466    6.60  5.2e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.111 on 1908 degrees of freedom
## Multiple R-squared:  0.597,  Adjusted R-squared:  0.594 
## F-statistic:  202 on 14 and 1908 DF,  p-value: <2e-16
mnNew5.lmE <- error5(mnNew5.lmFit)
mnNew5.svmE <- error5(mnNew5.svmFit)
mnNew5.nnetE <- error5(mnNew5.nnetFit)
mnNew5.rfE <- error5(mnNew5.rfFit)

models <- list(svm = mnNew5.svmFit, lm = mnNew5.lmFit, nnet = mnNew5.nnetFit, 
    rf = mnNew5.rfFit)
preValues <- extractPrediction(models, testX = mnNew5InputsTest, testY = mnNew5TargetTest)
plotObsVsPred(preValues)

plot of chunk extendMn.7

What about use the average of Ozone level one day ahead on mnNew3 database instead of max daily ozone level

if (file.exists("mnNew6TrainingAndTesting.RData")) {
    load("mnNew6TrainingAndTesting.RData")
} else {
    mnNew6Target <- mnNew5[, "O3"]
    mnNew6Inputs <- mnNew5[, c("DAY", "WEEKDAY", "HORA", "SEASON", "NOx", "NO2", 
        "RH", "TMP", "WDR", "WSP", "CO", "SO2", "O3AVGY1")]
    # normalize data
    library(RSNNS)
    mnNew6TargetNorm <- normalizeData(mnNew6Target, type = "0_1")
    mnNew6InputsNorm <- normalizeData(mnNew6Inputs, type = "0_1")
    # split the data into Training set and Testing set
    mnSplit <- splitForTrainingAndTest(mnNew6InputsNorm, mnNew6TargetNorm, ratio = 0.15)
    mnNew6InputsTrain <- mnSplit$inputsTrain
    mnNew6TargetTrain <- mnSplit$targetsTrain
    mnNew6InputsTest <- mnSplit$inputsTest
    mnNew6TargetTest <- mnSplit$targetsTest
    colnames(mnNew6InputsTrain) <- paste("V", 1:13, sep = "")
    colnames(mnNew6InputsTest) <- paste("V", 1:13, sep = "")
    save(mnNew6InputsTrain, mnNew6TargetTrain, mnNew6InputsTest, mnNew6TargetTest, 
        file = "mnNew6TrainingAndTesting.RData")
}

library(caret)
library(doMC)
registerDoMC(cores = 3)
cvcontrol <- trainControl(method = "cv", number = 5, repeats = 1)
# Linear Least Squares
if (file.exists("mnNew6.lmFit.RData")) {
    load("mnNew6.lmFit.RData")
} else {
    mnNew6.lmFit <- train(mnNew6InputsTrain, mnNew6TargetTrain, method = "lm", 
        trControl = cvcontrol, tuneLength = 4)
    save(mnNew6.lmFit, file = "mnNew6.lmFit.RData")
}
# svm
if (file.exists("mnNew6.svmFit.RData")) {
    load("mnNew6.svmFit.RData")
} else {
    mnNew6.svmFit <- train(mnNew6InputsTrain, mnNew6TargetTrain, method = "svmRadial", 
        trControl = cvcontrol, tuneLength = 4)
    save(mnNew6.svmFit, file = "mnNew6.svmFit.RData")
}
# neural networks
if (file.exists("mnNew6.nnetFit.RData")) {
    load("mnNew6.nnetFit.RData")
} else {
    nnet.grid <- expand.grid(.size = c(7:15), .decay = c(1e-04, 2e-04, 0.005, 
        0.01))
    mnNew6.nnetFit <- train(mnNew6InputsTrain, mnNew6TargetTrain, method = "nnet", 
        trControl = cvcontrol, tuneGrid = nnet.grid)
    save(mnNew6.nnetFit, file = "mnNew6.nnetFit.RData")
}
# random Forests
if (file.exists("mnNew6.rfFit.RData")) {
    load("mnNew6.rfFit.RData")
} else {
    mnNew6.rfFit <- train(mnNew6InputsTrain, mnNew6TargetTrain, method = "rf", 
        trControl = cvcontrol, tuneLength = 3)
    save(mnNew6.rfFit, file = "mnNew6.rfFit.RData")
}
error6 <- function(model) {
    pd <- predict(model, newdata = mnNew6InputsTest)
    modelErrors(pd, mnNew6TargetTest)
}
mnNew6.lmE <- error6(mnNew6.lmFit)
mnNew6.svmE <- error6(mnNew6.svmFit)
mnNew6.nnetE <- error5(mnNew5.nnetFit)
mnNew6.rfE <- error6(mnNew6.rfFit)
summary(mnNew6.lmFit)
## 
## Call:
## lm(formula = modFormula, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4047 -0.0706 -0.0110  0.0625  0.7987 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.22091    0.03223   -6.85  9.6e-12 ***
## V1          -0.00353    0.00897   -0.39  0.69434    
## V2           0.01055    0.00784    1.35  0.17852    
## V3           0.43089    0.03950   10.91  < 2e-16 ***
## V4           0.02789    0.00730    3.82  0.00014 ***
## V5          -0.91815    0.10194   -9.01  < 2e-16 ***
## V6           0.89848    0.06714   13.38  < 2e-16 ***
## V7          -0.12609    0.01983   -6.36  2.5e-10 ***
## V8           0.22731    0.02362    9.62  < 2e-16 ***
## V9           0.01140    0.00872    1.31  0.19127    
## V10         -0.14717    0.02868   -5.13  3.2e-07 ***
## V11          0.55343    0.03181   17.40  < 2e-16 ***
## V12         -0.14412    0.02789   -5.17  2.6e-07 ***
## V13          0.39451    0.01954   20.19  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.112 on 1909 degrees of freedom
## Multiple R-squared:  0.587,  Adjusted R-squared:  0.585 
## F-statistic:  209 on 13 and 1909 DF,  p-value: <2e-16
models <- list(svm = mnNew6.svmFit, lm = mnNew6.lmFit, nnet = mnNew6.nnetFit, 
    rf = mnNew6.rfFit)
preValues <- extractPrediction(models, testX = mnNew6InputsTest, testY = mnNew6TargetTest)
plotObsVsPred(preValues)

plot of chunk extendMn.8

Compare errors of 4 models among different databases

lmError <- rbind(mnNew.lmE, mnNew2.lmE, mnNew3.lmE, mnNew4.lmE, mnNew5.lmE, 
    mnNew6.lmE)
lmError
##                MAE    RMSE   RELE
## mnNew.lmE  0.07191 0.08754 0.4215
## mnNew2.lmE 0.06943 0.08615 0.4056
## mnNew3.lmE 0.04842 0.06192 0.2982
## mnNew4.lmE 0.04847 0.06199 0.2985
## mnNew5.lmE 0.05019 0.06323 0.3103
## mnNew6.lmE 0.05626 0.06957 0.3431
svmError <- rbind(mnNew.svmE, mnNew2.svmE, mnNew3.svmE, mnNew4.svmE, mnNew5.svmE, 
    mnNew6.svmE)
svmError
##                 MAE    RMSE   RELE
## mnNew.svmE  0.04548 0.05815 0.3002
## mnNew2.svmE 0.04860 0.06129 0.3095
## mnNew3.svmE 0.04706 0.05916 0.3029
## mnNew4.svmE 0.04408 0.05711 0.2872
## mnNew5.svmE 0.04494 0.05700 0.2904
## mnNew6.svmE 0.04820 0.06048 0.3081
nnetError <- rbind(mnNew.nnetE, mnNew2.nnetE, mnNew3.nnetE, mnNew4.nnetE, mnNew5.nnetE, 
    mnNew6.nnetE)
nnetError
##                  MAE    RMSE   RELE
## mnNew.nnetE  0.04398 0.05865 0.2282
## mnNew2.nnetE 0.04976 0.06489 0.2609
## mnNew3.nnetE 0.04596 0.05945 0.2055
## mnNew4.nnetE 0.03979 0.05260 0.2104
## mnNew5.nnetE 0.04000 0.05198 0.1958
## mnNew6.nnetE 0.04000 0.05198 0.1958
rfError <- rbind(mnNew.rfE, mnNew2.rfE, mnNew3.rfE, mnNew4.rfE, mnNew5.rfE, 
    mnNew6.rfE)
rfError
##                MAE    RMSE   RELE
## mnNew.rfE  0.04855 0.06175 0.2803
## mnNew2.rfE 0.04889 0.06214 0.2740
## mnNew3.rfE 0.04342 0.05462 0.2502
## mnNew4.rfE 0.04466 0.05595 0.2549
## mnNew5.rfE 0.04463 0.05533 0.2586
## mnNew6.rfE 0.04564 0.05655 0.2694

# anova
anova(mnNew2.lmFit$finalModel, mnNew.lmFit$finalModel)
## Analysis of Variance Table
## 
## Model 1: .outcome ~ V1 + V2 + V3 + V4 + V5 + V6 + V7 + V8 + V9 + V10 + 
##     V11
## Model 2: .outcome ~ V1 + V2 + V3 + V4 + V5 + V6 + V7 + V8 + V9 + V10 + 
##     V11 + V12
##   Res.Df  RSS Df Sum of Sq    F  Pr(>F)    
## 1   1988 31.3                              
## 2   1987 30.4  1     0.866 56.6 8.1e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(mnNew4.lmFit$finalModel, mnNew3.lmFit$finalModel)
## Analysis of Variance Table
## 
## Model 1: .outcome ~ V1 + V2 + V3 + V4 + V5 + V6 + V7 + V8 + V9 + V10 + 
##     V11 + V12
## Model 2: .outcome ~ V1 + V2 + V3 + V4 + V5 + V6 + V7 + V8 + V9 + V10 + 
##     V11 + V12 + V13
##   Res.Df  RSS Df Sum of Sq    F Pr(>F)
## 1   1910 23.9                         
## 2   1909 23.9  1  0.000376 0.03   0.86
anova(mnNew3.lmFit$finalModel, mnNew6.lmFit$finalModel)
## Analysis of Variance Table
## 
## Model 1: .outcome ~ V1 + V2 + V3 + V4 + V5 + V6 + V7 + V8 + V9 + V10 + 
##     V11 + V12 + V13
## Model 2: .outcome ~ V1 + V2 + V3 + V4 + V5 + V6 + V7 + V8 + V9 + V10 + 
##     V11 + V12 + V13
##   Res.Df  RSS Df Sum of Sq F Pr(>F)
## 1   1909 23.9                      
## 2   1909 24.1  0    -0.241
anova(mnNew3.lmFit$finalModel, mnNew5.lmFit$finalModel)
## Analysis of Variance Table
## 
## Model 1: .outcome ~ V1 + V2 + V3 + V4 + V5 + V6 + V7 + V8 + V9 + V10 + 
##     V11 + V12 + V13
## Model 2: .outcome ~ V1 + V2 + V3 + V4 + V5 + V6 + V7 + V8 + V9 + V10 + 
##     V11 + V12 + V13 + V14
##   Res.Df  RSS Df Sum of Sq    F  Pr(>F)    
## 1   1909 23.9                              
## 2   1908 23.6  1     0.297 24.1 9.9e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1