PhD Report2-prepare databases for training and testing

1.database “mn” first. Using the function “createDataPartition” to create random splits of a dataset ¾ of the data is training data, and ¼ is testing data set.

load("mn.RData")
library(caret)
inTrain <- createDataPartition(mn$O3, p = 3/4, list = FALSE)
mnTrain <- as.matrix(mn[inTrain, c("MONTH", "DAY", "WEEKDAY", "HORA", "SEASON", 
    "NOx", "NO2", "RH", "TMP", "WDR", "WSP")])
mnTest <- as.matrix(mn[-inTrain, c("MONTH", "DAY", "WEEKDAY", "HORA", "SEASON", 
    "NOx", "NO2", "RH", "TMP", "WDR", "WSP")])
mnTrainRegr <- as.matrix(mn[inTrain, c("O3")])
mnTestRegr <- as.matrix(mn[-inTrain, c("O3")])

multicollinearity lead to poor performance in nueral nuetwork and linear and other models. by using the function “cor” and cor.prob,flattenSquareMatrix to see if there is multicollinerity among different varialbes

mnCorr <- cor(mnTrain)
# the correlations among different variables
cor.prob <- function(X, dfr = nrow(X) - 2) {
    R <- cor(X, use = "pairwise.complete.obs")
    above <- row(R) < col(R)
    r2 <- R[above]^2
    Fstat <- r2 * dfr/(1 - r2)
    R[above] <- 1 - pf(Fstat, 1, dfr)
    R[row(R) == col(R)] <- NA
    R
}
flattenSquareMatrix <- function(m) {
    if ((class(m) != "matrix") | (nrow(m) != ncol(m))) 
        stop("Must be a square matrix.")
    if (!identical(rownames(m), colnames(m))) 
        stop("Row and column names must be equal.")
    ut <- upper.tri(m)
    data.frame(i = rownames(m)[row(m)[ut]], j = rownames(m)[col(m)[ut]], cor = t(m)[ut], 
        p = m[ut])
}
mnCor.prob <- cor.prob(mnTrain)
flattenSquareMatrix(mnCor.prob)
##          i       j       cor         p
## 1    MONTH     DAY  0.023833 2.780e-01
## 2    MONTH WEEKDAY -0.030817 1.606e-01
## 3      DAY WEEKDAY  0.007809 7.223e-01
## 4    MONTH    HORA  0.012950 5.556e-01
## 5      DAY    HORA  0.006883 7.541e-01
## 6  WEEKDAY    HORA  0.026923 2.203e-01
## 7    MONTH  SEASON  0.971538 0.000e+00
## 8      DAY  SEASON  0.031714 1.488e-01
## 9  WEEKDAY  SEASON -0.028993 1.869e-01
## 10    HORA  SEASON  0.014981 4.953e-01
## 11   MONTH     NOx  0.141447 9.779e-11
## 12     DAY     NOx  0.018236 4.065e-01
## 13 WEEKDAY     NOx -0.177306 4.441e-16
## 14    HORA     NOx -0.104700 1.767e-06
## 15  SEASON     NOx  0.141520 9.561e-11
## 16   MONTH     NO2  0.141460 9.741e-11
## 17     DAY     NO2  0.015981 4.670e-01
## 18 WEEKDAY     NO2 -0.174006 1.443e-15
## 19    HORA     NO2 -0.083198 1.487e-04
## 20  SEASON     NO2  0.144264 4.096e-11
## 21     NOx     NO2  0.951266 0.000e+00
## 22   MONTH      RH  0.318182 0.000e+00
## 23     DAY      RH -0.022108 3.143e-01
## 24 WEEKDAY      RH -0.020136 3.594e-01
## 25    HORA      RH -0.208898 0.000e+00
## 26  SEASON      RH  0.329845 0.000e+00
## 27     NOx      RH  0.145523 2.762e-11
## 28     NO2      RH  0.119168 5.234e-08
## 29   MONTH     TMP -0.150202 6.187e-12
## 30     DAY     TMP  0.036297 9.842e-02
## 31 WEEKDAY     TMP  0.017348 4.297e-01
## 32    HORA     TMP  0.187584 0.000e+00
## 33  SEASON     TMP -0.158585 3.764e-13
## 34     NOx     TMP -0.276833 0.000e+00
## 35     NO2     TMP -0.250699 0.000e+00
## 36      RH     TMP -0.437970 0.000e+00
## 37   MONTH     WDR  0.107875 8.471e-07
## 38     DAY     WDR  0.016582 4.504e-01
## 39 WEEKDAY     WDR -0.008759 6.901e-01
## 40    HORA     WDR  0.042908 5.072e-02
## 41  SEASON     WDR  0.102031 3.224e-06
## 42     NOx     WDR  0.023782 2.790e-01
## 43     NO2     WDR  0.015974 4.672e-01
## 44      RH     WDR  0.119232 5.147e-08
## 45     TMP     WDR -0.158870 3.413e-13
## 46   MONTH     WSP -0.142148 7.887e-11
## 47     DAY     WSP  0.006765 7.581e-01
## 48 WEEKDAY     WSP -0.010859 6.211e-01
## 49    HORA     WSP -0.138324 2.515e-10
## 50  SEASON     WSP -0.151798 3.674e-12
## 51     NOx     WSP -0.055348 1.170e-02
## 52     NO2     WSP -0.070845 1.244e-03
## 53      RH     WSP -0.179380 2.220e-16
## 54     TMP     WSP  0.047146 3.180e-02
## 55     WDR     WSP -0.035649 1.046e-01
library(zoo)
library(PerformanceAnalytics)
chart.Correlation(mnTrain)

plot of chunk Prepare Training And Testing Database.2

# remove the variables that result in correlations greater than 0.9.
mn.high.corr <- findCorrelation(mnCorr)
mnTrain2 <- mnTrain[, -mn.high.corr]
mnTest2 <- mnTest[, -mn.high.corr]

Models such as nerula networks and support vector machines need perdictor varialbes to be centered and scaled Normalized data for neural network

# Normalized data
NOR <- function(x) {
    (0.09 - 0.01) * (x - min(x))/(max(x) - min(x)) + 0.01
}
# mnTrain3 is the dependent varialbe for training
mnTrain3 <- apply(mnTrain2, 2, NOR)
# mnTest3 is the dependent varialbe for testing
mnTest3 <- apply(mnTest2, 2, NOR)
# mnTrainRegr3 are the independent varialbes for trainig
mnTrainRegr3 <- NOR(mnTrainRegr)
mnTestRegr3 <- NOR(mnTestRegr)
# mnTest3 is the independent varialbe for testing
save(mnTrain3, mnTest3, mnTrainRegr3, mnTestRegr3, file = "mnTrainAndTest.RData")

2.Database “davg” secondly.The progress of preparing the database is the same as “mn”

load("davg.RData")
inTrainDavg <- createDataPartition(davg$O3, p = 3/4, list = FALSE)
davgTrain <- as.matrix(davg[inTrainDavg, c("MONTH", "DAY", "WEEKDAY", "HORA.x", 
    "SEASON", "AVGO3", "AVGNOx", "AVGNO2", "AVGRH", "AVGTMP", "AVGWDR", "AVGWSP")])
davgTest <- as.matrix(davg[-inTrainDavg, c("MONTH", "DAY", "WEEKDAY", "HORA.x", 
    "SEASON", "AVGO3", "AVGNOx", "AVGNO2", "AVGRH", "AVGTMP", "AVGWDR", "AVGWSP")])
davgTrainRegr <- as.matrix(davg[inTrainDavg, c("O3")])
davgTestRegr <- as.matrix(davg[-inTrainDavg, c("O3")])
davgCorr <- cor(davgTrain)
davgCor.prob <- cor.prob(davgTrain)
flattenSquareMatrix(davgCor.prob)
##          i       j        cor         p
## 1    MONTH     DAY  0.0423223 1.504e-01
## 2    MONTH WEEKDAY -0.0221661 4.515e-01
## 3      DAY WEEKDAY -0.0332404 2.588e-01
## 4    MONTH  HORA.x -0.0287787 3.283e-01
## 5      DAY  HORA.x  0.0016603 9.550e-01
## 6  WEEKDAY  HORA.x  0.0331082 2.607e-01
## 7    MONTH  SEASON  0.9715325 0.000e+00
## 8      DAY  SEASON  0.0540152 6.638e-02
## 9  WEEKDAY  SEASON -0.0365039 2.149e-01
## 10  HORA.x  SEASON -0.0247870 3.998e-01
## 11   MONTH   AVGO3 -0.1722696 3.748e-09
## 12     DAY   AVGO3 -0.0676229 2.149e-02
## 13 WEEKDAY   AVGO3 -0.0411764 1.618e-01
## 14  HORA.x   AVGO3 -0.0708496 1.598e-02
## 15  SEASON   AVGO3 -0.1815962 4.992e-10
## 16   MONTH  AVGNOx  0.1478681 4.421e-07
## 17     DAY  AVGNOx  0.0462605 1.160e-01
## 18 WEEKDAY  AVGNOx  0.1195174 4.625e-05
## 19  HORA.x  AVGNOx  0.0068365 8.164e-01
## 20  SEASON  AVGNOx  0.1436458 9.384e-07
## 21   AVGO3  AVGNOx  0.1793263 8.235e-10
## 22   MONTH  AVGNO2  0.0959977 1.083e-03
## 23     DAY  AVGNO2  0.0065607 8.237e-01
## 24 WEEKDAY  AVGNO2  0.1130973 1.164e-04
## 25  HORA.x  AVGNO2  0.0200478 4.959e-01
## 26  SEASON  AVGNO2  0.0876318 2.864e-03
## 27   AVGO3  AVGNO2  0.3770915 0.000e+00
## 28  AVGNOx  AVGNO2  0.8863167 0.000e+00
## 29   MONTH   AVGRH  0.3910493 0.000e+00
## 30     DAY   AVGRH  0.0297259 3.126e-01
## 31 WEEKDAY   AVGRH  0.0192549 5.131e-01
## 32  HORA.x   AVGRH -0.0444207 1.312e-01
## 33  SEASON   AVGRH  0.4002597 0.000e+00
## 34   AVGO3   AVGRH -0.3924504 0.000e+00
## 35  AVGNOx   AVGRH  0.0259213 3.786e-01
## 36  AVGNO2   AVGRH -0.0229205 4.362e-01
## 37   MONTH  AVGTMP -0.0594555 4.327e-02
## 38     DAY  AVGTMP -0.0102674 7.273e-01
## 39 WEEKDAY  AVGTMP -0.0146851 6.179e-01
## 40  HORA.x  AVGTMP -0.0946899 1.268e-03
## 41  SEASON  AVGTMP -0.0755743 1.016e-02
## 42   AVGO3  AVGTMP  0.2724694 0.000e+00
## 43  AVGNOx  AVGTMP -0.3454267 0.000e+00
## 44  AVGNO2  AVGTMP -0.2142227 1.821e-13
## 45   AVGRH  AVGTMP -0.2052330 1.848e-12
## 46   MONTH  AVGWDR  0.1685669 8.099e-09
## 47     DAY  AVGWDR  0.0292633 3.202e-01
## 48 WEEKDAY  AVGWDR  0.0297079 3.129e-01
## 49  HORA.x  AVGWDR -0.0188176 5.227e-01
## 50  SEASON  AVGWDR  0.1738472 2.685e-09
## 51   AVGO3  AVGWDR -0.2061410 1.469e-12
## 52  AVGNOx  AVGWDR  0.0002665 9.928e-01
## 53  AVGNO2  AVGWDR -0.0401292 1.727e-01
## 54   AVGRH  AVGWDR  0.2025451 3.621e-12
## 55  AVGTMP  AVGWDR -0.1284987 1.174e-05
## 56   MONTH  AVGWSP -0.1331395 5.570e-06
## 57     DAY  AVGWSP  0.0095637 7.453e-01
## 58 WEEKDAY  AVGWSP -0.0505257 8.596e-02
## 59  HORA.x  AVGWSP -0.1021571 5.039e-04
## 60  SEASON  AVGWSP -0.1427013 1.107e-06
## 61   AVGO3  AVGWSP -0.0663609 2.405e-02
## 62  AVGNOx  AVGWSP -0.3110591 0.000e+00
## 63  AVGNO2  AVGWSP -0.2690659 0.000e+00
## 64   AVGRH  AVGWSP -0.2118465 3.394e-13
## 65  AVGTMP  AVGWSP  0.1596308 4.853e-08
## 66  AVGWDR  AVGWSP  0.1878841 1.206e-10
library(zoo)
library(PerformanceAnalytics)
chart.Correlation(davgTrain)

plot of chunk Prepare Training And Testing Database.5

# remove the variables that result in correlations greater than 0.9.
davg.high.corr <- findCorrelation(davgCorr)
davgTrain2 <- davgTrain[, -davg.high.corr]
davgTest2 <- davgTest[, -davg.high.corr]
# Normalized data
davgTrain3 <- apply(davgTrain2, 2, NOR)
davgTest3 <- apply(davgTest2, 2, NOR)
davgTrainRegr3 <- NOR(davgTrainRegr)
davgTestRegr3 <- NOR(davgTestRegr)
save(davgTrain3, davgTest3, davgTrainRegr3, davgTestRegr3, file = "davgTrainAndTest.RData")