1.database “mn” first. Using the function “createDataPartition” to create random splits of a dataset ¾ of the data is training data, and ¼ is testing data set.
load("mn.RData")
library(caret)
inTrain <- createDataPartition(mn$O3, p = 3/4, list = FALSE)
mnTrain <- as.matrix(mn[inTrain, c("MONTH", "DAY", "WEEKDAY", "HORA", "SEASON",
"NOx", "NO2", "RH", "TMP", "WDR", "WSP")])
mnTest <- as.matrix(mn[-inTrain, c("MONTH", "DAY", "WEEKDAY", "HORA", "SEASON",
"NOx", "NO2", "RH", "TMP", "WDR", "WSP")])
mnTrainRegr <- as.matrix(mn[inTrain, c("O3")])
mnTestRegr <- as.matrix(mn[-inTrain, c("O3")])
multicollinearity lead to poor performance in nueral nuetwork and linear and other models. by using the function “cor” and cor.prob,flattenSquareMatrix to see if there is multicollinerity among different varialbes
mnCorr <- cor(mnTrain)
# the correlations among different variables
cor.prob <- function(X, dfr = nrow(X) - 2) {
R <- cor(X, use = "pairwise.complete.obs")
above <- row(R) < col(R)
r2 <- R[above]^2
Fstat <- r2 * dfr/(1 - r2)
R[above] <- 1 - pf(Fstat, 1, dfr)
R[row(R) == col(R)] <- NA
R
}
flattenSquareMatrix <- function(m) {
if ((class(m) != "matrix") | (nrow(m) != ncol(m)))
stop("Must be a square matrix.")
if (!identical(rownames(m), colnames(m)))
stop("Row and column names must be equal.")
ut <- upper.tri(m)
data.frame(i = rownames(m)[row(m)[ut]], j = rownames(m)[col(m)[ut]], cor = t(m)[ut],
p = m[ut])
}
mnCor.prob <- cor.prob(mnTrain)
flattenSquareMatrix(mnCor.prob)
## i j cor p
## 1 MONTH DAY 0.023833 2.780e-01
## 2 MONTH WEEKDAY -0.030817 1.606e-01
## 3 DAY WEEKDAY 0.007809 7.223e-01
## 4 MONTH HORA 0.012950 5.556e-01
## 5 DAY HORA 0.006883 7.541e-01
## 6 WEEKDAY HORA 0.026923 2.203e-01
## 7 MONTH SEASON 0.971538 0.000e+00
## 8 DAY SEASON 0.031714 1.488e-01
## 9 WEEKDAY SEASON -0.028993 1.869e-01
## 10 HORA SEASON 0.014981 4.953e-01
## 11 MONTH NOx 0.141447 9.779e-11
## 12 DAY NOx 0.018236 4.065e-01
## 13 WEEKDAY NOx -0.177306 4.441e-16
## 14 HORA NOx -0.104700 1.767e-06
## 15 SEASON NOx 0.141520 9.561e-11
## 16 MONTH NO2 0.141460 9.741e-11
## 17 DAY NO2 0.015981 4.670e-01
## 18 WEEKDAY NO2 -0.174006 1.443e-15
## 19 HORA NO2 -0.083198 1.487e-04
## 20 SEASON NO2 0.144264 4.096e-11
## 21 NOx NO2 0.951266 0.000e+00
## 22 MONTH RH 0.318182 0.000e+00
## 23 DAY RH -0.022108 3.143e-01
## 24 WEEKDAY RH -0.020136 3.594e-01
## 25 HORA RH -0.208898 0.000e+00
## 26 SEASON RH 0.329845 0.000e+00
## 27 NOx RH 0.145523 2.762e-11
## 28 NO2 RH 0.119168 5.234e-08
## 29 MONTH TMP -0.150202 6.187e-12
## 30 DAY TMP 0.036297 9.842e-02
## 31 WEEKDAY TMP 0.017348 4.297e-01
## 32 HORA TMP 0.187584 0.000e+00
## 33 SEASON TMP -0.158585 3.764e-13
## 34 NOx TMP -0.276833 0.000e+00
## 35 NO2 TMP -0.250699 0.000e+00
## 36 RH TMP -0.437970 0.000e+00
## 37 MONTH WDR 0.107875 8.471e-07
## 38 DAY WDR 0.016582 4.504e-01
## 39 WEEKDAY WDR -0.008759 6.901e-01
## 40 HORA WDR 0.042908 5.072e-02
## 41 SEASON WDR 0.102031 3.224e-06
## 42 NOx WDR 0.023782 2.790e-01
## 43 NO2 WDR 0.015974 4.672e-01
## 44 RH WDR 0.119232 5.147e-08
## 45 TMP WDR -0.158870 3.413e-13
## 46 MONTH WSP -0.142148 7.887e-11
## 47 DAY WSP 0.006765 7.581e-01
## 48 WEEKDAY WSP -0.010859 6.211e-01
## 49 HORA WSP -0.138324 2.515e-10
## 50 SEASON WSP -0.151798 3.674e-12
## 51 NOx WSP -0.055348 1.170e-02
## 52 NO2 WSP -0.070845 1.244e-03
## 53 RH WSP -0.179380 2.220e-16
## 54 TMP WSP 0.047146 3.180e-02
## 55 WDR WSP -0.035649 1.046e-01
library(zoo)
library(PerformanceAnalytics)
chart.Correlation(mnTrain)
# remove the variables that result in correlations greater than 0.9.
mn.high.corr <- findCorrelation(mnCorr)
mnTrain2 <- mnTrain[, -mn.high.corr]
mnTest2 <- mnTest[, -mn.high.corr]
Models such as nerula networks and support vector machines need perdictor varialbes to be centered and scaled Normalized data for neural network
# Normalized data
NOR <- function(x) {
(0.09 - 0.01) * (x - min(x))/(max(x) - min(x)) + 0.01
}
# mnTrain3 is the dependent varialbe for training
mnTrain3 <- apply(mnTrain2, 2, NOR)
# mnTest3 is the dependent varialbe for testing
mnTest3 <- apply(mnTest2, 2, NOR)
# mnTrainRegr3 are the independent varialbes for trainig
mnTrainRegr3 <- NOR(mnTrainRegr)
mnTestRegr3 <- NOR(mnTestRegr)
# mnTest3 is the independent varialbe for testing
save(mnTrain3, mnTest3, mnTrainRegr3, mnTestRegr3, file = "mnTrainAndTest.RData")
2.Database “davg” secondly.The progress of preparing the database is the same as “mn”
load("davg.RData")
inTrainDavg <- createDataPartition(davg$O3, p = 3/4, list = FALSE)
davgTrain <- as.matrix(davg[inTrainDavg, c("MONTH", "DAY", "WEEKDAY", "HORA.x",
"SEASON", "AVGO3", "AVGNOx", "AVGNO2", "AVGRH", "AVGTMP", "AVGWDR", "AVGWSP")])
davgTest <- as.matrix(davg[-inTrainDavg, c("MONTH", "DAY", "WEEKDAY", "HORA.x",
"SEASON", "AVGO3", "AVGNOx", "AVGNO2", "AVGRH", "AVGTMP", "AVGWDR", "AVGWSP")])
davgTrainRegr <- as.matrix(davg[inTrainDavg, c("O3")])
davgTestRegr <- as.matrix(davg[-inTrainDavg, c("O3")])
davgCorr <- cor(davgTrain)
davgCor.prob <- cor.prob(davgTrain)
flattenSquareMatrix(davgCor.prob)
## i j cor p
## 1 MONTH DAY 0.0423223 1.504e-01
## 2 MONTH WEEKDAY -0.0221661 4.515e-01
## 3 DAY WEEKDAY -0.0332404 2.588e-01
## 4 MONTH HORA.x -0.0287787 3.283e-01
## 5 DAY HORA.x 0.0016603 9.550e-01
## 6 WEEKDAY HORA.x 0.0331082 2.607e-01
## 7 MONTH SEASON 0.9715325 0.000e+00
## 8 DAY SEASON 0.0540152 6.638e-02
## 9 WEEKDAY SEASON -0.0365039 2.149e-01
## 10 HORA.x SEASON -0.0247870 3.998e-01
## 11 MONTH AVGO3 -0.1722696 3.748e-09
## 12 DAY AVGO3 -0.0676229 2.149e-02
## 13 WEEKDAY AVGO3 -0.0411764 1.618e-01
## 14 HORA.x AVGO3 -0.0708496 1.598e-02
## 15 SEASON AVGO3 -0.1815962 4.992e-10
## 16 MONTH AVGNOx 0.1478681 4.421e-07
## 17 DAY AVGNOx 0.0462605 1.160e-01
## 18 WEEKDAY AVGNOx 0.1195174 4.625e-05
## 19 HORA.x AVGNOx 0.0068365 8.164e-01
## 20 SEASON AVGNOx 0.1436458 9.384e-07
## 21 AVGO3 AVGNOx 0.1793263 8.235e-10
## 22 MONTH AVGNO2 0.0959977 1.083e-03
## 23 DAY AVGNO2 0.0065607 8.237e-01
## 24 WEEKDAY AVGNO2 0.1130973 1.164e-04
## 25 HORA.x AVGNO2 0.0200478 4.959e-01
## 26 SEASON AVGNO2 0.0876318 2.864e-03
## 27 AVGO3 AVGNO2 0.3770915 0.000e+00
## 28 AVGNOx AVGNO2 0.8863167 0.000e+00
## 29 MONTH AVGRH 0.3910493 0.000e+00
## 30 DAY AVGRH 0.0297259 3.126e-01
## 31 WEEKDAY AVGRH 0.0192549 5.131e-01
## 32 HORA.x AVGRH -0.0444207 1.312e-01
## 33 SEASON AVGRH 0.4002597 0.000e+00
## 34 AVGO3 AVGRH -0.3924504 0.000e+00
## 35 AVGNOx AVGRH 0.0259213 3.786e-01
## 36 AVGNO2 AVGRH -0.0229205 4.362e-01
## 37 MONTH AVGTMP -0.0594555 4.327e-02
## 38 DAY AVGTMP -0.0102674 7.273e-01
## 39 WEEKDAY AVGTMP -0.0146851 6.179e-01
## 40 HORA.x AVGTMP -0.0946899 1.268e-03
## 41 SEASON AVGTMP -0.0755743 1.016e-02
## 42 AVGO3 AVGTMP 0.2724694 0.000e+00
## 43 AVGNOx AVGTMP -0.3454267 0.000e+00
## 44 AVGNO2 AVGTMP -0.2142227 1.821e-13
## 45 AVGRH AVGTMP -0.2052330 1.848e-12
## 46 MONTH AVGWDR 0.1685669 8.099e-09
## 47 DAY AVGWDR 0.0292633 3.202e-01
## 48 WEEKDAY AVGWDR 0.0297079 3.129e-01
## 49 HORA.x AVGWDR -0.0188176 5.227e-01
## 50 SEASON AVGWDR 0.1738472 2.685e-09
## 51 AVGO3 AVGWDR -0.2061410 1.469e-12
## 52 AVGNOx AVGWDR 0.0002665 9.928e-01
## 53 AVGNO2 AVGWDR -0.0401292 1.727e-01
## 54 AVGRH AVGWDR 0.2025451 3.621e-12
## 55 AVGTMP AVGWDR -0.1284987 1.174e-05
## 56 MONTH AVGWSP -0.1331395 5.570e-06
## 57 DAY AVGWSP 0.0095637 7.453e-01
## 58 WEEKDAY AVGWSP -0.0505257 8.596e-02
## 59 HORA.x AVGWSP -0.1021571 5.039e-04
## 60 SEASON AVGWSP -0.1427013 1.107e-06
## 61 AVGO3 AVGWSP -0.0663609 2.405e-02
## 62 AVGNOx AVGWSP -0.3110591 0.000e+00
## 63 AVGNO2 AVGWSP -0.2690659 0.000e+00
## 64 AVGRH AVGWSP -0.2118465 3.394e-13
## 65 AVGTMP AVGWSP 0.1596308 4.853e-08
## 66 AVGWDR AVGWSP 0.1878841 1.206e-10
library(zoo)
library(PerformanceAnalytics)
chart.Correlation(davgTrain)
# remove the variables that result in correlations greater than 0.9.
davg.high.corr <- findCorrelation(davgCorr)
davgTrain2 <- davgTrain[, -davg.high.corr]
davgTest2 <- davgTest[, -davg.high.corr]
# Normalized data
davgTrain3 <- apply(davgTrain2, 2, NOR)
davgTest3 <- apply(davgTest2, 2, NOR)
davgTrainRegr3 <- NOR(davgTrainRegr)
davgTestRegr3 <- NOR(davgTestRegr)
save(davgTrain3, davgTest3, davgTrainRegr3, davgTestRegr3, file = "davgTrainAndTest.RData")