Data sets

source('~/PED/classification/functoinForEnsembleClassification.R', echo=TRUE)

## 
## > normalized <- function(x) {
## +     (x - min(x)) * 0.8/(max(x) - min(x)) + 0.1
## + }
## 
## > library(caret)

## Loading required package: lattice
## Loading required package: ggplot2

## 
## > library(devtools)
## 
## > install_github("caretEnsemble", "zachmayer")

## Installing github repo(s) caretEnsemble/master from zachmayer
## Installing caretEnsemble.zip from https://github.com/zachmayer/caretEnsemble/archive/master.zip
## Installing caretEnsemble
## '/usr/lib/R/bin/R' --vanilla CMD INSTALL  \
##   '/tmp/RtmpvfvmHE/caretEnsemble-master'  \
##   --library='/opt/home/gong/R/x86_64-pc-linux-gnu-library/3.1'  \
##   --with-keep.source

## 
## > library(caretEnsemble)

## Loading required package: caTools

## 
## > library(doMC)

## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel

## 
## > library(foreach)
## 
## > registerDoMC(cores = 5)
## 
## > class_ensemble_function <- function(inputsTrain, targetsTrain, 
## +     inputsTest, targetsTest, dataset) {
## +     folds = 5
## +     repeats = 1
## +     my .... [TRUNCATED]

source('~/PED/classification-regression/regressionModelfunction.R', echo=TRUE)

## 
## > library(nnet)
## 
## > library(randomForest)

## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.

## 
## > modelErrors <- function(predicted, actual) {
## +     sal <- vector(mode = "numeric", length = 3)
## +     names(sal) <- c("MAE", "RMSE", "RELE")
## +     me .... [TRUNCATED] 
## 
## > train_testErrors <- function(model, inputsTrain, targetsTrain, 
## +     inputsTest, targetsTest) {
## +     trainPredict <- predict(model, newdata = as.d .... [TRUNCATED] 
## 
## > error_distri <- function(model, inputsTrain, targetsTrain, 
## +     inputsTest, targetsTest) {
## +     trainPredict <- predict(model, newdata = as.data. .... [TRUNCATED] 
## 
## > size <- seq(11, 25, 2)
## 
## > decay <- c(1e-04, 0.001, 0.01, 0.1, 1)
## 
## > exp <- expand.grid(decay = decay, size = size)
## 
## > mtry <- c(7:14)
## 
## > ntree <- c(500, 2000)
## 
## > exp.rf <- expand.grid(mtry = mtry, ntree = ntree)
## 
## > lm_nnet_rf_train <- function(inputsTrain, targetsTrain, 
## +     dataset, level) {
## +     if (file.exists(paste("lmFit_dataset_", dataset, "_level_", 
##  .... [TRUNCATED] 
## 
## > lm_nnet_rf_predict <- function(inputsTest, targetsTest, 
## +     dataset, level) {
## +     load(paste("lmFit_dataset_", dataset, "_level_", level, ".RDa ..." ... [TRUNCATED] 
## 
## > denorm <- function(norm, orig) {
## +     ((norm - 0.1) * (max(orig) - min(orig))/0.8) + min(orig)
## + }

if(file.exists(paste("dataset_",14,"_prepared.RData"))){
load(paste("dataset_",14,"_prepared.RData"))
}else{
load("~/PED/prepareDataDay/feature_new14.RData")
nrow(feature_new14[feature_new14[,"MAXO3C"]<=0.11,])
nrow(feature_new14[feature_new14[,"MAXO3C"]>0.11,])
######change type of the output to "factor", which have the levels of "l" and "h"."l" means the daily maximum ozone level is less or equal to 0.11 while "h" means higher than 0.11 ppm.
level<-sample(c("h","l"),nrow(feature_new14),replace=TRUE)
data.frame(cbind(feature_new14,level=level),stringsAsFactors=FALSE)->feature_new14
feature_new14[feature_new14[,"MAXO3C"]<=0.11,"level"]<-c("l")
feature_new14[feature_new14[,"MAXO3C"]>0.11,"level"]<-c("h")
apply(feature_new14[,c(1:(ncol(feature_new14)-2))],normalized,MARGIN=2)->feature_new_norm14
indexTrainAndValidation14<-sample(1:nrow(feature_new_norm14),nrow(feature_new_norm14)*0.66)
feature_new_norm14[indexTrainAndValidation14,]->inputsTrain
feature_new_norm14[-indexTrainAndValidation14,]->inputsTest
feature_new14[indexTrainAndValidation14,"level"]->targetsTrain
feature_new14[-indexTrainAndValidation14,"level"]->targetsTest
feature_new14[indexTrainAndValidation14,"MAXO3C"]->targetsTrainMAX
feature_new14[-indexTrainAndValidation14,"MAXO3C"]->targetsTestMAX
as.factor(targetsTrain)->targetsTrain
as.factor(targetsTest)->targetsTest
save(inputsTrain,inputsTest,targetsTrain,targetsTest,targetsTrainMAX,targetsTestMAX,indexTrainAndValidation14,file=paste("dataset_",14,"_prepared.RData"))
}

train classification models

Classification model results

###Train set
load(paste("dataset_",14,"predsTrainClass.RData"))
resultsTrainClass<-sort(data.frame(colAUC(predsTrainClass, targetsTrain)))
sort(data.frame(colAUC(predsTrainClass,targetsTrain,plotROC=TRUE)))

plot of chunk unnamed-chunk-3

##          rpart svmRadial   nnet    ada ENS_greedy ENS_linear treebag rf
## h vs. l 0.7716    0.8408 0.8574 0.8711     0.9184     0.9597       1  1

###Test set
load(paste("dataset_",14,"predsTestClass.RData"))
resultsTestClass<-sort(data.frame(colAUC(predsTestClass, targetsTest)))
sort(data.frame(colAUC(predsTestClass,targetsTest,plotROC=TRUE)))

plot of chunk unnamed-chunk-3

##          rpart treebag     rf    ada svmRadial ENS_linear  nnet ENS_greedy
## h vs. l 0.7644  0.8033 0.8227 0.8264    0.8282     0.8337 0.834     0.8347

Now I want to seperate the dataset into two parts: one is with high level ozone which the targetsTrain is “h”, another one is with low level of ozone , the targetsTrain is “l”. Then build two regression models based on these two data sets.

load("~/PED/classification-regression/dataset_ 14 _svmFit.RData")
load("~/PED/classification-regression/dataset_ 14 _nnetFit.RData")
load("~/PED/classification-regression/dataset_ 14 _rpartFit.RData")
load("~/PED/classification-regression/dataset_ 14 rfFit.RData")
load("~/PED/classification-regression/dataset_ 14 _adaFit.RData")
load("~/PED/classification-regression/dataset_ 14 _treebagFit.RData")
load("~/PED/classification-regression/dataset_ 14 _greedy.RData")
load("~/PED/classification-regression/dataset_ 14 _linear.RData")
all.models <- list(svmFit, nnetFit,rpartFit,rfFit,adaFit,treebagFit)
predsTrainClass<- data.frame(sapply(all.models, function(x){predict(x,inputsTrain)}))
predsTrainClass$ENS_greedy <- predict(greedy, newdata=as.data.frame(inputsTrain))

## Predictions being made only for cases with complete data

predsTrainClass$ENS_linear <- predict(linear, newdata=inputsTrain)
data.frame(inputsTrain,targetsTrainMAX,targetsTrain,preLinearClass=predsTrainClass$ENS_linear)->trainLinear
table(trainLinear[,"targetsTrain"],trainLinear[,"preLinearClass"])

##    
##       h   l
##   h  96 992
##   l 594 116

####seperate train datasets in to two sets corresponding to the level of daily maximum ozone#######
trainLinear[trainLinear$targetsTrain=="l",]->lclassTrain
trainLinear[trainLinear$targetsTrain=="h",]->hclassTrain
trainLinear[,c(1:21)]->inputsTrainA
lclassTrain[,c(1:21)]->inputsTrainL
lclassTrain[,"targetsTrainMAX"]->targetsTrainL
hclassTrain[,c(1:21)]->inputsTrainH
hclassTrain[,"targetsTrainMAX"]->targetsTrainH
trainLinear[,"targetsTrainMAX"]->targetsTrainA
save(inputsTrainL,targetsTrainL,inputsTrainH,targetsTrainH,file="dataset14classTrain.RData")
##the model for lower level of ozone #######
lm_nnet_rf_train(inputsTrainL,targetsTrainL,14,"L")
##the model for higher level of ozone #######
lm_nnet_rf_train(inputsTrainH,targetsTrainH,14,"H")
#######train a nnet model with all the data
lm_nnet_rf_train(inputsTrainA,targetsTrainA,14,"A")

Predict which level the maximum daily ozone fall in

##First thing is to predict which class the test dataset belong to ("h" o "l")
predsTestClass<- data.frame(sapply(all.models, function(x){predict(x,inputsTest)}))
predsTestClass$ENS_greedy <- predict(greedy, newdata=as.data.frame(inputsTest))

## Predictions being made only for cases with complete data

predsTestClass$ENS_linear <- predict(linear, newdata=inputsTest)
data.frame(inputsTest,targetsTestMAX,targetsTest,preLinearClass=predsTestClass$ENS_linear)->testLinear
table(testLinear[,"targetsTest"],testLinear[,"predsTestClass"])

## Error: undefined columns selected

###exchange the value "h" and "l" of preLinearClass
sample(c("h","l"),nrow(testLinear),replace=TRUE)->preLinearClassReverse
testLinear<-data.frame(testLinear,preLinearClassReverse)
testLinear[testLinear$preLinearClass=="h","preLinearClassReverse"]<-"l"
testLinear[testLinear$preLinearClass=="l","preLinearClassReverse"]<-"h"
table(testLinear[,"preLinearClassReverse"],testLinear[,"targetsTest"])

##    
##       h   l
##   h 501 118
##   l  92 216

######Seperate the dataset according to the prediction value if it is "h" o "l"
testLinear[testLinear$preLinearClassReverse=="l",]->lclassTest
testLinear[testLinear$preLinearClassReverse=="h",]->hclassTest
lclassTest[,c(1:21)]->inputsTestL
lclassTest[,"targetsTestMAX"]->targetsTestL
hclassTest[,c(1:21)]->inputsTestH
hclassTest[,"targetsTestMAX"]->targetsTestH
save(inputsTestL,targetsTestL,inputsTestH,targetsTestH,file="dataset14classTest.RData")

######predict if it is in the "l" class#############
lm_nnet_rf_predict(inputsTestL,targetsTestL,14,"L")

## [[1]]
##     MAE    RMSE    RELE 
## 0.02902 0.04173 0.32499 
## 
## [[2]]
##    decay size nnetMAE nnetRMSE
## 1  1e-04   11 0.02893  0.04179
## 2  1e-03   11 0.02918  0.04131
## 3  1e-02   11 0.03055  0.04149
## 4  1e-01   11 0.03229  0.04235
## 5  1e+00   11 0.03134  0.04073
## 6  1e-04   13 0.02855  0.04157
## 7  1e-03   13 0.02918  0.04131
## 8  1e-02   13 0.03057  0.04150
## 9  1e-01   13 0.03233  0.04239
## 10 1e+00   13 0.03137  0.04083
## 11 1e-04   15 0.02857  0.04157
## 12 1e-03   15 0.02918  0.04131
## 13 1e-02   15 0.03057  0.04150
## 14 1e-01   15 0.03236  0.04243
## 15 1e+00   15 0.03142  0.04093
## 16 1e-04   17 0.02853  0.04150
## 17 1e-03   17 0.02918  0.04130
## 18 1e-02   17 0.03057  0.04150
## 19 1e-01   17 0.03239  0.04246
## 20 1e+00   17 0.03147  0.04103
## 21 1e-04   19 0.02856  0.04150
## 22 1e-03   19 0.02918  0.04130
## 23 1e-02   19 0.03057  0.04150
## 24 1e-01   19 0.03241  0.04249
## 25 1e+00   19 0.03152  0.04112
## 26 1e-04   21 0.02865  0.04135
## 27 1e-03   21 0.02918  0.04130
## 28 1e-02   21 0.03057  0.04150
## 29 1e-01   21 0.03244  0.04252
## 30 1e+00   21 0.03158  0.04120
## 31 1e-04   23 0.02905  0.04178
## 32 1e-03   23 0.02918  0.04130
## 33 1e-02   23 0.03058  0.04151
## 34 1e-01   23 0.03246  0.04254
## 35 1e+00   23 0.03164  0.04128
## 36 1e-04   25 0.02834  0.04112
## 37 1e-03   25 0.02918  0.04130
## 38 1e-02   25 0.03057  0.04150
## 39 1e-01   25 0.03248  0.04256
## 40 1e+00   25 0.03169  0.04136
## 
## [[3]]
##    mtry ntree   rfMAE  rfRMSE
## 1     7   500 0.02839 0.03984
## 2     8   500 0.02837 0.03978
## 3     9   500 0.02836 0.03978
## 4    10   500 0.02833 0.03975
## 5    11   500 0.02824 0.03968
## 6    12   500 0.02828 0.03974
## 7    13   500 0.02822 0.03975
## 8    14   500 0.02826 0.03974
## 9     7  2000 0.02845 0.03981
## 10    8  2000 0.02837 0.03976
## 11    9  2000 0.02835 0.03980
## 12   10  2000 0.02830 0.03978
## 13   11  2000 0.02832 0.03974
## 14   12  2000 0.02826 0.03976
## 15   13  2000 0.02822 0.03975
## 16   14  2000 0.02824 0.03981

######predict if it is in the "H" class#############
lm_nnet_rf_predict(inputsTestH,targetsTestH,14,"H")

## [[1]]
##     MAE    RMSE    RELE 
## 0.04262 0.05390 0.39255 
## 
## [[2]]
##    decay size nnetMAE nnetRMSE
## 1  1e-04   11 0.04682  0.06125
## 2  1e-03   11 0.04319  0.05494
## 3  1e-02   11 0.04295  0.05418
## 4  1e-01   11 0.04472  0.05596
## 5  1e+00   11 0.04685  0.05827
## 6  1e-04   13 0.04819  0.06203
## 7  1e-03   13 0.04313  0.05477
## 8  1e-02   13 0.04295  0.05418
## 9  1e-01   13 0.04474  0.05598
## 10 1e+00   13 0.04677  0.05819
## 11 1e-04   15 0.04748  0.06370
## 12 1e-03   15 0.04317  0.05485
## 13 1e-02   15 0.04295  0.05418
## 14 1e-01   15 0.04473  0.05597
## 15 1e+00   15 0.04671  0.05814
## 16 1e-04   17 0.04901  0.06564
## 17 1e-03   17 0.04320  0.05493
## 18 1e-02   17 0.04295  0.05418
## 19 1e-01   17 0.04472  0.05597
## 20 1e+00   17 0.04666  0.05809
## 21 1e-04   19 0.04996  0.06495
## 22 1e-03   19 0.04274  0.05418
## 23 1e-02   19 0.04295  0.05418
## 24 1e-01   19 0.04472  0.05596
## 25 1e+00   19 0.04662  0.05805
## 26 1e-04   21 0.05133  0.06972
## 27 1e-03   21 0.04319  0.05493
## 28 1e-02   21 0.04295  0.05418
## 29 1e-01   21 0.04472  0.05596
## 30 1e+00   21 0.04659  0.05802
## 31 1e-04   23 0.05115  0.06878
## 32 1e-03   23 0.04278  0.05406
## 33 1e-02   23 0.04294  0.05417
## 34 1e-01   23 0.04472  0.05596
## 35 1e+00   23 0.04656  0.05800
## 36 1e-04   25 0.05279  0.07151
## 37 1e-03   25 0.04318  0.05478
## 38 1e-02   25 0.04294  0.05417
## 39 1e-01   25 0.04471  0.05596
## 40 1e+00   25 0.04654  0.05797
## 
## [[3]]
##    mtry ntree   rfMAE  rfRMSE
## 1     7   500 0.04229 0.05426
## 2     8   500 0.04229 0.05419
## 3     9   500 0.04231 0.05426
## 4    10   500 0.04228 0.05424
## 5    11   500 0.04218 0.05415
## 6    12   500 0.04223 0.05423
## 7    13   500 0.04212 0.05414
## 8    14   500 0.04235 0.05433
## 9     7  2000 0.04214 0.05406
## 10    8  2000 0.04211 0.05404
## 11    9  2000 0.04206 0.05407
## 12   10  2000 0.04217 0.05416
## 13   11  2000 0.04213 0.05413
## 14   12  2000 0.04213 0.05415
## 15   13  2000 0.04220 0.05422
## 16   14  2000 0.04220 0.05424

###########predict with all the test data###############
testLinear[,c(1:21)]->inputsTestA
testLinear[,"targetsTestMAX"]->targetsTestA
lm_nnet_rf_predict(inputsTestA,targetsTestA,14,"A")

## [[1]]
##     MAE    RMSE    RELE 
## 0.03626 0.04733 0.36260 
## 
## [[2]]
##    decay size nnetMAE nnetRMSE
## 1  1e-04   11 0.03909  0.05165
## 2  1e-03   11 0.03602  0.04728
## 3  1e-02   11 0.03647  0.04749
## 4  1e-01   11 0.03816  0.04933
## 5  1e+00   11 0.04686  0.05890
## 6  1e-04   13 0.03880  0.05080
## 7  1e-03   13 0.03612  0.04732
## 8  1e-02   13 0.03647  0.04749
## 9  1e-01   13 0.03817  0.04934
## 10 1e+00   13 0.04690  0.05897
## 11 1e-04   15 0.04003  0.05454
## 12 1e-03   15 0.03539  0.04647
## 13 1e-02   15 0.03647  0.04749
## 14 1e-01   15 0.03816  0.04934
## 15 1e+00   15 0.04694  0.05903
## 16 1e-04   17 0.04211  0.06260
## 17 1e-03   17 0.03616  0.04740
## 18 1e-02   17 0.03647  0.04749
## 19 1e-01   17 0.03817  0.04934
## 20 1e+00   17 0.04697  0.05909
## 21 1e-04   19 0.04351  0.06078
## 22 1e-03   19 0.03517  0.04605
## 23 1e-02   19 0.03647  0.04749
## 24 1e-01   19 0.03817  0.04934
## 25 1e+00   19 0.04701  0.05914
## 26 1e-04   21 0.04301  0.06032
## 27 1e-03   21 0.03554  0.04660
## 28 1e-02   21 0.03647  0.04749
## 29 1e-01   21 0.03817  0.04934
## 30 1e+00   21 0.04704  0.05919
## 31 1e-04   23 0.04273  0.05904
## 32 1e-03   23 0.03522  0.04613
## 33 1e-02   23 0.03647  0.04749
## 34 1e-01   23 0.03817  0.04934
## 35 1e+00   23 0.04707  0.05924
## 36 1e-04   25 0.04396  0.06271
## 37 1e-03   25 0.03580  0.04691
## 38 1e-02   25 0.03647  0.04749
## 39 1e-01   25 0.03817  0.04934
## 40 1e+00   25 0.04710  0.05928
## 
## [[3]]
##    mtry ntree   rfMAE  rfRMSE
## 1     7   500 0.03588 0.04713
## 2     8   500 0.03610 0.04733
## 3     9   500 0.03602 0.04729
## 4    10   500 0.03582 0.04705
## 5    11   500 0.03603 0.04729
## 6    12   500 0.03603 0.04722
## 7    13   500 0.03593 0.04721
## 8    14   500 0.03582 0.04700
## 9     7  2000 0.03594 0.04723
## 10    8  2000 0.03595 0.04718
## 11    9  2000 0.03593 0.04724
## 12   10  2000 0.03603 0.04727
## 13   11  2000 0.03599 0.04722
## 14   12  2000 0.03602 0.04726
## 15   13  2000 0.03595 0.04715
## 16   14  2000 0.03599 0.04721