classification models

library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(xtable)
setwd("~/Documentos/HongKong/trainingModel/TW_previous2")
source('~/Documentos/HongKong/functions/classification.R', echo=TRUE)
## 
## > normalized <- function(x) {
## +     (x - min(x)) * 0.8/(max(x) - min(x)) + 0.1
## + }
## 
## > library(caTools)
## 
## > library(caret)
## 
## > library(devtools)
## 
## Attaching package: 'devtools'
## 
## The following objects are masked from 'package:utils':
## 
##     ?, help
## 
## The following object is masked from 'package:base':
## 
##     system.file
## 
## > library(caretEnsemble)
## 
## > library(doMC)
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
## 
## > library(foreach)
## 
## > library(randomForest)
## randomForest 4.6-7
## Type rfNews() to see new features/changes/bug fixes.
## 
## > library(gbm)
## Loading required package: survival
## Loading required package: splines
## 
## Attaching package: 'survival'
## 
## The following object is masked from 'package:caret':
## 
##     cluster
## 
## Loaded gbm 2.1
## 
## > registerDoMC(cores = 3)
## 
## > class_ensemble_function <- function(inputsTrain, targetsTrain, 
## +     dataset) {
## +     folds = 3
## +     repeats = 1
## +     seeds = set.seed(1)
## +     m .... [TRUNCATED] 
## 
## > classPrediction <- function(all.models, inputs, dataset) {
## +     predsClass <- data.frame(sapply(all.models, function(x) {
## +         predict(x, as.d .... [TRUNCATED]
source('~/Documentos/HongKong/functions/rocFunction.R', echo=TRUE)
## 
## > library(caTools)
## 
## > rocFun <- function(all.models, inputs, targets) {
## +     preds <- data.frame(sapply(all.models, function(x) {
## +         predict(x, inputs, type = "pr ..." ... [TRUNCATED]
source('~/Documentos/HongKong/functions/classificationModelErrors.R', echo=TRUE)
## 
## > library(caTools)
## 
## > classModelErrors <- function(predict, targets, beta) {
## +     t <- table(targets, predict)
## +     TP <- t[1]
## +     FP <- t[2]
## +     FN <- t[3]
## +     T .... [TRUNCATED]
###########################origional data set################################### 
load("~/Documentos/HongKong/trainingModel/data/partitionData_TW_previoud2Day.RData")
class_ensemble_function(inputsTrain_p2,as.factor(targetsTrainClass_p2),"_TW_previous_")
## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009
if(file.exists("all.models_previous.RData")){
        load("all.models_previous.RData")
}else{
        load("dataset_ _TW_previous_ _svmFit.RData")
        load("dataset_ _TW_previous_ _nnetFit.RData")
        load("dataset_ _TW_previous_ _rpartFit.RData")
        load("dataset_ _TW_previous_ _gbmFit.RData")
        load("dataset_ _TW_previous_ _treebagFit.RData")
        load("dataset_ _TW_previous_ _greedy.RData")
        load("dataset_ _TW_previous_ _linear.RData")
        load("dataset_ _TW_previous_ _adaFit.RData")
        load("dataset_ _TW_previous_ _rfFit.RData")
        all.models_previous <- list(svmFit, nnetFit,rpartFit,rfFit,gbmFit,adaFit,treebagFit,linear)
        names(all.models_previous) <- c("SVM","NN","rpart","RF","GBM","ada","bagging","LE")
        save(all.models_previous,file="all.models_previous.RData")
}

###test sets#####
if(file.exists("modelsTest_previous.RData")){
        load("modelsTest_previous.RData")
}else{
     modelsTest_previous<-classPrediction(all.models_previous,inputsTest_p2,"_TW_previous_test_")
    save(modelsTest_previous,file="modelsTest_previous.RData")   
}

########################ubUnder10#########################
load("data_obUnder10.RData")
class_ensemble_function(inputsTrain_ubUnder10,as.factor(targetsTrain_ubUnder10),"_TW_ubUnder10_")
if(file.exists("all.models_ubUnder10.RData")){
        load("all.models_ubUnder10.RData")
}else{
        load(paste("dataset_","_TW_ubUnder10_","_svmFit.RData"))
        load(paste("dataset_","_TW_ubUnder10_","_nnetFit.RData"))
        load(paste("dataset_","_TW_ubUnder10_","_rpartFit.RData"))
        load(paste("dataset_","_TW_ubUnder10_","_rfFit.RData"))
        load(paste("dataset_","_TW_ubUnder10_","_gbmFit.RData"))
        load(paste("dataset_","_TW_ubUnder10_","_adaFit.RData"))
        load(paste("dataset_","_TW_ubUnder10_","_treebagFit.RData"))
        load(paste("dataset_","_TW_ubUnder10_","_linear.RData"))
        all.models_ubUnder10<- list(svmFit,nnetFit,rpartFit,rfFit,gbmFit,adaFit,treebagFit,linear)
         names(all.models_ubUnder10) <- c("SVM","NN","rpart","RF","GBM","ada","bagging","LE")
        save(all.models_ubUnder10,file="all.models_ubUnder10.RData")
}


###test sets#####
if(file.exists("modelsTest_ubUnder10.RData"))
        {load("modelsTest_ubUnder10.RData")
}else{
        modelsTest_ubUnder10<-classPrediction(all.models_ubUnder10,inputsTest_p2,"_TW_ubUnder10_test_")
        save(modelsTest_ubUnder10,file="modelsTest_ubUnder10.RData")   
        }

########################ubUnder30#########################
 load("data_obUnder30.RData")
 class_ensemble_function(inputsTrain_ubUnder30,as.factor(targetsTrain_ubUnder30),"_TW_ubUnder30_")
if(file.exists("all.models_ubUnder30.RData")){
        load("all.models_ubUnder30.RData")
}else{
        load(paste("dataset_","_TW_ubUnder30_","_svmFit.RData"))
        load(paste("dataset_","_TW_ubUnder30_","_nnetFit.RData"))
        load(paste("dataset_","_TW_ubUnder30_","_rpartFit.RData"))
        load(paste("dataset_","_TW_ubUnder30_","_gbmFit.RData"))
        load(paste("dataset_","_TW_ubUnder30_","_adaFit.RData"))
        load(paste("dataset_","_TW_ubUnder30_","_treebagFit.RData"))
        load(paste("dataset_","_TW_ubUnder30_","_linear.RData"))
        load(paste("dataset_","_TW_ubUnder30_","_rfFit.RData"))
        all.models_ubUnder30<- list(svmFit,nnetFit,rpartFit,rfFit,gbmFit,adaFit,treebagFit,linear)
       names(all.models_ubUnder30) <- c("SVM","NN","rpart","RF","GBM","ada","bagging","LE")
        save(all.models_ubUnder30,file="all.models_ubUnder30.RData")
}


###test sets#####
if(file.exists("modelsTest_ubUnder30.RData"))
        {load("modelsTest_ubUnder30.RData")
}else{
        modelsTest_ubUnder30<-classPrediction(all.models_ubUnder30,inputsTest_p2,"_TW_ubUnder30_test_")
       save(modelsTest_ubUnder30,file="modelsTest_ubUnder30.RData")   
        }

#######################over-sample#########################
load("data_ubOver.RData")
rownames(inputsTrain_ubOver)<-c(1:nrow(inputsTrain_ubOver))
class_ensemble_function(inputsTrain_ubOver,as.factor(targetsTrain_ubOver),"_TW_ubOver_")       
if(file.exists("all.models_ubOver.RData")){
        load("all.models_ubOver.RData")
}else{
        load(paste("dataset_","_TW_ubOver_","_svmFit.RData"))
        load(paste("dataset_","_TW_ubOver_","_nnetFit.RData"))
        load(paste("dataset_","_TW_ubOver_","_rpartFit.RData"))
        load(paste("dataset_","_TW_ubOver_","_rfFit.RData"))
        load(paste("dataset_","_TW_ubOver_","_gbmFit.RData"))
        load(paste("dataset_","_TW_ubOver_","_adaFit.RData"))
        load(paste("dataset_","_TW_ubOver_","_treebagFit.RData"))
        load(paste("dataset_","_TW_ubOver_","_linear.RData"))
        all.models_ubOver<- list(svmFit,nnetFit,rpartFit,rfFit,gbmFit,adaFit,treebagFit,linear)
         names(all.models_ubOver) <- c("SVM","NN","rpart","RF","GBM","ada","bagging","LE")
        save(all.models_ubOver,file="all.models_ubOver.RData")
}

###test sets#####
if(file.exists("modelsTest_ubOver.RData")){
        load("modelsTest_ubOver.RData")
}else{
   modelsTest_ubOver<-classPrediction(all.models_ubOver,inputsTest_p2,"_TW_ubOver_")
   save(modelsTest_ubOver,file="modelsTest_ubOver.RData")
}



###############################SMOTE###################################
load("data_SMOTE.RData")
class_ensemble_function(inputsTrain_SMOTE,as.factor(targetsTrain_SMOTE),"_TW_SMOTE_")
if(file.exists("all.models_SMOTE.RData")){
        load("all.models_SMOTE.RData")
}else{
        load(paste("dataset_","_TW_SMOTE_","_svmFit.RData"))
        load(paste("dataset_","_TW_SMOTE_","_nnetFit.RData"))
        load(paste("dataset_","_TW_SMOTE_","_rpartFit.RData"))
        load(paste("dataset_","_TW_SMOTE_","_rfFit.RData"))   
        load(paste("dataset_","_TW_SMOTE_","_gbmFit.RData"))
        load(paste("dataset_","_TW_SMOTE_","_adaFit.RData"))
        load(paste("dataset_","_TW_SMOTE_","_treebagFit.RData"))
        load(paste("dataset_","_TW_SMOTE_","_linear.RData"))
        all.models_SMOTE<- list(svmFit,nnetFit,rpartFit,rfFit,gbmFit,adaFit,treebagFit,linear)
        names(all.models_SMOTE) <- c("SVM","NN","rpart","RF","GBM","ada","bagging","LE")
        save(all.models_SMOTE,file="all.models_SMOTE.RData")
}

if(file.exists("modelsTest_SMOTE")){
        load("modelsTest_SMOTE")
}else{
     modelsTest_SMOTE<-classPrediction(all.models_SMOTE,inputsTest_p2,"_TW_SMOTE_")
     save(modelsTest_SMOTE,file="modelsTest_SMOTE.RData")   
}
## Loading required package: kernlab
####################show all the errors ############################
sapply(modelsTest_previous,function(x) classModelErrors(x,targetsTestClass_p2,0.4))
##                   svmRadial    nnet   rpart      rf     gbm     ada
## TNR                 1.00000      NA      NA 1.00000 0.99724      NA
## TPR                 0.02941      NA      NA 0.02941 0.02941      NA
## Precision           1.00000 0.02291 0.02291 1.00000 0.20000 0.02291
## Recall              0.02941      NA      NA 0.02941 0.02941      NA
## G-mean              0.17150      NA      NA 0.17150 0.17126      NA
## Weighted Accuracy   0.61176      NA      NA 0.61176 0.61011      NA
## F-measure           0.05714      NA      NA 0.05714 0.05128      NA
## AUC                 0.51471 0.50000 0.50000 0.51471 0.51333 0.50000
##                   treebag    NULL
## TNR               1.00000      NA
## TPR               0.02941      NA
## Precision         1.00000 0.02291
## Recall            0.02941      NA
## G-mean            0.17150      NA
## Weighted Accuracy 0.61176      NA
## F-measure         0.05714      NA
## AUC               0.51471 0.50000
sapply(modelsTest_ubOver,function(x) classModelErrors(x,targetsTestClass_p2,0.4))
##                   svmRadial   nnet  rpart     rf    gbm    ada treebag
## TNR                 0.99862 0.9510 0.9538 0.9986 0.9910 0.8924  0.9821
## TPR                 0.02941 0.2941 0.2941 0.1176 0.1471 0.6471  0.1765
## Precision           0.33333 0.1235 0.1299 0.6667 0.2778 0.1236  0.1875
## Recall              0.02941 0.2941 0.2941 0.1176 0.1471 0.6471  0.1765
## G-mean              0.17138 0.5289 0.5296 0.3428 0.3818 0.7599  0.4163
## Weighted Accuracy   0.61094 0.6883 0.6899 0.6462 0.6534 0.7943  0.6598
## F-measure           0.05405 0.1739 0.1802 0.2000 0.1923 0.2075  0.1818
## AUC                 0.51402 0.6226 0.6240 0.5581 0.5690 0.7697  0.5793
##                      NULL
## TNR               1.00000
## TPR               0.02941
## Precision         1.00000
## Recall            0.02941
## G-mean            0.17150
## Weighted Accuracy 0.61176
## F-measure         0.05714
## AUC               0.51471
sapply(modelsTest_ubUnder10,function(x) classModelErrors(x,targetsTestClass_p2,0.4))
##                   svmRadial   nnet   rpart      rf    gbm    ada treebag
## TNR                 0.99931 0.9945 0.97241 0.99379 0.9814 0.9903  0.9814
## TPR                 0.02941 0.1176 0.08824 0.05882 0.1765 0.1471  0.1471
## Precision           0.50000 0.3333 0.06977 0.18182 0.1818 0.2632  0.1562
## Recall              0.02941 0.1176 0.08824 0.05882 0.1765 0.1471  0.1471
## G-mean              0.17144 0.3420 0.29292 0.24178 0.4162 0.3816  0.3799
## Weighted Accuracy   0.61135 0.6437 0.61874 0.61981 0.6594 0.6530  0.6477
## F-measure           0.05556 0.1739 0.07792 0.08889 0.1791 0.1887  0.1515
## AUC                 0.51436 0.5561 0.53032 0.52631 0.5789 0.5687  0.5642
##                      NULL
## TNR               0.01034
## TPR               0.85294
## Precision         0.01981
## Recall            0.85294
## G-mean            0.09393
## Weighted Accuracy 0.34738
## F-measure         0.03872
## AUC               0.56836
sapply(modelsTest_ubUnder30,function(x) classModelErrors(x,targetsTestClass_p2,0.4))
##                   svmRadial   nnet  rpart     rf    gbm    ada treebag
## TNR                 0.83241 0.7800 0.8248 0.8917 0.8483 0.8545  0.8717
## TPR                 0.67647 0.5882 0.8529 0.7353 0.7647 0.8235  0.7941
## Precision           0.08647 0.0590 0.1025 0.1374 0.1057 0.1172  0.1268
## Recall              0.67647 0.5882 0.8529 0.7353 0.7647 0.8235  0.7941
## G-mean              0.75040 0.6774 0.8388 0.8097 0.8054 0.8389  0.8320
## Weighted Accuracy   0.77004 0.7033 0.8361 0.8292 0.8148 0.8421  0.8407
## F-measure           0.15333 0.1072 0.1830 0.2315 0.1857 0.2051  0.2186
## AUC                 0.75444 0.6841 0.8389 0.8135 0.8065 0.8390  0.8329
##                       NULL
## TNR               0.196552
## TPR               0.323529
## Precision         0.009354
## Recall            0.323529
## G-mean            0.252171
## Weighted Accuracy 0.247343
## F-measure         0.018182
## AUC               0.739959
sapply(modelsTest_SMOTE,function(x) classModelErrors(x,targetsTestClass_p2,0.4))
##                   svmRadial   nnet  rpart     rf    gbm    ada treebag
## TNR                 0.98138 0.8993 0.9255 0.9317 0.9372 0.9166  0.9179
## TPR                 0.08824 0.5294 0.4118 0.5294 0.3529 0.7353  0.5882
## Precision           0.10000 0.1098 0.1148 0.1538 0.1165 0.1712  0.1439
## Recall              0.08824 0.5294 0.4118 0.5294 0.3529 0.7353  0.5882
## G-mean              0.29427 0.6900 0.6173 0.7023 0.5751 0.8209  0.7348
## Weighted Accuracy   0.62412 0.7514 0.7200 0.7708 0.7035 0.8440  0.7861
## F-measure           0.09375 0.1818 0.1795 0.2384 0.1752 0.2778  0.2312
## AUC                 0.53481 0.7144 0.6686 0.7306 0.6451 0.8259  0.7531
##                      NULL
## TNR               0.02207
## TPR               0.91176
## Precision         0.02139
## Recall            0.91176
## G-mean            0.14185
## Weighted Accuracy 0.37795
## F-measure         0.04181
## AUC               0.53308
# xtable(sapply(modelsTest_previous,function(x) classModelErrors(x,targetsTestClass_p2,0.4)),digits=c(0,3,3,3,3,3,3,3,3))   
# xtable(sapply(modelsTest_ubOver,function(x) classModelErrors(x,targetsTestClass_p2,0.4)),digits=c(0,3,3,3,3,3,3,3,3),caption="over sampling")
# xtable(sapply(modelsTest_ubUnder10,function(x) classModelErrors(x,targetsTestClass_p2,0.4)),digits=c(0,3,3,3,3,3,3,3,3))
# xtable(sapply(modelsTest_ubUnder30,function(x) classModelErrors(x,targetsTestClass_p2,0.4)),digits=c(0,3,3,3,3,3,3,3,3))
#  xtable(sapply(modelsTest_SMOTE,function(x) classModelErrors(x,targetsTestClass_p2,0.4)),digits=c(0,3,3,3,3,3,3,3,3),caption="SMOTE")

variables importance

load(paste("dataset_","_TW_SMOTE_","_rpartFit.RData"))
rpartImp<- varImp(rpartFit, scale = FALSE)$importance
## Loading required package: rpart
as.matrix(rpartImp)->rpartImp
load(paste("dataset_","_TW_SMOTE_","_rfFit.RData"))
rfImp<- varImp(rfFit, scale = FALSE)$importance
as.matrix(rfImp)->rfImp
load(paste("dataset_","_TW_SMOTE_","_gbmFit.RData"))
gbmImp<- varImp(gbmFit, scale = FALSE)$importance
## Loading required package: plyr
as.matrix(gbmImp)->gbmImp
load(paste("dataset_","_TW_SMOTE_","_treebagFit.RData"))
treebagImp<- varImp(treebagFit, scale = FALSE)$importance
## Loading required package: ipred
as.matrix(treebagImp)->treebagImp
#############combining##################
merge(rpartImp,rfImp,by="row.names",all=TRUE)->rpart_rf_imp
merge(rpart_rf_imp,gbmImp,by.x="Row.names",by.y="row.names",all=TRUE)->rpart_rf_gmb_imp
merge(rpart_rf_gmb_imp,treebagImp,by.x="Row.names",by.y="row.names",all=TRUE)->rpart_rf_gmb_treebag_imp
## Warning: column names 'Overall.x', 'Overall.y' are duplicated in the
## result
colnames(rpart_rf_gmb_treebag_imp)<-c("variables","CART","RF","GBM","bagging")
as.matrix(rpart_rf_gmb_treebag_imp[,c("CART","RF","GBM","bagging")])->imp
rownames(imp)<-rpart_rf_gmb_treebag_imp[,1]
imp[ order(imp[,1], imp[,2],imp[,3], imp[,4]), ]->imp.order
################drow the plot of variable importance using lattice package###########
# pdf(file="imp_SMOTE_TW.pdf",width=14 ,height=10)
dotplot(imp.order, groups = FALSE,mar=rep(0, 4),transparent=TRUE,
layout = c(2, 2), aspect = 0.7,
origin = 0, type = c("p", "h"),
xlab = "Variable importance")

plot of chunk .2

# dev.off()