This report aims to build a promising model for the risk monitoring system. We are going to employ several state-of-the-art data mining techniqes to predict the risk and make a comparison among them in order to select the best one for this system.

loading packages

library(Hmisc)
library(gdata)
library(PerformanceAnalytics)
library(survival)
library(splines)
library(lattice)
library(gbm)
library(methods)
library(kernlab)
library(MASS)
library(caret)
library(ggplot2)
library(corrplot)
library(pbapply)
library(devtools)
library(caretEnsemble)
library(xlsx)
pathMain="/home/gong/FakharCompetition"

Data processing

if(file.exists("inputs.RData")){
        load("inputs.RData")
}else{library(xlsx)
read.xlsx("NHA-indicators-2.xlsx",sheetName = "Table")->output
output[-1,]->output
colnames(output)<-gsub("X","",colnames(output))
levels(as.factor(output$Indicators))[3]->TotalExpenditure
## the years with data
years<-c("2008","2009","2010","2011","2012","2013","2014")
read.xlsx("NHA-indicators-2-2.xlsx",sheetName = "Table")->targets
colnames(targets)<-gsub("X","",colnames(targets))
inputs<-c()
####inputs by country
for(i in 3:7){
       read.xlsx("Factors.xlsx",sheetName = sheetNames("Factors.xlsx")[i],header=TRUE,startRow=14,endRow = 21)->inputs_temp
       inputs_temp[,c(1:16)]->inputs_temp
       col_names<-paste(rep("col",ncol(inputs_temp)),1:ncol(inputs_temp),sep="")
       inputs_temp$country<- sheetNames("Factors.xlsx")[i]
       #the last column is the country number, 1 is saudi, 2 is Russia, 3 is Turkey, 4 is UAE, and 5 is Iran
        inputs_temp$country_factor<- (i-2)
        inputs_temp$output<-as.numeric(targets[targets$Countries==sheetNames("Factors.xlsx")[i],c(2:ncol(targets))])
       colnames(inputs_temp)<-col_names
       inputs<-rbind(inputs,inputs_temp)
      
}
inputs[,-17]->inputs
colnames(inputs)<-c(paste(rep("col",(ncol(inputs)-1)),1:(ncol(inputs)-1),sep=""),c("output"))
save(inputs,file="inputs.RData")
        }
##remove the country factor
head(inputs)
##   col1 col2 col3 col4 col5 col6 col7 col8 col9 col10     col11    col12
## 1 2008 0.48 0.57 0.34 0.14 0.47 0.25 0.44 0.46  0.36  1.456326 2.496307
## 2 2009 0.48 0.47 0.34 0.14 0.30 0.26 0.43 0.47  0.31 -2.062203 2.462249
## 3 2010 0.44 0.34 0.34 0.14 0.28 0.26 0.42 0.45  0.26  4.078066 2.454733
## 4 2011 0.55 0.26 0.34 0.13 0.15 0.25 0.55 0.47  0.25  2.847302 2.453717
## 5 2012 0.44 0.32 0.34 0.13 0.19 0.26 0.44 0.47  0.20  2.260134 2.428241
## 6 2013 0.44 0.38 0.32 0.13 0.15 0.29 0.43 0.48  0.21  2.368004 2.362047
##       col13 col14    col15    col16 col17   output
## 1  557.8670 99.67 19436.86 59.22617     1 14918.95
## 2  639.7370 61.95 15655.08 38.55709     1 17534.87
## 3  655.0706 79.48 18753.98 43.99712     1 18401.36
## 4  829.2460 94.88 23256.10 48.83015     1 23872.70
## 5  961.3441 94.05 24883.19 46.15887     1 28355.85
## 6 1052.0965 97.98 24646.02 43.66758     1 31774.42
read.xlsx("Factors.xlsx",sheetName = sheetNames("Factors.xlsx")[3],header=FALSE,startRow=1,endRow = 1)->inputsNames
colnames(inputsNames)<-gsub("X","col",colnames(inputsNames))
inputsNames
##   col1                     col2                         col3
## 1 Year Government Effectiveness Financial Market Development
##                  col4                col5      col6                   col7
## 1 Overall Health Risk Access to Utilities Inflation Trading Across Borders
##         col8               col9                           col10
## 1 Corruption Demographic Shifts Capacity of Technology Adoption
##                  col11                 col12
## 1 World GDP Growth (%) Population Growth (%)
##                                     col13           col14
## 1 Healthcare expenditure per capita (USD) Oil Price (USD)
##                  col15                    col16
## 1 GDP per capita (USD) Oil Rent as % of GDP (%)

variable correlation analysis

##scale the data sets
normalized<-function(x){
  (x-min(x))*0.8/(max(x)-min(x))+0.1
}
apply(inputs,2,normalized)->inputs_normalized
# ++++++++++++++++++++++++++++
# flattenCorrMatrix
# ++++++++++++++++++++++++++++
# cormat : matrix of the correlation coefficients
# pmat : matrix of the correlation p-values
flattenCorrMatrix <- function(cormat, pmat) {
  ut <- upper.tri(cormat)
  data.frame(
    row = rownames(cormat)[row(cormat)[ut]],
    column = rownames(cormat)[col(cormat)[ut]],
    cor  =(cormat)[ut],
    p = pmat[ut]
    )
}

res<-rcorr(as.matrix(inputs_normalized))
###the correaltion and p values between two input variables
flattenCorrMatrix(res$r, res$P)
##       row column           cor            p
## 1    col1   col2 -9.721576e-02 5.785065e-01
## 2    col1   col3 -2.483232e-01 1.503239e-01
## 3    col2   col3  7.708723e-01 6.034834e-08
## 4    col1   col4 -1.785824e-01 3.046942e-01
## 5    col2   col4  7.841756e-01 2.505476e-08
## 6    col3   col4  6.414164e-01 3.294614e-05
## 7    col1   col5 -1.466408e-01 4.005778e-01
## 8    col2   col5  7.009293e-01 2.743173e-06
## 9    col3   col5  8.194833e-01 1.744417e-09
## 10   col4   col5  4.095532e-01 1.455785e-02
## 11   col1   col6 -1.526817e-01 3.812353e-01
## 12   col2   col6  6.902025e-01 4.481654e-06
## 13   col3   col6  7.283959e-01 7.042007e-07
## 14   col4   col6  6.036950e-01 1.233181e-04
## 15   col5   col6  4.088657e-01 1.474045e-02
## 16   col1   col7  9.697633e-02 5.794469e-01
## 17   col2   col7  7.827527e-01 2.760474e-08
## 18   col3   col7  7.902080e-01 1.647752e-08
## 19   col4   col7  6.231860e-01 6.370019e-05
## 20   col5   col7  8.224785e-01 1.355455e-09
## 21   col6   col7  6.060541e-01 1.141039e-04
## 22   col1   col8 -4.897505e-02 7.799491e-01
## 23   col2   col8  9.136091e-01 1.909584e-14
## 24   col3   col8  8.440129e-01 1.902087e-10
## 25   col4   col8  7.002189e-01 2.835685e-06
## 26   col5   col8  8.813428e-01 2.788214e-12
## 27   col6   col8  6.205304e-01 6.988083e-05
## 28   col7   col8  8.885868e-01 1.042721e-12
## 29   col1   col9  3.808941e-02 8.280253e-01
## 30   col2   col9  6.964502e-01 3.375830e-06
## 31   col3   col9  7.336829e-01 5.319585e-07
## 32   col4   col9  8.147954e-01 2.565503e-09
## 33   col5   col9  6.769918e-01 7.978922e-06
## 34   col6   col9  4.537618e-01 6.184205e-03
## 35   col7   col9  7.473509e-01 2.496596e-07
## 36   col8   col9  8.084869e-01 4.239762e-09
## 37   col1  col10 -5.483494e-02 7.543857e-01
## 38   col2  col10  8.626080e-01 2.708767e-11
## 39   col3  col10  9.275892e-01 1.110223e-15
## 40   col4  col10  7.256513e-01 8.125270e-07
## 41   col5  col10  8.430778e-01 2.083862e-10
## 42   col6  col10  6.991398e-01 2.981644e-06
## 43   col7  col10  8.807890e-01 2.998046e-12
## 44   col8  col10  9.393311e-01 0.000000e+00
## 45   col9  col10  8.382482e-01 3.308189e-10
## 46   col1  col11  4.069552e-01 1.525805e-02
## 47   col2  col11 -2.479410e-02 8.875711e-01
## 48   col3  col11 -1.717993e-01 3.237268e-01
## 49   col4  col11 -7.689691e-02 6.606198e-01
## 50   col5  col11 -7.456084e-02 6.703400e-01
## 51   col6  col11  4.400206e-02 8.018241e-01
## 52   col7  col11  1.848080e-02 9.160806e-01
## 53   col8  col11 -3.779724e-03 9.828076e-01
## 54   col9  col11  4.004904e-03 9.817836e-01
## 55  col10  col11  2.173747e-02 9.013591e-01
## 56   col1  col12 -3.151351e-01 6.518960e-02
## 57   col2  col12 -4.927701e-01 2.634999e-03
## 58   col3  col12 -4.064475e-01 1.539811e-02
## 59   col4  col12 -5.949976e-01 1.633438e-04
## 60   col5  col12 -4.157737e-01 1.299034e-02
## 61   col6  col12 -2.226745e-01 1.985370e-01
## 62   col7  col12 -5.728642e-01 3.224704e-04
## 63   col8  col12 -5.613829e-01 4.504173e-04
## 64   col9  col12 -7.398198e-01 3.809479e-07
## 65  col10  col12 -5.390585e-01 8.340429e-04
## 66  col11  col12 -1.471844e-01 3.988144e-01
## 67   col1  col13  2.297201e-01 1.843366e-01
## 68   col2  col13 -7.393718e-01 3.904700e-07
## 69   col3  col13 -5.896009e-01 1.936882e-04
## 70   col4  col13 -9.286445e-01 8.881784e-16
## 71   col5  col13 -2.832962e-01 9.912372e-02
## 72   col6  col13 -6.637759e-01 1.381080e-05
## 73   col7  col13 -4.957046e-01 2.461151e-03
## 74   col8  col13 -6.029582e-01 1.263300e-04
## 75   col9  col13 -7.158107e-01 1.338850e-06
## 76  col10  col13 -6.401739e-01 3.450708e-05
## 77  col11  col13  1.148794e-01 5.110946e-01
## 78  col12  col13  4.554756e-01 5.968749e-03
## 79   col1  col14  5.212715e-02 7.661682e-01
## 80   col2  col14  1.293749e-01 4.588652e-01
## 81   col3  col14 -5.455932e-02 7.555826e-01
## 82   col4  col14  3.066097e-01 7.321648e-02
## 83   col5  col14  1.739680e-01 3.175629e-01
## 84   col6  col14 -3.561424e-01 3.574191e-02
## 85   col7  col14  1.117210e-01 5.228543e-01
## 86   col8  col14  1.807460e-01 2.987757e-01
## 87   col9  col14  4.381230e-01 8.478302e-03
## 88  col10  col14  8.819839e-02 6.143862e-01
## 89  col11  col14  8.687952e-02 6.197112e-01
## 90  col12  col14 -3.416995e-01 4.453372e-02
## 91  col13  col14 -2.042105e-01 2.393154e-01
## 92   col1  col15  1.023102e-01 5.586611e-01
## 93   col2  col15 -7.971300e-01 1.001507e-08
## 94   col3  col15 -7.064160e-01 2.116444e-06
## 95   col4  col15 -9.517662e-01 0.000000e+00
## 96   col5  col15 -4.969074e-01 2.392839e-03
## 97   col6  col15 -6.369493e-01 3.887494e-05
## 98   col7  col15 -6.709974e-01 1.026807e-05
## 99   col8  col15 -7.611395e-01 1.107390e-07
## 100  col9  col15 -8.758311e-01 5.651923e-12
## 101 col10  col15 -7.895739e-01 1.723056e-08
## 102 col11  col15  8.850645e-02 6.131451e-01
## 103 col12  col15  5.928878e-01 1.746585e-04
## 104 col13  col15  9.346099e-01 2.220446e-16
## 105 col14  col15 -2.999569e-01 7.999976e-02
## 106  col1  col16 -1.028728e-01 5.564890e-01
## 107  col2  col16  1.517719e-01 3.841128e-01
## 108  col3  col16 -2.492945e-01 1.486830e-01
## 109  col4  col16 -1.897929e-01 2.748265e-01
## 110  col5  col16 -1.170588e-01 5.030576e-01
## 111  col6  col16 -3.831857e-02 8.270062e-01
## 112  col7  col16 -1.333378e-01 4.451008e-01
## 113  col8  col16 -6.975224e-02 6.905132e-01
## 114  col9  col16 -4.790610e-01 3.596519e-03
## 115 col10  col16 -2.360027e-01 1.722924e-01
## 116 col11  col16  4.196725e-02 8.108187e-01
## 117 col12  col16  1.886671e-01 2.777368e-01
## 118 col13  col16  1.431465e-01 4.120180e-01
## 119 col14  col16 -7.433725e-02 6.712731e-01
## 120 col15  col16  2.976164e-01 8.249865e-02
## 121  col1  col17 -6.728968e-16 1.000000e+00
## 122  col2  col17 -7.471971e-03 9.660205e-01
## 123  col3  col17  2.630475e-01 1.268299e-01
## 124  col4  col17  5.001058e-03 9.772536e-01
## 125  col5  col17 -1.008882e-01 5.641685e-01
## 126  col6  col17  5.447561e-01 7.155846e-04
## 127  col7  col17  5.159921e-02 7.684716e-01
## 128  col8  col17 -1.806812e-02 9.179482e-01
## 129  col9  col17 -7.406648e-02 6.724038e-01
## 130 col10  col17  1.477111e-01 3.971103e-01
## 131 col11  col17 -7.727526e-16 1.000000e+00
## 132 col12  col17  1.656448e-01 3.416197e-01
## 133 col13  col17 -8.132941e-02 6.423269e-01
## 134 col14  col17 -8.274669e-01 8.810634e-10
## 135 col15  col17 -2.679259e-02 8.785735e-01
## 136 col16  col17 -3.246318e-01 5.708090e-02
## 137  col1 output  1.404063e-01 4.211172e-01
## 138  col2 output  4.551045e-01 6.014854e-03
## 139  col3 output  5.774303e-01 2.813690e-04
## 140  col4 output  3.029581e-01 7.688188e-02
## 141  col5 output  8.485439e-01 1.211811e-10
## 142  col6 output  7.263263e-02 6.784031e-01
## 143  col7 output  6.613523e-01 1.522830e-05
## 144  col8 output  7.045158e-01 2.316891e-06
## 145  col9 output  7.195876e-01 1.108041e-06
## 146 col10 output  6.849352e-01 5.660550e-06
## 147 col11 output  8.472608e-02 6.284469e-01
## 148 col12 output -4.780807e-01 3.675648e-03
## 149 col13 output -1.234923e-01 4.797097e-01
## 150 col14 output  5.054650e-01 1.952771e-03
## 151 col15 output -4.083072e-01 1.489021e-02
## 152 col16 output -3.469056e-01 4.118342e-02
## 153 col17 output -3.318084e-01 5.150120e-02
##chart of the correlations
chart.Correlation(inputs_normalized, histogram=TRUE, pch=19)

seperate data set into training and testing

if(file.exists("train_test.RData")){
        load("train_test.RData")
}else{
        set.seed(1234)
        sample(nrow(inputs_normalized),size=25,replace=FALSE)->trainIndex
        inputs_normalized[trainIndex,]->train
        inputs_normalized[-trainIndex,]->test
        save(trainIndex,train,test,file="train_test.RData")
        
}

Training models

Train models , Support Vector Machine, Neural networks, Classificiation and regression Tree, Random Forest, bagging, adaboost models

##Train models
Training<-function(inputsTrain,targetsTrain,inputsTest,targetsTest,dataset){
        path=getwd()
        subdir<-"models"
        dir.create(paste(path,subdir,sep="/"), showWarnings = FALSE)
        setwd(paste(path,subdir,sep="/"))
  resultList=list()
  cvcontrol<-trainControl(method="cv",number=5,returnData=FALSE, verboseIter=TRUE,allowParallel=TRUE,savePredictions=TRUE,index=createMultiFolds(targetsTrain, k=5, times=2),seeds = NULL)
  #Linear Least Squares
  if(file.exists(paste("dataset_",dataset,"_lmFit.RData")))
  {
    load(paste("dataset_",dataset,"_lmFit.RData"))
  }else{
    lmFit<-caret::train(inputsTrain,targetsTrain,method="lm",trControl=cvcontrol)
    save(lmFit,file=paste("dataset_",dataset,"_lmFit.RData"))
  }
  
  #svm
  if(file.exists(paste("dataset_",dataset,"_svmFit.RData")))
  { 
    load(paste("dataset_",dataset,"_svmFit.RData"))
  }else{
    svm.grid<-expand.grid(.C=c(0.1,1,10),.sigma=c(10,1,0.1,0.01))
    svmFit<-caret::train(inputsTrain,targetsTrain,method="svmRadial",trControl=cvcontrol,tuneGrid=svm.grid)
    save(svmFit,file=paste("dataset_",dataset,"_svmFit.RData"))   

  }
  
  #neural networks
  if(file.exists(paste("dataset_",dataset,"_nnetFit.RData")))
  {
    load(paste("dataset_",dataset,"_nnetFit.RData"))

  }else
  {
    nnet.grid<-expand.grid(.size=c(1:10),.decay=c(0.0001,0.001,0.01,1,10))
    nnetFit<-caret::train(inputsTrain,targetsTrain,method="nnet",trControl=cvcontrol,tuneGrid=nnet.grid,maxit=10000000)
    save(nnetFit,file=paste("dataset_",dataset,"_nnetFit.RData"))
  }
  
  #CART
  if(file.exists(paste("dataset_",dataset,"_rpartFit.RData")))
  {
    load(paste("dataset_",dataset,"_rpartFit.RData"))
  }else{
    rpart.grid<-expand.grid(.cp=c(0.06))
    rpartFit<-caret::train(inputsTrain,targetsTrain,method="rpart",trControl=cvcontrol,tuneGrid=rpart.grid)
    save(rpartFit,file=paste("dataset_",dataset,"_rpartFit.RData"))
  }
  
  #randomForest
  if(file.exists(paste("dataset_",dataset,"_rfFit.RData")))
  {
    load(paste("dataset_",dataset,"_rfFit.RData"))
  }else{
    rf.grid<-expand.grid(.mtry=c(1:15))
    rfFit<-caret::train(inputsTrain,targetsTrain,method="rf",trControl=cvcontrol,tuneGrid=rf.grid,importance=TRUE)
    save(rfFit,file=paste("dataset_",dataset,"_rfFit.RData"))
  }
  
  #boosting CART
  if(file.exists(paste("dataset_",dataset,"_gbmFit.RData")))
  {
    load(paste("dataset_",dataset,"_gbmFit.RData"))
  }else{
    gbm.grid<-expand.grid(.interaction.depth=c(1:20),.n.trees=c(1:10),.shrinkage=c(0.15,0.2),.n.minobsinnode=c(1:10))
   gbmFit<-caret::train(inputsTrain,targetsTrain,method="gbm",trControl=cvcontrol,tuneGrid=gbm.grid)
    save(gbmFit,file=paste("dataset_",dataset,"_gbmFit.RData"))
  }
  
  #bagged CART
  if(file.exists(paste("dataset_",dataset,"_bagTreeFit.RData")))
  {
    load(paste("dataset_",dataset,"_bagTreeFit.RData"))
  }else{
    bagTreeFit<-caret::train(inputsTrain,targetsTrain,method="treebag",trControl=cvcontrol)
    save(bagTreeFit,file=paste("dataset_",dataset,"_bagTreeFit.RData"))
  }
}
setwd(pathMain)
train[,1:(ncol(train)-1)]->inputsTrain
train[,ncol(train)]->targetsTrain
test[,1:(ncol(test)-1)]->inputsTest
test[,ncol(test)]->targetsTest
Training(train[,1:(ncol(train)-1)],train[,ncol(train)],test[,1:(ncol(test)-1)],test[,ncol(test)],"allFactors")

Model performances

functions to denormalize data and matrixes that measure errors which includes Relative Mean square error etc.

unormalized<-function(x,y){
        ((y-0.1)*(max(x)-min(x))/0.8) + min(x)
}
###function of r square
rsquare<-function(predict,observed){
        sum((predict-mean(observed))*(predict-mean(observed)))/sum((observed-mean(observed))^2)
}

modelErrors <- function(predicted, actual) {
  sal <- vector(mode="numeric", length=3)
  names(sal) <- c( "MAE", "RMSE", "RELE")
  meanPredicted <- mean(predicted)
  meanActual <- mean(actual)
  sumPred <- sum((predicted - meanPredicted)^2)
  sumActual <- sum((actual - meanActual)^2)
  n<- length(actual)
  p3<-vector(mode="numeric", length=n)
  for (i in c(1:n)) {
    if (actual[i]==0) {p3[i]<-abs(predicted[i])
    } else { p3[i]<-((abs(predicted[i]-actual[i]))/actual[i])
    }}
  sal[1] <- mean(abs(predicted - actual))
  sal[2] <- sqrt(sum((predicted - actual)^2)/n)
  sal[3] <- mean(p3)
  sal
}

     
allModels<-function(all.models,inputsTest,targetsTest,outputOrig,nameOfDataset){
                ##predict function
         pd<-function(model){
                        predict(model,newdata=inputsTest)
         }
         
         ##error calcuation function
         error<-function(model){
                       pd<-pd(model)
                       modelErrors(pd,targetsTest)   
         }
         
     
        unormalized(outputOrig,targetsTest)->real_unormalized

       if(length(all.models)==1){
               all.models[[1]]->all.models
               pd(all.models)->predNorm
               names(predNorm)<- names(all.models)
               save(predNorm,file=paste("dataset_", nameOfDataset,"_predictValue_normalized.RData"))
               error(all.models)->modelsErrorsTotalNorm
               save(modelsErrorsTotalNorm,file=paste("error_norm_dataset_", nameOfDataset,".RData"))
               unormalized(outputOrig, predNorm )->predUnormalized
               save(predUnormalized,file=paste("dataset_", nameOfDataset,"_predictValue_unnormalized.RData"))
               modelErrors(predUnormalized,real_unormalized)->modelsErrorsTotalUnnorm
               save(modelsErrorsTotalUnnorm,file=paste("error_unnorm_dataset_",nameOfDataset,".RData"))
                
     }else{
       ####predict normalized value of all models
      
        data.frame(sapply(all.models,pd))->predNorm
        colnames(predNorm)<-names(all.models)
       save(predNorm,file=paste("dataset_", nameOfDataset,"_predictValue_normalized.RData"))
       
         ###calculate the errors of normalized 
        data.frame(sapply(all.models,error))->modelsErrorsTotalNorm
        colnames(modelsErrorsTotalNorm)<-names(all.models)
        save(modelsErrorsTotalNorm,file=paste("error_norm_dataset_", nameOfDataset,".RData"))
        ###denormalized the predict valu
        data.frame(sapply(predNorm,function(x) unormalized(outputOrig,x)))->predUnormalized
        save(predUnormalized,file=paste("dataset_", nameOfDataset,"_predictValue_unnormalized.RData"))
        
        
         ###calculate the error of unnormalized
        data.frame(sapply(predUnormalized,function(x) modelErrors(x,real_unormalized)))->modelsErrorsTotalUnnorm
        save(modelsErrorsTotalUnnorm,file=paste("error_unnorm_dataset_",nameOfDataset,".RData"))
       }
}


setwd("/home/gong/FakharCompetition/models")
temp_files = list.files(pattern="*Fit.RData")
all.models<-list()
for(i in 1:length(temp_files)){
        e1 <- new.env()
        variableName<-load(temp_files[i])
        model<-load(temp_files[i], e1) 
        modelFit <- get(model, e1)
        all.models[i]<-list(modelFit)
}

names(all.models)<-lapply(all.models,function(x) x$method)

dir.create("/home/gong/FakharCompetition/predictAndErrors", showWarnings = FALSE)
setwd("/home/gong/FakharCompetition/predictAndErrors")
allModels(all.models,test[,1:(ncol(test)-1)],test[,ncol(test)],inputs[,"output"],"allFactors")
## Loading required package: ipred
## Loading required package: plyr
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:Hmisc':
## 
##     is.discrete, summarize
## Loading required package: e1071
## 
## Attaching package: 'e1071'
## The following objects are masked from 'package:PerformanceAnalytics':
## 
##     kurtosis, skewness
## The following object is masked from 'package:Hmisc':
## 
##     impute
## Loading required package: nnet
## Loading required package: randomForest
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:gdata':
## 
##     combine
## The following object is masked from 'package:Hmisc':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
## Loading required package: rpart
load("~/FakharCompetition/predictAndErrors/error_unnorm_dataset_ allFactors .RData")
###The models' errors###
modelsErrorsTotalUnnorm
##           treebag          gbm           lm         nnet           rf
## MAE  1.583060e+04 9.207730e+03 1.052644e+04 4892.6863405 1.012616e+04
## RMSE 2.470833e+04 1.168656e+04 1.396007e+04 5622.8437542 1.432048e+04
## RELE 3.771896e-01 2.814608e-01 3.903668e-01    0.1464394 2.930727e-01
##             rpart    svmRadial
## MAE  1.562287e+04 7596.2317144
## RMSE 2.077029e+04 8646.1863143
## RELE 4.011528e-01    0.2519252

According to the various error measurments, tochastic Gradient Boosting(gbm), artificial neural network(nnet) and support vector machine(SVM) are highly recommended as model for risk monitoring system. Among which, neural network is the best with the lowest errors.

plot the predict and real value

load("~/FakharCompetition/train_test.RData")
inputs[-trainIndex,ncol(inputs)]->real
load("~/FakharCompetition/predictAndErrors/dataset_ allFactors _predictValue_unnormalized.RData")
par(mfrow=c(1,2))
plot(real,predUnormalized$nnet,xlim=c(1,140000),ylim=c(1,140000),ylab="Nnet")
abline(a=0,b=1,col="red")

plot(real,predUnormalized$lm,xlim=c(1,140000),ylim=c(1,140000),ylab="lm")
abline(a=0,b=1,col="red")

The two plots illustatrates the real value against the predicted values of Nnet and lm. The closer the points to 45 degree, the more accurate model is. Therefore, the Nnet model performance much better than lm.

Variable importance analysis

Next step, we will use the two best models Stochastic Gradient Boosting and neural network models to analyze the vaiable importances.

#######variables importance#########)
load("~/FakharCompetition/models/dataset_ allFactors _nnetFit.RData")
nnetImp<- varImp(nnetFit, scale = FALSE)$importance
as.matrix(nnetImp)->nnetImp
load("~/FakharCompetition/models/dataset_ allFactors _gbmFit.RData")
gbmImp<- varImp(gbmFit, scale = FALSE)$importance
as.matrix(gbmImp)->gbmImp

merge(nnetImp,gbmImp,by="row.names",all=TRUE)->nnet_gbm_imp

colnames(nnet_gbm_imp)<-c("variables","nnet","GBM")
as.matrix(nnet_gbm_imp[,c("nnet","GBM")])->imp
rownames(imp)<-nnet_gbm_imp[,"variables"]
###order importances
imp[ order(imp[,1], imp[,2]), ]->imp.order

normalized<-function(x){
        (x-min(x))*0.8/(max(x)-min(x))+0.1
}
###scale the important scores
apply(imp.order,2,normalized)->imp.order.norm

dotplot(imp.order.norm, groups = FALSE,mar=rep(0, 4),transparent=TRUE,
layout = c(2, 1), aspect = 0.7,
origin = 0, type = c("p", "h"),
xlab = "Variable importance score")