This report aims to build a promising model for the risk monitoring system. We are going to employ several state-of-the-art data mining techniqes to predict the risk and make a comparison among them in order to select the best one for this system.
loading packages
library(Hmisc)
library(gdata)
library(PerformanceAnalytics)
library(survival)
library(splines)
library(lattice)
library(gbm)
library(methods)
library(kernlab)
library(MASS)
library(caret)
library(ggplot2)
library(corrplot)
library(pbapply)
library(devtools)
library(caretEnsemble)
library(xlsx)
pathMain="/home/gong/FakharCompetition"
if(file.exists("inputs.RData")){
load("inputs.RData")
}else{library(xlsx)
read.xlsx("NHA-indicators-2.xlsx",sheetName = "Table")->output
output[-1,]->output
colnames(output)<-gsub("X","",colnames(output))
levels(as.factor(output$Indicators))[3]->TotalExpenditure
## the years with data
years<-c("2008","2009","2010","2011","2012","2013","2014")
read.xlsx("NHA-indicators-2-2.xlsx",sheetName = "Table")->targets
colnames(targets)<-gsub("X","",colnames(targets))
inputs<-c()
####inputs by country
for(i in 3:7){
read.xlsx("Factors.xlsx",sheetName = sheetNames("Factors.xlsx")[i],header=TRUE,startRow=14,endRow = 21)->inputs_temp
inputs_temp[,c(1:16)]->inputs_temp
col_names<-paste(rep("col",ncol(inputs_temp)),1:ncol(inputs_temp),sep="")
inputs_temp$country<- sheetNames("Factors.xlsx")[i]
#the last column is the country number, 1 is saudi, 2 is Russia, 3 is Turkey, 4 is UAE, and 5 is Iran
inputs_temp$country_factor<- (i-2)
inputs_temp$output<-as.numeric(targets[targets$Countries==sheetNames("Factors.xlsx")[i],c(2:ncol(targets))])
colnames(inputs_temp)<-col_names
inputs<-rbind(inputs,inputs_temp)
}
inputs[,-17]->inputs
colnames(inputs)<-c(paste(rep("col",(ncol(inputs)-1)),1:(ncol(inputs)-1),sep=""),c("output"))
save(inputs,file="inputs.RData")
}
##remove the country factor
head(inputs)
## col1 col2 col3 col4 col5 col6 col7 col8 col9 col10 col11 col12
## 1 2008 0.48 0.57 0.34 0.14 0.47 0.25 0.44 0.46 0.36 1.456326 2.496307
## 2 2009 0.48 0.47 0.34 0.14 0.30 0.26 0.43 0.47 0.31 -2.062203 2.462249
## 3 2010 0.44 0.34 0.34 0.14 0.28 0.26 0.42 0.45 0.26 4.078066 2.454733
## 4 2011 0.55 0.26 0.34 0.13 0.15 0.25 0.55 0.47 0.25 2.847302 2.453717
## 5 2012 0.44 0.32 0.34 0.13 0.19 0.26 0.44 0.47 0.20 2.260134 2.428241
## 6 2013 0.44 0.38 0.32 0.13 0.15 0.29 0.43 0.48 0.21 2.368004 2.362047
## col13 col14 col15 col16 col17 output
## 1 557.8670 99.67 19436.86 59.22617 1 14918.95
## 2 639.7370 61.95 15655.08 38.55709 1 17534.87
## 3 655.0706 79.48 18753.98 43.99712 1 18401.36
## 4 829.2460 94.88 23256.10 48.83015 1 23872.70
## 5 961.3441 94.05 24883.19 46.15887 1 28355.85
## 6 1052.0965 97.98 24646.02 43.66758 1 31774.42
read.xlsx("Factors.xlsx",sheetName = sheetNames("Factors.xlsx")[3],header=FALSE,startRow=1,endRow = 1)->inputsNames
colnames(inputsNames)<-gsub("X","col",colnames(inputsNames))
inputsNames
## col1 col2 col3
## 1 Year Government Effectiveness Financial Market Development
## col4 col5 col6 col7
## 1 Overall Health Risk Access to Utilities Inflation Trading Across Borders
## col8 col9 col10
## 1 Corruption Demographic Shifts Capacity of Technology Adoption
## col11 col12
## 1 World GDP Growth (%) Population Growth (%)
## col13 col14
## 1 Healthcare expenditure per capita (USD) Oil Price (USD)
## col15 col16
## 1 GDP per capita (USD) Oil Rent as % of GDP (%)
##scale the data sets
normalized<-function(x){
(x-min(x))*0.8/(max(x)-min(x))+0.1
}
apply(inputs,2,normalized)->inputs_normalized
# ++++++++++++++++++++++++++++
# flattenCorrMatrix
# ++++++++++++++++++++++++++++
# cormat : matrix of the correlation coefficients
# pmat : matrix of the correlation p-values
flattenCorrMatrix <- function(cormat, pmat) {
ut <- upper.tri(cormat)
data.frame(
row = rownames(cormat)[row(cormat)[ut]],
column = rownames(cormat)[col(cormat)[ut]],
cor =(cormat)[ut],
p = pmat[ut]
)
}
res<-rcorr(as.matrix(inputs_normalized))
###the correaltion and p values between two input variables
flattenCorrMatrix(res$r, res$P)
## row column cor p
## 1 col1 col2 -9.721576e-02 5.785065e-01
## 2 col1 col3 -2.483232e-01 1.503239e-01
## 3 col2 col3 7.708723e-01 6.034834e-08
## 4 col1 col4 -1.785824e-01 3.046942e-01
## 5 col2 col4 7.841756e-01 2.505476e-08
## 6 col3 col4 6.414164e-01 3.294614e-05
## 7 col1 col5 -1.466408e-01 4.005778e-01
## 8 col2 col5 7.009293e-01 2.743173e-06
## 9 col3 col5 8.194833e-01 1.744417e-09
## 10 col4 col5 4.095532e-01 1.455785e-02
## 11 col1 col6 -1.526817e-01 3.812353e-01
## 12 col2 col6 6.902025e-01 4.481654e-06
## 13 col3 col6 7.283959e-01 7.042007e-07
## 14 col4 col6 6.036950e-01 1.233181e-04
## 15 col5 col6 4.088657e-01 1.474045e-02
## 16 col1 col7 9.697633e-02 5.794469e-01
## 17 col2 col7 7.827527e-01 2.760474e-08
## 18 col3 col7 7.902080e-01 1.647752e-08
## 19 col4 col7 6.231860e-01 6.370019e-05
## 20 col5 col7 8.224785e-01 1.355455e-09
## 21 col6 col7 6.060541e-01 1.141039e-04
## 22 col1 col8 -4.897505e-02 7.799491e-01
## 23 col2 col8 9.136091e-01 1.909584e-14
## 24 col3 col8 8.440129e-01 1.902087e-10
## 25 col4 col8 7.002189e-01 2.835685e-06
## 26 col5 col8 8.813428e-01 2.788214e-12
## 27 col6 col8 6.205304e-01 6.988083e-05
## 28 col7 col8 8.885868e-01 1.042721e-12
## 29 col1 col9 3.808941e-02 8.280253e-01
## 30 col2 col9 6.964502e-01 3.375830e-06
## 31 col3 col9 7.336829e-01 5.319585e-07
## 32 col4 col9 8.147954e-01 2.565503e-09
## 33 col5 col9 6.769918e-01 7.978922e-06
## 34 col6 col9 4.537618e-01 6.184205e-03
## 35 col7 col9 7.473509e-01 2.496596e-07
## 36 col8 col9 8.084869e-01 4.239762e-09
## 37 col1 col10 -5.483494e-02 7.543857e-01
## 38 col2 col10 8.626080e-01 2.708767e-11
## 39 col3 col10 9.275892e-01 1.110223e-15
## 40 col4 col10 7.256513e-01 8.125270e-07
## 41 col5 col10 8.430778e-01 2.083862e-10
## 42 col6 col10 6.991398e-01 2.981644e-06
## 43 col7 col10 8.807890e-01 2.998046e-12
## 44 col8 col10 9.393311e-01 0.000000e+00
## 45 col9 col10 8.382482e-01 3.308189e-10
## 46 col1 col11 4.069552e-01 1.525805e-02
## 47 col2 col11 -2.479410e-02 8.875711e-01
## 48 col3 col11 -1.717993e-01 3.237268e-01
## 49 col4 col11 -7.689691e-02 6.606198e-01
## 50 col5 col11 -7.456084e-02 6.703400e-01
## 51 col6 col11 4.400206e-02 8.018241e-01
## 52 col7 col11 1.848080e-02 9.160806e-01
## 53 col8 col11 -3.779724e-03 9.828076e-01
## 54 col9 col11 4.004904e-03 9.817836e-01
## 55 col10 col11 2.173747e-02 9.013591e-01
## 56 col1 col12 -3.151351e-01 6.518960e-02
## 57 col2 col12 -4.927701e-01 2.634999e-03
## 58 col3 col12 -4.064475e-01 1.539811e-02
## 59 col4 col12 -5.949976e-01 1.633438e-04
## 60 col5 col12 -4.157737e-01 1.299034e-02
## 61 col6 col12 -2.226745e-01 1.985370e-01
## 62 col7 col12 -5.728642e-01 3.224704e-04
## 63 col8 col12 -5.613829e-01 4.504173e-04
## 64 col9 col12 -7.398198e-01 3.809479e-07
## 65 col10 col12 -5.390585e-01 8.340429e-04
## 66 col11 col12 -1.471844e-01 3.988144e-01
## 67 col1 col13 2.297201e-01 1.843366e-01
## 68 col2 col13 -7.393718e-01 3.904700e-07
## 69 col3 col13 -5.896009e-01 1.936882e-04
## 70 col4 col13 -9.286445e-01 8.881784e-16
## 71 col5 col13 -2.832962e-01 9.912372e-02
## 72 col6 col13 -6.637759e-01 1.381080e-05
## 73 col7 col13 -4.957046e-01 2.461151e-03
## 74 col8 col13 -6.029582e-01 1.263300e-04
## 75 col9 col13 -7.158107e-01 1.338850e-06
## 76 col10 col13 -6.401739e-01 3.450708e-05
## 77 col11 col13 1.148794e-01 5.110946e-01
## 78 col12 col13 4.554756e-01 5.968749e-03
## 79 col1 col14 5.212715e-02 7.661682e-01
## 80 col2 col14 1.293749e-01 4.588652e-01
## 81 col3 col14 -5.455932e-02 7.555826e-01
## 82 col4 col14 3.066097e-01 7.321648e-02
## 83 col5 col14 1.739680e-01 3.175629e-01
## 84 col6 col14 -3.561424e-01 3.574191e-02
## 85 col7 col14 1.117210e-01 5.228543e-01
## 86 col8 col14 1.807460e-01 2.987757e-01
## 87 col9 col14 4.381230e-01 8.478302e-03
## 88 col10 col14 8.819839e-02 6.143862e-01
## 89 col11 col14 8.687952e-02 6.197112e-01
## 90 col12 col14 -3.416995e-01 4.453372e-02
## 91 col13 col14 -2.042105e-01 2.393154e-01
## 92 col1 col15 1.023102e-01 5.586611e-01
## 93 col2 col15 -7.971300e-01 1.001507e-08
## 94 col3 col15 -7.064160e-01 2.116444e-06
## 95 col4 col15 -9.517662e-01 0.000000e+00
## 96 col5 col15 -4.969074e-01 2.392839e-03
## 97 col6 col15 -6.369493e-01 3.887494e-05
## 98 col7 col15 -6.709974e-01 1.026807e-05
## 99 col8 col15 -7.611395e-01 1.107390e-07
## 100 col9 col15 -8.758311e-01 5.651923e-12
## 101 col10 col15 -7.895739e-01 1.723056e-08
## 102 col11 col15 8.850645e-02 6.131451e-01
## 103 col12 col15 5.928878e-01 1.746585e-04
## 104 col13 col15 9.346099e-01 2.220446e-16
## 105 col14 col15 -2.999569e-01 7.999976e-02
## 106 col1 col16 -1.028728e-01 5.564890e-01
## 107 col2 col16 1.517719e-01 3.841128e-01
## 108 col3 col16 -2.492945e-01 1.486830e-01
## 109 col4 col16 -1.897929e-01 2.748265e-01
## 110 col5 col16 -1.170588e-01 5.030576e-01
## 111 col6 col16 -3.831857e-02 8.270062e-01
## 112 col7 col16 -1.333378e-01 4.451008e-01
## 113 col8 col16 -6.975224e-02 6.905132e-01
## 114 col9 col16 -4.790610e-01 3.596519e-03
## 115 col10 col16 -2.360027e-01 1.722924e-01
## 116 col11 col16 4.196725e-02 8.108187e-01
## 117 col12 col16 1.886671e-01 2.777368e-01
## 118 col13 col16 1.431465e-01 4.120180e-01
## 119 col14 col16 -7.433725e-02 6.712731e-01
## 120 col15 col16 2.976164e-01 8.249865e-02
## 121 col1 col17 -6.728968e-16 1.000000e+00
## 122 col2 col17 -7.471971e-03 9.660205e-01
## 123 col3 col17 2.630475e-01 1.268299e-01
## 124 col4 col17 5.001058e-03 9.772536e-01
## 125 col5 col17 -1.008882e-01 5.641685e-01
## 126 col6 col17 5.447561e-01 7.155846e-04
## 127 col7 col17 5.159921e-02 7.684716e-01
## 128 col8 col17 -1.806812e-02 9.179482e-01
## 129 col9 col17 -7.406648e-02 6.724038e-01
## 130 col10 col17 1.477111e-01 3.971103e-01
## 131 col11 col17 -7.727526e-16 1.000000e+00
## 132 col12 col17 1.656448e-01 3.416197e-01
## 133 col13 col17 -8.132941e-02 6.423269e-01
## 134 col14 col17 -8.274669e-01 8.810634e-10
## 135 col15 col17 -2.679259e-02 8.785735e-01
## 136 col16 col17 -3.246318e-01 5.708090e-02
## 137 col1 output 1.404063e-01 4.211172e-01
## 138 col2 output 4.551045e-01 6.014854e-03
## 139 col3 output 5.774303e-01 2.813690e-04
## 140 col4 output 3.029581e-01 7.688188e-02
## 141 col5 output 8.485439e-01 1.211811e-10
## 142 col6 output 7.263263e-02 6.784031e-01
## 143 col7 output 6.613523e-01 1.522830e-05
## 144 col8 output 7.045158e-01 2.316891e-06
## 145 col9 output 7.195876e-01 1.108041e-06
## 146 col10 output 6.849352e-01 5.660550e-06
## 147 col11 output 8.472608e-02 6.284469e-01
## 148 col12 output -4.780807e-01 3.675648e-03
## 149 col13 output -1.234923e-01 4.797097e-01
## 150 col14 output 5.054650e-01 1.952771e-03
## 151 col15 output -4.083072e-01 1.489021e-02
## 152 col16 output -3.469056e-01 4.118342e-02
## 153 col17 output -3.318084e-01 5.150120e-02
##chart of the correlations
chart.Correlation(inputs_normalized, histogram=TRUE, pch=19)
if(file.exists("train_test.RData")){
load("train_test.RData")
}else{
set.seed(1234)
sample(nrow(inputs_normalized),size=25,replace=FALSE)->trainIndex
inputs_normalized[trainIndex,]->train
inputs_normalized[-trainIndex,]->test
save(trainIndex,train,test,file="train_test.RData")
}
Train models , Support Vector Machine, Neural networks, Classificiation and regression Tree, Random Forest, bagging, adaboost models
##Train models
Training<-function(inputsTrain,targetsTrain,inputsTest,targetsTest,dataset){
path=getwd()
subdir<-"models"
dir.create(paste(path,subdir,sep="/"), showWarnings = FALSE)
setwd(paste(path,subdir,sep="/"))
resultList=list()
cvcontrol<-trainControl(method="cv",number=5,returnData=FALSE, verboseIter=TRUE,allowParallel=TRUE,savePredictions=TRUE,index=createMultiFolds(targetsTrain, k=5, times=2),seeds = NULL)
#Linear Least Squares
if(file.exists(paste("dataset_",dataset,"_lmFit.RData")))
{
load(paste("dataset_",dataset,"_lmFit.RData"))
}else{
lmFit<-caret::train(inputsTrain,targetsTrain,method="lm",trControl=cvcontrol)
save(lmFit,file=paste("dataset_",dataset,"_lmFit.RData"))
}
#svm
if(file.exists(paste("dataset_",dataset,"_svmFit.RData")))
{
load(paste("dataset_",dataset,"_svmFit.RData"))
}else{
svm.grid<-expand.grid(.C=c(0.1,1,10),.sigma=c(10,1,0.1,0.01))
svmFit<-caret::train(inputsTrain,targetsTrain,method="svmRadial",trControl=cvcontrol,tuneGrid=svm.grid)
save(svmFit,file=paste("dataset_",dataset,"_svmFit.RData"))
}
#neural networks
if(file.exists(paste("dataset_",dataset,"_nnetFit.RData")))
{
load(paste("dataset_",dataset,"_nnetFit.RData"))
}else
{
nnet.grid<-expand.grid(.size=c(1:10),.decay=c(0.0001,0.001,0.01,1,10))
nnetFit<-caret::train(inputsTrain,targetsTrain,method="nnet",trControl=cvcontrol,tuneGrid=nnet.grid,maxit=10000000)
save(nnetFit,file=paste("dataset_",dataset,"_nnetFit.RData"))
}
#CART
if(file.exists(paste("dataset_",dataset,"_rpartFit.RData")))
{
load(paste("dataset_",dataset,"_rpartFit.RData"))
}else{
rpart.grid<-expand.grid(.cp=c(0.06))
rpartFit<-caret::train(inputsTrain,targetsTrain,method="rpart",trControl=cvcontrol,tuneGrid=rpart.grid)
save(rpartFit,file=paste("dataset_",dataset,"_rpartFit.RData"))
}
#randomForest
if(file.exists(paste("dataset_",dataset,"_rfFit.RData")))
{
load(paste("dataset_",dataset,"_rfFit.RData"))
}else{
rf.grid<-expand.grid(.mtry=c(1:15))
rfFit<-caret::train(inputsTrain,targetsTrain,method="rf",trControl=cvcontrol,tuneGrid=rf.grid,importance=TRUE)
save(rfFit,file=paste("dataset_",dataset,"_rfFit.RData"))
}
#boosting CART
if(file.exists(paste("dataset_",dataset,"_gbmFit.RData")))
{
load(paste("dataset_",dataset,"_gbmFit.RData"))
}else{
gbm.grid<-expand.grid(.interaction.depth=c(1:20),.n.trees=c(1:10),.shrinkage=c(0.15,0.2),.n.minobsinnode=c(1:10))
gbmFit<-caret::train(inputsTrain,targetsTrain,method="gbm",trControl=cvcontrol,tuneGrid=gbm.grid)
save(gbmFit,file=paste("dataset_",dataset,"_gbmFit.RData"))
}
#bagged CART
if(file.exists(paste("dataset_",dataset,"_bagTreeFit.RData")))
{
load(paste("dataset_",dataset,"_bagTreeFit.RData"))
}else{
bagTreeFit<-caret::train(inputsTrain,targetsTrain,method="treebag",trControl=cvcontrol)
save(bagTreeFit,file=paste("dataset_",dataset,"_bagTreeFit.RData"))
}
}
setwd(pathMain)
train[,1:(ncol(train)-1)]->inputsTrain
train[,ncol(train)]->targetsTrain
test[,1:(ncol(test)-1)]->inputsTest
test[,ncol(test)]->targetsTest
Training(train[,1:(ncol(train)-1)],train[,ncol(train)],test[,1:(ncol(test)-1)],test[,ncol(test)],"allFactors")
functions to denormalize data and matrixes that measure errors which includes Relative Mean square error etc.
unormalized<-function(x,y){
((y-0.1)*(max(x)-min(x))/0.8) + min(x)
}
###function of r square
rsquare<-function(predict,observed){
sum((predict-mean(observed))*(predict-mean(observed)))/sum((observed-mean(observed))^2)
}
modelErrors <- function(predicted, actual) {
sal <- vector(mode="numeric", length=3)
names(sal) <- c( "MAE", "RMSE", "RELE")
meanPredicted <- mean(predicted)
meanActual <- mean(actual)
sumPred <- sum((predicted - meanPredicted)^2)
sumActual <- sum((actual - meanActual)^2)
n<- length(actual)
p3<-vector(mode="numeric", length=n)
for (i in c(1:n)) {
if (actual[i]==0) {p3[i]<-abs(predicted[i])
} else { p3[i]<-((abs(predicted[i]-actual[i]))/actual[i])
}}
sal[1] <- mean(abs(predicted - actual))
sal[2] <- sqrt(sum((predicted - actual)^2)/n)
sal[3] <- mean(p3)
sal
}
allModels<-function(all.models,inputsTest,targetsTest,outputOrig,nameOfDataset){
##predict function
pd<-function(model){
predict(model,newdata=inputsTest)
}
##error calcuation function
error<-function(model){
pd<-pd(model)
modelErrors(pd,targetsTest)
}
unormalized(outputOrig,targetsTest)->real_unormalized
if(length(all.models)==1){
all.models[[1]]->all.models
pd(all.models)->predNorm
names(predNorm)<- names(all.models)
save(predNorm,file=paste("dataset_", nameOfDataset,"_predictValue_normalized.RData"))
error(all.models)->modelsErrorsTotalNorm
save(modelsErrorsTotalNorm,file=paste("error_norm_dataset_", nameOfDataset,".RData"))
unormalized(outputOrig, predNorm )->predUnormalized
save(predUnormalized,file=paste("dataset_", nameOfDataset,"_predictValue_unnormalized.RData"))
modelErrors(predUnormalized,real_unormalized)->modelsErrorsTotalUnnorm
save(modelsErrorsTotalUnnorm,file=paste("error_unnorm_dataset_",nameOfDataset,".RData"))
}else{
####predict normalized value of all models
data.frame(sapply(all.models,pd))->predNorm
colnames(predNorm)<-names(all.models)
save(predNorm,file=paste("dataset_", nameOfDataset,"_predictValue_normalized.RData"))
###calculate the errors of normalized
data.frame(sapply(all.models,error))->modelsErrorsTotalNorm
colnames(modelsErrorsTotalNorm)<-names(all.models)
save(modelsErrorsTotalNorm,file=paste("error_norm_dataset_", nameOfDataset,".RData"))
###denormalized the predict valu
data.frame(sapply(predNorm,function(x) unormalized(outputOrig,x)))->predUnormalized
save(predUnormalized,file=paste("dataset_", nameOfDataset,"_predictValue_unnormalized.RData"))
###calculate the error of unnormalized
data.frame(sapply(predUnormalized,function(x) modelErrors(x,real_unormalized)))->modelsErrorsTotalUnnorm
save(modelsErrorsTotalUnnorm,file=paste("error_unnorm_dataset_",nameOfDataset,".RData"))
}
}
setwd("/home/gong/FakharCompetition/models")
temp_files = list.files(pattern="*Fit.RData")
all.models<-list()
for(i in 1:length(temp_files)){
e1 <- new.env()
variableName<-load(temp_files[i])
model<-load(temp_files[i], e1)
modelFit <- get(model, e1)
all.models[i]<-list(modelFit)
}
names(all.models)<-lapply(all.models,function(x) x$method)
dir.create("/home/gong/FakharCompetition/predictAndErrors", showWarnings = FALSE)
setwd("/home/gong/FakharCompetition/predictAndErrors")
allModels(all.models,test[,1:(ncol(test)-1)],test[,ncol(test)],inputs[,"output"],"allFactors")
## Loading required package: ipred
## Loading required package: plyr
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:Hmisc':
##
## is.discrete, summarize
## Loading required package: e1071
##
## Attaching package: 'e1071'
## The following objects are masked from 'package:PerformanceAnalytics':
##
## kurtosis, skewness
## The following object is masked from 'package:Hmisc':
##
## impute
## Loading required package: nnet
## Loading required package: randomForest
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:gdata':
##
## combine
## The following object is masked from 'package:Hmisc':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
## Loading required package: rpart
load("~/FakharCompetition/predictAndErrors/error_unnorm_dataset_ allFactors .RData")
###The models' errors###
modelsErrorsTotalUnnorm
## treebag gbm lm nnet rf
## MAE 1.583060e+04 9.207730e+03 1.052644e+04 4892.6863405 1.012616e+04
## RMSE 2.470833e+04 1.168656e+04 1.396007e+04 5622.8437542 1.432048e+04
## RELE 3.771896e-01 2.814608e-01 3.903668e-01 0.1464394 2.930727e-01
## rpart svmRadial
## MAE 1.562287e+04 7596.2317144
## RMSE 2.077029e+04 8646.1863143
## RELE 4.011528e-01 0.2519252
According to the various error measurments, tochastic Gradient Boosting(gbm), artificial neural network(nnet) and support vector machine(SVM) are highly recommended as model for risk monitoring system. Among which, neural network is the best with the lowest errors.
plot the predict and real value
load("~/FakharCompetition/train_test.RData")
inputs[-trainIndex,ncol(inputs)]->real
load("~/FakharCompetition/predictAndErrors/dataset_ allFactors _predictValue_unnormalized.RData")
par(mfrow=c(1,2))
plot(real,predUnormalized$nnet,xlim=c(1,140000),ylim=c(1,140000),ylab="Nnet")
abline(a=0,b=1,col="red")
plot(real,predUnormalized$lm,xlim=c(1,140000),ylim=c(1,140000),ylab="lm")
abline(a=0,b=1,col="red")
The two plots illustatrates the real value against the predicted values of Nnet and lm. The closer the points to 45 degree, the more accurate model is. Therefore, the Nnet model performance much better than lm.
Next step, we will use the two best models Stochastic Gradient Boosting and neural network models to analyze the vaiable importances.
#######variables importance#########)
load("~/FakharCompetition/models/dataset_ allFactors _nnetFit.RData")
nnetImp<- varImp(nnetFit, scale = FALSE)$importance
as.matrix(nnetImp)->nnetImp
load("~/FakharCompetition/models/dataset_ allFactors _gbmFit.RData")
gbmImp<- varImp(gbmFit, scale = FALSE)$importance
as.matrix(gbmImp)->gbmImp
merge(nnetImp,gbmImp,by="row.names",all=TRUE)->nnet_gbm_imp
colnames(nnet_gbm_imp)<-c("variables","nnet","GBM")
as.matrix(nnet_gbm_imp[,c("nnet","GBM")])->imp
rownames(imp)<-nnet_gbm_imp[,"variables"]
###order importances
imp[ order(imp[,1], imp[,2]), ]->imp.order
normalized<-function(x){
(x-min(x))*0.8/(max(x)-min(x))+0.1
}
###scale the important scores
apply(imp.order,2,normalized)->imp.order.norm
dotplot(imp.order.norm, groups = FALSE,mar=rep(0, 4),transparent=TRUE,
layout = c(2, 1), aspect = 0.7,
origin = 0, type = c("p", "h"),
xlab = "Variable importance score")