Packages

library(randomForest)
library(ggplot2)
library(caret)
library(nnet)
library(devtools);source_gist(5086859)
df<-read.csv2("amex_dataset_for_RF.csv", sep=";")

Exclude the Variables of low variance

### Matrix of Zero Variance
zv<-nearZeroVar(df,saveMetrics=TRUE)
### Reduced Data Frame
df<-df[,rownames(subset(zv, nzv==FALSE))]

Create the Training and the Testing set

set.seed(5)
inTrain <- createDataPartition(y=df$classes, p=0.75, list=FALSE)
training <- df[inTrain, ]
testing <- df[-inTrain, ]

Exclude the ID column from the training set

training_reduced<-subset(training, select=-ID)

We could apply Cross Vildation to see how the error of the model is reduced by adding more variables and to pick the N more important. However since the fitting is very low I do not think that it is necessary to exclude more variables apart from those with low variance. Also the Cross Validation takes extremely much time

fitRf <- randomForest(classes ~ ., data=training_reduced, importantce=TRUE)
###In case we want to print the Confusion Matrix the command is: fitRf$confusion
pred <- predict(fitRf, testing)
## table(pred, testing$classes)
err_rate <- length(pred[!pred==testing$classes])/nrow(testing)
err_rate
## [1] 0.688878

Random Forest with the 15 most Important Variables

At this point we run again the Random Forest by keeping the 15 most important variables

finalcols<-c(rownames(as.data.frame((fitRf$importance[order(fitRf$importance, decreasing=TRUE),][1:15]))),"classes")
trainingfinalcols <- training[, finalcols]
fitRfv2 <- randomForest(classes ~ ., data=trainingfinalcols, importance=TRUE)
pred2 <- predict(fitRfv2, testing)
err_rate2 <- length(pred2[!pred2==testing$classes])/nrow(testing)
err_rate2 
## [1] 0.694221

Multinomial Logistic Regression

At this point we run a multinomial logistic regression using the 15 most important variables according to Random Forest algorithm

fitMn<-multinom(classes~., data=trainingfinalcols)
## # weights:  792 (715 variable)
## initial  value 364200.343127 
## iter  10 value 296174.926278
## iter  20 value 292062.128324
## iter  30 value 290808.879651
## iter  40 value 287724.220528
## iter  50 value 281827.352090
## iter  60 value 273863.932389
## iter  70 value 270609.642540
## iter  80 value 269533.805365
## iter  90 value 268838.502040
## iter 100 value 268054.679106
## final  value 268054.679106 
## stopped after 100 iterations
predictedemotion<-predict(fitMn, newdata=testing)


err_rate_mn<-length(predictedemotion[!predictedemotion==testing$classes])/nrow(testing)
err_rate_mn
## [1] 0.6969027

Neural Network with one hidden layer of 10 nodes using 15 most important variables

n <- names(trainingfinalcols)
f <- as.formula(paste("classes ~", paste(n[!n %in% "classes"], collapse = " + ")))
nn <- nnet(f,data=trainingfinalcols, size=10)
## # weights:  782
## initial  value 426176.821152 
## iter  10 value 284338.028695
## iter  20 value 269721.572255
## iter  30 value 264571.399311
## iter  40 value 262947.218010
## iter  50 value 261665.415234
## iter  60 value 261149.919289
## iter  70 value 260672.173756
## iter  80 value 260456.838101
## iter  90 value 260271.668491
## iter 100 value 260014.790434
## final  value 260014.790434 
## stopped after 100 iterations
predNN<-predict(nn, testing, type="class")
err_rate_nn<-length(predNN[!predNN==testing$classes])/nrow(testing)
err_rate_nn
## [1] 0.6906999

NN Plot

plot.nnet(nn)
## Loading required package: scales

To sum up. The error rate between the different models is very close

##Full RF
err_rate
## [1] 0.688878
##Reduced RF
err_rate2
## [1] 0.694221
##Reduced MN Regression
err_rate_mn
## [1] 0.6969027
##Reduced NN
err_rate_nn
## [1] 0.6906999