Packages

library(randomForest)
library(ggplot2)
library(caret)
library(nnet)
df<-read.csv2("amex_dataset_for_RF.csv", sep=";")

Exclude the Variables of low variance

### Matrix of Zero Variance
zv<-nearZeroVar(df,saveMetrics=TRUE)
### Reduced Data Frame
df<-df[,rownames(subset(zv, nzv==FALSE))]

Create the Training and the Testing set

set.seed(5)
inTrain <- createDataPartition(y=df$classes, p=0.75, list=FALSE)
training <- df[inTrain, ]
testing <- df[-inTrain, ]

Exclude the ID column from the training set

training_reduced<-subset(training, select=-ID)

We could apply Cross Vildation to see how the error of the model is reduced by adding more variables and to pick the N more important. However since the fitting is very low I do not think that it is necessary to exclude more variables apart from those with low variance. Also the Cross Validation takes extremely much time

fitRf <- randomForest(classes ~ ., data=training_reduced, importantce=TRUE)
###In case we want to print the Confusion Matrix the command is: fitRf$confusion
pred <- predict(fitRf, testing)
table(pred, testing$classes)
##                
## pred            achievement anxiety celebration curiosity desirability
##   achievement           459     344           1        39          113
##   anxiety               189    1306           2       332           33
##   celebration             0       0           0         0            0
##   curiosity             164    2109         113      7834           38
##   desirability            1       0           0         0            0
##   encouragement          99     454           1       315           27
##   exclusivity          1347    1173          48      2117          284
##   gratification          19     397          12       726            5
##   gratitude               0       1           0         0            0
##   guilt                   0       0           0         0            0
##   relief                 21      18           0        12            8
##   urgency                20       9           0         3            6
##                
## pred            encouragement exclusivity gratification gratitude guilt
##   achievement             231         274            20        10    12
##   anxiety                 480         267           267       350     5
##   celebration               0           0             0         0     0
##   curiosity              1872        5646          4793       181    91
##   desirability              1           1             0         0     0
##   encouragement          1005         360           188       184     7
##   exclusivity            1796        3456          1203       104   166
##   gratification           304         632          1100        19    17
##   gratitude                 1           0             0         5     0
##   guilt                     0           0             1         0     0
##   relief                   18          23             6         2     1
##   urgency                  10           9             8         1     3
##                
## pred            relief urgency
##   achievement       86     259
##   anxiety           40      77
##   celebration        0       0
##   curiosity        488     305
##   desirability       0       0
##   encouragement     91     118
##   exclusivity     1018     824
##   gratification    106      55
##   gratitude          1       0
##   guilt              0       0
##   relief            21      14
##   urgency            5      12
err_rate <- length(pred[!pred==testing$classes])/nrow(testing)
err_rate
## [1] 0.688878

Random Forest with the 15 most Important Variables

At this point we run again the Random Forest by keeping the 15 most important variables

finalcols<-c(rownames(as.data.frame((fitRf$importance[order(fitRf$importance, decreasing=TRUE),][1:15]))),"classes")
trainingfinalcols <- training[, finalcols]
fitRfv2 <- randomForest(classes ~ ., data=trainingfinalcols, importance=TRUE)
## Warning in matrix(rfout$nodepred, ncol = ntree): Reached total allocation
## of 8082Mb: see help(memory.size)
## Warning in matrix(rfout$nodepred, ncol = ntree): Reached total allocation
## of 8082Mb: see help(memory.size)
pred2 <- predict(fitRfv2, testing)
err_rate2 <- length(pred2[!pred2==testing$classes])/nrow(testing)
err_rate2 
## [1] 0.694221

Multinomial Logistic Regression

At this point we run a multinomial logistic regression using the 15 most important variables according to Random Forest algorithm

fitMn<-multinom(classes~., data=trainingfinalcols)
## # weights:  792 (715 variable)
## initial  value 364200.343127 
## iter  10 value 296174.926278
## iter  20 value 292062.128324
## iter  30 value 290808.879651
## iter  40 value 287724.220528
## iter  50 value 281827.352090
## iter  60 value 273863.932389
## iter  70 value 270609.642540
## iter  80 value 269533.805365
## iter  90 value 268838.502040
## iter 100 value 268054.679106
## final  value 268054.679106 
## stopped after 100 iterations
predMn<-predict(fitMn, newdata=testing, "probs")
predictedemotion<-rep(c("Emotion"), length(predMn[,1]))
for (i in 1:length(predMn[,1]))  {

  predictedemotion[i]<-names(which.max(predMn[i,]))
}

err_rate_mn<-length(predictedemotion[!predictedemotion==testing$classes])/nrow(testing)
err_rate_mn
## [1] 0.6969027

Neural Network with one hidden layer of 10 nodes using 15 most

n <- names(trainingfinalcols)
f <- as.formula(paste("classes ~", paste(n[!n %in% "classes"], collapse = " + ")))
nn <- nnet(f,data=trainingfinalcols, size=10)
## # weights:  782
## initial  value 426176.821152 
## iter  10 value 284338.028695
## iter  20 value 269721.572255
## iter  30 value 264571.399311
## iter  40 value 262947.218010
## iter  50 value 261665.415234
## iter  60 value 261149.919289
## iter  70 value 260672.173756
## iter  80 value 260456.838101
## iter  90 value 260271.668491
## iter 100 value 260014.790434
## final  value 260014.790434 
## stopped after 100 iterations
predNN<-predict(nn, testing, type="class")
err_rate_nn<-length(predNN[!predNN==testing$classes])/nrow(testing)
err_rate_nn
## [1] 0.6906999