Classification

Packages

library(randomForest)
library(ggplot2)
library(caret)
library(nnet)

df<-read.csv2("amex_dataset_for_RF.csv", sep=";")

Exclude the Variables of low variance

### Matrix of Zero Variance
zv<-nearZeroVar(df,saveMetrics=TRUE)
### Reduced Data Frame
df<-df[,rownames(subset(zv, nzv==FALSE))]

Create the Training and the Testing set

set.seed(5)
inTrain <- createDataPartition(y=df$classes, p=0.75, list=FALSE)
training <- df[inTrain, ]
testing <- df[-inTrain, ]

Exclude the ID column from the training set

training_reduced<-subset(training, select=-ID)

We could apply Cross Vildation to see how the error of the model is reduced by adding more variables and to pick the N more important. However since the fitting is very low I do not think that it is necessary to exclude more variables apart from those with low variance. Also the Cross Validation takes extremely much time

fitRf <- randomForest(classes ~ ., data=training_reduced, importantce=TRUE)
###In case we want to print the Confusion Matrix the command is: fitRf$confusion
pred <- predict(fitRf, testing)
table(pred, testing$classes)

##                
## pred            achievement anxiety celebration curiosity desirability
##   achievement            61      47           0        27            7
##   anxiety               295    1587           3       534           97
##   celebration             0       0           0         0            0
##   curiosity             751    2166         114      6852          151
##   desirability            1       1           0         0            0
##   encouragement         124     230           0       177           32
##   exclusivity           985    1368          50      3158          208
##   gratification          83     396          10       612           15
##   gratitude               0       0           0         0            0
##   guilt                   0       0           0         0            0
##   relief                 10      13           0        15            2
##   urgency                 9       3           0         3            2
##                
## pred            encouragement exclusivity gratification gratitude guilt
##   achievement              49          54            19         2     3
##   anxiety                 984         460           369       474     5
##   celebration               0           0             0         0     0
##   curiosity              2148        5878          4244       155   132
##   desirability              0           1             0         0     0
##   encouragement           405         217           133        85    10
##   exclusivity            1775        3468          1941       121   130
##   gratification           341         563           859        16    20
##   gratitude                 0           0             0         0     0
##   guilt                     0           0             1         0     0
##   relief                   11          24            16         3     0
##   urgency                   5           3             4         0     2
##                
## pred            relief urgency
##   achievement       19      28
##   anxiety           24     141
##   celebration        0       0
##   curiosity        759     653
##   desirability       0       0
##   encouragement     70     123
##   exclusivity      812     617
##   gratification    155      91
##   gratitude          1       0
##   guilt              0       0
##   relief            15       6
##   urgency            1       5

err_rate <- length(pred[!pred==testing$classes])/nrow(testing)
err_rate

## [1] 0.728715

At this point we run again the Random Forest by keeping the 20 most important variables

finalcols<-c(rownames(as.data.frame((fitRf$importance[order(fitRf$importance, decreasing=TRUE),][1:20]))),"classes")
trainingfinalcols <- training[, finalcols]
fitRfv2 <- randomForest(classes ~ ., data=trainingfinalcols, importance=TRUE)

## Warning in matrix(rfout$xbestsplit, ncol = ntree): Reached total allocation
## of 8082Mb: see help(memory.size)

## Warning in matrix(rfout$xbestsplit, ncol = ntree): Reached total allocation
## of 8082Mb: see help(memory.size)

pred2 <- predict(fitRfv2, testing)
err_rate2 <- length(pred2[!pred2==testing$classes])/nrow(testing)
err_rate2

## [1] 0.7318266

fitMn<-multinom(classes~., data=trainingfinalcols)

## # weights:  996 (902 variable)
## initial  value 364200.343127 
## iter  10 value 306379.163319
## iter  20 value 303835.307931
## iter  30 value 302712.318325
## iter  40 value 299391.660766
## iter  50 value 294303.664516
## iter  60 value 290066.865872
## iter  70 value 288430.439749
## iter  80 value 287844.434053
## iter  90 value 287160.053916
## iter 100 value 285889.894921
## final  value 285889.894921 
## stopped after 100 iterations

predMn<-predict(fitMn, newdata=testing, "probs")
predictedemotion<-rep(c("Emotion"), length(predMn[,1]))
for (i in 1:length(predMn[,1]))  {

  predictedemotion[i]<-names(which.max(predMn[i,]))
}

err_rate_mn<-length(predictedemotion[!predictedemotion==testing$classes])/nrow(testing)
err_rate_mn

## [1] 0.7353477