library(randomForest)
library(ggplot2)
library(caret)
library(nnet)
df<-read.csv2("amex_dataset_for_RF.csv", sep=";")
### Matrix of Zero Variance
zv<-nearZeroVar(df,saveMetrics=TRUE)
### Reduced Data Frame
df<-df[,rownames(subset(zv, nzv==FALSE))]
set.seed(5)
inTrain <- createDataPartition(y=df$classes, p=0.75, list=FALSE)
training <- df[inTrain, ]
testing <- df[-inTrain, ]
training_reduced<-subset(training, select=-ID)
We could apply Cross Vildation to see how the error of the model is reduced by adding more variables and to pick the N more important. However since the fitting is very low I do not think that it is necessary to exclude more variables apart from those with low variance. Also the Cross Validation takes extremely much time
fitRf <- randomForest(classes ~ ., data=training_reduced, importantce=TRUE)
###In case we want to print the Confusion Matrix the command is: fitRf$confusion
pred <- predict(fitRf, testing)
table(pred, testing$classes)
##
## pred achievement anxiety celebration curiosity desirability
## achievement 459 344 1 39 113
## anxiety 189 1306 2 332 33
## celebration 0 0 0 0 0
## curiosity 164 2109 113 7834 38
## desirability 1 0 0 0 0
## encouragement 99 454 1 315 27
## exclusivity 1347 1173 48 2117 284
## gratification 19 397 12 726 5
## gratitude 0 1 0 0 0
## guilt 0 0 0 0 0
## relief 21 18 0 12 8
## urgency 20 9 0 3 6
##
## pred encouragement exclusivity gratification gratitude guilt
## achievement 231 274 20 10 12
## anxiety 480 267 267 350 5
## celebration 0 0 0 0 0
## curiosity 1872 5646 4793 181 91
## desirability 1 1 0 0 0
## encouragement 1005 360 188 184 7
## exclusivity 1796 3456 1203 104 166
## gratification 304 632 1100 19 17
## gratitude 1 0 0 5 0
## guilt 0 0 1 0 0
## relief 18 23 6 2 1
## urgency 10 9 8 1 3
##
## pred relief urgency
## achievement 86 259
## anxiety 40 77
## celebration 0 0
## curiosity 488 305
## desirability 0 0
## encouragement 91 118
## exclusivity 1018 824
## gratification 106 55
## gratitude 1 0
## guilt 0 0
## relief 21 14
## urgency 5 12
err_rate <- length(pred[!pred==testing$classes])/nrow(testing)
err_rate
## [1] 0.688878
At this point we run again the Random Forest by keeping the 15 most important variables
finalcols<-c(rownames(as.data.frame((fitRf$importance[order(fitRf$importance, decreasing=TRUE),][1:15]))),"classes")
trainingfinalcols <- training[, finalcols]
fitRfv2 <- randomForest(classes ~ ., data=trainingfinalcols, importance=TRUE)
## Warning in matrix(rfout$nodepred, ncol = ntree): Reached total allocation
## of 8082Mb: see help(memory.size)
## Warning in matrix(rfout$nodepred, ncol = ntree): Reached total allocation
## of 8082Mb: see help(memory.size)
pred2 <- predict(fitRfv2, testing)
err_rate2 <- length(pred2[!pred2==testing$classes])/nrow(testing)
err_rate2
## [1] 0.694221
At this point we run a multinomial logistic regression using the 15 most important variables according to Random Forest algorithm
fitMn<-multinom(classes~., data=trainingfinalcols)
## # weights: 792 (715 variable)
## initial value 364200.343127
## iter 10 value 296174.926278
## iter 20 value 292062.128324
## iter 30 value 290808.879651
## iter 40 value 287724.220528
## iter 50 value 281827.352090
## iter 60 value 273863.932389
## iter 70 value 270609.642540
## iter 80 value 269533.805365
## iter 90 value 268838.502040
## iter 100 value 268054.679106
## final value 268054.679106
## stopped after 100 iterations
predMn<-predict(fitMn, newdata=testing, "probs")
predictedemotion<-rep(c("Emotion"), length(predMn[,1]))
for (i in 1:length(predMn[,1])) {
predictedemotion[i]<-names(which.max(predMn[i,]))
}
err_rate_mn<-length(predictedemotion[!predictedemotion==testing$classes])/nrow(testing)
err_rate_mn
## [1] 0.6969027