library(randomForest)
library(ggplot2)
library(caret)
library(nnet)
df<-read.csv2("amex_dataset_for_RF.csv", sep=";")
### Matrix of Zero Variance
zv<-nearZeroVar(df,saveMetrics=TRUE)
### Reduced Data Frame
df<-df[,rownames(subset(zv, nzv==FALSE))]
set.seed(5)
inTrain <- createDataPartition(y=df$classes, p=0.75, list=FALSE)
training <- df[inTrain, ]
testing <- df[-inTrain, ]
training_reduced<-subset(training, select=-ID)
We could apply Cross Vildation to see how the error of the model is reduced by adding more variables and to pick the N more important. However since the fitting is very low I do not think that it is necessary to exclude more variables apart from those with low variance. Also the Cross Validation takes extremely much time
fitRf <- randomForest(classes ~ ., data=training_reduced, importantce=TRUE)
###In case we want to print the Confusion Matrix the command is: fitRf$confusion
pred <- predict(fitRf, testing)
table(pred, testing$classes)
##
## pred achievement anxiety celebration curiosity desirability
## achievement 61 47 0 27 7
## anxiety 295 1587 3 534 97
## celebration 0 0 0 0 0
## curiosity 751 2166 114 6852 151
## desirability 1 1 0 0 0
## encouragement 124 230 0 177 32
## exclusivity 985 1368 50 3158 208
## gratification 83 396 10 612 15
## gratitude 0 0 0 0 0
## guilt 0 0 0 0 0
## relief 10 13 0 15 2
## urgency 9 3 0 3 2
##
## pred encouragement exclusivity gratification gratitude guilt
## achievement 49 54 19 2 3
## anxiety 984 460 369 474 5
## celebration 0 0 0 0 0
## curiosity 2148 5878 4244 155 132
## desirability 0 1 0 0 0
## encouragement 405 217 133 85 10
## exclusivity 1775 3468 1941 121 130
## gratification 341 563 859 16 20
## gratitude 0 0 0 0 0
## guilt 0 0 1 0 0
## relief 11 24 16 3 0
## urgency 5 3 4 0 2
##
## pred relief urgency
## achievement 19 28
## anxiety 24 141
## celebration 0 0
## curiosity 759 653
## desirability 0 0
## encouragement 70 123
## exclusivity 812 617
## gratification 155 91
## gratitude 1 0
## guilt 0 0
## relief 15 6
## urgency 1 5
err_rate <- length(pred[!pred==testing$classes])/nrow(testing)
err_rate
## [1] 0.728715
At this point we run again the Random Forest by keeping the 20 most important variables
finalcols<-c(rownames(as.data.frame((fitRf$importance[order(fitRf$importance, decreasing=TRUE),][1:20]))),"classes")
trainingfinalcols <- training[, finalcols]
fitRfv2 <- randomForest(classes ~ ., data=trainingfinalcols, importance=TRUE)
## Warning in matrix(rfout$xbestsplit, ncol = ntree): Reached total allocation
## of 8082Mb: see help(memory.size)
## Warning in matrix(rfout$xbestsplit, ncol = ntree): Reached total allocation
## of 8082Mb: see help(memory.size)
pred2 <- predict(fitRfv2, testing)
err_rate2 <- length(pred2[!pred2==testing$classes])/nrow(testing)
err_rate2
## [1] 0.7318266
fitMn<-multinom(classes~., data=trainingfinalcols)
## # weights: 996 (902 variable)
## initial value 364200.343127
## iter 10 value 306379.163319
## iter 20 value 303835.307931
## iter 30 value 302712.318325
## iter 40 value 299391.660766
## iter 50 value 294303.664516
## iter 60 value 290066.865872
## iter 70 value 288430.439749
## iter 80 value 287844.434053
## iter 90 value 287160.053916
## iter 100 value 285889.894921
## final value 285889.894921
## stopped after 100 iterations
predMn<-predict(fitMn, newdata=testing, "probs")
predictedemotion<-rep(c("Emotion"), length(predMn[,1]))
for (i in 1:length(predMn[,1])) {
predictedemotion[i]<-names(which.max(predMn[i,]))
}
err_rate_mn<-length(predictedemotion[!predictedemotion==testing$classes])/nrow(testing)
err_rate_mn
## [1] 0.7353477