library(randomForest)
library(ggplot2)
library(caret)
library(nnet)
library(devtools);source_gist(5086859)
df<-read.csv2("amex_dataset_for_RF.csv", sep=";")
### Matrix of Zero Variance
zv<-nearZeroVar(df,saveMetrics=TRUE)
### Reduced Data Frame
df<-df[,rownames(subset(zv, nzv==FALSE))]
set.seed(5)
inTrain <- createDataPartition(y=df$classes, p=0.75, list=FALSE)
training <- df[inTrain, ]
testing <- df[-inTrain, ]
training_reduced<-subset(training, select=-ID)
We could apply Cross Vildation to see how the error of the model is reduced by adding more variables and to pick the N more important. However since the fitting is very low I do not think that it is necessary to exclude more variables apart from those with low variance. Also the Cross Validation takes extremely much time
fitRf <- randomForest(classes ~ ., data=training_reduced, importantce=TRUE)
###In case we want to print the Confusion Matrix the command is: fitRf$confusion
pred <- predict(fitRf, testing)
## table(pred, testing$classes)
err_rate <- length(pred[!pred==testing$classes])/nrow(testing)
err_rate
## [1] 0.688878
At this point we run again the Random Forest by keeping the 15 most important variables
finalcols<-c(rownames(as.data.frame((fitRf$importance[order(fitRf$importance, decreasing=TRUE),][1:15]))),"classes")
trainingfinalcols <- training[, finalcols]
fitRfv2 <- randomForest(classes ~ ., data=trainingfinalcols, importance=TRUE)
pred2 <- predict(fitRfv2, testing)
err_rate2 <- length(pred2[!pred2==testing$classes])/nrow(testing)
err_rate2
## [1] 0.694221
At this point we run a multinomial logistic regression using the 15 most important variables according to Random Forest algorithm
fitMn<-multinom(classes~., data=trainingfinalcols)
## # weights: 792 (715 variable)
## initial value 364200.343127
## iter 10 value 296174.926278
## iter 20 value 292062.128324
## iter 30 value 290808.879651
## iter 40 value 287724.220528
## iter 50 value 281827.352090
## iter 60 value 273863.932389
## iter 70 value 270609.642540
## iter 80 value 269533.805365
## iter 90 value 268838.502040
## iter 100 value 268054.679106
## final value 268054.679106
## stopped after 100 iterations
predictedemotion<-predict(fitMn, newdata=testing)
err_rate_mn<-length(predictedemotion[!predictedemotion==testing$classes])/nrow(testing)
err_rate_mn
## [1] 0.6969027
plot.nnet(nn)
## Loading required package: scales
##Full RF
err_rate
## [1] 0.688878
##Reduced RF
err_rate2
## [1] 0.694221
##Reduced MN Regression
err_rate_mn
## [1] 0.6969027
##Reduced NN
err_rate_nn
## [1] 0.6906999