Project Directions

You are given three datasets,

You are to run the .csv excel dataset plus one of the two remaining datasets through the random forest model.

The performance of each of the two models should be improved as well.

To do:

First dataset: Left-Right

This dataset was provided as Left-Right for CSV file.

Import libraries

library(randomForest)
library(mlbench)
library(RCurl)
library(caret)
library(rpart)
library(pROC)

Access dataset

kdata <- read.csv("leftright.csv")
str(kdata)
## 'data.frame':    10 obs. of  4 variables:
##  $ Left : int  1 0 1 0 0 0 0 1 1 0
##  $ Right: int  45 0 92 18 26 48 41 52 64 80
##  $ Up   : int  24 26 32 41 80 76 92 39 46 50
##  $ Down : int  100 69 46 24 0 32 86 71 65 48
dim(kdata)
## [1] 10  4

Random Forest

set.seed(1000)
ranF <- randomForest(Left~.,data=kdata)
ranF
## 
## Call:
##  randomForest(formula = Left ~ ., data = kdata) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 1
## 
##           Mean of squared residuals: 0.2804608
##                     % Var explained: -16.86

set Left as factor and recreate random forest

kdata$Left <- as.factor(kdata$Left)
str(kdata)
## 'data.frame':    10 obs. of  4 variables:
##  $ Left : Factor w/ 2 levels "0","1": 2 1 2 1 1 1 1 2 2 1
##  $ Right: int  45 0 92 18 26 48 41 52 64 80
##  $ Up   : int  24 26 32 41 80 76 92 39 46 50
##  $ Down : int  100 69 46 24 0 32 86 71 65 48
set.seed(1000)
ranF <- randomForest(Left~.,data=kdata)
ranF
## 
## Call:
##  randomForest(formula = Left ~ ., data = kdata) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 1
## 
##         OOB estimate of  error rate: 70%
## Confusion matrix:
##   0 1 class.error
## 0 3 3         0.5
## 1 4 0         1.0

Evaluation: cross validation

control <- trainControl(method = "cv", number = 3)
grid_rf <- expand.grid(mtry=3)
kranF <- train(Left~., data=kdata, method = "rf", importance=TRUE, 
              trControl=control, tuneGrid = grid_rf)
kranF
## Random Forest 
## 
## 10 samples
##  3 predictor
##  2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (3 fold) 
## Summary of sample sizes: 7, 6, 7 
## Resampling results:
## 
##   Accuracy   Kappa        
##   0.5277778  -3.700743e-17
## 
## Tuning parameter 'mtry' was held constant at a value of 3

Build model with mtry value

set.seed(1000)
new_rf <-randomForest(Left~.,data=kdata, mtry=3, importance=TRUE)
new_rf
## 
## Call:
##  randomForest(formula = Left ~ ., data = kdata, mtry = 3, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 50%
## Confusion matrix:
##   0 1 class.error
## 0 4 2   0.3333333
## 1 3 1   0.7500000
#Evaluate variable importance
importance(new_rf)
##               0         1 MeanDecreaseAccuracy MeanDecreaseGini
## Right  1.603902  2.312356             2.066167        1.9172254
## Up     2.170799  3.508703             3.380443        1.6896095
## Down  -4.228080 -3.364658            -4.204520        0.7247651
varImpPlot(new_rf)

Higher the value of mean decrease accuracy or mean decrease gini score , higher the importance of the variable in the model. In the plots shown above, Right and Up are important variables.

Prediction and performance metrics

library(ROCR)
pred1=predict(new_rf,type = "prob")
perf = prediction(pred1[,2], kdata$Left)
# 1. Area under curve
auc = performance(perf, "auc")
auc
## A performance instance
##   'Area under the ROC curve'
# 2. True Positive and Negative Rate
pred3 = performance(perf, "tpr","fpr")
# 3. Plot the ROC curve
plot(pred3,main="ROC Curve for Random Forest",col=2,lwd=2)
abline(a=0,b=1,lwd=2,lty=2,col="black")

Conclusion

  • mtry value is 3, producing 50% error rate. *The model ranks a random positive more highly than a random negative.
  • AUC provides an aggregate measure of performance across all possible classification thresholds as the probability that the model ranks a random positive more highly than a random negative
  • This ROC curve has an AUC of 0.5, meaning it ranks a random positive example higher than a random negative example 50% of the time. The corresponding classification model is basically worthless, as its predictive ability is no better than random guessing.

Second dataset: Pima Indian Diabetes

Access dataset

data(PimaIndiansDiabetes)
df <- PimaIndiansDiabetes
str(df)
## 'data.frame':    768 obs. of  9 variables:
##  $ pregnant: num  6 1 8 1 0 5 3 10 2 8 ...
##  $ glucose : num  148 85 183 89 137 116 78 115 197 125 ...
##  $ pressure: num  72 66 64 66 40 74 50 0 70 96 ...
##  $ triceps : num  35 29 0 23 35 0 32 0 45 0 ...
##  $ insulin : num  0 0 0 94 168 0 88 0 543 0 ...
##  $ mass    : num  33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
##  $ pedigree: num  0.627 0.351 0.672 0.167 2.288 ...
##  $ age     : num  50 31 32 21 33 30 26 29 53 54 ...
##  $ diabetes: Factor w/ 2 levels "neg","pos": 2 1 2 1 2 1 2 1 2 2 ...
head(df)
##   pregnant glucose pressure triceps insulin mass pedigree age diabetes
## 1        6     148       72      35       0 33.6    0.627  50      pos
## 2        1      85       66      29       0 26.6    0.351  31      neg
## 3        8     183       64       0       0 23.3    0.672  32      pos
## 4        1      89       66      23      94 28.1    0.167  21      neg
## 5        0     137       40      35     168 43.1    2.288  33      pos
## 6        5     116       74       0       0 25.6    0.201  30      neg

Split and data

#store rows for partition
partition <- caret::createDataPartition(y = df$diabetes, times = 1, p = 0.7, list = FALSE)

# create training data set
train_set <- df[partition,]

# create testing data set, subtracting the rows partition to get remaining 30% of the data
test_set <- df[-partition,]

str(train_set)
## 'data.frame':    538 obs. of  9 variables:
##  $ pregnant: num  8 0 5 3 8 4 10 10 1 7 ...
##  $ glucose : num  183 137 116 78 125 110 168 139 189 100 ...
##  $ pressure: num  64 40 74 50 96 92 74 80 60 0 ...
##  $ triceps : num  0 35 0 32 0 0 0 0 23 0 ...
##  $ insulin : num  0 168 0 88 0 0 0 0 846 0 ...
##  $ mass    : num  23.3 43.1 25.6 31 0 37.6 38 27.1 30.1 30 ...
##  $ pedigree: num  0.672 2.288 0.201 0.248 0.232 ...
##  $ age     : num  32 33 30 26 54 30 34 57 59 32 ...
##  $ diabetes: Factor w/ 2 levels "neg","pos": 2 2 1 2 2 1 2 1 2 2 ...

Random forest

model_forest <- caret::train(diabetes ~., data = train_set, method = "ranger", metric = "ROC",trControl = trainControl(method = "cv", number = 10,classProbs = T, summaryFunction = twoClassSummary),preProcess = c("center","scale","pca"))
model_forest
## Random Forest 
## 
## 538 samples
##   8 predictor
##   2 classes: 'neg', 'pos' 
## 
## Pre-processing: centered (8), scaled (8), principal component signal
##  extraction (8) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 485, 484, 484, 484, 484, 484, ... 
## Resampling results across tuning parameters:
## 
##   mtry  splitrule   ROC        Sens       Spec     
##   2     gini        0.8161571  0.8571429  0.5967836
##   2     extratrees  0.8232373  0.8771429  0.5330409
##   4     gini        0.8152632  0.8428571  0.6020468
##   4     extratrees  0.8153175  0.8514286  0.5593567
##   7     gini        0.8127068  0.8371429  0.5859649
##   7     extratrees  0.8170844  0.8485714  0.5862573
## 
## Tuning parameter 'min.node.size' was held constant at a value of 1
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 2, splitrule = extratrees
##  and min.node.size = 1.
model_forest$results[6,4]
## [1] 0.8170844

Prediction

# prediction on Test data set
pred_rf <- predict(model_forest, test_set)
# Confusion Matrix 
cm_rf <- confusionMatrix(pred_rf, test_set$diabetes, positive="pos")

# Prediction Probabilities
pred_prob_rf <- predict(model_forest, test_set, type="prob")

# ROC value
roc_rf <- roc(test_set$diabetes, pred_prob_rf$pos)

# Confusion Matrix for Random Forest Model
cm_rf
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction neg pos
##        neg 131  34
##        pos  19  46
##                                           
##                Accuracy : 0.7696          
##                  95% CI : (0.7097, 0.8224)
##     No Information Rate : 0.6522          
##     P-Value [Acc > NIR] : 7.748e-05       
##                                           
##                   Kappa : 0.4688          
##                                           
##  Mcnemar's Test P-Value : 0.05447         
##                                           
##             Sensitivity : 0.5750          
##             Specificity : 0.8733          
##          Pos Pred Value : 0.7077          
##          Neg Pred Value : 0.7939          
##              Prevalence : 0.3478          
##          Detection Rate : 0.2000          
##    Detection Prevalence : 0.2826          
##       Balanced Accuracy : 0.7242          
##                                           
##        'Positive' Class : pos             
## 
# ROC Value for Random Forest
roc_rf
## 
## Call:
## roc.default(response = test_set$diabetes, predictor = pred_prob_rf$pos)
## 
## Data: pred_prob_rf$pos in 150 controls (test_set$diabetes neg) < 80 cases (test_set$diabetes pos).
## Area under the curve: 0.8399
# AUC - Area under the curve
caTools::colAUC(pred_prob_rf$pos, test_set$diabetes, plotROC = T)

##                  [,1]
## neg vs. pos 0.8399167

Test result

result_rf <- c(cm_rf$byClass['Sensitivity'], cm_rf$byClass['Specificity'], cm_rf$byClass['Precision'], cm_rf$byClass['Recall'], cm_rf$byClass['F1'], roc_rf$auc)

result_rf
## Sensitivity Specificity   Precision      Recall          F1             
##   0.5750000   0.8733333   0.7076923   0.5750000   0.6344828   0.8399167

Conclusion

  • Random Forest accuracy is 72%, which could be higher using another model, such as Logistic Regression or Recursive Partitioning and Regression Trees model.

Third Dataset: Iris

data("iris")
dfI <- sample(2,nrow(iris),replace=TRUE,prob=c(0.7,0.3))
trainData <- iris[dfI==1,]
testData <- iris[dfI==2,]
str(dfI)
##  int [1:150] 1 1 2 1 1 2 2 1 1 1 ...
head(dfI)
## [1] 1 1 2 1 1 2

Generate random forest learning tree

iris_rf <- randomForest(Species~.,data=trainData,ntree=100,proximity=TRUE)
table(predict(iris_rf),trainData$Species)
##             
##              setosa versicolor virginica
##   setosa         37          0         0
##   versicolor      0         31         2
##   virginica       0          2        32

print(iris_rf)
## 
## Call:
##  randomForest(formula = Species ~ ., data = trainData, ntree = 100,      proximity = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 100
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 3.85%
## Confusion matrix:
##            setosa versicolor virginica class.error
## setosa         37          0         0  0.00000000
## versicolor      0         31         2  0.06060606
## virginica       0          2        32  0.05882353
plot(iris_rf)

Determine important variables

importance(iris_rf)
##              MeanDecreaseGini
## Sepal.Length         6.369843
## Sepal.Width          1.076268
## Petal.Length        31.467502
## Petal.Width         29.623758
varImpPlot(iris_rf)

Build random forest with testing data

irisPr<-predict(iris_rf,newdata=testData)
table(irisPr, testData$Species)
##             
## irisPr       setosa versicolor virginica
##   setosa         13          0         0
##   versicolor      0         16         1
##   virginica       0          1        15

Tune Random Forest

tune.rf <- tuneRF(iris[,-5],iris[,5], stepFactor=0.5)
## mtry = 2  OOB error = 5.33% 
## Searching left ...
## mtry = 4     OOB error = 5.33% 
## 0 0.05 
## Searching right ...
## mtry = 1     OOB error = 6.67% 
## -0.25 0.05

print(tune.rf)
##       mtry   OOBError
## 1.OOB    1 0.06666667
## 2.OOB    2 0.05333333
## 4.OOB    4 0.05333333

Conclusion

  • mtry values 2 to 4 produced error rates of 5.33%.