You are given three datasets,
You are to run the .csv excel dataset plus one of the two remaining datasets through the random forest model.
The performance of each of the two models should be improved as well.
To do:
This dataset was provided as Left-Right for CSV file.
library(randomForest)
library(mlbench)
library(RCurl)
library(caret)
library(rpart)
library(pROC)
kdata <- read.csv("leftright.csv")
str(kdata)
## 'data.frame': 10 obs. of 4 variables:
## $ Left : int 1 0 1 0 0 0 0 1 1 0
## $ Right: int 45 0 92 18 26 48 41 52 64 80
## $ Up : int 24 26 32 41 80 76 92 39 46 50
## $ Down : int 100 69 46 24 0 32 86 71 65 48
dim(kdata)
## [1] 10 4
set.seed(1000)
ranF <- randomForest(Left~.,data=kdata)
ranF
##
## Call:
## randomForest(formula = Left ~ ., data = kdata)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 1
##
## Mean of squared residuals: 0.2804608
## % Var explained: -16.86
kdata$Left <- as.factor(kdata$Left)
str(kdata)
## 'data.frame': 10 obs. of 4 variables:
## $ Left : Factor w/ 2 levels "0","1": 2 1 2 1 1 1 1 2 2 1
## $ Right: int 45 0 92 18 26 48 41 52 64 80
## $ Up : int 24 26 32 41 80 76 92 39 46 50
## $ Down : int 100 69 46 24 0 32 86 71 65 48
set.seed(1000)
ranF <- randomForest(Left~.,data=kdata)
ranF
##
## Call:
## randomForest(formula = Left ~ ., data = kdata)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 1
##
## OOB estimate of error rate: 70%
## Confusion matrix:
## 0 1 class.error
## 0 3 3 0.5
## 1 4 0 1.0
control <- trainControl(method = "cv", number = 3)
grid_rf <- expand.grid(mtry=3)
kranF <- train(Left~., data=kdata, method = "rf", importance=TRUE,
trControl=control, tuneGrid = grid_rf)
kranF
## Random Forest
##
## 10 samples
## 3 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (3 fold)
## Summary of sample sizes: 7, 6, 7
## Resampling results:
##
## Accuracy Kappa
## 0.5277778 -3.700743e-17
##
## Tuning parameter 'mtry' was held constant at a value of 3
set.seed(1000)
new_rf <-randomForest(Left~.,data=kdata, mtry=3, importance=TRUE)
new_rf
##
## Call:
## randomForest(formula = Left ~ ., data = kdata, mtry = 3, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 50%
## Confusion matrix:
## 0 1 class.error
## 0 4 2 0.3333333
## 1 3 1 0.7500000
#Evaluate variable importance
importance(new_rf)
## 0 1 MeanDecreaseAccuracy MeanDecreaseGini
## Right 1.603902 2.312356 2.066167 1.9172254
## Up 2.170799 3.508703 3.380443 1.6896095
## Down -4.228080 -3.364658 -4.204520 0.7247651
varImpPlot(new_rf)
library(ROCR)
pred1=predict(new_rf,type = "prob")
perf = prediction(pred1[,2], kdata$Left)
# 1. Area under curve
auc = performance(perf, "auc")
auc
## A performance instance
## 'Area under the ROC curve'
# 2. True Positive and Negative Rate
pred3 = performance(perf, "tpr","fpr")
# 3. Plot the ROC curve
plot(pred3,main="ROC Curve for Random Forest",col=2,lwd=2)
abline(a=0,b=1,lwd=2,lty=2,col="black")
data(PimaIndiansDiabetes)
df <- PimaIndiansDiabetes
str(df)
## 'data.frame': 768 obs. of 9 variables:
## $ pregnant: num 6 1 8 1 0 5 3 10 2 8 ...
## $ glucose : num 148 85 183 89 137 116 78 115 197 125 ...
## $ pressure: num 72 66 64 66 40 74 50 0 70 96 ...
## $ triceps : num 35 29 0 23 35 0 32 0 45 0 ...
## $ insulin : num 0 0 0 94 168 0 88 0 543 0 ...
## $ mass : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ pedigree: num 0.627 0.351 0.672 0.167 2.288 ...
## $ age : num 50 31 32 21 33 30 26 29 53 54 ...
## $ diabetes: Factor w/ 2 levels "neg","pos": 2 1 2 1 2 1 2 1 2 2 ...
head(df)
## pregnant glucose pressure triceps insulin mass pedigree age diabetes
## 1 6 148 72 35 0 33.6 0.627 50 pos
## 2 1 85 66 29 0 26.6 0.351 31 neg
## 3 8 183 64 0 0 23.3 0.672 32 pos
## 4 1 89 66 23 94 28.1 0.167 21 neg
## 5 0 137 40 35 168 43.1 2.288 33 pos
## 6 5 116 74 0 0 25.6 0.201 30 neg
#store rows for partition
partition <- caret::createDataPartition(y = df$diabetes, times = 1, p = 0.7, list = FALSE)
# create training data set
train_set <- df[partition,]
# create testing data set, subtracting the rows partition to get remaining 30% of the data
test_set <- df[-partition,]
str(train_set)
## 'data.frame': 538 obs. of 9 variables:
## $ pregnant: num 8 0 5 3 8 4 10 10 1 7 ...
## $ glucose : num 183 137 116 78 125 110 168 139 189 100 ...
## $ pressure: num 64 40 74 50 96 92 74 80 60 0 ...
## $ triceps : num 0 35 0 32 0 0 0 0 23 0 ...
## $ insulin : num 0 168 0 88 0 0 0 0 846 0 ...
## $ mass : num 23.3 43.1 25.6 31 0 37.6 38 27.1 30.1 30 ...
## $ pedigree: num 0.672 2.288 0.201 0.248 0.232 ...
## $ age : num 32 33 30 26 54 30 34 57 59 32 ...
## $ diabetes: Factor w/ 2 levels "neg","pos": 2 2 1 2 2 1 2 1 2 2 ...
model_forest <- caret::train(diabetes ~., data = train_set, method = "ranger", metric = "ROC",trControl = trainControl(method = "cv", number = 10,classProbs = T, summaryFunction = twoClassSummary),preProcess = c("center","scale","pca"))
model_forest
## Random Forest
##
## 538 samples
## 8 predictor
## 2 classes: 'neg', 'pos'
##
## Pre-processing: centered (8), scaled (8), principal component signal
## extraction (8)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 485, 484, 484, 484, 484, 484, ...
## Resampling results across tuning parameters:
##
## mtry splitrule ROC Sens Spec
## 2 gini 0.8161571 0.8571429 0.5967836
## 2 extratrees 0.8232373 0.8771429 0.5330409
## 4 gini 0.8152632 0.8428571 0.6020468
## 4 extratrees 0.8153175 0.8514286 0.5593567
## 7 gini 0.8127068 0.8371429 0.5859649
## 7 extratrees 0.8170844 0.8485714 0.5862573
##
## Tuning parameter 'min.node.size' was held constant at a value of 1
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 2, splitrule = extratrees
## and min.node.size = 1.
model_forest$results[6,4]
## [1] 0.8170844
# prediction on Test data set
pred_rf <- predict(model_forest, test_set)
# Confusion Matrix
cm_rf <- confusionMatrix(pred_rf, test_set$diabetes, positive="pos")
# Prediction Probabilities
pred_prob_rf <- predict(model_forest, test_set, type="prob")
# ROC value
roc_rf <- roc(test_set$diabetes, pred_prob_rf$pos)
# Confusion Matrix for Random Forest Model
cm_rf
## Confusion Matrix and Statistics
##
## Reference
## Prediction neg pos
## neg 131 34
## pos 19 46
##
## Accuracy : 0.7696
## 95% CI : (0.7097, 0.8224)
## No Information Rate : 0.6522
## P-Value [Acc > NIR] : 7.748e-05
##
## Kappa : 0.4688
##
## Mcnemar's Test P-Value : 0.05447
##
## Sensitivity : 0.5750
## Specificity : 0.8733
## Pos Pred Value : 0.7077
## Neg Pred Value : 0.7939
## Prevalence : 0.3478
## Detection Rate : 0.2000
## Detection Prevalence : 0.2826
## Balanced Accuracy : 0.7242
##
## 'Positive' Class : pos
##
# ROC Value for Random Forest
roc_rf
##
## Call:
## roc.default(response = test_set$diabetes, predictor = pred_prob_rf$pos)
##
## Data: pred_prob_rf$pos in 150 controls (test_set$diabetes neg) < 80 cases (test_set$diabetes pos).
## Area under the curve: 0.8399
# AUC - Area under the curve
caTools::colAUC(pred_prob_rf$pos, test_set$diabetes, plotROC = T)
## [,1]
## neg vs. pos 0.8399167
result_rf <- c(cm_rf$byClass['Sensitivity'], cm_rf$byClass['Specificity'], cm_rf$byClass['Precision'], cm_rf$byClass['Recall'], cm_rf$byClass['F1'], roc_rf$auc)
result_rf
## Sensitivity Specificity Precision Recall F1
## 0.5750000 0.8733333 0.7076923 0.5750000 0.6344828 0.8399167
data("iris")
dfI <- sample(2,nrow(iris),replace=TRUE,prob=c(0.7,0.3))
trainData <- iris[dfI==1,]
testData <- iris[dfI==2,]
str(dfI)
## int [1:150] 1 1 2 1 1 2 2 1 1 1 ...
head(dfI)
## [1] 1 1 2 1 1 2
iris_rf <- randomForest(Species~.,data=trainData,ntree=100,proximity=TRUE)
table(predict(iris_rf),trainData$Species)
##
## setosa versicolor virginica
## setosa 37 0 0
## versicolor 0 31 2
## virginica 0 2 32
print(iris_rf)
##
## Call:
## randomForest(formula = Species ~ ., data = trainData, ntree = 100, proximity = TRUE)
## Type of random forest: classification
## Number of trees: 100
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 3.85%
## Confusion matrix:
## setosa versicolor virginica class.error
## setosa 37 0 0 0.00000000
## versicolor 0 31 2 0.06060606
## virginica 0 2 32 0.05882353
plot(iris_rf)
importance(iris_rf)
## MeanDecreaseGini
## Sepal.Length 6.369843
## Sepal.Width 1.076268
## Petal.Length 31.467502
## Petal.Width 29.623758
varImpPlot(iris_rf)
irisPr<-predict(iris_rf,newdata=testData)
table(irisPr, testData$Species)
##
## irisPr setosa versicolor virginica
## setosa 13 0 0
## versicolor 0 16 1
## virginica 0 1 15
tune.rf <- tuneRF(iris[,-5],iris[,5], stepFactor=0.5)
## mtry = 2 OOB error = 5.33%
## Searching left ...
## mtry = 4 OOB error = 5.33%
## 0 0.05
## Searching right ...
## mtry = 1 OOB error = 6.67%
## -0.25 0.05
print(tune.rf)
## mtry OOBError
## 1.OOB 1 0.06666667
## 2.OOB 2 0.05333333
## 4.OOB 4 0.05333333