to predict whether the tumor is benign or Malignant using Random Forest
#install.packages("randomForest")
#install.packages("Mass")
#install.packages("caret")
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.5.1
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
library(MASS)
library(caret)
## Warning: package 'caret' was built under R version 3.5.1
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.5.1
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
# USe the set.seed function so that we get same results each time
wbcd <- read.csv(file.choose())
View(wbcd)
#First colum in dataset is id which is not required so we will be taking out
wbcd <- wbcd[-1]
View(wbcd)
#table of diagonis B <- 357 and M <- 212
table(wbcd$diagnosis)
##
## B M
## 357 212
# Replace B with Benign and M with Malignant. Diagnosis is factor with 2 levels that is B and M. We also replacing these two entery with Benign and Malignat
wbcd$diagnosis <- factor(wbcd$diagnosis, levels = c("B","M"), labels = c("Benign","Malignant"))
# table or proportation of enteries in the datasets. What % of entry is Bengin and % of entry is Malignant
round(prop.table(table(wbcd$diagnosis))*100,1)
##
## Benign Malignant
## 62.7 37.3
summary(wbcd[c("radius_mean","texture_mean","perimeter_mean")])
## radius_mean texture_mean perimeter_mean
## Min. : 6.981 Min. : 9.71 Min. : 43.79
## 1st Qu.:11.700 1st Qu.:16.17 1st Qu.: 75.17
## Median :13.370 Median :18.84 Median : 86.24
## Mean :14.127 Mean :19.29 Mean : 91.97
## 3rd Qu.:15.780 3rd Qu.:21.80 3rd Qu.:104.10
## Max. :28.110 Max. :39.28 Max. :188.50
#Create a function to normalize the data
norm <- function(x){
return((x-min(x))/(max(x)-min(x)))
}
#test normalization
norm(c(1,2,3,4,5))
## [1] 0.00 0.25 0.50 0.75 1.00
norm(c(10,20,30,40,50))
## [1] 0.00 0.25 0.50 0.75 1.00
#Apply the normalization function to wbcd dataset
wbcd_n <- as.data.frame(lapply(wbcd[2:31], norm))
View(wbcd_n)
diag <- as.data.frame(wbcd$diagnosis,label = diag)
wbcd_n1 <-as.data.frame(c(diag,wbcd_n))
View(wbcd_n1)
summary(wbcd_n[c("radius_mean","texture_mean","perimeter_mean")])
## radius_mean texture_mean perimeter_mean
## Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.2233 1st Qu.:0.2185 1st Qu.:0.2168
## Median :0.3024 Median :0.3088 Median :0.2933
## Mean :0.3382 Mean :0.3240 Mean :0.3329
## 3rd Qu.:0.4164 3rd Qu.:0.4089 3rd Qu.:0.4168
## Max. :1.0000 Max. :1.0000 Max. :1.0000
# Data Partition
set.seed(123)
ind <- sample(2, nrow(wbcd_n1), replace = TRUE, prob = c(0.7,0.3))
train <- wbcd_n1[ind==1,]
test <- wbcd_n1[ind==2,]
set.seed(213)
rf <- randomForest(wbcd.diagnosis~., data=train)
rf # Description of the random forest with no of trees, mtry = no of variables for splitting
##
## Call:
## randomForest(formula = wbcd.diagnosis ~ ., data = train)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 5
##
## OOB estimate of error rate: 4.2%
## Confusion matrix:
## Benign Malignant class.error
## Benign 245 7 0.02777778
## Malignant 10 143 0.06535948
# each tree node.
# Out of bag estimate of error rate is 4.2 % in Random Forest Model.
attributes(rf)
## $names
## [1] "call" "type" "predicted"
## [4] "err.rate" "confusion" "votes"
## [7] "oob.times" "classes" "importance"
## [10] "importanceSD" "localImportance" "proximity"
## [13] "ntree" "mtry" "forest"
## [16] "y" "test" "inbag"
## [19] "terms"
##
## $class
## [1] "randomForest.formula" "randomForest"
# Prediction and Confusion Matrix - Training data
pred1 <- predict(rf, train)
head(pred1)
## 1 3 6 7 9 10
## Benign Benign Benign Benign Benign Benign
## Levels: Benign Malignant
head(train$wbcd.diagnosis)
## [1] Benign Benign Benign Benign Benign Benign
## Levels: Benign Malignant
# looks like the first six predicted value and original value matches.
confusionMatrix(pred1, train$wbcd.diagnosis) # 100 % accuracy on training data
## Confusion Matrix and Statistics
##
## Reference
## Prediction Benign Malignant
## Benign 252 0
## Malignant 0 153
##
## Accuracy : 1
## 95% CI : (0.9909, 1)
## No Information Rate : 0.6222
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.6222
## Detection Rate : 0.6222
## Detection Prevalence : 0.6222
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : Benign
##
# Around 99% Confidence Interval.
# Sensitivity for Yes and No is 100 %
# Prediction with test data - Test Data
pred2 <- predict(rf, test)
confusionMatrix(pred2, test$wbcd.diagnosis) # 96.95 % accuracy on test data
## Confusion Matrix and Statistics
##
## Reference
## Prediction Benign Malignant
## Benign 102 2
## Malignant 3 57
##
## Accuracy : 0.9695
## 95% CI : (0.9303, 0.99)
## No Information Rate : 0.6402
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9341
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9714
## Specificity : 0.9661
## Pos Pred Value : 0.9808
## Neg Pred Value : 0.9500
## Prevalence : 0.6402
## Detection Rate : 0.6220
## Detection Prevalence : 0.6341
## Balanced Accuracy : 0.9688
##
## 'Positive' Class : Benign
##
# Error Rate in Random Forest Model :
plot(rf)

# Tune Random Forest Model mtry
tune <- tuneRF(train[,-11], train[,11], stepFactor = 1, plot = TRUE, ntreeTry = 350,
trace = TRUE, improve = 0.05)
## mtry = 10 OOB error = 0.003831827
## Searching left ...
## Searching right ...

rf1 <- randomForest(wbcd.diagnosis~., data=train, ntree = 350, mtry = 5, importance = TRUE,
proximity = TRUE)
rf1 # OBB estimate is 3.95%
##
## Call:
## randomForest(formula = wbcd.diagnosis ~ ., data = train, ntree = 350, mtry = 5, importance = TRUE, proximity = TRUE)
## Type of random forest: classification
## Number of trees: 350
## No. of variables tried at each split: 5
##
## OOB estimate of error rate: 4.44%
## Confusion matrix:
## Benign Malignant class.error
## Benign 245 7 0.02777778
## Malignant 11 142 0.07189542
pred1 <- predict(rf1, train)
confusionMatrix(pred1, train$wbcd.diagnosis) # 100 % accuracy on training data
## Confusion Matrix and Statistics
##
## Reference
## Prediction Benign Malignant
## Benign 252 0
## Malignant 0 153
##
## Accuracy : 1
## 95% CI : (0.9909, 1)
## No Information Rate : 0.6222
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.6222
## Detection Rate : 0.6222
## Detection Prevalence : 0.6222
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : Benign
##
# Around 98% Confidence Interval.
# Sensitivity for Yes and No is 100 %
# test data prediction using the Tuned RF1 model
pred2 <- predict(rf1, test)
confusionMatrix(pred2, test$wbcd.diagnosis) # 96.95 % accuracy on test data
## Confusion Matrix and Statistics
##
## Reference
## Prediction Benign Malignant
## Benign 103 2
## Malignant 2 57
##
## Accuracy : 0.9756
## 95% CI : (0.9387, 0.9933)
## No Information Rate : 0.6402
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9471
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9810
## Specificity : 0.9661
## Pos Pred Value : 0.9810
## Neg Pred Value : 0.9661
## Prevalence : 0.6402
## Detection Rate : 0.6280
## Detection Prevalence : 0.6402
## Balanced Accuracy : 0.9735
##
## 'Positive' Class : Benign
##
# Confidence Interval is around 93 %
# no of nodes of trees
hist(treesize(rf1), main = "No of Nodes for the trees", col = "green")

# Majority of the trees has an average number of greater than 80 nodes.
# Variable Importance :
varImpPlot(rf1)

# Mean Decrease Accuracy graph shows that how worst the model performs without each variable.
# radius_worst, perimeter worst, area_worst, point_worst has high values and important
# variable for prediction. Symmetry_Se/Texture_Se has very low value and not required for
# Predictions.
# MeanDecrease gini graph shows how much by average the gini decreases if one of those nodes were
# removed. radius_worst, perimeter worst has high values and used for prediction.
# Dimension_mean and Texture SE has low gini values.
varImpPlot(rf1 ,Sort = T, n.var = 10, main = "Top 10 -Variable Importance")
## Warning in mtext(labs, side = 2, line = loffset, at = y, adj = 0, col =
## color, : "Sort" is not a graphical parameter
## Warning in title(main = main, xlab = xlab, ylab = ylab, ...): "Sort" is not
## a graphical parameter
## Warning in mtext(labs, side = 2, line = loffset, at = y, adj = 0, col =
## color, : "Sort" is not a graphical parameter
## Warning in title(main = main, xlab = xlab, ylab = ylab, ...): "Sort" is not
## a graphical parameter

# Quantitative values
importance(rf1)
## Benign Malignant MeanDecreaseAccuracy
## radius_mean 7.1911739 4.32321331 7.85037321
## texture_mean 5.6743410 8.44630225 9.06432930
## perimeter_mean 7.3518779 4.85167092 8.23515163
## area_mean 7.2646554 3.89641750 7.85798918
## smoothness_mean 0.3537775 4.74537515 4.35692181
## compactness_mean 3.7461760 2.50179759 4.67533954
## concavity_mean 6.7575278 8.28054150 10.40890142
## points_mean 9.3809355 9.51932458 13.20451410
## symmetry_mean 2.1697124 3.75933914 4.15246030
## dimension_mean 2.6366595 0.30099469 2.56456868
## radius_se 6.3837370 3.16752864 7.23787170
## texture_se 0.6634674 0.04772526 0.61296295
## perimeter_se 6.2717175 5.13096374 8.45988353
## area_se 9.8105764 5.80799776 11.01556362
## smoothness_se 2.5326218 0.54105959 2.48899547
## compactness_se 2.9005218 0.25073684 2.76101034
## concavity_se 3.2971283 4.10486160 5.09374979
## points_se 3.3004658 2.64163149 4.15170443
## symmetry_se 2.5048692 -0.93171313 1.59862213
## dimension_se 1.4627556 -0.46983717 0.83294664
## radius_worst 11.3447812 9.35245740 14.30335931
## texture_worst 4.7676536 10.94502340 10.92702852
## perimeter_worst 10.0139517 8.89570976 12.59823425
## area_worst 11.5847138 9.69904036 13.81626659
## smoothness_worst 5.8904503 5.04379433 7.68190274
## compactness_worst 4.0541595 4.21560900 5.99430403
## concavity_worst 4.3710500 9.42683676 10.63652676
## points_worst 10.0173318 8.97742982 13.06461431
## symmetry_worst 5.1019940 5.40940896 7.00599791
## dimension_worst 0.2653214 -0.36763436 0.05284653
## MeanDecreaseGini
## radius_mean 8.8290339
## texture_mean 3.1080177
## perimeter_mean 12.8519340
## area_mean 8.5968019
## smoothness_mean 1.0793160
## compactness_mean 1.4645734
## concavity_mean 10.8839144
## points_mean 16.6298274
## symmetry_mean 1.0149812
## dimension_mean 0.8011728
## radius_se 2.3319541
## texture_se 0.6525691
## perimeter_se 2.3406325
## area_se 6.8225855
## smoothness_se 1.0133967
## compactness_se 1.0593310
## concavity_se 1.1633374
## points_se 1.0468614
## symmetry_se 0.7465928
## dimension_se 1.1801131
## radius_worst 23.9052917
## texture_worst 4.0776340
## perimeter_worst 25.4575206
## area_worst 22.2424773
## smoothness_worst 2.7430307
## compactness_worst 2.5082056
## concavity_worst 5.6522834
## points_worst 16.9645505
## symmetry_worst 2.1732580
## dimension_worst 0.8324878
varUsed(rf1) # which predictor variables are actually used in the random forest.
## [1] 149 248 183 151 127 120 243 300 125 103 152 105 183 227 139 124 127
## [18] 117 104 143 294 274 296 282 210 177 232 307 193 103
# Partial Dependence Plot
partialPlot(rf1, train, radius_worst, "Benign")

# On that graph, i see that if the value is greater than 0.4, then they are malignant.
# Extract single tree from the forest :
getTree(rf1, 1, labelVar = TRUE)
## left daughter right daughter split var split point status
## 1 2 3 points_mean 0.24286779 1
## 2 4 5 area_mean 0.23512195 1
## 3 6 7 concavity_worst 0.17955272 1
## 4 8 9 texture_worst 0.47374733 1
## 5 10 11 texture_mean 0.30334799 1
## 6 0 0 <NA> 0.00000000 -1
## 7 12 13 area_mean 0.08150583 1
## 8 14 15 points_se 0.26539117 1
## 9 16 17 dimension_mean 0.14279697 1
## 10 0 0 <NA> 0.00000000 -1
## 11 18 19 texture_se 0.32781559 1
## 12 0 0 <NA> 0.00000000 -1
## 13 20 21 dimension_se 0.41510164 1
## 14 0 0 <NA> 0.00000000 -1
## 15 22 23 radius_se 0.17168206 1
## 16 24 25 area_mean 0.16625663 1
## 17 26 27 radius_worst 0.31821416 1
## 18 0 0 <NA> 0.00000000 -1
## 19 0 0 <NA> 0.00000000 -1
## 20 28 29 area_se 0.03259818 1
## 21 0 0 <NA> 0.00000000 -1
## 22 0 0 <NA> 0.00000000 -1
## 23 0 0 <NA> 0.00000000 -1
## 24 0 0 <NA> 0.00000000 -1
## 25 0 0 <NA> 0.00000000 -1
## 26 0 0 <NA> 0.00000000 -1
## 27 0 0 <NA> 0.00000000 -1
## 28 30 31 compactness_worst 0.19114009 1
## 29 0 0 <NA> 0.00000000 -1
## 30 0 0 <NA> 0.00000000 -1
## 31 0 0 <NA> 0.00000000 -1
## prediction
## 1 <NA>
## 2 <NA>
## 3 <NA>
## 4 <NA>
## 5 <NA>
## 6 Benign
## 7 <NA>
## 8 <NA>
## 9 <NA>
## 10 Benign
## 11 <NA>
## 12 Benign
## 13 <NA>
## 14 Benign
## 15 <NA>
## 16 <NA>
## 17 <NA>
## 18 Malignant
## 19 Benign
## 20 <NA>
## 21 Benign
## 22 Benign
## 23 Malignant
## 24 Benign
## 25 Malignant
## 26 Benign
## 27 Malignant
## 28 <NA>
## 29 Malignant
## 30 Benign
## 31 Malignant
# Multi Dimension scaling plot of proximity Matrix
MDSplot(rf1, wbcd$diagnosis)
## Warning in RColorBrewer::brewer.pal(nlevs, "Set1"): minimal value for n is 3, returning requested palette with 3 different levels
