to predict whether the tumor is benign or Malignant using Random Forest

#install.packages("randomForest")
#install.packages("Mass")
#install.packages("caret")
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.5.1
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
library(MASS)
library(caret)
## Warning: package 'caret' was built under R version 3.5.1
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.5.1
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
## 
##     margin
# USe the set.seed function so that we get same results each time 
wbcd <- read.csv(file.choose())
View(wbcd)
#First colum in dataset is id which is not required so we will be taking out
wbcd <- wbcd[-1]
View(wbcd)
#table of diagonis B <- 357 and M <- 212
table(wbcd$diagnosis)
## 
##   B   M 
## 357 212
# Replace B with Benign and M with Malignant. Diagnosis is factor with 2 levels that is B and M. We also replacing these two entery with Benign and Malignat
wbcd$diagnosis <- factor(wbcd$diagnosis, levels = c("B","M"), labels = c("Benign","Malignant"))

# table or proportation of enteries in the datasets. What % of entry is Bengin and % of entry is Malignant
round(prop.table(table(wbcd$diagnosis))*100,1)
## 
##    Benign Malignant 
##      62.7      37.3
summary(wbcd[c("radius_mean","texture_mean","perimeter_mean")])
##   radius_mean      texture_mean   perimeter_mean  
##  Min.   : 6.981   Min.   : 9.71   Min.   : 43.79  
##  1st Qu.:11.700   1st Qu.:16.17   1st Qu.: 75.17  
##  Median :13.370   Median :18.84   Median : 86.24  
##  Mean   :14.127   Mean   :19.29   Mean   : 91.97  
##  3rd Qu.:15.780   3rd Qu.:21.80   3rd Qu.:104.10  
##  Max.   :28.110   Max.   :39.28   Max.   :188.50
#Create a function to normalize the data
norm <- function(x){ 
  return((x-min(x))/(max(x)-min(x)))
}
#test normalization
norm(c(1,2,3,4,5))
## [1] 0.00 0.25 0.50 0.75 1.00
norm(c(10,20,30,40,50))
## [1] 0.00 0.25 0.50 0.75 1.00
#Apply the normalization function to wbcd dataset
wbcd_n <- as.data.frame(lapply(wbcd[2:31], norm))
View(wbcd_n)
diag <- as.data.frame(wbcd$diagnosis,label = diag)

wbcd_n1 <-as.data.frame(c(diag,wbcd_n))
View(wbcd_n1)
summary(wbcd_n[c("radius_mean","texture_mean","perimeter_mean")])
##   radius_mean      texture_mean    perimeter_mean  
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.2233   1st Qu.:0.2185   1st Qu.:0.2168  
##  Median :0.3024   Median :0.3088   Median :0.2933  
##  Mean   :0.3382   Mean   :0.3240   Mean   :0.3329  
##  3rd Qu.:0.4164   3rd Qu.:0.4089   3rd Qu.:0.4168  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000
# Data Partition
set.seed(123)
ind <- sample(2, nrow(wbcd_n1), replace = TRUE, prob = c(0.7,0.3))
train <- wbcd_n1[ind==1,]
test  <- wbcd_n1[ind==2,]
set.seed(213)
rf <- randomForest(wbcd.diagnosis~., data=train)
rf  # Description of the random forest with no of trees, mtry = no of variables for splitting
## 
## Call:
##  randomForest(formula = wbcd.diagnosis ~ ., data = train) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 5
## 
##         OOB estimate of  error rate: 4.2%
## Confusion matrix:
##           Benign Malignant class.error
## Benign       245         7  0.02777778
## Malignant     10       143  0.06535948
                  # each tree node.
   # Out of bag estimate of error rate is 4.2 % in Random Forest Model.
attributes(rf)
## $names
##  [1] "call"            "type"            "predicted"      
##  [4] "err.rate"        "confusion"       "votes"          
##  [7] "oob.times"       "classes"         "importance"     
## [10] "importanceSD"    "localImportance" "proximity"      
## [13] "ntree"           "mtry"            "forest"         
## [16] "y"               "test"            "inbag"          
## [19] "terms"          
## 
## $class
## [1] "randomForest.formula" "randomForest"
# Prediction and Confusion Matrix - Training data 
pred1 <- predict(rf, train)
head(pred1)
##      1      3      6      7      9     10 
## Benign Benign Benign Benign Benign Benign 
## Levels: Benign Malignant
head(train$wbcd.diagnosis)
## [1] Benign Benign Benign Benign Benign Benign
## Levels: Benign Malignant
# looks like the first six predicted value and original value matches.

confusionMatrix(pred1, train$wbcd.diagnosis)   # 100 % accuracy on training data 
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign       252         0
##   Malignant      0       153
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9909, 1)
##     No Information Rate : 0.6222     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 1.0000     
##          Pos Pred Value : 1.0000     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.6222     
##          Detection Rate : 0.6222     
##    Detection Prevalence : 0.6222     
##       Balanced Accuracy : 1.0000     
##                                      
##        'Positive' Class : Benign     
## 
 # Around 99% Confidence Interval. 
  # Sensitivity for Yes and No is 100 % 

# Prediction with test data - Test Data 
pred2 <- predict(rf, test)
confusionMatrix(pred2, test$wbcd.diagnosis) # 96.95 % accuracy on test data 
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign       102         2
##   Malignant      3        57
##                                         
##                Accuracy : 0.9695        
##                  95% CI : (0.9303, 0.99)
##     No Information Rate : 0.6402        
##     P-Value [Acc > NIR] : <2e-16        
##                                         
##                   Kappa : 0.9341        
##  Mcnemar's Test P-Value : 1             
##                                         
##             Sensitivity : 0.9714        
##             Specificity : 0.9661        
##          Pos Pred Value : 0.9808        
##          Neg Pred Value : 0.9500        
##              Prevalence : 0.6402        
##          Detection Rate : 0.6220        
##    Detection Prevalence : 0.6341        
##       Balanced Accuracy : 0.9688        
##                                         
##        'Positive' Class : Benign        
## 
# Error Rate in Random Forest Model :
plot(rf)

# Tune Random Forest Model mtry 
tune <- tuneRF(train[,-11], train[,11], stepFactor = 1, plot = TRUE, ntreeTry = 350,
       trace = TRUE, improve = 0.05)
## mtry = 10  OOB error = 0.003831827 
## Searching left ...
## Searching right ...

rf1 <- randomForest(wbcd.diagnosis~., data=train, ntree = 350, mtry = 5, importance = TRUE,
                   proximity = TRUE)
rf1    # OBB estimate is 3.95%
## 
## Call:
##  randomForest(formula = wbcd.diagnosis ~ ., data = train, ntree = 350,      mtry = 5, importance = TRUE, proximity = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 350
## No. of variables tried at each split: 5
## 
##         OOB estimate of  error rate: 4.44%
## Confusion matrix:
##           Benign Malignant class.error
## Benign       245         7  0.02777778
## Malignant     11       142  0.07189542
pred1 <- predict(rf1, train)
confusionMatrix(pred1, train$wbcd.diagnosis)  # 100 % accuracy on training data 
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign       252         0
##   Malignant      0       153
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9909, 1)
##     No Information Rate : 0.6222     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 1.0000     
##          Pos Pred Value : 1.0000     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.6222     
##          Detection Rate : 0.6222     
##    Detection Prevalence : 0.6222     
##       Balanced Accuracy : 1.0000     
##                                      
##        'Positive' Class : Benign     
## 
  # Around 98% Confidence Interval. 
  # Sensitivity for Yes and No is 100 % 

# test data prediction using the Tuned RF1 model
pred2 <- predict(rf1, test)
confusionMatrix(pred2, test$wbcd.diagnosis) # 96.95 % accuracy on test data 
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign       103         2
##   Malignant      2        57
##                                           
##                Accuracy : 0.9756          
##                  95% CI : (0.9387, 0.9933)
##     No Information Rate : 0.6402          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9471          
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9810          
##             Specificity : 0.9661          
##          Pos Pred Value : 0.9810          
##          Neg Pred Value : 0.9661          
##              Prevalence : 0.6402          
##          Detection Rate : 0.6280          
##    Detection Prevalence : 0.6402          
##       Balanced Accuracy : 0.9735          
##                                           
##        'Positive' Class : Benign          
## 
# Confidence Interval is around 93 % 


# no of nodes of trees

hist(treesize(rf1), main = "No of Nodes for the trees", col = "green")

# Majority of the trees has an average number of greater than 80 nodes. 

# Variable Importance :

varImpPlot(rf1)

# Mean Decrease Accuracy graph shows that how worst the model performs without each variable.
# radius_worst, perimeter worst, area_worst, point_worst has high values and important
# variable for prediction. Symmetry_Se/Texture_Se has very low value and not required for 
# Predictions.

# MeanDecrease gini graph shows how much by average the gini decreases if one of those nodes were 
# removed. radius_worst, perimeter worst has high values and used for prediction. 
# Dimension_mean and Texture SE has low gini values.

varImpPlot(rf1 ,Sort = T, n.var = 10, main = "Top 10 -Variable Importance")
## Warning in mtext(labs, side = 2, line = loffset, at = y, adj = 0, col =
## color, : "Sort" is not a graphical parameter
## Warning in title(main = main, xlab = xlab, ylab = ylab, ...): "Sort" is not
## a graphical parameter
## Warning in mtext(labs, side = 2, line = loffset, at = y, adj = 0, col =
## color, : "Sort" is not a graphical parameter
## Warning in title(main = main, xlab = xlab, ylab = ylab, ...): "Sort" is not
## a graphical parameter

# Quantitative values 
importance(rf1)
##                       Benign   Malignant MeanDecreaseAccuracy
## radius_mean        7.1911739  4.32321331           7.85037321
## texture_mean       5.6743410  8.44630225           9.06432930
## perimeter_mean     7.3518779  4.85167092           8.23515163
## area_mean          7.2646554  3.89641750           7.85798918
## smoothness_mean    0.3537775  4.74537515           4.35692181
## compactness_mean   3.7461760  2.50179759           4.67533954
## concavity_mean     6.7575278  8.28054150          10.40890142
## points_mean        9.3809355  9.51932458          13.20451410
## symmetry_mean      2.1697124  3.75933914           4.15246030
## dimension_mean     2.6366595  0.30099469           2.56456868
## radius_se          6.3837370  3.16752864           7.23787170
## texture_se         0.6634674  0.04772526           0.61296295
## perimeter_se       6.2717175  5.13096374           8.45988353
## area_se            9.8105764  5.80799776          11.01556362
## smoothness_se      2.5326218  0.54105959           2.48899547
## compactness_se     2.9005218  0.25073684           2.76101034
## concavity_se       3.2971283  4.10486160           5.09374979
## points_se          3.3004658  2.64163149           4.15170443
## symmetry_se        2.5048692 -0.93171313           1.59862213
## dimension_se       1.4627556 -0.46983717           0.83294664
## radius_worst      11.3447812  9.35245740          14.30335931
## texture_worst      4.7676536 10.94502340          10.92702852
## perimeter_worst   10.0139517  8.89570976          12.59823425
## area_worst        11.5847138  9.69904036          13.81626659
## smoothness_worst   5.8904503  5.04379433           7.68190274
## compactness_worst  4.0541595  4.21560900           5.99430403
## concavity_worst    4.3710500  9.42683676          10.63652676
## points_worst      10.0173318  8.97742982          13.06461431
## symmetry_worst     5.1019940  5.40940896           7.00599791
## dimension_worst    0.2653214 -0.36763436           0.05284653
##                   MeanDecreaseGini
## radius_mean              8.8290339
## texture_mean             3.1080177
## perimeter_mean          12.8519340
## area_mean                8.5968019
## smoothness_mean          1.0793160
## compactness_mean         1.4645734
## concavity_mean          10.8839144
## points_mean             16.6298274
## symmetry_mean            1.0149812
## dimension_mean           0.8011728
## radius_se                2.3319541
## texture_se               0.6525691
## perimeter_se             2.3406325
## area_se                  6.8225855
## smoothness_se            1.0133967
## compactness_se           1.0593310
## concavity_se             1.1633374
## points_se                1.0468614
## symmetry_se              0.7465928
## dimension_se             1.1801131
## radius_worst            23.9052917
## texture_worst            4.0776340
## perimeter_worst         25.4575206
## area_worst              22.2424773
## smoothness_worst         2.7430307
## compactness_worst        2.5082056
## concavity_worst          5.6522834
## points_worst            16.9645505
## symmetry_worst           2.1732580
## dimension_worst          0.8324878
varUsed(rf1)   # which predictor variables are actually used in the random forest.
##  [1] 149 248 183 151 127 120 243 300 125 103 152 105 183 227 139 124 127
## [18] 117 104 143 294 274 296 282 210 177 232 307 193 103
# Partial Dependence Plot 
partialPlot(rf1, train, radius_worst, "Benign")

# On that graph, i see that if the value is greater than 0.4, then they are malignant.
# Extract single tree from the forest :

getTree(rf1, 1, labelVar = TRUE)
##    left daughter right daughter         split var split point status
## 1              2              3       points_mean  0.24286779      1
## 2              4              5         area_mean  0.23512195      1
## 3              6              7   concavity_worst  0.17955272      1
## 4              8              9     texture_worst  0.47374733      1
## 5             10             11      texture_mean  0.30334799      1
## 6              0              0              <NA>  0.00000000     -1
## 7             12             13         area_mean  0.08150583      1
## 8             14             15         points_se  0.26539117      1
## 9             16             17    dimension_mean  0.14279697      1
## 10             0              0              <NA>  0.00000000     -1
## 11            18             19        texture_se  0.32781559      1
## 12             0              0              <NA>  0.00000000     -1
## 13            20             21      dimension_se  0.41510164      1
## 14             0              0              <NA>  0.00000000     -1
## 15            22             23         radius_se  0.17168206      1
## 16            24             25         area_mean  0.16625663      1
## 17            26             27      radius_worst  0.31821416      1
## 18             0              0              <NA>  0.00000000     -1
## 19             0              0              <NA>  0.00000000     -1
## 20            28             29           area_se  0.03259818      1
## 21             0              0              <NA>  0.00000000     -1
## 22             0              0              <NA>  0.00000000     -1
## 23             0              0              <NA>  0.00000000     -1
## 24             0              0              <NA>  0.00000000     -1
## 25             0              0              <NA>  0.00000000     -1
## 26             0              0              <NA>  0.00000000     -1
## 27             0              0              <NA>  0.00000000     -1
## 28            30             31 compactness_worst  0.19114009      1
## 29             0              0              <NA>  0.00000000     -1
## 30             0              0              <NA>  0.00000000     -1
## 31             0              0              <NA>  0.00000000     -1
##    prediction
## 1        <NA>
## 2        <NA>
## 3        <NA>
## 4        <NA>
## 5        <NA>
## 6      Benign
## 7        <NA>
## 8        <NA>
## 9        <NA>
## 10     Benign
## 11       <NA>
## 12     Benign
## 13       <NA>
## 14     Benign
## 15       <NA>
## 16       <NA>
## 17       <NA>
## 18  Malignant
## 19     Benign
## 20       <NA>
## 21     Benign
## 22     Benign
## 23  Malignant
## 24     Benign
## 25  Malignant
## 26     Benign
## 27  Malignant
## 28       <NA>
## 29  Malignant
## 30     Benign
## 31  Malignant
# Multi Dimension scaling plot of proximity Matrix
MDSplot(rf1, wbcd$diagnosis)
## Warning in RColorBrewer::brewer.pal(nlevs, "Set1"): minimal value for n is 3, returning requested palette with 3 different levels