# clear the environment and read the data
rm(list=ls())

breast <- read.csv('breast.csv')
str(breast)
## 'data.frame':    699 obs. of  11 variables:
##  $ ID       : int  1000025 1002945 1015425 1016277 1017023 1017122 1018099 1018561 1033078 1033078 ...
##  $ thickness: int  5 5 3 6 4 8 1 2 2 4 ...
##  $ size     : int  1 4 1 8 1 10 1 1 1 2 ...
##  $ shape    : int  1 4 1 8 1 10 1 2 1 1 ...
##  $ adhesion : int  1 5 1 1 3 8 1 1 1 1 ...
##  $ eSize    : int  2 7 2 3 2 7 2 2 2 2 ...
##  $ bNuclei  : chr  "1" "10" "2" "4" ...
##  $ chromatin: int  3 3 3 3 3 9 3 3 1 2 ...
##  $ nNucleoli: int  1 2 1 7 1 7 1 1 1 1 ...
##  $ mitosis  : int  1 1 1 1 1 1 1 1 5 1 ...
##  $ class    : int  0 0 0 0 0 1 0 0 0 0 ...
summary(breast)
##        ID             thickness           size            shape       
##  Min.   :   61634   Min.   : 1.000   Min.   : 1.000   Min.   : 1.000  
##  1st Qu.:  870688   1st Qu.: 2.000   1st Qu.: 1.000   1st Qu.: 1.000  
##  Median : 1171710   Median : 4.000   Median : 1.000   Median : 1.000  
##  Mean   : 1071704   Mean   : 4.418   Mean   : 3.134   Mean   : 3.207  
##  3rd Qu.: 1238298   3rd Qu.: 6.000   3rd Qu.: 5.000   3rd Qu.: 5.000  
##  Max.   :13454352   Max.   :10.000   Max.   :10.000   Max.   :10.000  
##     adhesion          eSize          bNuclei            chromatin     
##  Min.   : 1.000   Min.   : 1.000   Length:699         Min.   : 1.000  
##  1st Qu.: 1.000   1st Qu.: 2.000   Class :character   1st Qu.: 2.000  
##  Median : 1.000   Median : 2.000   Mode  :character   Median : 3.000  
##  Mean   : 2.807   Mean   : 3.216                      Mean   : 3.438  
##  3rd Qu.: 4.000   3rd Qu.: 4.000                      3rd Qu.: 5.000  
##  Max.   :10.000   Max.   :10.000                      Max.   :10.000  
##    nNucleoli         mitosis           class       
##  Min.   : 1.000   Min.   : 1.000   Min.   :0.0000  
##  1st Qu.: 1.000   1st Qu.: 1.000   1st Qu.:0.0000  
##  Median : 1.000   Median : 1.000   Median :0.0000  
##  Mean   : 2.867   Mean   : 1.589   Mean   :0.3448  
##  3rd Qu.: 4.000   3rd Qu.: 1.000   3rd Qu.:1.0000  
##  Max.   :10.000   Max.   :10.000   Max.   :1.0000
# remove ID column
breast$ID <- NULL

# remove the row with bNuclei's value is ?
breast <- breast[!(breast$bNuclei == '?'), ]

# convert bNucler to integers
breast$bNuclei <- factor(breast$bNuclei, levels=1:10, order=T)
breast$bNuclei <- as.integer(breast$bNuclei)
class(breast$bNuclei)
## [1] "integer"
str(breast)
## 'data.frame':    683 obs. of  10 variables:
##  $ thickness: int  5 5 3 6 4 8 1 2 2 4 ...
##  $ size     : int  1 4 1 8 1 10 1 1 1 2 ...
##  $ shape    : int  1 4 1 8 1 10 1 2 1 1 ...
##  $ adhesion : int  1 5 1 1 3 8 1 1 1 1 ...
##  $ eSize    : int  2 7 2 3 2 7 2 2 2 2 ...
##  $ bNuclei  : int  1 10 2 4 1 10 10 1 1 1 ...
##  $ chromatin: int  3 3 3 3 3 9 3 3 1 2 ...
##  $ nNucleoli: int  1 2 1 7 1 7 1 1 1 1 ...
##  $ mitosis  : int  1 1 1 1 1 1 1 1 5 1 ...
##  $ class    : int  0 0 0 0 0 1 0 0 0 0 ...
# convert class to factor
breast$class <- as.factor(breast$class)
summary(breast)
##    thickness           size            shape           adhesion    
##  Min.   : 1.000   Min.   : 1.000   Min.   : 1.000   Min.   : 1.00  
##  1st Qu.: 2.000   1st Qu.: 1.000   1st Qu.: 1.000   1st Qu.: 1.00  
##  Median : 4.000   Median : 1.000   Median : 1.000   Median : 1.00  
##  Mean   : 4.442   Mean   : 3.151   Mean   : 3.215   Mean   : 2.83  
##  3rd Qu.: 6.000   3rd Qu.: 5.000   3rd Qu.: 5.000   3rd Qu.: 4.00  
##  Max.   :10.000   Max.   :10.000   Max.   :10.000   Max.   :10.00  
##      eSize           bNuclei         chromatin        nNucleoli    
##  Min.   : 1.000   Min.   : 1.000   Min.   : 1.000   Min.   : 1.00  
##  1st Qu.: 2.000   1st Qu.: 1.000   1st Qu.: 2.000   1st Qu.: 1.00  
##  Median : 2.000   Median : 1.000   Median : 3.000   Median : 1.00  
##  Mean   : 3.234   Mean   : 3.545   Mean   : 3.445   Mean   : 2.87  
##  3rd Qu.: 4.000   3rd Qu.: 6.000   3rd Qu.: 5.000   3rd Qu.: 4.00  
##  Max.   :10.000   Max.   :10.000   Max.   :10.000   Max.   :10.00  
##     mitosis       class  
##  Min.   : 1.000   0:444  
##  1st Qu.: 1.000   1:239  
##  Median : 1.000          
##  Mean   : 1.603          
##  3rd Qu.: 1.000          
##  Max.   :10.000
library(ggplot2)

vars <- colnames(breast)[-10]
for (var in vars) {
  plt <- ggplot(aes(x = class, y = breast[, var]), data=breast) +
    geom_boxplot() +
    labs(y = var)
  print(plt)
}

# split the data to training and test datasets
library(caret)
## Loading required package: lattice
set.seed(2019)
partition <- createDataPartition(breast$class, p = 0.7, list=F)
trains <- breast[partition, ]
test <- breast[-partition, ]

summary(trains$class)
##   0   1 
## 311 168
summary(test$class)
##   0   1 
## 133  71
# build the trees
library(rpart)
library(rpart.plot)

tree.a <- rpart(class ~ . - mitosis, data = trains, method="class", 
                control = rpart.control(
                  minsplit = 5,
                  minbucket = 1,
                  cp = 0
                ))
rpart.plot(tree.a)

tree.b <- rpart(class ~ ., data=trains, method='class',
                control=rpart.control(
                  minbucket=2,
                  cp=0.0005,
                  maxdepth = 20
                ), parms=list(split="gini"))
tree.b
## n= 479 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 479 168 0 (0.64926931 0.35073069)  
##    2) size< 3.5 328  25 0 (0.92378049 0.07621951)  
##      4) bNuclei< 5.5 311  10 0 (0.96784566 0.03215434)  
##        8) nNucleoli< 3.5 304   5 0 (0.98355263 0.01644737)  
##         16) bNuclei< 2.5 282   0 0 (1.00000000 0.00000000) *
##         17) bNuclei>=2.5 22   5 0 (0.77272727 0.22727273)  
##           34) thickness< 3.5 15   0 0 (1.00000000 0.00000000) *
##           35) thickness>=3.5 7   2 1 (0.28571429 0.71428571)  
##             70) eSize< 2.5 3   1 0 (0.66666667 0.33333333) *
##             71) eSize>=2.5 4   0 1 (0.00000000 1.00000000) *
##        9) nNucleoli>=3.5 7   2 1 (0.28571429 0.71428571)  
##         18) thickness< 4.5 2   0 0 (1.00000000 0.00000000) *
##         19) thickness>=4.5 5   0 1 (0.00000000 1.00000000) *
##      5) bNuclei>=5.5 17   2 1 (0.11764706 0.88235294) *
##    3) size>=3.5 151   8 1 (0.05298013 0.94701987)  
##      6) size< 4.5 29   6 1 (0.20689655 0.79310345)  
##       12) bNuclei< 7.5 13   6 1 (0.46153846 0.53846154)  
##         24) adhesion< 3.5 5   1 0 (0.80000000 0.20000000) *
##         25) adhesion>=3.5 8   2 1 (0.25000000 0.75000000)  
##           50) shape< 4.5 3   1 0 (0.66666667 0.33333333) *
##           51) shape>=4.5 5   0 1 (0.00000000 1.00000000) *
##       13) bNuclei>=7.5 16   0 1 (0.00000000 1.00000000) *
##      7) size>=4.5 122   2 1 (0.01639344 0.98360656) *
rpart.plot(tree.b)

# pruned the tree.b
tree.b.pruned <- prune(tree.b, cp = tree.b$cptable[5, "CP"])
rpart.plot(tree.b.pruned)

# comparing the predictive performance on tests
b.pred <- predict(tree.b, newdata=test, type="class")
b.pruned.pred <- predict(tree.b.pruned, newdata=test, type="class")

confusionMatrix(b.pred, test$class, positive="1", dnn=c("Prediction", "Actual"))
## Confusion Matrix and Statistics
## 
##           Actual
## Prediction   0   1
##          0 129   2
##          1   4  69
##                                           
##                Accuracy : 0.9706          
##                  95% CI : (0.9371, 0.9891)
##     No Information Rate : 0.652           
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9356          
##                                           
##  Mcnemar's Test P-Value : 0.6831          
##                                           
##             Sensitivity : 0.9718          
##             Specificity : 0.9699          
##          Pos Pred Value : 0.9452          
##          Neg Pred Value : 0.9847          
##              Prevalence : 0.3480          
##          Detection Rate : 0.3382          
##    Detection Prevalence : 0.3578          
##       Balanced Accuracy : 0.9709          
##                                           
##        'Positive' Class : 1               
## 
confusionMatrix(b.pruned.pred, test$class, positive="1", dnn=c("Prediction", "Actual"))
## Confusion Matrix and Statistics
## 
##           Actual
## Prediction   0   1
##          0 129   2
##          1   4  69
##                                           
##                Accuracy : 0.9706          
##                  95% CI : (0.9371, 0.9891)
##     No Information Rate : 0.652           
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9356          
##                                           
##  Mcnemar's Test P-Value : 0.6831          
##                                           
##             Sensitivity : 0.9718          
##             Specificity : 0.9699          
##          Pos Pred Value : 0.9452          
##          Neg Pred Value : 0.9847          
##              Prevalence : 0.3480          
##          Detection Rate : 0.3382          
##    Detection Prevalence : 0.3578          
##       Balanced Accuracy : 0.9709          
##                                           
##        'Positive' Class : 1               
## 
# auc comparision
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
roc.b <- roc(test$class, predict(tree.b, newdata = test)[, 2])
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc.b <- auc(roc.b)
print(auc.b)
## Area under the curve: 0.9682
roc.pruned <- roc(test$class, predict(tree.b.pruned, newdata = test)[, 2])
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc.pruned <- auc(roc.pruned)
print(auc.pruned)
## Area under the curve: 0.9602
# downsampling with trainControl
set.seed(2019)
control <- trainControl(
  method = 'repeatedcv',
  number = 5,
  repeats = 3,
  sampling= 'down'
)

rf.grid <- expand.grid(mtry=1:9)

rf <- train(class ~ .,
            data = trains,
            ntree = 200,
            method = 'rf',
            importance = T,
            trControl = control,
            tuneGrid = rf.grid)
rf
## Random Forest 
## 
## 479 samples
##   9 predictor
##   2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 3 times) 
## Summary of sample sizes: 384, 382, 383, 383, 384, 383, ... 
## Addtional sampling using down-sampling
## 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##   1     0.9735520  0.9424285
##   2     0.9721704  0.9393218
##   3     0.9714615  0.9378171
##   4     0.9693563  0.9332199
##   5     0.9673017  0.9285667
##   6     0.9693781  0.9332722
##   7     0.9658911  0.9255486
##   8     0.9665855  0.9274086
##   9     0.9645311  0.9226915
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 1.
plot(rf)

# confusion matrix of random forest
rf.pred <- predict(rf, newdata = test)

confusionMatrix(rf.pred, test$class, dnn=c("Prediction", "Actual"), positive = '1')
## Confusion Matrix and Statistics
## 
##           Actual
## Prediction   0   1
##          0 129   0
##          1   4  71
##                                           
##                Accuracy : 0.9804          
##                  95% CI : (0.9506, 0.9946)
##     No Information Rate : 0.652           
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9574          
##                                           
##  Mcnemar's Test P-Value : 0.1336          
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.9699          
##          Pos Pred Value : 0.9467          
##          Neg Pred Value : 1.0000          
##              Prevalence : 0.3480          
##          Detection Rate : 0.3480          
##    Detection Prevalence : 0.3676          
##       Balanced Accuracy : 0.9850          
##                                           
##        'Positive' Class : 1               
## 
rf.pred.prob <- predict(rf, newdata = test, type = 'prob')
roc(test$class, rf.pred.prob[, 2], auc=T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## 
## Call:
## roc.default(response = test$class, predictor = rf.pred.prob[,     2], auc = T)
## 
## Data: rf.pred.prob[, 2] in 133 controls (test$class 0) < 71 cases (test$class 1).
## Area under the curve: 0.9955
# get the importance list
importance <- varImp(rf)
importance
## rf variable importance
## 
##           Importance
## bNuclei       100.00
## shape          91.54
## chromatin      80.51
## thickness      76.16
## size           73.42
## nNucleoli      55.22
## adhesion       45.70
## eSize          27.34
## mitosis         0.00
# data transformation
breast$class <- ifelse(breast$class == "0", "B", "M")
breast$class <- as.factor(breast$class)

trains_2 <- breast[partition, ]
test_2 <- breast[-partition, ]
# xgboost model
xgb.grid <- expand.grid(
  max_depth = c(1, 3, 7),
  min_child_weight = 1,
  gamma = 0,
  nrounds = 1000,
  eta = c(0.01, 0.05, 0.1),
  colsample_bytree = c(0.6, 0.9),
  subsample = 0.6
)

control <- trainControl(
  method = 'cv',
  number = 5,
  sampling = 'down'
)

set.seed(42)
xgb.tuned <- train(class ~., 
                   data = trains_2, 
                   method = 'xgbTree',
                   trControl = control, 
                   tuneGrid = xgb.grid)
xgb.tuned
## eXtreme Gradient Boosting 
## 
## 479 samples
##   9 predictor
##   2 classes: 'B', 'M' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 383, 384, 383, 384, 382 
## Addtional sampling using down-sampling
## 
## Resampling results across tuning parameters:
## 
##   eta   max_depth  colsample_bytree  Accuracy   Kappa    
##   0.01  1          0.6               0.9770166  0.9502360
##   0.01  1          0.9               0.9728715  0.9409068
##   0.01  3          0.6               0.9686609  0.9318137
##   0.01  3          0.9               0.9707443  0.9360771
##   0.01  7          0.6               0.9707881  0.9363840
##   0.01  7          0.9               0.9644723  0.9230479
##   0.05  1          0.6               0.9623671  0.9173734
##   0.05  1          0.9               0.9623456  0.9178595
##   0.05  3          0.6               0.9644070  0.9222164
##   0.05  3          0.9               0.9707009  0.9363512
##   0.05  7          0.6               0.9728061  0.9408311
##   0.05  7          0.9               0.9644289  0.9228733
##   0.10  1          0.6               0.9686394  0.9317702
##   0.10  1          0.9               0.9624319  0.9181507
##   0.10  3          0.6               0.9686175  0.9316346
##   0.10  3          0.9               0.9539465  0.8991945
##   0.10  7          0.6               0.9686175  0.9315137
##   0.10  7          0.9               0.9624105  0.9182839
## 
## Tuning parameter 'nrounds' was held constant at a value of 1000
##  parameter 'min_child_weight' was held constant at a value of 1
## 
## Tuning parameter 'subsample' was held constant at a value of 0.6
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were nrounds = 1000, max_depth = 1, eta
##  = 0.01, gamma = 0, colsample_bytree = 0.6, min_child_weight = 1 and
##  subsample = 0.6.
pred.xgb.pred <- predict(xgb.tuned, newdata = test_2)
confusionMatrix(pred.xgb.pred, test_2$class, positive = 'M', dnn=c('Prediction', 'Acutal'))
## Confusion Matrix and Statistics
## 
##           Acutal
## Prediction   B   M
##          B 129   1
##          M   4  70
##                                          
##                Accuracy : 0.9755         
##                  95% CI : (0.9437, 0.992)
##     No Information Rate : 0.652          
##     P-Value [Acc > NIR] : <2e-16         
##                                          
##                   Kappa : 0.9465         
##                                          
##  Mcnemar's Test P-Value : 0.3711         
##                                          
##             Sensitivity : 0.9859         
##             Specificity : 0.9699         
##          Pos Pred Value : 0.9459         
##          Neg Pred Value : 0.9923         
##              Prevalence : 0.3480         
##          Detection Rate : 0.3431         
##    Detection Prevalence : 0.3627         
##       Balanced Accuracy : 0.9779         
##                                          
##        'Positive' Class : M              
## 
pred.xgb.prob <- predict(xgb.tuned, newdata = test_2, type = 'prob')[, 2]
roc(test_2$class, pred.xgb.prob, auc=T)
## Setting levels: control = B, case = M
## Setting direction: controls < cases
## 
## Call:
## roc.default(response = test_2$class, predictor = pred.xgb.prob,     auc = T)
## 
## Data: pred.xgb.prob in 133 controls (test_2$class B) < 71 cases (test_2$class M).
## Area under the curve: 0.9961