# clear the environment and read the data
rm(list=ls())
breast <- read.csv('breast.csv')
str(breast)
## 'data.frame': 699 obs. of 11 variables:
## $ ID : int 1000025 1002945 1015425 1016277 1017023 1017122 1018099 1018561 1033078 1033078 ...
## $ thickness: int 5 5 3 6 4 8 1 2 2 4 ...
## $ size : int 1 4 1 8 1 10 1 1 1 2 ...
## $ shape : int 1 4 1 8 1 10 1 2 1 1 ...
## $ adhesion : int 1 5 1 1 3 8 1 1 1 1 ...
## $ eSize : int 2 7 2 3 2 7 2 2 2 2 ...
## $ bNuclei : chr "1" "10" "2" "4" ...
## $ chromatin: int 3 3 3 3 3 9 3 3 1 2 ...
## $ nNucleoli: int 1 2 1 7 1 7 1 1 1 1 ...
## $ mitosis : int 1 1 1 1 1 1 1 1 5 1 ...
## $ class : int 0 0 0 0 0 1 0 0 0 0 ...
summary(breast)
## ID thickness size shape
## Min. : 61634 Min. : 1.000 Min. : 1.000 Min. : 1.000
## 1st Qu.: 870688 1st Qu.: 2.000 1st Qu.: 1.000 1st Qu.: 1.000
## Median : 1171710 Median : 4.000 Median : 1.000 Median : 1.000
## Mean : 1071704 Mean : 4.418 Mean : 3.134 Mean : 3.207
## 3rd Qu.: 1238298 3rd Qu.: 6.000 3rd Qu.: 5.000 3rd Qu.: 5.000
## Max. :13454352 Max. :10.000 Max. :10.000 Max. :10.000
## adhesion eSize bNuclei chromatin
## Min. : 1.000 Min. : 1.000 Length:699 Min. : 1.000
## 1st Qu.: 1.000 1st Qu.: 2.000 Class :character 1st Qu.: 2.000
## Median : 1.000 Median : 2.000 Mode :character Median : 3.000
## Mean : 2.807 Mean : 3.216 Mean : 3.438
## 3rd Qu.: 4.000 3rd Qu.: 4.000 3rd Qu.: 5.000
## Max. :10.000 Max. :10.000 Max. :10.000
## nNucleoli mitosis class
## Min. : 1.000 Min. : 1.000 Min. :0.0000
## 1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.:0.0000
## Median : 1.000 Median : 1.000 Median :0.0000
## Mean : 2.867 Mean : 1.589 Mean :0.3448
## 3rd Qu.: 4.000 3rd Qu.: 1.000 3rd Qu.:1.0000
## Max. :10.000 Max. :10.000 Max. :1.0000
# remove ID column
breast$ID <- NULL
# remove the row with bNuclei's value is ?
breast <- breast[!(breast$bNuclei == '?'), ]
# convert bNucler to integers
breast$bNuclei <- factor(breast$bNuclei, levels=1:10, order=T)
breast$bNuclei <- as.integer(breast$bNuclei)
class(breast$bNuclei)
## [1] "integer"
str(breast)
## 'data.frame': 683 obs. of 10 variables:
## $ thickness: int 5 5 3 6 4 8 1 2 2 4 ...
## $ size : int 1 4 1 8 1 10 1 1 1 2 ...
## $ shape : int 1 4 1 8 1 10 1 2 1 1 ...
## $ adhesion : int 1 5 1 1 3 8 1 1 1 1 ...
## $ eSize : int 2 7 2 3 2 7 2 2 2 2 ...
## $ bNuclei : int 1 10 2 4 1 10 10 1 1 1 ...
## $ chromatin: int 3 3 3 3 3 9 3 3 1 2 ...
## $ nNucleoli: int 1 2 1 7 1 7 1 1 1 1 ...
## $ mitosis : int 1 1 1 1 1 1 1 1 5 1 ...
## $ class : int 0 0 0 0 0 1 0 0 0 0 ...
# convert class to factor
breast$class <- as.factor(breast$class)
summary(breast)
## thickness size shape adhesion
## Min. : 1.000 Min. : 1.000 Min. : 1.000 Min. : 1.00
## 1st Qu.: 2.000 1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.: 1.00
## Median : 4.000 Median : 1.000 Median : 1.000 Median : 1.00
## Mean : 4.442 Mean : 3.151 Mean : 3.215 Mean : 2.83
## 3rd Qu.: 6.000 3rd Qu.: 5.000 3rd Qu.: 5.000 3rd Qu.: 4.00
## Max. :10.000 Max. :10.000 Max. :10.000 Max. :10.00
## eSize bNuclei chromatin nNucleoli
## Min. : 1.000 Min. : 1.000 Min. : 1.000 Min. : 1.00
## 1st Qu.: 2.000 1st Qu.: 1.000 1st Qu.: 2.000 1st Qu.: 1.00
## Median : 2.000 Median : 1.000 Median : 3.000 Median : 1.00
## Mean : 3.234 Mean : 3.545 Mean : 3.445 Mean : 2.87
## 3rd Qu.: 4.000 3rd Qu.: 6.000 3rd Qu.: 5.000 3rd Qu.: 4.00
## Max. :10.000 Max. :10.000 Max. :10.000 Max. :10.00
## mitosis class
## Min. : 1.000 0:444
## 1st Qu.: 1.000 1:239
## Median : 1.000
## Mean : 1.603
## 3rd Qu.: 1.000
## Max. :10.000
library(ggplot2)
vars <- colnames(breast)[-10]
for (var in vars) {
plt <- ggplot(aes(x = class, y = breast[, var]), data=breast) +
geom_boxplot() +
labs(y = var)
print(plt)
}









# split the data to training and test datasets
library(caret)
## Loading required package: lattice
set.seed(2019)
partition <- createDataPartition(breast$class, p = 0.7, list=F)
trains <- breast[partition, ]
test <- breast[-partition, ]
summary(trains$class)
## 0 1
## 311 168
summary(test$class)
## 0 1
## 133 71
# build the trees
library(rpart)
library(rpart.plot)
tree.a <- rpart(class ~ . - mitosis, data = trains, method="class",
control = rpart.control(
minsplit = 5,
minbucket = 1,
cp = 0
))
rpart.plot(tree.a)

tree.b <- rpart(class ~ ., data=trains, method='class',
control=rpart.control(
minbucket=2,
cp=0.0005,
maxdepth = 20
), parms=list(split="gini"))
tree.b
## n= 479
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 479 168 0 (0.64926931 0.35073069)
## 2) size< 3.5 328 25 0 (0.92378049 0.07621951)
## 4) bNuclei< 5.5 311 10 0 (0.96784566 0.03215434)
## 8) nNucleoli< 3.5 304 5 0 (0.98355263 0.01644737)
## 16) bNuclei< 2.5 282 0 0 (1.00000000 0.00000000) *
## 17) bNuclei>=2.5 22 5 0 (0.77272727 0.22727273)
## 34) thickness< 3.5 15 0 0 (1.00000000 0.00000000) *
## 35) thickness>=3.5 7 2 1 (0.28571429 0.71428571)
## 70) eSize< 2.5 3 1 0 (0.66666667 0.33333333) *
## 71) eSize>=2.5 4 0 1 (0.00000000 1.00000000) *
## 9) nNucleoli>=3.5 7 2 1 (0.28571429 0.71428571)
## 18) thickness< 4.5 2 0 0 (1.00000000 0.00000000) *
## 19) thickness>=4.5 5 0 1 (0.00000000 1.00000000) *
## 5) bNuclei>=5.5 17 2 1 (0.11764706 0.88235294) *
## 3) size>=3.5 151 8 1 (0.05298013 0.94701987)
## 6) size< 4.5 29 6 1 (0.20689655 0.79310345)
## 12) bNuclei< 7.5 13 6 1 (0.46153846 0.53846154)
## 24) adhesion< 3.5 5 1 0 (0.80000000 0.20000000) *
## 25) adhesion>=3.5 8 2 1 (0.25000000 0.75000000)
## 50) shape< 4.5 3 1 0 (0.66666667 0.33333333) *
## 51) shape>=4.5 5 0 1 (0.00000000 1.00000000) *
## 13) bNuclei>=7.5 16 0 1 (0.00000000 1.00000000) *
## 7) size>=4.5 122 2 1 (0.01639344 0.98360656) *
rpart.plot(tree.b)

# pruned the tree.b
tree.b.pruned <- prune(tree.b, cp = tree.b$cptable[5, "CP"])
rpart.plot(tree.b.pruned)

# comparing the predictive performance on tests
b.pred <- predict(tree.b, newdata=test, type="class")
b.pruned.pred <- predict(tree.b.pruned, newdata=test, type="class")
confusionMatrix(b.pred, test$class, positive="1", dnn=c("Prediction", "Actual"))
## Confusion Matrix and Statistics
##
## Actual
## Prediction 0 1
## 0 129 2
## 1 4 69
##
## Accuracy : 0.9706
## 95% CI : (0.9371, 0.9891)
## No Information Rate : 0.652
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9356
##
## Mcnemar's Test P-Value : 0.6831
##
## Sensitivity : 0.9718
## Specificity : 0.9699
## Pos Pred Value : 0.9452
## Neg Pred Value : 0.9847
## Prevalence : 0.3480
## Detection Rate : 0.3382
## Detection Prevalence : 0.3578
## Balanced Accuracy : 0.9709
##
## 'Positive' Class : 1
##
confusionMatrix(b.pruned.pred, test$class, positive="1", dnn=c("Prediction", "Actual"))
## Confusion Matrix and Statistics
##
## Actual
## Prediction 0 1
## 0 129 2
## 1 4 69
##
## Accuracy : 0.9706
## 95% CI : (0.9371, 0.9891)
## No Information Rate : 0.652
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9356
##
## Mcnemar's Test P-Value : 0.6831
##
## Sensitivity : 0.9718
## Specificity : 0.9699
## Pos Pred Value : 0.9452
## Neg Pred Value : 0.9847
## Prevalence : 0.3480
## Detection Rate : 0.3382
## Detection Prevalence : 0.3578
## Balanced Accuracy : 0.9709
##
## 'Positive' Class : 1
##
# auc comparision
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
roc.b <- roc(test$class, predict(tree.b, newdata = test)[, 2])
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc.b <- auc(roc.b)
print(auc.b)
## Area under the curve: 0.9682
roc.pruned <- roc(test$class, predict(tree.b.pruned, newdata = test)[, 2])
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc.pruned <- auc(roc.pruned)
print(auc.pruned)
## Area under the curve: 0.9602
# downsampling with trainControl
set.seed(2019)
control <- trainControl(
method = 'repeatedcv',
number = 5,
repeats = 3,
sampling= 'down'
)
rf.grid <- expand.grid(mtry=1:9)
rf <- train(class ~ .,
data = trains,
ntree = 200,
method = 'rf',
importance = T,
trControl = control,
tuneGrid = rf.grid)
rf
## Random Forest
##
## 479 samples
## 9 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 3 times)
## Summary of sample sizes: 384, 382, 383, 383, 384, 383, ...
## Addtional sampling using down-sampling
##
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 1 0.9735520 0.9424285
## 2 0.9721704 0.9393218
## 3 0.9714615 0.9378171
## 4 0.9693563 0.9332199
## 5 0.9673017 0.9285667
## 6 0.9693781 0.9332722
## 7 0.9658911 0.9255486
## 8 0.9665855 0.9274086
## 9 0.9645311 0.9226915
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 1.
plot(rf)

# confusion matrix of random forest
rf.pred <- predict(rf, newdata = test)
confusionMatrix(rf.pred, test$class, dnn=c("Prediction", "Actual"), positive = '1')
## Confusion Matrix and Statistics
##
## Actual
## Prediction 0 1
## 0 129 0
## 1 4 71
##
## Accuracy : 0.9804
## 95% CI : (0.9506, 0.9946)
## No Information Rate : 0.652
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9574
##
## Mcnemar's Test P-Value : 0.1336
##
## Sensitivity : 1.0000
## Specificity : 0.9699
## Pos Pred Value : 0.9467
## Neg Pred Value : 1.0000
## Prevalence : 0.3480
## Detection Rate : 0.3480
## Detection Prevalence : 0.3676
## Balanced Accuracy : 0.9850
##
## 'Positive' Class : 1
##
rf.pred.prob <- predict(rf, newdata = test, type = 'prob')
roc(test$class, rf.pred.prob[, 2], auc=T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##
## Call:
## roc.default(response = test$class, predictor = rf.pred.prob[, 2], auc = T)
##
## Data: rf.pred.prob[, 2] in 133 controls (test$class 0) < 71 cases (test$class 1).
## Area under the curve: 0.9955
# get the importance list
importance <- varImp(rf)
importance
## rf variable importance
##
## Importance
## bNuclei 100.00
## shape 91.54
## chromatin 80.51
## thickness 76.16
## size 73.42
## nNucleoli 55.22
## adhesion 45.70
## eSize 27.34
## mitosis 0.00
# data transformation
breast$class <- ifelse(breast$class == "0", "B", "M")
breast$class <- as.factor(breast$class)
trains_2 <- breast[partition, ]
test_2 <- breast[-partition, ]
# xgboost model
xgb.grid <- expand.grid(
max_depth = c(1, 3, 7),
min_child_weight = 1,
gamma = 0,
nrounds = 1000,
eta = c(0.01, 0.05, 0.1),
colsample_bytree = c(0.6, 0.9),
subsample = 0.6
)
control <- trainControl(
method = 'cv',
number = 5,
sampling = 'down'
)
set.seed(42)
xgb.tuned <- train(class ~.,
data = trains_2,
method = 'xgbTree',
trControl = control,
tuneGrid = xgb.grid)
xgb.tuned
## eXtreme Gradient Boosting
##
## 479 samples
## 9 predictor
## 2 classes: 'B', 'M'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 383, 384, 383, 384, 382
## Addtional sampling using down-sampling
##
## Resampling results across tuning parameters:
##
## eta max_depth colsample_bytree Accuracy Kappa
## 0.01 1 0.6 0.9770166 0.9502360
## 0.01 1 0.9 0.9728715 0.9409068
## 0.01 3 0.6 0.9686609 0.9318137
## 0.01 3 0.9 0.9707443 0.9360771
## 0.01 7 0.6 0.9707881 0.9363840
## 0.01 7 0.9 0.9644723 0.9230479
## 0.05 1 0.6 0.9623671 0.9173734
## 0.05 1 0.9 0.9623456 0.9178595
## 0.05 3 0.6 0.9644070 0.9222164
## 0.05 3 0.9 0.9707009 0.9363512
## 0.05 7 0.6 0.9728061 0.9408311
## 0.05 7 0.9 0.9644289 0.9228733
## 0.10 1 0.6 0.9686394 0.9317702
## 0.10 1 0.9 0.9624319 0.9181507
## 0.10 3 0.6 0.9686175 0.9316346
## 0.10 3 0.9 0.9539465 0.8991945
## 0.10 7 0.6 0.9686175 0.9315137
## 0.10 7 0.9 0.9624105 0.9182839
##
## Tuning parameter 'nrounds' was held constant at a value of 1000
## parameter 'min_child_weight' was held constant at a value of 1
##
## Tuning parameter 'subsample' was held constant at a value of 0.6
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were nrounds = 1000, max_depth = 1, eta
## = 0.01, gamma = 0, colsample_bytree = 0.6, min_child_weight = 1 and
## subsample = 0.6.
pred.xgb.pred <- predict(xgb.tuned, newdata = test_2)
confusionMatrix(pred.xgb.pred, test_2$class, positive = 'M', dnn=c('Prediction', 'Acutal'))
## Confusion Matrix and Statistics
##
## Acutal
## Prediction B M
## B 129 1
## M 4 70
##
## Accuracy : 0.9755
## 95% CI : (0.9437, 0.992)
## No Information Rate : 0.652
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9465
##
## Mcnemar's Test P-Value : 0.3711
##
## Sensitivity : 0.9859
## Specificity : 0.9699
## Pos Pred Value : 0.9459
## Neg Pred Value : 0.9923
## Prevalence : 0.3480
## Detection Rate : 0.3431
## Detection Prevalence : 0.3627
## Balanced Accuracy : 0.9779
##
## 'Positive' Class : M
##
pred.xgb.prob <- predict(xgb.tuned, newdata = test_2, type = 'prob')[, 2]
roc(test_2$class, pred.xgb.prob, auc=T)
## Setting levels: control = B, case = M
## Setting direction: controls < cases
##
## Call:
## roc.default(response = test_2$class, predictor = pred.xgb.prob, auc = T)
##
## Data: pred.xgb.prob in 133 controls (test_2$class B) < 71 cases (test_2$class M).
## Area under the curve: 0.9961