# Loading Data
url <- 'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data'
bc_data <- read.csv(url, header = FALSE)
class(bc_data)
## [1] "data.frame"
str(bc_data)
## 'data.frame': 699 obs. of 11 variables:
## $ V1 : int 1000025 1002945 1015425 1016277 1017023 1017122 1018099 1018561 1033078 1033078 ...
## $ V2 : int 5 5 3 6 4 8 1 2 2 4 ...
## $ V3 : int 1 4 1 8 1 10 1 1 1 2 ...
## $ V4 : int 1 4 1 8 1 10 1 2 1 1 ...
## $ V5 : int 1 5 1 1 3 8 1 1 1 1 ...
## $ V6 : int 2 7 2 3 2 7 2 2 2 2 ...
## $ V7 : Factor w/ 11 levels "?","1","10","2",..: 2 3 4 6 2 3 3 2 2 2 ...
## $ V8 : int 3 3 3 3 3 9 3 3 1 2 ...
## $ V9 : int 1 2 1 7 1 7 1 1 1 1 ...
## $ V10: int 1 1 1 1 1 1 1 1 5 1 ...
## $ V11: int 2 2 2 2 2 4 2 2 2 2 ...
head(bc_data)
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11
## 1 1000025 5 1 1 1 2 1 3 1 1 2
## 2 1002945 5 4 4 5 7 10 3 2 1 2
## 3 1015425 3 1 1 1 2 2 3 1 1 2
## 4 1016277 6 8 8 1 3 4 3 7 1 2
## 5 1017023 4 1 1 3 2 1 3 1 1 2
## 6 1017122 8 10 10 8 7 10 9 7 1 4
colnames(bc_data) <- c(
"sample_code_number",
"clump_thickness",
"uniformity_of_cell_size",
"uniformity_of_cell_shape",
"marginal_adhesion",
"single_epithelial_cell_size",
"bare_nuclei",
"bland_chromatin",
"normal_nucleoli",
"mitosis",
"classes"
)
head(bc_data)
## sample_code_number clump_thickness uniformity_of_cell_size
## 1 1000025 5 1
## 2 1002945 5 4
## 3 1015425 3 1
## 4 1016277 6 8
## 5 1017023 4 1
## 6 1017122 8 10
## uniformity_of_cell_shape marginal_adhesion single_epithelial_cell_size
## 1 1 1 2
## 2 4 5 7
## 3 1 1 2
## 4 8 1 3
## 5 1 3 2
## 6 10 8 7
## bare_nuclei bland_chromatin normal_nucleoli mitosis classes
## 1 1 3 1 1 2
## 2 10 3 2 1 2
## 3 2 3 1 1 2
## 4 4 3 7 1 2
## 5 1 3 1 1 2
## 6 10 9 7 1 4
# Data Transofmation and Cleaning
str(bc_data)
## 'data.frame': 699 obs. of 11 variables:
## $ sample_code_number : int 1000025 1002945 1015425 1016277 1017023 1017122 1018099 1018561 1033078 1033078 ...
## $ clump_thickness : int 5 5 3 6 4 8 1 2 2 4 ...
## $ uniformity_of_cell_size : int 1 4 1 8 1 10 1 1 1 2 ...
## $ uniformity_of_cell_shape : int 1 4 1 8 1 10 1 2 1 1 ...
## $ marginal_adhesion : int 1 5 1 1 3 8 1 1 1 1 ...
## $ single_epithelial_cell_size: int 2 7 2 3 2 7 2 2 2 2 ...
## $ bare_nuclei : Factor w/ 11 levels "?","1","10","2",..: 2 3 4 6 2 3 3 2 2 2 ...
## $ bland_chromatin : int 3 3 3 3 3 9 3 3 1 2 ...
## $ normal_nucleoli : int 1 2 1 7 1 7 1 1 1 1 ...
## $ mitosis : int 1 1 1 1 1 1 1 1 5 1 ...
## $ classes : int 2 2 2 2 2 4 2 2 2 2 ...
bc_data[bc_data == '?'] <- NA
bc_data <- na.omit(bc_data)
bc_data$bare_nuclei <- as.integer(bc_data$bare_nuclei)
#method 1
bc_data$sample_code_number <- NULL
# method 2
#bc_data <- bc_data[, -1]
# method 3
#bc_data <- bc_data[, ! names(bc_data) %in% c('sample_code_number')]
bc_data$classes <- factor(bc_data$classes, levels = c(2,4), labels = c('benign', 'malignant'))
str(bc_data)
## 'data.frame': 683 obs. of 10 variables:
## $ clump_thickness : int 5 5 3 6 4 8 1 2 2 4 ...
## $ uniformity_of_cell_size : int 1 4 1 8 1 10 1 1 1 2 ...
## $ uniformity_of_cell_shape : int 1 4 1 8 1 10 1 2 1 1 ...
## $ marginal_adhesion : int 1 5 1 1 3 8 1 1 1 1 ...
## $ single_epithelial_cell_size: int 2 7 2 3 2 7 2 2 2 2 ...
## $ bare_nuclei : int 2 3 4 6 2 3 3 2 2 2 ...
## $ bland_chromatin : int 3 3 3 3 3 9 3 3 1 2 ...
## $ normal_nucleoli : int 1 2 1 7 1 7 1 1 1 1 ...
## $ mitosis : int 1 1 1 1 1 1 1 1 5 1 ...
## $ classes : Factor w/ 2 levels "benign","malignant": 1 1 1 1 1 2 1 1 1 1 ...
## - attr(*, "na.action")=Class 'omit' Named int [1:16] 24 41 140 146 159 165 236 250 276 293 ...
## .. ..- attr(*, "names")= chr [1:16] "24" "41" "140" "146" ...
# Split the data into training dataset and testing dataset
set.seed(42)
idx <- sample.int(2, nrow(bc_data), prob = c(0.7,0.3), replace = TRUE)
train_data <-bc_data[idx ==1, ]
test_data <-bc_data[idx ==2, ]
dim(train_data)
## [1] 485 10
dim(test_data)
## [1] 198 10
library(rpart)
fit <- rpart(classes ~., data = train_data, method = 'class')
plot(fit, margin = 0.1)
text(fit)

# install.packages('rpart.plot')
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.4.4
rpart.plot(fit)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.4

ggplot(train_data, aes(x = classes,y=uniformity_of_cell_size, fill=classes)) + geom_boxplot() + geom_abline(intercept = 3.5, slope = 0,colour='red')

boxplot(uniformity_of_cell_size ~ classes, data = train_data)
abline(h = 3.5, col= 'red')

plot(uniformity_of_cell_size~ clump_thickness, data = train_data, col=classes, pch = 19)
abline(h = 3.5, col= 'blue')
abline(v = 6.5, col= 'orange')

predicted <- predict(fit, test_data, type = 'class')
sum(predicted == test_data$classes) / length(test_data$classes)
## [1] 0.9545455
table(test_data$classes, predicted)
## predicted
## benign malignant
## benign 126 7
## malignant 2 63
Confusion Matrix
predicted <- predict(fit, test_data, type = 'prob')
res <- ifelse(predicted[,1] >= 0.5, 'benign', 'malignant')
res <- factor(res)
table(test_data$classes, res)
## res
## benign malignant
## benign 126 7
## malignant 2 63
tb <- table(test_data$classes, res)
tb
## res
## benign malignant
## benign 126 7
## malignant 2 63
TP <- tb[1,1]
FP <- tb[1,2]
FN <- tb[2,1]
TN <- tb[2,2]
TPR <- TP / (TP + FN)
TPR
## [1] 0.984375
FPR <- FP / (FP + TN)
FPR
## [1] 0.1
ROC Curve
predicted <- predict(fit, test_data, type = 'prob')
TPR_ARY <- c(0)
FPR_ARY <- c(0)
for (cost in seq(0,1,0.1) ){
res <- ifelse(predicted[,1] >= cost, 'benign', 'malignant')
res <- factor(res)
tb <- table(test_data$classes, res)
if (length(tb) == 4){
TP <- tb[1,1]
FP <- tb[1,2]
FN <- tb[2,1]
TN <- tb[2,2]
TPR <- TP / (TP + FN)
FPR <- FP / (FP + TN)
TPR_ARY <- c(TPR_ARY, TPR)
FPR_ARY <- c(FPR_ARY, FPR)
#print(paste(TPR, FPR))
}
}
TPR_ARY <- c(TPR_ARY, 1)
FPR_ARY <- c(FPR_ARY, 1)
plot(FPR_ARY, TPR_ARY,xlim = c(0,1), ylim = c(0,1), type = 'o', col='blue', xlab = 'False Positive Rate', ylab = 'True Positive Rate', main = 'ROC Curve')
lines(c(0,1), c(0,1), col= 'black', lty = 2)

Caret Version of ROC Curve
library(caret)
## Warning: package 'caret' was built under R version 3.4.4
## Loading required package: lattice
res <- ifelse(predicted[,1] >= 0.5, 'benign', 'malignant')
res <- factor(res)
tb <- table(test_data$classes, res)
cm <- confusionMatrix(tb)
TPR <- cm$byClass[1]
FPR <- 1 - cm$byClass[2]
predicted <- predict(fit, test_data, type = 'prob')
TPR_ARY <- c(0)
FPR_ARY <- c(0)
for (cost in seq(0,1,0.1) ){
res <- ifelse(predicted[,1] >= cost, 'benign', 'malignant')
res <- factor(res)
tb <- table(test_data$classes, res)
if (length(tb) == 4){
cm <- confusionMatrix(tb)
TPR <- cm$byClass[1]
FPR <- 1 - cm$byClass[2]
TPR_ARY <- c(TPR_ARY, TPR)
FPR_ARY <- c(FPR_ARY, FPR)
}
}
TPR_ARY <- c(TPR_ARY, 1)
FPR_ARY <- c(FPR_ARY, 1)
plot(FPR_ARY, TPR_ARY,xlim = c(0,1), ylim = c(0,1), type = 'o', col='blue', xlab = 'False Positive Rate', ylab = 'True Positive Rate', main = 'ROC Curve')
lines(c(0,1), c(0,1), col= 'black', lty = 2)

Precision-Recall Plot
predicted <- predict(fit, test_data, type = 'prob')
res <- ifelse(predicted[,1] >= 0.5, 'benign', 'malignant')
res <- factor(res)
tb <- table(test_data$classes, res)
cm <- confusionMatrix(tb)
cm$byClass[5]
## Precision
## 0.9473684
cm$byClass[6]
## Recall
## 0.984375
predicted <- predict(fit, test_data, type = 'prob')
PRECISION_ARY <- c(1)
RECALL_ARY <- c(0)
for (cost in seq(0,1,0.1) ){
res <- ifelse(predicted[,1] >= cost, 'benign', 'malignant')
res <- factor(res)
tb <- table(test_data$classes, res)
if (length(tb) == 4){
cm <- confusionMatrix(tb)
PRECISION <- cm$byClass[5]
RECALL <- cm$byClass[6]
PRECISION_ARY <- c(PRECISION_ARY, PRECISION)
RECALL_ARY <- c(RECALL_ARY, RECALL)
}
}
PRECISION_ARY <- c(PRECISION_ARY, 0)
RECALL_ARY <- c(RECALL_ARY, 1)
plot(RECALL_ARY, PRECISION_ARY,xlim = c(0,1), ylim = c(0,1), type = 'o', col='blue', xlab = 'RECALL', ylab = 'PRECISION', main = 'PRECISION RECALL CURVE')
lines(c(0,1), c(1,0), col= 'black', lty = 2)

Plot ROC Curve with ROCR
library(ROCR)
## Warning: package 'ROCR' was built under R version 3.4.4
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.4.4
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(rpart)
fit <- rpart(classes ~., data = train_data, method = 'class')
predictions <- predict(fit, test_data, type="prob")
#predictions
pred.to.roc <- predictions[, 2]
pred.rocr <- prediction(pred.to.roc, as.factor(test_data$classes))
perf.rocr <- performance(pred.rocr, measure = "auc", x.measure = "cutoff")
perf.tpr.rocr <- performance(pred.rocr, "tpr","fpr")
plot(perf.tpr.rocr, colorize=T,main=paste("AUC:",(perf.rocr@y.values)))

Compare Model
## Decision Tree
library(rpart)
tree <- rpart(classes ~., data = train_data, method = 'class')
## Logistic Regression
lr <- glm(classes ~., data = train_data, family=binomial)
## SVM
library(e1071)
## Warning: package 'e1071' was built under R version 3.4.4
svc <- svm(classes ~., data = train_data)
predict1 <- predict(tree, test_data, type = 'class')
table(test_data$classes,predict1)
## predict1
## benign malignant
## benign 126 7
## malignant 2 63
predict2 <- predict(lr, test_data)
res2 <- as.factor(ifelse(predict2>=0, 'malignant', 'benign'))
table(test_data$classes,res2)
## res2
## benign malignant
## benign 129 4
## malignant 2 63
predict3 <- predict(svc, test_data)
table(test_data$classes,predict3)
## predict3
## benign malignant
## benign 127 6
## malignant 2 63
library(ROCR)
# tree
predictions1 <- predict(tree, test_data, type="prob")
pred.to.roc1 <- predictions1[, 2]
pred.rocr1 <- prediction(pred.to.roc1, as.factor(test_data$classes))
perf.rocr1 <- performance(pred.rocr1, measure = "auc", x.measure = "cutoff")
perf.tpr.rocr1 <- performance(pred.rocr1, "tpr","fpr")
# logistic regression
pred.to.roc2 <- predict2
pred.rocr2 <- prediction(pred.to.roc2, as.factor(test_data$classes))
perf.rocr2 <- performance(pred.rocr2, measure = "auc", x.measure = "cutoff")
perf.tpr.rocr2 <- performance(pred.rocr2, "tpr","fpr")
# svm
library(e1071)
svc <- svm(classes ~., data = train_data, probability =TRUE)
predictions3 <- predict(svc, test_data, probability=TRUE)
#names(predictions3)
pred.to.roc3 <-attr(predictions3, "probabilities")[, 2]
pred.rocr3 <- prediction(pred.to.roc3, as.factor(test_data$classes))
perf.rocr3 <- performance(pred.rocr3, measure = "auc", x.measure = "cutoff")
perf.tpr.rocr3 <- performance(pred.rocr3, "tpr","fpr")
plot(perf.tpr.rocr1, main='ROC Curve', col=1)
plot(perf.tpr.rocr2, col=2, add=TRUE)
plot(perf.tpr.rocr3, col=3, add=TRUE)
legend(0.6,0.3, c(
paste('rpart', round(perf.rocr1@y.values[[1]],2)),
paste('logistic regression', round(perf.rocr2@y.values[[1]],2)),
paste('svm',round(perf.rocr3@y.values[[1]],2))), 1:3)

Random Forest
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.4.4
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
forest <- randomForest(classes ~., data = train_data, ntree=100, importance=T, proximity=T)
plot(forest)

#predict(forest, test_data, type='prob')
predicted <- predict(forest, test_data, type='class')
sum(predicted == test_data$classes) / length(test_data$classes)
## [1] 0.969697
table(test_data$classes, predicted)
## predicted
## benign malignant
## benign 129 4
## malignant 2 63
predictions1 <- predict(tree, test_data, type="prob")
pred.to.roc1 <- predictions1[, 2]
pred.rocr1 <- prediction(pred.to.roc1, as.factor(test_data$classes))
perf.rocr1 <- performance(pred.rocr1, measure = "auc", x.measure = "cutoff")
perf.tpr.rocr1 <- performance(pred.rocr1, "tpr","fpr")
predictions2 <- predict(forest, test_data, type="prob")
pred.to.roc2 <- predictions2[, 2]
pred.rocr2 <- prediction(pred.to.roc2, as.factor(test_data$classes))
perf.rocr1 <- performance(pred.rocr2, measure = "auc", x.measure = "cutoff")
perf.tpr.rocr2 <- performance(pred.rocr2, "tpr","fpr")
plot(perf.tpr.rocr1, main='ROC Curve', col=1)
plot(perf.tpr.rocr2, col=2, add=TRUE)
legend(0.6,0.3, c(
paste('rpart', round(perf.rocr1@y.values[[1]],3)),
paste('randomforest', round(perf.rocr2@y.values[[1]],3))), 1:2)

varImpPlot(forest)

varImp(forest)
## benign malignant
## clump_thickness 8.060424 8.060424
## uniformity_of_cell_size 6.704466 6.704466
## uniformity_of_cell_shape 6.221985 6.221985
## marginal_adhesion 4.262833 4.262833
## single_epithelial_cell_size 5.848506 5.848506
## bare_nuclei 8.197795 8.197795
## bland_chromatin 5.666025 5.666025
## normal_nucleoli 4.080215 4.080215
## mitosis 1.005600 1.005600