Demo20180613

# Loading Data
url <- 'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data'
bc_data <- read.csv(url, header = FALSE)

class(bc_data)

## [1] "data.frame"

str(bc_data)

## 'data.frame':    699 obs. of  11 variables:
##  $ V1 : int  1000025 1002945 1015425 1016277 1017023 1017122 1018099 1018561 1033078 1033078 ...
##  $ V2 : int  5 5 3 6 4 8 1 2 2 4 ...
##  $ V3 : int  1 4 1 8 1 10 1 1 1 2 ...
##  $ V4 : int  1 4 1 8 1 10 1 2 1 1 ...
##  $ V5 : int  1 5 1 1 3 8 1 1 1 1 ...
##  $ V6 : int  2 7 2 3 2 7 2 2 2 2 ...
##  $ V7 : Factor w/ 11 levels "?","1","10","2",..: 2 3 4 6 2 3 3 2 2 2 ...
##  $ V8 : int  3 3 3 3 3 9 3 3 1 2 ...
##  $ V9 : int  1 2 1 7 1 7 1 1 1 1 ...
##  $ V10: int  1 1 1 1 1 1 1 1 5 1 ...
##  $ V11: int  2 2 2 2 2 4 2 2 2 2 ...

head(bc_data)

##        V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11
## 1 1000025  5  1  1  1  2  1  3  1   1   2
## 2 1002945  5  4  4  5  7 10  3  2   1   2
## 3 1015425  3  1  1  1  2  2  3  1   1   2
## 4 1016277  6  8  8  1  3  4  3  7   1   2
## 5 1017023  4  1  1  3  2  1  3  1   1   2
## 6 1017122  8 10 10  8  7 10  9  7   1   4

colnames(bc_data) <- c(
  "sample_code_number",
  "clump_thickness",
  "uniformity_of_cell_size",
  "uniformity_of_cell_shape",
  "marginal_adhesion",
  "single_epithelial_cell_size",
  "bare_nuclei",
  "bland_chromatin",
  "normal_nucleoli",
  "mitosis",
  "classes"
)

head(bc_data)

##   sample_code_number clump_thickness uniformity_of_cell_size
## 1            1000025               5                       1
## 2            1002945               5                       4
## 3            1015425               3                       1
## 4            1016277               6                       8
## 5            1017023               4                       1
## 6            1017122               8                      10
##   uniformity_of_cell_shape marginal_adhesion single_epithelial_cell_size
## 1                        1                 1                           2
## 2                        4                 5                           7
## 3                        1                 1                           2
## 4                        8                 1                           3
## 5                        1                 3                           2
## 6                       10                 8                           7
##   bare_nuclei bland_chromatin normal_nucleoli mitosis classes
## 1           1               3               1       1       2
## 2          10               3               2       1       2
## 3           2               3               1       1       2
## 4           4               3               7       1       2
## 5           1               3               1       1       2
## 6          10               9               7       1       4

# Data Transofmation and Cleaning
str(bc_data)

## 'data.frame':    699 obs. of  11 variables:
##  $ sample_code_number         : int  1000025 1002945 1015425 1016277 1017023 1017122 1018099 1018561 1033078 1033078 ...
##  $ clump_thickness            : int  5 5 3 6 4 8 1 2 2 4 ...
##  $ uniformity_of_cell_size    : int  1 4 1 8 1 10 1 1 1 2 ...
##  $ uniformity_of_cell_shape   : int  1 4 1 8 1 10 1 2 1 1 ...
##  $ marginal_adhesion          : int  1 5 1 1 3 8 1 1 1 1 ...
##  $ single_epithelial_cell_size: int  2 7 2 3 2 7 2 2 2 2 ...
##  $ bare_nuclei                : Factor w/ 11 levels "?","1","10","2",..: 2 3 4 6 2 3 3 2 2 2 ...
##  $ bland_chromatin            : int  3 3 3 3 3 9 3 3 1 2 ...
##  $ normal_nucleoli            : int  1 2 1 7 1 7 1 1 1 1 ...
##  $ mitosis                    : int  1 1 1 1 1 1 1 1 5 1 ...
##  $ classes                    : int  2 2 2 2 2 4 2 2 2 2 ...

bc_data[bc_data == '?'] <- NA
bc_data <- na.omit(bc_data)

bc_data$bare_nuclei <- as.integer(bc_data$bare_nuclei)

#method 1
bc_data$sample_code_number <- NULL

# method 2
#bc_data <- bc_data[, -1]

# method 3
#bc_data <- bc_data[, ! names(bc_data) %in% c('sample_code_number')]

bc_data$classes <- factor(bc_data$classes, levels = c(2,4), labels =  c('benign', 'malignant'))

str(bc_data)

## 'data.frame':    683 obs. of  10 variables:
##  $ clump_thickness            : int  5 5 3 6 4 8 1 2 2 4 ...
##  $ uniformity_of_cell_size    : int  1 4 1 8 1 10 1 1 1 2 ...
##  $ uniformity_of_cell_shape   : int  1 4 1 8 1 10 1 2 1 1 ...
##  $ marginal_adhesion          : int  1 5 1 1 3 8 1 1 1 1 ...
##  $ single_epithelial_cell_size: int  2 7 2 3 2 7 2 2 2 2 ...
##  $ bare_nuclei                : int  2 3 4 6 2 3 3 2 2 2 ...
##  $ bland_chromatin            : int  3 3 3 3 3 9 3 3 1 2 ...
##  $ normal_nucleoli            : int  1 2 1 7 1 7 1 1 1 1 ...
##  $ mitosis                    : int  1 1 1 1 1 1 1 1 5 1 ...
##  $ classes                    : Factor w/ 2 levels "benign","malignant": 1 1 1 1 1 2 1 1 1 1 ...
##  - attr(*, "na.action")=Class 'omit'  Named int [1:16] 24 41 140 146 159 165 236 250 276 293 ...
##   .. ..- attr(*, "names")= chr [1:16] "24" "41" "140" "146" ...

# Split the data into training dataset and testing dataset
set.seed(42)
idx <- sample.int(2, nrow(bc_data), prob = c(0.7,0.3), replace = TRUE)
train_data <-bc_data[idx ==1, ]
test_data  <-bc_data[idx ==2, ]

dim(train_data)

## [1] 485  10

dim(test_data)

## [1] 198  10

library(rpart)
fit <- rpart(classes ~., data = train_data, method = 'class')

plot(fit, margin = 0.1)
text(fit)

# install.packages('rpart.plot')
library(rpart.plot)

## Warning: package 'rpart.plot' was built under R version 3.4.4

rpart.plot(fit)


library(ggplot2)

## Warning: package 'ggplot2' was built under R version 3.4.4

ggplot(train_data, aes(x = classes,y=uniformity_of_cell_size, fill=classes)) + geom_boxplot() + geom_abline(intercept = 3.5, slope = 0,colour='red')

boxplot(uniformity_of_cell_size ~ classes, data = train_data)
abline(h = 3.5, col= 'red')

plot(uniformity_of_cell_size~ clump_thickness, data = train_data, col=classes, pch = 19)
abline(h = 3.5, col= 'blue')
abline(v = 6.5, col= 'orange')

predicted <- predict(fit, test_data, type = 'class')

sum(predicted == test_data$classes) / length(test_data$classes)

## [1] 0.9545455

table(test_data$classes, predicted)

##            predicted
##             benign malignant
##   benign       126         7
##   malignant      2        63

Confusion Matrix

predicted <- predict(fit, test_data, type = 'prob')
res <- ifelse(predicted[,1] >= 0.5, 'benign', 'malignant')
res <- factor(res)
table(test_data$classes, res)

##            res
##             benign malignant
##   benign       126         7
##   malignant      2        63

tb <- table(test_data$classes, res)
tb

##            res
##             benign malignant
##   benign       126         7
##   malignant      2        63

TP <- tb[1,1]
FP <- tb[1,2]
FN <- tb[2,1]
TN <- tb[2,2]

TPR <- TP / (TP + FN)
TPR

## [1] 0.984375

FPR <- FP / (FP + TN)
FPR

## [1] 0.1

ROC Curve

predicted <- predict(fit, test_data, type = 'prob')

TPR_ARY <- c(0)
FPR_ARY <- c(0)

for (cost in seq(0,1,0.1) ){
  res <- ifelse(predicted[,1] >= cost, 'benign', 'malignant')
  res <- factor(res)
  tb  <- table(test_data$classes, res)
  if (length(tb) == 4){
    TP <- tb[1,1]
    FP <- tb[1,2]
    FN <- tb[2,1]
    TN <- tb[2,2]
    TPR <- TP / (TP + FN)
    FPR <- FP / (FP + TN)
    TPR_ARY <- c(TPR_ARY, TPR)
    FPR_ARY <- c(FPR_ARY, FPR)
    #print(paste(TPR, FPR))
  }
}

TPR_ARY <- c(TPR_ARY, 1)
FPR_ARY <- c(FPR_ARY, 1)

plot(FPR_ARY, TPR_ARY,xlim = c(0,1), ylim = c(0,1), type = 'o', col='blue', xlab = 'False Positive Rate', ylab = 'True Positive Rate', main = 'ROC Curve')
lines(c(0,1), c(0,1), col= 'black', lty = 2)

Caret Version of ROC Curve

library(caret)

## Warning: package 'caret' was built under R version 3.4.4

## Loading required package: lattice

res <- ifelse(predicted[,1] >= 0.5, 'benign', 'malignant')
res <- factor(res)
tb  <- table(test_data$classes, res)
cm  <- confusionMatrix(tb)
TPR <- cm$byClass[1]
FPR <- 1 - cm$byClass[2]

predicted <- predict(fit, test_data, type = 'prob')

TPR_ARY <- c(0)
FPR_ARY <- c(0)

for (cost in seq(0,1,0.1) ){
  res <- ifelse(predicted[,1] >= cost, 'benign', 'malignant')
  res <- factor(res)
  tb  <- table(test_data$classes, res)
  

  if (length(tb) == 4){
    cm  <- confusionMatrix(tb)
    TPR <- cm$byClass[1]
    FPR <- 1 - cm$byClass[2]
    TPR_ARY <- c(TPR_ARY, TPR)
    FPR_ARY <- c(FPR_ARY, FPR)
  }
}

TPR_ARY <- c(TPR_ARY, 1)
FPR_ARY <- c(FPR_ARY, 1)

plot(FPR_ARY, TPR_ARY,xlim = c(0,1), ylim = c(0,1), type = 'o', col='blue', xlab = 'False Positive Rate', ylab = 'True Positive Rate', main = 'ROC Curve')
lines(c(0,1), c(0,1), col= 'black', lty = 2)

Precision-Recall Plot

predicted <- predict(fit, test_data, type = 'prob')
res <- ifelse(predicted[,1] >= 0.5, 'benign', 'malignant')
res <- factor(res)
tb <- table(test_data$classes, res)
cm <- confusionMatrix(tb)
cm$byClass[5]

## Precision 
## 0.9473684

cm$byClass[6]

##   Recall 
## 0.984375

predicted <- predict(fit, test_data, type = 'prob')

PRECISION_ARY <- c(1)
RECALL_ARY <- c(0)

for (cost in seq(0,1,0.1) ){
  res <- ifelse(predicted[,1] >= cost, 'benign', 'malignant')
  res <- factor(res)
  tb  <- table(test_data$classes, res)
  

  if (length(tb) == 4){
    cm  <- confusionMatrix(tb)
    PRECISION <- cm$byClass[5]
    RECALL    <- cm$byClass[6]
    PRECISION_ARY <- c(PRECISION_ARY, PRECISION)
    RECALL_ARY <- c(RECALL_ARY, RECALL)
  }
}

PRECISION_ARY <- c(PRECISION_ARY, 0)
RECALL_ARY <- c(RECALL_ARY, 1)

plot(RECALL_ARY, PRECISION_ARY,xlim = c(0,1), ylim = c(0,1), type = 'o', col='blue', xlab = 'RECALL', ylab = 'PRECISION', main = 'PRECISION RECALL CURVE')
lines(c(0,1), c(1,0), col= 'black', lty = 2)

Plot ROC Curve with ROCR

library(ROCR)

## Warning: package 'ROCR' was built under R version 3.4.4

## Loading required package: gplots

## Warning: package 'gplots' was built under R version 3.4.4

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

library(rpart)
fit <- rpart(classes ~., data = train_data, method = 'class')
predictions <- predict(fit, test_data, type="prob")
#predictions
pred.to.roc <- predictions[, 2]
pred.rocr <- prediction(pred.to.roc, as.factor(test_data$classes))

perf.rocr <- performance(pred.rocr, measure = "auc", x.measure = "cutoff")

perf.tpr.rocr <- performance(pred.rocr, "tpr","fpr")

plot(perf.tpr.rocr, colorize=T,main=paste("AUC:",(perf.rocr@y.values)))

Compare Model

## Decision Tree
library(rpart)
tree <- rpart(classes ~., data = train_data, method = 'class')

## Logistic Regression
lr <- glm(classes ~., data = train_data, family=binomial)

## SVM
library(e1071)

## Warning: package 'e1071' was built under R version 3.4.4

svc <- svm(classes ~., data = train_data)

predict1 <- predict(tree, test_data, type = 'class')
table(test_data$classes,predict1)

##            predict1
##             benign malignant
##   benign       126         7
##   malignant      2        63

predict2 <- predict(lr, test_data)
res2 <- as.factor(ifelse(predict2>=0, 'malignant', 'benign'))
table(test_data$classes,res2)

##            res2
##             benign malignant
##   benign       129         4
##   malignant      2        63

predict3 <- predict(svc, test_data)
table(test_data$classes,predict3)

##            predict3
##             benign malignant
##   benign       127         6
##   malignant      2        63

library(ROCR)
# tree
predictions1 <- predict(tree, test_data, type="prob")
pred.to.roc1 <- predictions1[, 2]
pred.rocr1 <- prediction(pred.to.roc1, as.factor(test_data$classes))
perf.rocr1 <- performance(pred.rocr1, measure = "auc", x.measure = "cutoff")
perf.tpr.rocr1 <- performance(pred.rocr1, "tpr","fpr")

# logistic regression
pred.to.roc2 <- predict2
pred.rocr2 <- prediction(pred.to.roc2, as.factor(test_data$classes))
perf.rocr2 <- performance(pred.rocr2, measure = "auc", x.measure = "cutoff")
perf.tpr.rocr2 <- performance(pred.rocr2, "tpr","fpr")


# svm
library(e1071)
svc <- svm(classes ~., data = train_data, probability =TRUE)

predictions3 <- predict(svc, test_data,  probability=TRUE)
#names(predictions3)

pred.to.roc3 <-attr(predictions3, "probabilities")[, 2]
pred.rocr3 <- prediction(pred.to.roc3, as.factor(test_data$classes))
perf.rocr3 <- performance(pred.rocr3, measure = "auc", x.measure = "cutoff")
perf.tpr.rocr3 <- performance(pred.rocr3, "tpr","fpr")


plot(perf.tpr.rocr1, main='ROC Curve', col=1)
plot(perf.tpr.rocr2, col=2, add=TRUE)
plot(perf.tpr.rocr3, col=3, add=TRUE)

legend(0.6,0.3, c(
          paste('rpart', round(perf.rocr1@y.values[[1]],2)),
          paste('logistic regression', round(perf.rocr2@y.values[[1]],2)),
          paste('svm',round(perf.rocr3@y.values[[1]],2))), 1:3)

Random Forest

library(randomForest)

## Warning: package 'randomForest' was built under R version 3.4.4

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

forest <- randomForest(classes ~., data = train_data, ntree=100, importance=T, proximity=T)
plot(forest)

#predict(forest, test_data, type='prob')
predicted <- predict(forest, test_data, type='class')
sum(predicted == test_data$classes) / length(test_data$classes)

## [1] 0.969697

table(test_data$classes, predicted)

##            predicted
##             benign malignant
##   benign       129         4
##   malignant      2        63

predictions1 <- predict(tree, test_data, type="prob")
pred.to.roc1 <- predictions1[, 2]
pred.rocr1 <- prediction(pred.to.roc1, as.factor(test_data$classes))
perf.rocr1 <- performance(pred.rocr1, measure = "auc", x.measure = "cutoff")
perf.tpr.rocr1 <- performance(pred.rocr1, "tpr","fpr")

predictions2 <- predict(forest, test_data, type="prob")
pred.to.roc2 <- predictions2[, 2]
pred.rocr2 <- prediction(pred.to.roc2, as.factor(test_data$classes))
perf.rocr1 <- performance(pred.rocr2, measure = "auc", x.measure = "cutoff")
perf.tpr.rocr2 <- performance(pred.rocr2, "tpr","fpr")

plot(perf.tpr.rocr1, main='ROC Curve', col=1)
plot(perf.tpr.rocr2, col=2, add=TRUE)
legend(0.6,0.3, c(
          paste('rpart', round(perf.rocr1@y.values[[1]],3)),
          paste('randomforest', round(perf.rocr2@y.values[[1]],3))), 1:2)

varImpPlot(forest)

varImp(forest)

##                               benign malignant
## clump_thickness             8.060424  8.060424
## uniformity_of_cell_size     6.704466  6.704466
## uniformity_of_cell_shape    6.221985  6.221985
## marginal_adhesion           4.262833  4.262833
## single_epithelial_cell_size 5.848506  5.848506
## bare_nuclei                 8.197795  8.197795
## bland_chromatin             5.666025  5.666025
## normal_nucleoli             4.080215  4.080215
## mitosis                     1.005600  1.005600

Demo20180613

David Chiu

2018年6月13日

Confusion Matrix

ROC Curve

Caret Version of ROC Curve

Precision-Recall Plot

Plot ROC Curve with ROCR

Compare Model

Random Forest