LoadingData

url <- 'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data'

bc_data <- read.csv(url, header = FALSE)

class(bc_data)
## [1] "data.frame"
str(bc_data)
## 'data.frame':    699 obs. of  11 variables:
##  $ V1 : int  1000025 1002945 1015425 1016277 1017023 1017122 1018099 1018561 1033078 1033078 ...
##  $ V2 : int  5 5 3 6 4 8 1 2 2 4 ...
##  $ V3 : int  1 4 1 8 1 10 1 1 1 2 ...
##  $ V4 : int  1 4 1 8 1 10 1 2 1 1 ...
##  $ V5 : int  1 5 1 1 3 8 1 1 1 1 ...
##  $ V6 : int  2 7 2 3 2 7 2 2 2 2 ...
##  $ V7 : Factor w/ 11 levels "?","1","10","2",..: 2 3 4 6 2 3 3 2 2 2 ...
##  $ V8 : int  3 3 3 3 3 9 3 3 1 2 ...
##  $ V9 : int  1 2 1 7 1 7 1 1 1 1 ...
##  $ V10: int  1 1 1 1 1 1 1 1 5 1 ...
##  $ V11: int  2 2 2 2 2 4 2 2 2 2 ...
head(bc_data)
##        V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11
## 1 1000025  5  1  1  1  2  1  3  1   1   2
## 2 1002945  5  4  4  5  7 10  3  2   1   2
## 3 1015425  3  1  1  1  2  2  3  1   1   2
## 4 1016277  6  8  8  1  3  4  3  7   1   2
## 5 1017023  4  1  1  3  2  1  3  1   1   2
## 6 1017122  8 10 10  8  7 10  9  7   1   4
colnames(bc_data) <- c("sample_code_number",
"clump_thickness",
"uniformity_of_cell_size",
"uniformity_of_cell_shape",
"marginal_adhesion",
"single_epithelial_cell_size",
"bare_nuclei",
"bland_chromatin",
"normal_nucleoli",
"mitosis",
"classes")

head(bc_data)
##   sample_code_number clump_thickness uniformity_of_cell_size
## 1            1000025               5                       1
## 2            1002945               5                       4
## 3            1015425               3                       1
## 4            1016277               6                       8
## 5            1017023               4                       1
## 6            1017122               8                      10
##   uniformity_of_cell_shape marginal_adhesion single_epithelial_cell_size
## 1                        1                 1                           2
## 2                        4                 5                           7
## 3                        1                 1                           2
## 4                        8                 1                           3
## 5                        1                 3                           2
## 6                       10                 8                           7
##   bare_nuclei bland_chromatin normal_nucleoli mitosis classes
## 1           1               3               1       1       2
## 2          10               3               2       1       2
## 3           2               3               1       1       2
## 4           4               3               7       1       2
## 5           1               3               1       1       2
## 6          10               9               7       1       4

Cleaning Data and Transfoming Data

a <- ifelse(5 < 3, 1, 0)
a
## [1] 0
# method 1: ifelse
#bc_data$classes <- ifelse(bc_data$classes == "2", "benign",
#ifelse(bc_data$classes == "4", "malignant", NA))


# method 2: factor
bc_data$classes <- factor(bc_data$classes, levels = c(2,4), labels = c('benign', 'malignant'))

str(bc_data)
## 'data.frame':    699 obs. of  11 variables:
##  $ sample_code_number         : int  1000025 1002945 1015425 1016277 1017023 1017122 1018099 1018561 1033078 1033078 ...
##  $ clump_thickness            : int  5 5 3 6 4 8 1 2 2 4 ...
##  $ uniformity_of_cell_size    : int  1 4 1 8 1 10 1 1 1 2 ...
##  $ uniformity_of_cell_shape   : int  1 4 1 8 1 10 1 2 1 1 ...
##  $ marginal_adhesion          : int  1 5 1 1 3 8 1 1 1 1 ...
##  $ single_epithelial_cell_size: int  2 7 2 3 2 7 2 2 2 2 ...
##  $ bare_nuclei                : Factor w/ 11 levels "?","1","10","2",..: 2 3 4 6 2 3 3 2 2 2 ...
##  $ bland_chromatin            : int  3 3 3 3 3 9 3 3 1 2 ...
##  $ normal_nucleoli            : int  1 2 1 7 1 7 1 1 1 1 ...
##  $ mitosis                    : int  1 1 1 1 1 1 1 1 5 1 ...
##  $ classes                    : Factor w/ 2 levels "benign","malignant": 1 1 1 1 1 2 1 1 1 1 ...
# method 1
bc_data$bare_nuclei[bc_data$bare_nuclei == '?'] <- NA
#bc_data$bare_nuclei

bc_data[bc_data == '?'] <- NA

m <- matrix(c(1,2,3,4,5,6), ncol= 2)
apply(m, sum, MARGIN = 1)
## [1] 5 7 9
apply(is.na(bc_data), sum, MARGIN = 2)
##          sample_code_number             clump_thickness 
##                           0                           0 
##     uniformity_of_cell_size    uniformity_of_cell_shape 
##                           0                           0 
##           marginal_adhesion single_epithelial_cell_size 
##                           0                           0 
##                 bare_nuclei             bland_chromatin 
##                          16                           0 
##             normal_nucleoli                     mitosis 
##                           0                           0 
##                     classes 
##                           0
#install.packages("Amelia")
#library(Amelia)
#missmap(bc_data)

#AmeliaView()


nrow(bc_data)
## [1] 699
bc_data <- na.omit(bc_data)
nrow(bc_data)
## [1] 683
sum(is.na(bc_data))
## [1] 0

Data Exploration

tb <- table(bc_data$classes)
barplot(tb)

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.4

ggplot(bc_data, aes(x = classes, fill = classes)) +
geom_bar()

hist(bc_data$clump_thickness)

ggplot(bc_data, aes(x = clump_thickness)) +
geom_histogram(bins = 10)

boxplot(clump_thickness ~ classes, data = bc_data)

ggplot(bc_data, aes(x=classes,y=clump_thickness, fill= classes)) +
geom_boxplot()

source('https://raw.githubusercontent.com/ywchiu/cdc_course/master/script/multiplot.R')



p1 <- ggplot(bc_data, aes(x=classes,y=clump_thickness, fill= classes)) +
geom_boxplot()

p2 <- ggplot(bc_data, aes(x=classes,y=uniformity_of_cell_size, fill= classes)) +
geom_boxplot()


multiplot(p1, p2, cols = 2)
## Loading required package: grid

Decision Tree

library(rpart)

bc_data$sample_code_number <- NULL

fit <- rpart(classes ~., 
      data = bc_data,
      method = 'class')

plot(fit, margin = 0.1, uniform = TRUE)
text(fit)

predicted <- predict(fit, bc_data, type='class')

sum(predicted == bc_data$classes) / length(bc_data$classes)
## [1] 0.9677892
tb <- table(bc_data$classes, predicted)
(tb[1,1] + tb[2,2] ) / sum(tb)
## [1] 0.9677892
tb
##            predicted
##             benign malignant
##   benign       431        13
##   malignant      9       230
tb
##            predicted
##             benign malignant
##   benign       431        13
##   malignant      9       230
TP <- tb[1,1]
FP <- tb[1,2]
FN <- tb[2,1]
TN <- tb[2,2]

Early Termination

fit <- rpart(classes ~., 
      data = bc_data,
      method = 'class')

#summary(fit)
printcp(fit)
## 
## Classification tree:
## rpart(formula = classes ~ ., data = bc_data, method = "class")
## 
## Variables actually used in tree construction:
## [1] bare_nuclei              uniformity_of_cell_shape
## [3] uniformity_of_cell_size 
## 
## Root node error: 239/683 = 0.34993
## 
## n= 683 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.790795      0   1.00000 1.00000 0.052153
## 2 0.054393      1   0.20921 0.25941 0.031415
## 3 0.025105      2   0.15481 0.17992 0.026559
## 4 0.012552      3   0.12971 0.17573 0.026269
## 5 0.010000      6   0.09205 0.17992 0.026559
plotcp(fit)

plot(fit, margin = 0.1, uniform = TRUE)
text(fit)

min_split <- which.min(fit$cptable[,'xerror'])
stop_criteria <- fit$cptable[min_split ,"CP"]
prune.fit <- prune(fit, cp= stop_criteria)


plot(prune.fit, margin = 0.1, uniform = TRUE)
text(prune.fit)

plotcp(fit)

stop_criteria2 <- fit$cptable[3 ,"CP"]
prune.fit2 <- prune(fit, cp= stop_criteria2)

plot(prune.fit2, margin = 0.1, uniform = TRUE)
text(prune.fit2)

predicted2 <- predict(prune.fit2, bc_data, type= 'class')
sum(predicted2 == bc_data$classes) / length(bc_data$classes)
## [1] 0.9458272
table(bc_data$classes, predicted)
##            predicted
##             benign malignant
##   benign       431        13
##   malignant      9       230
library(party)
## Warning: package 'party' was built under R version 3.4.4
## Loading required package: mvtnorm
## Warning: package 'mvtnorm' was built under R version 3.4.4
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Warning: package 'strucchange' was built under R version 3.4.4
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 3.4.4
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 3.4.4
fit2 <- ctree(classes ~., 
      data = bc_data)

plot(fit2)

predicted2 <- predict(fit2, bc_data, type= 'response')
sum(predicted2 == bc_data$classes) / length(bc_data$classes)
## [1] 0.9736457
library(C50)
## Warning: package 'C50' was built under R version 3.4.4
treeModel <- C5.0(classes ~., data = bc_data)
treeModel
## 
## Call:
## C5.0.formula(formula = classes ~ ., data = bc_data)
## 
## Classification Tree
## Number of samples: 683 
## Number of predictors: 9 
## 
## Tree size: 7 
## 
## Non-standard options: attempt to group attributes
summary(treeModel)
## 
## Call:
## C5.0.formula(formula = classes ~ ., data = bc_data)
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Tue Jun 12 17:05:43 2018
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 683 cases (10 attributes) from undefined.data
## 
## Decision tree:
## 
## uniformity_of_cell_size > 2:
## :...uniformity_of_cell_shape > 2: malignant (242/20)
## :   uniformity_of_cell_shape <= 2:
## :   :...clump_thickness <= 5: benign (19/1)
## :       clump_thickness > 5: malignant (4)
## uniformity_of_cell_size <= 2:
## :...clump_thickness <= 5: benign (397/4)
##     clump_thickness > 5:
##     :...normal_nucleoli > 2: malignant (6)
##         normal_nucleoli <= 2:
##         :...bare_nuclei in {?,1,10,2,4,5,7,8,9}: benign (13)
##             bare_nuclei in {3,6}: malignant (2)
## 
## 
## Evaluation on training data (683 cases):
## 
##      Decision Tree   
##    ----------------  
##    Size      Errors  
## 
##       7   25( 3.7%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##     424    20    (a): class benign
##       5   234    (b): class malignant
## 
## 
##  Attribute usage:
## 
##  100.00% uniformity_of_cell_size
##   64.57% clump_thickness
##   38.80% uniformity_of_cell_shape
##    3.07% normal_nucleoli
##    2.20% bare_nuclei
## 
## 
## Time: 0.0 secs
plot(treeModel)

predicted3 <- predict(treeModel, bc_data, type= 'class')

sum(predicted3 == bc_data$classes) / length(bc_data$classes)
## [1] 0.9633968

Cross Validation

sample.int(42, 6)
## [1] 11 35  2  7 10  8
nrow(bc_data)
## [1] 683
set.seed(42)
idx <- sample.int(2, nrow(bc_data), replace=TRUE, prob=c(0.7,0.3) )

table(idx)
## idx
##   1   2 
## 485 198
prop.table(table(idx))
## idx
##         1         2 
## 0.7101025 0.2898975
train_data <- bc_data[idx == 1, ]
test_data <- bc_data[idx == 2, ]

dim(train_data)
## [1] 485  10
dim(test_data)
## [1] 198  10
library(dplyr) 
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr) 
## Warning: package 'tidyr' was built under R version 3.4.4
dataset <- rbind(data.frame(group = "train", train_data),
      data.frame(group = "test", test_data)) %>%
      gather(x, y, clump_thickness:mitosis)
## Warning: attributes are not identical across measure variables;
## they will be dropped
ggplot(dataset, aes(x = y, color = group, fill = group)) +
geom_density(alpha = 0.3) +
facet_wrap( ~ x, scales = "free", ncol = 3)