LoadingData
url <- 'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data'
bc_data <- read.csv(url, header = FALSE)
class(bc_data)
## [1] "data.frame"
str(bc_data)
## 'data.frame': 699 obs. of 11 variables:
## $ V1 : int 1000025 1002945 1015425 1016277 1017023 1017122 1018099 1018561 1033078 1033078 ...
## $ V2 : int 5 5 3 6 4 8 1 2 2 4 ...
## $ V3 : int 1 4 1 8 1 10 1 1 1 2 ...
## $ V4 : int 1 4 1 8 1 10 1 2 1 1 ...
## $ V5 : int 1 5 1 1 3 8 1 1 1 1 ...
## $ V6 : int 2 7 2 3 2 7 2 2 2 2 ...
## $ V7 : Factor w/ 11 levels "?","1","10","2",..: 2 3 4 6 2 3 3 2 2 2 ...
## $ V8 : int 3 3 3 3 3 9 3 3 1 2 ...
## $ V9 : int 1 2 1 7 1 7 1 1 1 1 ...
## $ V10: int 1 1 1 1 1 1 1 1 5 1 ...
## $ V11: int 2 2 2 2 2 4 2 2 2 2 ...
head(bc_data)
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11
## 1 1000025 5 1 1 1 2 1 3 1 1 2
## 2 1002945 5 4 4 5 7 10 3 2 1 2
## 3 1015425 3 1 1 1 2 2 3 1 1 2
## 4 1016277 6 8 8 1 3 4 3 7 1 2
## 5 1017023 4 1 1 3 2 1 3 1 1 2
## 6 1017122 8 10 10 8 7 10 9 7 1 4
colnames(bc_data) <- c("sample_code_number",
"clump_thickness",
"uniformity_of_cell_size",
"uniformity_of_cell_shape",
"marginal_adhesion",
"single_epithelial_cell_size",
"bare_nuclei",
"bland_chromatin",
"normal_nucleoli",
"mitosis",
"classes")
head(bc_data)
## sample_code_number clump_thickness uniformity_of_cell_size
## 1 1000025 5 1
## 2 1002945 5 4
## 3 1015425 3 1
## 4 1016277 6 8
## 5 1017023 4 1
## 6 1017122 8 10
## uniformity_of_cell_shape marginal_adhesion single_epithelial_cell_size
## 1 1 1 2
## 2 4 5 7
## 3 1 1 2
## 4 8 1 3
## 5 1 3 2
## 6 10 8 7
## bare_nuclei bland_chromatin normal_nucleoli mitosis classes
## 1 1 3 1 1 2
## 2 10 3 2 1 2
## 3 2 3 1 1 2
## 4 4 3 7 1 2
## 5 1 3 1 1 2
## 6 10 9 7 1 4
Cleaning Data and Transfoming Data
a <- ifelse(5 < 3, 1, 0)
a
## [1] 0
# method 1: ifelse
#bc_data$classes <- ifelse(bc_data$classes == "2", "benign",
#ifelse(bc_data$classes == "4", "malignant", NA))
# method 2: factor
bc_data$classes <- factor(bc_data$classes, levels = c(2,4), labels = c('benign', 'malignant'))
str(bc_data)
## 'data.frame': 699 obs. of 11 variables:
## $ sample_code_number : int 1000025 1002945 1015425 1016277 1017023 1017122 1018099 1018561 1033078 1033078 ...
## $ clump_thickness : int 5 5 3 6 4 8 1 2 2 4 ...
## $ uniformity_of_cell_size : int 1 4 1 8 1 10 1 1 1 2 ...
## $ uniformity_of_cell_shape : int 1 4 1 8 1 10 1 2 1 1 ...
## $ marginal_adhesion : int 1 5 1 1 3 8 1 1 1 1 ...
## $ single_epithelial_cell_size: int 2 7 2 3 2 7 2 2 2 2 ...
## $ bare_nuclei : Factor w/ 11 levels "?","1","10","2",..: 2 3 4 6 2 3 3 2 2 2 ...
## $ bland_chromatin : int 3 3 3 3 3 9 3 3 1 2 ...
## $ normal_nucleoli : int 1 2 1 7 1 7 1 1 1 1 ...
## $ mitosis : int 1 1 1 1 1 1 1 1 5 1 ...
## $ classes : Factor w/ 2 levels "benign","malignant": 1 1 1 1 1 2 1 1 1 1 ...
# method 1
bc_data$bare_nuclei[bc_data$bare_nuclei == '?'] <- NA
#bc_data$bare_nuclei
bc_data[bc_data == '?'] <- NA
m <- matrix(c(1,2,3,4,5,6), ncol= 2)
apply(m, sum, MARGIN = 1)
## [1] 5 7 9
apply(is.na(bc_data), sum, MARGIN = 2)
## sample_code_number clump_thickness
## 0 0
## uniformity_of_cell_size uniformity_of_cell_shape
## 0 0
## marginal_adhesion single_epithelial_cell_size
## 0 0
## bare_nuclei bland_chromatin
## 16 0
## normal_nucleoli mitosis
## 0 0
## classes
## 0
#install.packages("Amelia")
#library(Amelia)
#missmap(bc_data)
#AmeliaView()
nrow(bc_data)
## [1] 699
bc_data <- na.omit(bc_data)
nrow(bc_data)
## [1] 683
sum(is.na(bc_data))
## [1] 0
Data Exploration
tb <- table(bc_data$classes)
barplot(tb)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.4

ggplot(bc_data, aes(x = classes, fill = classes)) +
geom_bar()

hist(bc_data$clump_thickness)

ggplot(bc_data, aes(x = clump_thickness)) +
geom_histogram(bins = 10)

boxplot(clump_thickness ~ classes, data = bc_data)

ggplot(bc_data, aes(x=classes,y=clump_thickness, fill= classes)) +
geom_boxplot()

source('https://raw.githubusercontent.com/ywchiu/cdc_course/master/script/multiplot.R')
p1 <- ggplot(bc_data, aes(x=classes,y=clump_thickness, fill= classes)) +
geom_boxplot()
p2 <- ggplot(bc_data, aes(x=classes,y=uniformity_of_cell_size, fill= classes)) +
geom_boxplot()
multiplot(p1, p2, cols = 2)
## Loading required package: grid

Decision Tree
library(rpart)
bc_data$sample_code_number <- NULL
fit <- rpart(classes ~.,
data = bc_data,
method = 'class')
plot(fit, margin = 0.1, uniform = TRUE)
text(fit)

predicted <- predict(fit, bc_data, type='class')
sum(predicted == bc_data$classes) / length(bc_data$classes)
## [1] 0.9677892
tb <- table(bc_data$classes, predicted)
(tb[1,1] + tb[2,2] ) / sum(tb)
## [1] 0.9677892
tb
## predicted
## benign malignant
## benign 431 13
## malignant 9 230
tb
## predicted
## benign malignant
## benign 431 13
## malignant 9 230
TP <- tb[1,1]
FP <- tb[1,2]
FN <- tb[2,1]
TN <- tb[2,2]
Early Termination
fit <- rpart(classes ~.,
data = bc_data,
method = 'class')
#summary(fit)
printcp(fit)
##
## Classification tree:
## rpart(formula = classes ~ ., data = bc_data, method = "class")
##
## Variables actually used in tree construction:
## [1] bare_nuclei uniformity_of_cell_shape
## [3] uniformity_of_cell_size
##
## Root node error: 239/683 = 0.34993
##
## n= 683
##
## CP nsplit rel error xerror xstd
## 1 0.790795 0 1.00000 1.00000 0.052153
## 2 0.054393 1 0.20921 0.25941 0.031415
## 3 0.025105 2 0.15481 0.17992 0.026559
## 4 0.012552 3 0.12971 0.17573 0.026269
## 5 0.010000 6 0.09205 0.17992 0.026559
plotcp(fit)

plot(fit, margin = 0.1, uniform = TRUE)
text(fit)

min_split <- which.min(fit$cptable[,'xerror'])
stop_criteria <- fit$cptable[min_split ,"CP"]
prune.fit <- prune(fit, cp= stop_criteria)
plot(prune.fit, margin = 0.1, uniform = TRUE)
text(prune.fit)

plotcp(fit)

stop_criteria2 <- fit$cptable[3 ,"CP"]
prune.fit2 <- prune(fit, cp= stop_criteria2)
plot(prune.fit2, margin = 0.1, uniform = TRUE)
text(prune.fit2)

predicted2 <- predict(prune.fit2, bc_data, type= 'class')
sum(predicted2 == bc_data$classes) / length(bc_data$classes)
## [1] 0.9458272
table(bc_data$classes, predicted)
## predicted
## benign malignant
## benign 431 13
## malignant 9 230
library(party)
## Warning: package 'party' was built under R version 3.4.4
## Loading required package: mvtnorm
## Warning: package 'mvtnorm' was built under R version 3.4.4
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Warning: package 'strucchange' was built under R version 3.4.4
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 3.4.4
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 3.4.4
fit2 <- ctree(classes ~.,
data = bc_data)
plot(fit2)

predicted2 <- predict(fit2, bc_data, type= 'response')
sum(predicted2 == bc_data$classes) / length(bc_data$classes)
## [1] 0.9736457
library(C50)
## Warning: package 'C50' was built under R version 3.4.4
treeModel <- C5.0(classes ~., data = bc_data)
treeModel
##
## Call:
## C5.0.formula(formula = classes ~ ., data = bc_data)
##
## Classification Tree
## Number of samples: 683
## Number of predictors: 9
##
## Tree size: 7
##
## Non-standard options: attempt to group attributes
summary(treeModel)
##
## Call:
## C5.0.formula(formula = classes ~ ., data = bc_data)
##
##
## C5.0 [Release 2.07 GPL Edition] Tue Jun 12 17:05:43 2018
## -------------------------------
##
## Class specified by attribute `outcome'
##
## Read 683 cases (10 attributes) from undefined.data
##
## Decision tree:
##
## uniformity_of_cell_size > 2:
## :...uniformity_of_cell_shape > 2: malignant (242/20)
## : uniformity_of_cell_shape <= 2:
## : :...clump_thickness <= 5: benign (19/1)
## : clump_thickness > 5: malignant (4)
## uniformity_of_cell_size <= 2:
## :...clump_thickness <= 5: benign (397/4)
## clump_thickness > 5:
## :...normal_nucleoli > 2: malignant (6)
## normal_nucleoli <= 2:
## :...bare_nuclei in {?,1,10,2,4,5,7,8,9}: benign (13)
## bare_nuclei in {3,6}: malignant (2)
##
##
## Evaluation on training data (683 cases):
##
## Decision Tree
## ----------------
## Size Errors
##
## 7 25( 3.7%) <<
##
##
## (a) (b) <-classified as
## ---- ----
## 424 20 (a): class benign
## 5 234 (b): class malignant
##
##
## Attribute usage:
##
## 100.00% uniformity_of_cell_size
## 64.57% clump_thickness
## 38.80% uniformity_of_cell_shape
## 3.07% normal_nucleoli
## 2.20% bare_nuclei
##
##
## Time: 0.0 secs
plot(treeModel)

predicted3 <- predict(treeModel, bc_data, type= 'class')
sum(predicted3 == bc_data$classes) / length(bc_data$classes)
## [1] 0.9633968
Cross Validation
sample.int(42, 6)
## [1] 11 35 2 7 10 8
nrow(bc_data)
## [1] 683
set.seed(42)
idx <- sample.int(2, nrow(bc_data), replace=TRUE, prob=c(0.7,0.3) )
table(idx)
## idx
## 1 2
## 485 198
prop.table(table(idx))
## idx
## 1 2
## 0.7101025 0.2898975
train_data <- bc_data[idx == 1, ]
test_data <- bc_data[idx == 2, ]
dim(train_data)
## [1] 485 10
dim(test_data)
## [1] 198 10
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.4.4
dataset <- rbind(data.frame(group = "train", train_data),
data.frame(group = "test", test_data)) %>%
gather(x, y, clump_thickness:mitosis)
## Warning: attributes are not identical across measure variables;
## they will be dropped
ggplot(dataset, aes(x = y, color = group, fill = group)) +
geom_density(alpha = 0.3) +
facet_wrap( ~ x, scales = "free", ncol = 3)

set.seed(42)
fit <- rpart(classes ~ .,
data = train_data,
method = "class")
plot(fit, margin = 0.1, compress = TRUE, uniform = TRUE)
text(fit)

predicted <- predict(fit, train_data, type = 'class')
table(train_data$classes, predicted)
## predicted
## benign malignant
## benign 301 10
## malignant 2 172
sum(train_data$classes == predicted) / length(train_data$classes)
## [1] 0.9752577
predicted2 <- predict(fit, test_data, type = 'class')
table(test_data$classes, predicted2)
## predicted2
## benign malignant
## benign 127 6
## malignant 2 63
sum(test_data$classes == predicted2) / length(test_data$classes)
## [1] 0.959596
K-fold-cross validation
nrow(bc_data)
## [1] 683
#bc_data
set.seed(42)
idx <- sample.int(10, nrow(bc_data), replace=TRUE)
models <- c()
accuriacies <- c()
for (i in 1:10){
test_set <- bc_data[idx == i, ]
train_set <- bc_data[idx != i, ]
fit <- rpart(classes~., train_set, method = "class")
models <- c(models, fit)
predicted <- predict(fit, test_set, type= 'class')
acc <- sum(predicted == test_set$classes) / length(test_set$classes)
accuriacies <- c(accuriacies, acc)
}
accuriacies
## [1] 0.9701493 0.9090909 0.9545455 0.9342105 0.9740260 0.9130435 0.9622642
## [8] 0.9393939 0.9661017 0.9452055
#dim(test_set)
library(caret)
## Warning: package 'caret' was built under R version 3.4.4
## Loading required package: lattice
control <- trainControl(method=
"repeatedcv", number=10, repeats=3)
model <- train(classes~., data=bc_data, method="rpart", trControl=control)
model
## CART
##
## 683 samples
## 9 predictor
## 2 classes: 'benign', 'malignant'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 615, 614, 615, 615, 614, 615, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.02092050 0.9316339 0.8496048
## 0.05439331 0.9174820 0.8201925
## 0.79079498 0.8236201 0.5433908
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.0209205.
plot(model)

predicted <- predict(model, bc_data, type= 'raw')
sum(predicted == bc_data$classes) / length(bc_data$classes)
## [1] 0.9458272