# R in Action (2nd ed): Chapter 17
# Classification
# requires packaged rpart, party, randomForest, kernlab, rattle
# install.packages(c("rpart", "party", "randomForest", "e1071", "rpart.plot")
# install.packages('rattle', dependencies = c("Depends", "Suggests"))
par(ask=TRUE)
# Listing 17.1 - Prepare the breast cancer data
loc <- "http://archive.ics.uci.edu/ml/machine-learning-databases/"
ds <- "breast-cancer-wisconsin/breast-cancer-wisconsin.data"
url <- paste(loc, ds, sep="")
breast <- read.table(url, sep=",", header=FALSE, na.strings="?")
names(breast) <- c("ID", "clumpThickness", "sizeUniformity",
"shapeUniformity", "maginalAdhesion",
"singleEpithelialCellSize", "bareNuclei",
"blandChromatin", "normalNucleoli", "mitosis", "class")
df <- breast[-1]
df$class <- factor(df$class, levels=c(2,4),
labels=c("benign", "malignant"))
set.seed(1234)
train <- sample(nrow(df), 0.7*nrow(df))
df.train <- df[train,]
df.validate <- df[-train,]
table(df.train$class)
##
## benign malignant
## 329 160
table(df.validate$class)
##
## benign malignant
## 129 81
# Listing 17.2 - Logistic regression with glm()
fit.logit <- glm(class~., data=df.train, family=binomial())
summary(fit.logit)
##
## Call:
## glm(formula = class ~ ., family = binomial(), data = df.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.75813 -0.10602 -0.05679 0.01237 2.64317
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -10.42758 1.47602 -7.065 1.61e-12 ***
## clumpThickness 0.52434 0.15950 3.287 0.00101 **
## sizeUniformity -0.04805 0.25706 -0.187 0.85171
## shapeUniformity 0.42309 0.26775 1.580 0.11407
## maginalAdhesion 0.29245 0.14690 1.991 0.04650 *
## singleEpithelialCellSize 0.11053 0.17980 0.615 0.53871
## bareNuclei 0.33570 0.10715 3.133 0.00173 **
## blandChromatin 0.42353 0.20673 2.049 0.04049 *
## normalNucleoli 0.28888 0.13995 2.064 0.03900 *
## mitosis 0.69057 0.39829 1.734 0.08295 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 612.063 on 482 degrees of freedom
## Residual deviance: 71.346 on 473 degrees of freedom
## (6 observations deleted due to missingness)
## AIC: 91.346
##
## Number of Fisher Scoring iterations: 8
prob <- predict(fit.logit, df.validate, type="response")
logit.pred <- factor(prob > .5, levels=c(FALSE, TRUE),
labels=c("benign", "malignant"))
logit.perf <- table(df.validate$class, logit.pred,
dnn=c("Actual", "Predicted"))
logit.perf
## Predicted
## Actual benign malignant
## benign 118 2
## malignant 4 76
# Listing 17.3 - Creating a classical decision tree with rpart()
library(rpart)
set.seed(1234)
dtree <- rpart(class ~ ., data=df.train, method="class",
parms=list(split="information"))
dtree$cptable
## CP nsplit rel error xerror xstd
## 1 0.800000 0 1.00000 1.00000 0.06484605
## 2 0.046875 1 0.20000 0.30625 0.04150018
## 3 0.012500 3 0.10625 0.20625 0.03467089
## 4 0.010000 4 0.09375 0.18125 0.03264401
plotcp(dtree)

dtree.pruned <- prune(dtree, cp=.0125)
library(rpart.plot)
prp(dtree.pruned, type = 2, extra = 104,
fallen.leaves = TRUE, main="Decision Tree")
dtree.pred <- predict(dtree.pruned, df.validate, type="class")
dtree.perf <- table(df.validate$class, dtree.pred,
dnn=c("Actual", "Predicted"))
dtree.perf
## Predicted
## Actual benign malignant
## benign 122 7
## malignant 2 79
# Listing 17.4 - Creating a conditional inference tree with ctree()
library(party)
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich

fit.ctree <- ctree(class~., data=df.train)
plot(fit.ctree, main="Conditional Inference Tree")

ctree.pred <- predict(fit.ctree, df.validate, type="response")
ctree.perf <- table(df.validate$class, ctree.pred,
dnn=c("Actual", "Predicted"))
ctree.perf
## Predicted
## Actual benign malignant
## benign 122 7
## malignant 3 78
# Listing 17.5 - Random forest
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
set.seed(1234)
fit.forest <- randomForest(class~., data=df.train,
na.action=na.roughfix,
importance=TRUE)
fit.forest
##
## Call:
## randomForest(formula = class ~ ., data = df.train, importance = TRUE, na.action = na.roughfix)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 3.48%
## Confusion matrix:
## benign malignant class.error
## benign 320 9 0.02735562
## malignant 8 152 0.05000000
importance(fit.forest, type=2)
## MeanDecreaseGini
## clumpThickness 11.847884
## sizeUniformity 63.362023
## shapeUniformity 44.921067
## maginalAdhesion 5.442715
## singleEpithelialCellSize 12.725937
## bareNuclei 35.489054
## blandChromatin 18.472646
## normalNucleoli 20.934481
## mitosis 1.832045
forest.pred <- predict(fit.forest, df.validate)
forest.perf <- table(df.validate$class, forest.pred,
dnn=c("Actual", "Predicted"))
forest.perf
## Predicted
## Actual benign malignant
## benign 117 3
## malignant 1 79
# Listing 17.6 - A support vector machine
library(e1071)
set.seed(1234)
fit.svm <- svm(class~., data=df.train)
fit.svm
##
## Call:
## svm(formula = class ~ ., data = df.train)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
## gamma: 0.1111111
##
## Number of Support Vectors: 76
svm.pred <- predict(fit.svm, na.omit(df.validate))
svm.perf <- table(na.omit(df.validate)$class,
svm.pred, dnn=c("Actual", "Predicted"))
svm.perf
## Predicted
## Actual benign malignant
## benign 116 4
## malignant 3 77
# Listing 17.7 Tuning an RBF support vector machine (this can take a while)
set.seed(1234)
tuned <- tune.svm(class~., data=df.train,
gamma=10^(-6:1),
cost=10^(-10:10))
tuned
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## gamma cost
## 0.01 1
##
## - best performance: 0.02904092
fit.svm <- svm(class~., data=df.train, gamma=.01, cost=1)
svm.pred <- predict(fit.svm, na.omit(df.validate))
svm.perf <- table(na.omit(df.validate)$class,
svm.pred, dnn=c("Actual", "Predicted"))
svm.perf
## Predicted
## Actual benign malignant
## benign 117 3
## malignant 3 77
# Listing 17.8 Function for assessing binary classification accuracy
performance <- function(table, n=2){
if(!all(dim(table) == c(2,2)))
stop("Must be a 2 x 2 table")
tn = table[1,1]
fp = table[1,2]
fn = table[2,1]
tp = table[2,2]
sensitivity = tp/(tp+fn)
specificity = tn/(tn+fp)
ppp = tp/(tp+fp)
npp = tn/(tn+fn)
hitrate = (tp+tn)/(tp+tn+fp+fn)
result <- paste("Sensitivity = ", round(sensitivity, n) ,
"\nSpecificity = ", round(specificity, n),
"\nPositive Predictive Value = ", round(ppp, n),
"\nNegative Predictive Value = ", round(npp, n),
"\nAccuracy = ", round(hitrate, n), "\n", sep="")
cat(result)
}
# Listing 17.9 - Performance of breast cancer data classifiers
performance(dtree.perf)
## Sensitivity = 0.98
## Specificity = 0.95
## Positive Predictive Value = 0.92
## Negative Predictive Value = 0.98
## Accuracy = 0.96
performance(ctree.perf)
## Sensitivity = 0.96
## Specificity = 0.95
## Positive Predictive Value = 0.92
## Negative Predictive Value = 0.98
## Accuracy = 0.95
performance(forest.perf)
## Sensitivity = 0.99
## Specificity = 0.98
## Positive Predictive Value = 0.96
## Negative Predictive Value = 0.99
## Accuracy = 0.98
performance(svm.perf)
## Sensitivity = 0.96
## Specificity = 0.98
## Positive Predictive Value = 0.96
## Negative Predictive Value = 0.98
## Accuracy = 0.97
# # Using Rattle Package for data mining
# diabetes<-get(data(PimaIndiansDiabetes,package = 'mlbench'))
# names(diabetes) <- c("npregant", "plasma", "bp", "triceps",
# "insulin", "bmi", "pedigree", "age", "class")
# diabetes$class <- factor(diabetes$class, levels=c(0,1),
# labels=c("normal", "diabetic"))
# library(rattle)
# rattle()