# R in Action (2nd ed): Chapter 17                                            
# Classification                                                              
# requires packaged rpart, party, randomForest, kernlab, rattle               
# install.packages(c("rpart", "party", "randomForest", "e1071", "rpart.plot")
# install.packages('rattle', dependencies = c("Depends", "Suggests"))           
par(ask=TRUE)

# Listing 17.1 - Prepare the breast cancer data
loc <- "http://archive.ics.uci.edu/ml/machine-learning-databases/"
ds  <- "breast-cancer-wisconsin/breast-cancer-wisconsin.data"
url <- paste(loc, ds, sep="")

breast <- read.table(url, sep=",", header=FALSE, na.strings="?")
names(breast) <- c("ID", "clumpThickness", "sizeUniformity",
                   "shapeUniformity", "maginalAdhesion", 
                   "singleEpithelialCellSize", "bareNuclei", 
                   "blandChromatin", "normalNucleoli", "mitosis", "class")

df <- breast[-1]
df$class <- factor(df$class, levels=c(2,4), 
                   labels=c("benign", "malignant"))

set.seed(1234)
train <- sample(nrow(df), 0.7*nrow(df))
df.train <- df[train,]
df.validate <- df[-train,]
table(df.train$class)
## 
##    benign malignant 
##       329       160
table(df.validate$class)
## 
##    benign malignant 
##       129        81
# Listing 17.2 - Logistic regression with glm()
fit.logit <- glm(class~., data=df.train, family=binomial())
summary(fit.logit)
## 
## Call:
## glm(formula = class ~ ., family = binomial(), data = df.train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.75813  -0.10602  -0.05679   0.01237   2.64317  
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -10.42758    1.47602  -7.065 1.61e-12 ***
## clumpThickness             0.52434    0.15950   3.287  0.00101 ** 
## sizeUniformity            -0.04805    0.25706  -0.187  0.85171    
## shapeUniformity            0.42309    0.26775   1.580  0.11407    
## maginalAdhesion            0.29245    0.14690   1.991  0.04650 *  
## singleEpithelialCellSize   0.11053    0.17980   0.615  0.53871    
## bareNuclei                 0.33570    0.10715   3.133  0.00173 ** 
## blandChromatin             0.42353    0.20673   2.049  0.04049 *  
## normalNucleoli             0.28888    0.13995   2.064  0.03900 *  
## mitosis                    0.69057    0.39829   1.734  0.08295 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 612.063  on 482  degrees of freedom
## Residual deviance:  71.346  on 473  degrees of freedom
##   (6 observations deleted due to missingness)
## AIC: 91.346
## 
## Number of Fisher Scoring iterations: 8
prob <- predict(fit.logit, df.validate, type="response")
logit.pred <- factor(prob > .5, levels=c(FALSE, TRUE), 
                     labels=c("benign", "malignant"))
logit.perf <- table(df.validate$class, logit.pred,
                    dnn=c("Actual", "Predicted"))
logit.perf
##            Predicted
## Actual      benign malignant
##   benign       118         2
##   malignant      4        76
# Listing 17.3 - Creating a classical decision tree with rpart()
library(rpart)
set.seed(1234)
dtree <- rpart(class ~ ., data=df.train, method="class",      
               parms=list(split="information"))
dtree$cptable
##         CP nsplit rel error  xerror       xstd
## 1 0.800000      0   1.00000 1.00000 0.06484605
## 2 0.046875      1   0.20000 0.30625 0.04150018
## 3 0.012500      3   0.10625 0.20625 0.03467089
## 4 0.010000      4   0.09375 0.18125 0.03264401
plotcp(dtree)

dtree.pruned <- prune(dtree, cp=.0125) 

library(rpart.plot)
prp(dtree.pruned, type = 2, extra = 104,  
    fallen.leaves = TRUE, main="Decision Tree")

dtree.pred <- predict(dtree.pruned, df.validate, type="class")
dtree.perf <- table(df.validate$class, dtree.pred, 
                    dnn=c("Actual", "Predicted"))
dtree.perf
##            Predicted
## Actual      benign malignant
##   benign       122         7
##   malignant      2        79
# Listing 17.4 - Creating a conditional inference tree with ctree()
library(party)
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich

fit.ctree <- ctree(class~., data=df.train)
plot(fit.ctree, main="Conditional Inference Tree")

ctree.pred <- predict(fit.ctree, df.validate, type="response")
ctree.perf <- table(df.validate$class, ctree.pred, 
                    dnn=c("Actual", "Predicted"))
ctree.perf
##            Predicted
## Actual      benign malignant
##   benign       122         7
##   malignant      3        78
# Listing 17.5 - Random forest
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
set.seed(1234)
fit.forest <- randomForest(class~., data=df.train,        
                           na.action=na.roughfix,
                           importance=TRUE)             
fit.forest
## 
## Call:
##  randomForest(formula = class ~ ., data = df.train, importance = TRUE,      na.action = na.roughfix) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 3.48%
## Confusion matrix:
##           benign malignant class.error
## benign       320         9  0.02735562
## malignant      8       152  0.05000000
importance(fit.forest, type=2)                          
##                          MeanDecreaseGini
## clumpThickness                  11.847884
## sizeUniformity                  63.362023
## shapeUniformity                 44.921067
## maginalAdhesion                  5.442715
## singleEpithelialCellSize        12.725937
## bareNuclei                      35.489054
## blandChromatin                  18.472646
## normalNucleoli                  20.934481
## mitosis                          1.832045
forest.pred <- predict(fit.forest, df.validate)         
forest.perf <- table(df.validate$class, forest.pred, 
                     dnn=c("Actual", "Predicted"))
forest.perf
##            Predicted
## Actual      benign malignant
##   benign       117         3
##   malignant      1        79
# Listing 17.6 - A support vector machine
library(e1071)
set.seed(1234)
fit.svm <- svm(class~., data=df.train)
fit.svm
## 
## Call:
## svm(formula = class ~ ., data = df.train)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
##       gamma:  0.1111111 
## 
## Number of Support Vectors:  76
svm.pred <- predict(fit.svm, na.omit(df.validate))
svm.perf <- table(na.omit(df.validate)$class, 
                  svm.pred, dnn=c("Actual", "Predicted"))
svm.perf
##            Predicted
## Actual      benign malignant
##   benign       116         4
##   malignant      3        77
# Listing 17.7 Tuning an RBF support vector machine (this can take a while)
set.seed(1234)
tuned <- tune.svm(class~., data=df.train,
                  gamma=10^(-6:1),
                  cost=10^(-10:10))
tuned
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  gamma cost
##   0.01    1
## 
## - best performance: 0.02904092
fit.svm <- svm(class~., data=df.train, gamma=.01, cost=1)
svm.pred <- predict(fit.svm, na.omit(df.validate))
svm.perf <- table(na.omit(df.validate)$class,
                  svm.pred, dnn=c("Actual", "Predicted"))
svm.perf
##            Predicted
## Actual      benign malignant
##   benign       117         3
##   malignant      3        77
# Listing 17.8 Function for assessing binary classification accuracy
performance <- function(table, n=2){
  if(!all(dim(table) == c(2,2)))
    stop("Must be a 2 x 2 table")
  tn = table[1,1]
  fp = table[1,2]
  fn = table[2,1]
  tp = table[2,2]
  sensitivity = tp/(tp+fn)
  specificity = tn/(tn+fp)
  ppp = tp/(tp+fp)
  npp = tn/(tn+fn)
  hitrate = (tp+tn)/(tp+tn+fp+fn)
  result <- paste("Sensitivity = ", round(sensitivity, n) ,
                  "\nSpecificity = ", round(specificity, n),
                  "\nPositive Predictive Value = ", round(ppp, n),
                  "\nNegative Predictive Value = ", round(npp, n),
                  "\nAccuracy = ", round(hitrate, n), "\n", sep="")
  cat(result)
}


# Listing 17.9 - Performance of breast cancer data classifiers
performance(dtree.perf)
## Sensitivity = 0.98
## Specificity = 0.95
## Positive Predictive Value = 0.92
## Negative Predictive Value = 0.98
## Accuracy = 0.96
performance(ctree.perf)
## Sensitivity = 0.96
## Specificity = 0.95
## Positive Predictive Value = 0.92
## Negative Predictive Value = 0.98
## Accuracy = 0.95
performance(forest.perf)
## Sensitivity = 0.99
## Specificity = 0.98
## Positive Predictive Value = 0.96
## Negative Predictive Value = 0.99
## Accuracy = 0.98
performance(svm.perf)
## Sensitivity = 0.96
## Specificity = 0.98
## Positive Predictive Value = 0.96
## Negative Predictive Value = 0.98
## Accuracy = 0.97
# # Using Rattle Package for data mining
# diabetes<-get(data(PimaIndiansDiabetes,package = 'mlbench'))
# names(diabetes) <- c("npregant", "plasma", "bp", "triceps",
#                      "insulin", "bmi", "pedigree", "age", "class")
# diabetes$class <- factor(diabetes$class, levels=c(0,1),
#                          labels=c("normal", "diabetic"))
# library(rattle)
# rattle()