Sorting–Decision tree, random forest and SVM

Reference book: R in Action
Related packages:

Decision tree: rpart, rpart.plot, party
Random forest: randomForest
SVM: e1071

This markdown is done with the breast cancer databases
The following variables are contained in the dataset:

Attribute	Domain
Sample code number	id number
Clump Thickness	1 - 10
Uniformity of Cell Size	1 - 10
Uniformity of Cell Shape	1 - 10
Marginal Adhesion	1 - 10
Single Epithelial Cell Size	1 - 10
Bare Nuclei	1 - 10
Bland Chromatin	1 - 10
Normal Nucleoli	1 - 10
Mitoses	1 - 10
Class	(2 for benign, 4 for malignant)

Addressing the Missing Values with mi package

### Data loading ###
breast=read.csv("breast cancer.csv")
colnames(breast)=make.names(names$Attribute)

### Addressing the missing values
library(mi)
mdf <- missing_data.frame(breast)
mdf=change(mdf, y="Bare.Nuclei", what="type", to="ordered-categorical")
df0=mi(mdf)
df=complete(df0, 1)

### Data subseting ###
df=df[,-c(1, 12)]
df$Bare.Nuclei=as.numeric(df$Bare.Nuclei)
df$Class=factor(df$Class, levels=c(2,4), labels=c("benign", "malignant"))

### 70% set for train data ###
set.seed(086)
train=sample(nrow(df), 0.7*nrow(df))
df.train=df[train, ]
df.validate=df[-train, ]
table(df.train$Class)

## 
##    benign malignant 
##       320       168

table(df.validate$Class)

## 
##    benign malignant 
##       137        73

Logistic Regression

fit.logit=glm(Class~., data=df.train, family=binomial())
summary(fit.logit)  # model check

## 
## Call:
## glm(formula = Class ~ ., family = binomial(), data = df.train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.85303  -0.08090  -0.03041   0.01937   2.04975  
## 
## Coefficients:
##                               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 -11.788749   1.790760  -6.583 4.61e-11 ***
## Clump.Thickness               0.785756   0.214010   3.672 0.000241 ***
## Uniformity.of.Cell.Size       0.002869   0.272166   0.011 0.991588    
## Uniformity.of.Cell.Shape      0.223394   0.287442   0.777 0.437052    
## Marginal.Adhesion             0.188299   0.175988   1.070 0.284641    
## Single.Epithelial.Cell.Size   0.305127   0.193810   1.574 0.115405    
## Bare.Nuclei                   0.563376   0.136880   4.116 3.86e-05 ***
## Bland.Chromatin               0.419041   0.213236   1.965 0.049397 *  
## Normal.Nucleoli               0.102033   0.134335   0.760 0.447528    
## Mitoses                       0.396719   0.408032   0.972 0.330915    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 628.371  on 487  degrees of freedom
## Residual deviance:  57.037  on 478  degrees of freedom
## AIC: 77.037
## 
## Number of Fisher Scoring iterations: 8

prob=predict(fit.logit, df.validate, type = "response")
logit.pred=factor(prob>.5, levels = c(FALSE, TRUE),
                  labels=c("benign", "malignant"))  # sort the validate sample
table(df.validate$Class, logit.pred, dnn=c("Actual", "Predicted"))

##            Predicted
## Actual      benign malignant
##   benign       133         4
##   malignant      7        66

Decision tree

library(rpart)
set.seed(086)
dtree=rpart(Class~., data = df.train, method = "class",
            parms = list(split="information")) # Spanning Tree
dtree$cptable

##           CP nsplit rel error    xerror       xstd
## 1 0.80952381      0 1.0000000 1.0000000 0.06247560
## 2 0.04464286      1 0.1904762 0.2500000 0.03687847
## 3 0.01000000      3 0.1011905 0.1547619 0.02953170

plotcp(dtree) # Judge the number of pruning

dtree.pruned=prune(dtree, cp=.01) # Pruning
library(rpart.plot)
prp(dtree.pruned, type = 2, extra = 104, fallen.leaves = TRUE,
    main= "Decision Tree")

dtree.pred=predict(dtree.pruned, df.validate, type="class")
table(df.validate$Class, dtree.pred, dnn=c("Actual", "Predicted"))

##            Predicted
## Actual      benign malignant
##   benign       127        10
##   malignant      6        67

Random forest

For more detail, see Leo Breiman and Adele Cutler

library(randomForest)

set.seed(086)
fit.forest=randomForest(Class~., data=df.train, importance=TRUE,
                        na.action = na.roughfix) # Generate forest
importance(fit.forest, type=2) # Given the importance of variables

##                             MeanDecreaseGini
## Clump.Thickness                    15.713212
## Uniformity.of.Cell.Size            52.062307
## Uniformity.of.Cell.Shape           50.491615
## Marginal.Adhesion                   4.360324
## Single.Epithelial.Cell.Size        23.345561
## Bare.Nuclei                        38.446229
## Bland.Chromatin                    23.405485
## Normal.Nucleoli                    10.754592
## Mitoses                             1.209941

forest.pred=predict(fit.forest, df.validate)
table(df.validate$Class, forest.pred, dnn=c("Actual", "Predicted"))

##            Predicted
## Actual      benign malignant
##   benign       133         4
##   malignant      6        67