Reference book: R in Action
Related packages:
This markdown is done with the breast cancer databases
The following variables are contained in the dataset:
| Attribute | Domain |
|---|---|
| Sample code number | id number |
| Clump Thickness | 1 - 10 |
| Uniformity of Cell Size | 1 - 10 |
| Uniformity of Cell Shape | 1 - 10 |
| Marginal Adhesion | 1 - 10 |
| Single Epithelial Cell Size | 1 - 10 |
| Bare Nuclei | 1 - 10 |
| Bland Chromatin | 1 - 10 |
| Normal Nucleoli | 1 - 10 |
| Mitoses | 1 - 10 |
| Class | (2 for benign, 4 for malignant) |
### Data loading ###
breast=read.csv("breast cancer.csv")
colnames(breast)=make.names(names$Attribute)
### Addressing the missing values
library(mi)
mdf <- missing_data.frame(breast)
mdf=change(mdf, y="Bare.Nuclei", what="type", to="ordered-categorical")
df0=mi(mdf)
df=complete(df0, 1)
### Data subseting ###
df=df[,-c(1, 12)]
df$Bare.Nuclei=as.numeric(df$Bare.Nuclei)
df$Class=factor(df$Class, levels=c(2,4), labels=c("benign", "malignant"))
### 70% set for train data ###
set.seed(086)
train=sample(nrow(df), 0.7*nrow(df))
df.train=df[train, ]
df.validate=df[-train, ]
table(df.train$Class)
##
## benign malignant
## 320 168
table(df.validate$Class)
##
## benign malignant
## 137 73
fit.logit=glm(Class~., data=df.train, family=binomial())
summary(fit.logit) # model check
##
## Call:
## glm(formula = Class ~ ., family = binomial(), data = df.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.85303 -0.08090 -0.03041 0.01937 2.04975
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -11.788749 1.790760 -6.583 4.61e-11 ***
## Clump.Thickness 0.785756 0.214010 3.672 0.000241 ***
## Uniformity.of.Cell.Size 0.002869 0.272166 0.011 0.991588
## Uniformity.of.Cell.Shape 0.223394 0.287442 0.777 0.437052
## Marginal.Adhesion 0.188299 0.175988 1.070 0.284641
## Single.Epithelial.Cell.Size 0.305127 0.193810 1.574 0.115405
## Bare.Nuclei 0.563376 0.136880 4.116 3.86e-05 ***
## Bland.Chromatin 0.419041 0.213236 1.965 0.049397 *
## Normal.Nucleoli 0.102033 0.134335 0.760 0.447528
## Mitoses 0.396719 0.408032 0.972 0.330915
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 628.371 on 487 degrees of freedom
## Residual deviance: 57.037 on 478 degrees of freedom
## AIC: 77.037
##
## Number of Fisher Scoring iterations: 8
prob=predict(fit.logit, df.validate, type = "response")
logit.pred=factor(prob>.5, levels = c(FALSE, TRUE),
labels=c("benign", "malignant")) # sort the validate sample
table(df.validate$Class, logit.pred, dnn=c("Actual", "Predicted"))
## Predicted
## Actual benign malignant
## benign 133 4
## malignant 7 66
library(rpart)
set.seed(086)
dtree=rpart(Class~., data = df.train, method = "class",
parms = list(split="information")) # Spanning Tree
dtree$cptable
## CP nsplit rel error xerror xstd
## 1 0.80952381 0 1.0000000 1.0000000 0.06247560
## 2 0.04464286 1 0.1904762 0.2500000 0.03687847
## 3 0.01000000 3 0.1011905 0.1547619 0.02953170
plotcp(dtree) # Judge the number of pruning
dtree.pruned=prune(dtree, cp=.01) # Pruning
library(rpart.plot)
prp(dtree.pruned, type = 2, extra = 104, fallen.leaves = TRUE,
main= "Decision Tree")
dtree.pred=predict(dtree.pruned, df.validate, type="class")
table(df.validate$Class, dtree.pred, dnn=c("Actual", "Predicted"))
## Predicted
## Actual benign malignant
## benign 127 10
## malignant 6 67
For more detail, see Leo Breiman and Adele Cutler
library(randomForest)
set.seed(086)
fit.forest=randomForest(Class~., data=df.train, importance=TRUE,
na.action = na.roughfix) # Generate forest
importance(fit.forest, type=2) # Given the importance of variables
## MeanDecreaseGini
## Clump.Thickness 15.713212
## Uniformity.of.Cell.Size 52.062307
## Uniformity.of.Cell.Shape 50.491615
## Marginal.Adhesion 4.360324
## Single.Epithelial.Cell.Size 23.345561
## Bare.Nuclei 38.446229
## Bland.Chromatin 23.405485
## Normal.Nucleoli 10.754592
## Mitoses 1.209941
forest.pred=predict(fit.forest, df.validate)
table(df.validate$Class, forest.pred, dnn=c("Actual", "Predicted"))
## Predicted
## Actual benign malignant
## benign 133 4
## malignant 6 67