For the fully functional html version, please visit http://www.rpubs.com/jasonchanhku/poison
library(xgboost)
library(knitr)
library(caret)
library(ROCR)
library(DiagrammeR)
The dataset comes in a list where each variable is a list containing label and data. As xgboost can only take in numeric vectors, the features have all been one hot encoded. Bear in mind that for label, it containes the class levels where 0 is “edible” and 1 is “poisonous”. The data has already been prepared into train and test as below:
#data loading
data(agaricus.train, package = "xgboost")
data(agaricus.test, package = "xgboost")
train <- agaricus.train
test <- agaricus.test
str(train)
## List of 2
## $ data :Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
## .. ..@ i : int [1:143286] 2 6 8 11 18 20 21 24 28 32 ...
## .. ..@ p : int [1:127] 0 369 372 3306 5845 6489 6513 8380 8384 10991 ...
## .. ..@ Dim : int [1:2] 6513 126
## .. ..@ Dimnames:List of 2
## .. .. ..$ : NULL
## .. .. ..$ : chr [1:126] "cap-shape=bell" "cap-shape=conical" "cap-shape=convex" "cap-shape=flat" ...
## .. ..@ x : num [1:143286] 1 1 1 1 1 1 1 1 1 1 ...
## .. ..@ factors : list()
## $ label: num [1:6513] 1 0 0 1 0 0 0 1 0 0 ...
A preview of the dataset is possible, by coercing the list into a matrix. The matrix is bound to be large due to the one hot encoding.
#coercing it into a matrix
mat <- as.matrix(train$data)
#preview of the matrix
kable(data.frame(head(mat)))
| cap.shape.bell | cap.shape.conical | cap.shape.convex | cap.shape.flat | cap.shape.knobbed | cap.shape.sunken | cap.surface.fibrous | cap.surface.grooves | cap.surface.scaly | cap.surface.smooth | cap.color.brown | cap.color.buff | cap.color.cinnamon | cap.color.gray | cap.color.green | cap.color.pink | cap.color.purple | cap.color.red | cap.color.white | cap.color.yellow | bruises..bruises | bruises..no | odor.almond | odor.anise | odor.creosote | odor.fishy | odor.foul | odor.musty | odor.none | odor.pungent | odor.spicy | gill.attachment.attached | gill.attachment.descending | gill.attachment.free | gill.attachment.notched | gill.spacing.close | gill.spacing.crowded | gill.spacing.distant | gill.size.broad | gill.size.narrow | gill.color.black | gill.color.brown | gill.color.buff | gill.color.chocolate | gill.color.gray | gill.color.green | gill.color.orange | gill.color.pink | gill.color.purple | gill.color.red | gill.color.white | gill.color.yellow | stalk.shape.enlarging | stalk.shape.tapering | stalk.root.bulbous | stalk.root.club | stalk.root.cup | stalk.root.equal | stalk.root.rhizomorphs | stalk.root.rooted | stalk.root.missing | stalk.surface.above.ring.fibrous | stalk.surface.above.ring.scaly | stalk.surface.above.ring.silky | stalk.surface.above.ring.smooth | stalk.surface.below.ring.fibrous | stalk.surface.below.ring.scaly | stalk.surface.below.ring.silky | stalk.surface.below.ring.smooth | stalk.color.above.ring.brown | stalk.color.above.ring.buff | stalk.color.above.ring.cinnamon | stalk.color.above.ring.gray | stalk.color.above.ring.orange | stalk.color.above.ring.pink | stalk.color.above.ring.red | stalk.color.above.ring.white | stalk.color.above.ring.yellow | stalk.color.below.ring.brown | stalk.color.below.ring.buff | stalk.color.below.ring.cinnamon | stalk.color.below.ring.gray | stalk.color.below.ring.orange | stalk.color.below.ring.pink | stalk.color.below.ring.red | stalk.color.below.ring.white | stalk.color.below.ring.yellow | veil.type.partial | veil.type.universal | veil.color.brown | veil.color.orange | veil.color.white | veil.color.yellow | ring.number.none | ring.number.one | ring.number.two | ring.type.cobwebby | ring.type.evanescent | ring.type.flaring | ring.type.large | ring.type.none | ring.type.pendant | ring.type.sheathing | ring.type.zone | spore.print.color.black | spore.print.color.brown | spore.print.color.buff | spore.print.color.chocolate | spore.print.color.green | spore.print.color.orange | spore.print.color.purple | spore.print.color.white | spore.print.color.yellow | population.abundant | population.clustered | population.numerous | population.scattered | population.several | population.solitary | habitat.grasses | habitat.leaves | habitat.meadows | habitat.paths | habitat.urban | habitat.waste | habitat.woods |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
As for the parameters, since the dataset is rather small, parameters are set as follows:
nrounds = 2max.depth = 2nthread = 2The model is trained using xgboost as below:
#building the classifier using xgboost
model <- xgboost(data = train$data, label = train$label, nrounds = 2, objective = "binary:logistic", verbose = 1, max.depth = 2, nthread = 2, eta = 1)
## [0] train-error:0.046522
## [1] train-error:0.022263
#building the predictor
pred <- predict(model, test$data)
#preview of pred
head(pred)
## [1] 0.28583017 0.92392391 0.28583017 0.28583017 0.05169873 0.92392391
xgb.plot.tree(feature_names = agaricus.train$data@Dimnames[[2]], model = model)
Note that pred returns a vector of probabilities where if the probability is > 0.5, it represents a predicted class label of 1, which means “poisonous”
#perform the transformation
pred_t <- as.numeric(pred > 0.5)
head(pred_t)
## [1] 0 1 0 0 0 1
To evaluate the model, Average Error can be a simple evaluator alongside the Confusion Matrix.
err <- mean(as.numeric(pred > 0.5) != test$label)
print(paste("test-error=", err))
## [1] "test-error= 0.0217256362507759"
confusionMatrix(test$label, pred_t, dnn = c("actual", "predicted"), positive = "1")
## Confusion Matrix and Statistics
##
## predicted
## actual 0 1
## 0 813 22
## 1 13 763
##
## Accuracy : 0.9783
## 95% CI : (0.9699, 0.9848)
## No Information Rate : 0.5127
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9565
## Mcnemar's Test P-Value : 0.1763
##
## Sensitivity : 0.9720
## Specificity : 0.9843
## Pos Pred Value : 0.9832
## Neg Pred Value : 0.9737
## Prevalence : 0.4873
## Detection Rate : 0.4736
## Detection Prevalence : 0.4817
## Balanced Accuracy : 0.9781
##
## 'Positive' Class : 1
##
pred_obj <- prediction(test$label, pred_t )
perf <- performance(pred_obj, measure = "tpr", x.measure = "fpr")
plot(perf, main = "ROC Curve for Poisonous Mushrooms", col = "blue", lwd = 3)
abline(a = 0, b = 1, lwd = 2, lty = 2)
To plot the feature importance, an importance matrix must first be constructed:
#constructing importance matrix
importance_matrix <- xgb.importance(feature_names = agaricus.train$data@Dimnames[[2]], model = model)
#plotting
xgb.plot.importance(importance_matrix)
To improve the model, some parameters must be tuned.
As we limited some of the parameters earlier in the model, this time, let’s see if we remove the criteria of maxdepth, eta, and nthread.
model <- xgboost(data = train$data, label = train$label, nrounds = 2, objective = "binary:logistic")
## [0] train-error:0.000614
## [1] train-error:0.001228
pred <- predict(model, test$data)
pred_t <- ifelse(pred > 0.5, 1, 0)
confusionMatrix(test$label, pred_t, dnn = c("actual", "predicted"), positive = "1")
## Confusion Matrix and Statistics
##
## predicted
## actual 0 1
## 0 835 0
## 1 0 776
##
## Accuracy : 1
## 95% CI : (0.9977, 1)
## No Information Rate : 0.5183
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.4817
## Detection Rate : 0.4817
## Detection Prevalence : 0.4817
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : 1
##