library(tidyverse)
library(dplyr)
library(rsample)
library(caret)
library(e1071)
library(ROCR)
library(partykit)
library(ggplot2) # to visualize data
library(gridExtra) # to display multiple graph
library(inspectdf) # for EDA
library(tidymodels) # to build tidy modelsHi!! Welcome to my LBB in this LBB i’m gonna use Mushroom dataset and make a classification whether if its edible or poisonous using Naive Bayes and Decision Tree. Enjoy!!
#> [1] 8124 23
#> [1] "class" "cap-shape"
#> [3] "cap-surface" "cap-color"
#> [5] "bruises" "odor"
#> [7] "gill-attachment" "gill-spacing"
#> [9] "gill-size" "gill-color"
#> [11] "stalk-shape" "stalk-root"
#> [13] "stalk-surface-above-ring" "stalk-surface-below-ring"
#> [15] "stalk-color-above-ring" "stalk-color-below-ring"
#> [17] "veil-type" "veil-color"
#> [19] "ring-number" "ring-type"
#> [21] "spore-print-color" "population"
#> [23] "habitat"
#> Rows: 8,124
#> Columns: 23
#> $ class <chr> "p", "e", "e", "p", "e", "e", "e", "e", "p"…
#> $ `cap-shape` <chr> "x", "x", "b", "x", "x", "x", "b", "b", "x"…
#> $ `cap-surface` <chr> "s", "s", "s", "y", "s", "y", "s", "y", "y"…
#> $ `cap-color` <chr> "n", "y", "w", "w", "g", "y", "w", "w", "w"…
#> $ bruises <lgl> TRUE, TRUE, TRUE, TRUE, FALSE, TRUE, TRUE, …
#> $ odor <chr> "p", "a", "l", "p", "n", "a", "a", "l", "p"…
#> $ `gill-attachment` <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, F…
#> $ `gill-spacing` <chr> "c", "c", "c", "c", "w", "c", "c", "c", "c"…
#> $ `gill-size` <chr> "n", "b", "b", "n", "b", "b", "b", "b", "n"…
#> $ `gill-color` <chr> "k", "k", "n", "n", "k", "n", "g", "n", "p"…
#> $ `stalk-shape` <chr> "e", "e", "e", "e", "t", "e", "e", "e", "e"…
#> $ `stalk-root` <chr> "e", "c", "c", "e", "e", "c", "c", "c", "e"…
#> $ `stalk-surface-above-ring` <chr> "s", "s", "s", "s", "s", "s", "s", "s", "s"…
#> $ `stalk-surface-below-ring` <chr> "s", "s", "s", "s", "s", "s", "s", "s", "s"…
#> $ `stalk-color-above-ring` <chr> "w", "w", "w", "w", "w", "w", "w", "w", "w"…
#> $ `stalk-color-below-ring` <chr> "w", "w", "w", "w", "w", "w", "w", "w", "w"…
#> $ `veil-type` <chr> "p", "p", "p", "p", "p", "p", "p", "p", "p"…
#> $ `veil-color` <chr> "w", "w", "w", "w", "w", "w", "w", "w", "w"…
#> $ `ring-number` <chr> "o", "o", "o", "o", "o", "o", "o", "o", "o"…
#> $ `ring-type` <chr> "p", "p", "p", "p", "e", "p", "p", "p", "p"…
#> $ `spore-print-color` <chr> "k", "n", "n", "k", "n", "k", "k", "n", "k"…
#> $ population <chr> "s", "n", "n", "s", "a", "n", "n", "s", "v"…
#> $ habitat <chr> "u", "g", "m", "u", "g", "g", "m", "m", "g"…
Variable Definitions :
the data type is still not correct so we need to change it. we want to change class level from e and p to edibale and poisonous so we can understand it easier. we won’t need column that only have 1 factor so we remove it.
data <- data %>% mutate_all(as.factor) %>%
mutate(class = factor(class, levels = c("e","p"), labels = c("edible", "poisonous"))) %>%
select(-c("gill-attachment","veil-type"))Lets check if there is any missing value
#> class cap-shape cap-surface
#> 0 0 0
#> cap-color bruises odor
#> 0 0 0
#> gill-spacing gill-size gill-color
#> 0 0 0
#> stalk-shape stalk-root stalk-surface-above-ring
#> 0 0 0
#> stalk-surface-below-ring stalk-color-above-ring stalk-color-below-ring
#> 0 0 0
#> veil-color ring-number ring-type
#> 0 0 0
#> spore-print-color population habitat
#> 0 0 0
#> [1] FALSE
great!! there is no more missing value
Split data to data train and data test
set.seed(123)
index <- sample(nrow(data_clean), 0.8*nrow(data_clean))
data_train <- data_clean[index,]
data_test <- data_clean[-index,]check proportion data train
#>
#> edible poisonous
#> 0.5145407 0.4854593
check proportion data train
#>
#> edible poisonous
#> 0.5316923 0.4683077
Great! the proportion only slightly inbalance so we don’t need to perform up or down sampling, we can procced to build the model
we use all the predictors available in the data
#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction edible poisonous
#> edible 858 93
#> poisonous 6 668
#>
#> Accuracy : 0.9391
#> 95% CI : (0.9263, 0.9502)
#> No Information Rate : 0.5317
#> P-Value [Acc > NIR] : < 0.00000000000000022
#>
#> Kappa : 0.8768
#>
#> Mcnemar's Test P-Value : < 0.00000000000000022
#>
#> Sensitivity : 0.9931
#> Specificity : 0.8778
#> Pos Pred Value : 0.9022
#> Neg Pred Value : 0.9911
#> Prevalence : 0.5317
#> Detection Rate : 0.5280
#> Detection Prevalence : 0.5852
#> Balanced Accuracy : 0.9354
#>
#> 'Positive' Class : edible
#>
# result
naive_table <- select(data_test, class) %>%
bind_cols(class_pred = pred_naive) %>%
bind_cols(class_eprob = round(prob_naive[,1],4)) %>%
bind_cols(class_pprob = round(prob_naive[,2],4))
# performance evaluation - confusion matrix
naive_table %>%
conf_mat(class, class_pred) %>%
autoplot(type = "heatmap")naive_table %>%
summarise(
accuracy = accuracy_vec(class, class_pred),
sensitivity = sens_vec(class, class_pred),
specificity = spec_vec(class, class_pred),
precision = precision_vec(class, class_pred)
)naive_roc <- data.frame(prediction=round(prob_naive[,1],4),
trueclass=as.numeric(naive_table$class=="edible"))
head(naive_roc)naive_roc <- ROCR::prediction(naive_roc$prediction, naive_roc$trueclass)
# ROC curve
plot(performance(naive_roc, "tpr", "fpr"),
main = "ROC")
abline(a = 0, b = 1) #### AUC
auc_ROCR_n <- performance(naive_roc, measure = "auc")
auc_ROCR_n <- auc_ROCR_n@y.values[[1]]
auc_ROCR_n#> [1] 0.992321
based on model above we get an accuracy 94% and lets see if the model improv if we add ‘laplace=1’ in our formula.
#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction edible poisonous
#> edible 860 75
#> poisonous 4 686
#>
#> Accuracy : 0.9514
#> 95% CI : (0.9398, 0.9613)
#> No Information Rate : 0.5317
#> P-Value [Acc > NIR] : < 0.00000000000000022
#>
#> Kappa : 0.9018
#>
#> Mcnemar's Test P-Value : 0.000000000000003391
#>
#> Sensitivity : 0.9954
#> Specificity : 0.9014
#> Pos Pred Value : 0.9198
#> Neg Pred Value : 0.9942
#> Prevalence : 0.5317
#> Detection Rate : 0.5292
#> Detection Prevalence : 0.5754
#> Balanced Accuracy : 0.9484
#>
#> 'Positive' Class : edible
#>
# result
naive_table <- select(data_test, class) %>%
bind_cols(class_pred = pred_naive) %>%
bind_cols(class_eprob = round(prob_naive[,1],4)) %>%
bind_cols(class_pprob = round(prob_naive[,2],4))
# performance evaluation - confusion matrix
naive_table %>%
conf_mat(class, class_pred) %>%
autoplot(type = "heatmap")naive_table %>%
summarise(
accuracy = accuracy_vec(class, class_pred),
sensitivity = sens_vec(class, class_pred),
specificity = spec_vec(class, class_pred),
precision = precision_vec(class, class_pred)
)naive_roc <- data.frame(prediction=round(prob_naive[,1],4),
trueclass=as.numeric(naive_table$class=="edible"))
head(naive_roc)naive_roc <- ROCR::prediction(naive_roc$prediction, naive_roc$trueclass)
# ROC curve
plot(performance(naive_roc, "tpr", "fpr"),
main = "ROC")
abline(a = 0, b = 1)auc_ROCR_n <- performance(naive_roc, measure = "auc")
auc_ROCR_n <- auc_ROCR_n@y.values[[1]]
auc_ROCR_n#> [1] 0.9966783
final_n <- naive_table %>%
summarise(
accuracy = accuracy_vec(class, class_pred),
sensitivity = sens_vec(class, class_pred),
specificity = spec_vec(class, class_pred),
precision = precision_vec(class, class_pred)
)%>%
cbind(AUC = auc_ROCR_n)the model accuracy above increase from 94% to 95% when add laplace = 1 in our formula. in our case, mushrooms is classified whether its safe or not for us to eat it. there are mushrooms that we cant or not safe to eat it because its poisonous. so we need a high accuracy model and more importantly we need the specificity have a high result as well because we dont want if our model predict the mushrooms edible but the truth its poisonous
now let try bulid our model using Decision Tree and compere it with naive bayes model
library(rpart)
library(rattle)
library(rpart.plot)
# model building
model_dtree <- rpart(formula = class ~ ., data = data_train, method = "class")
fancyRpartPlot(model_dtree, sub = NULL)#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction edible poisonous
#> edible 864 12
#> poisonous 0 749
#>
#> Accuracy : 0.9926
#> 95% CI : (0.9871, 0.9962)
#> No Information Rate : 0.5317
#> P-Value [Acc > NIR] : < 0.00000000000000022
#>
#> Kappa : 0.9852
#>
#> Mcnemar's Test P-Value : 0.001496
#>
#> Sensitivity : 1.0000
#> Specificity : 0.9842
#> Pos Pred Value : 0.9863
#> Neg Pred Value : 1.0000
#> Prevalence : 0.5317
#> Detection Rate : 0.5317
#> Detection Prevalence : 0.5391
#> Balanced Accuracy : 0.9921
#>
#> 'Positive' Class : edible
#>
# result
dtree_table <- select(data_test, class) %>%
bind_cols(class_pred = pred_dt) %>%
bind_cols(class_eprob = round(prob_dt[,1],4)) %>%
bind_cols(class_pprob = round(prob_dt[,2],4))
# performance evaluation - confusion matrix
naive_table %>%
conf_mat(class, class_pred) %>%
autoplot(type = "heatmap")dtree_table %>%
summarise(
accuracy = accuracy_vec(class, class_pred),
sensitivity = sens_vec(class, class_pred),
specificity = spec_vec(class, class_pred),
precision = precision_vec(class, class_pred)
)dtree_roc <- data.frame(prediction=round(prob_dt[,1],4),
trueclass=as.numeric(dtree_table$class=="edible"))
head(dtree_roc)dtree_roc <- ROCR::prediction(dtree_roc$prediction, dtree_roc$trueclass)
# ROC curve
plot(performance(naive_roc, "tpr", "fpr"),
main = "ROC")
abline(a = 0, b = 1)auc_ROCR_d <- performance(dtree_roc, measure = "auc")
auc_ROCR_d <- auc_ROCR_d@y.values[[1]]
auc_ROCR_d#> [1] 0.9921156
final_d <- dtree_table %>%
summarise(
accuracy = accuracy_vec(class, class_pred),
sensitivity = sens_vec(class, class_pred),
specificity = spec_vec(class, class_pred),
precision = precision_vec(class, class_pred)
)%>%
cbind(AUC = auc_ROCR_d)Based on result above we get accuracy of 99% and specificity of 98%.
Based on the metrics table above, the predictive model built using Decision Tree algorithm gave the best result. The model gave highest accuracy 99% while also maintain sensitivity, specificity, and precision above 90%. It also gave the highest AUC at 99%. Therefore the best model to predict Mushrooms whether its edible or poisonous based on characteristic features is the Decision Tree model.