review
read data diabetes
cross validation
set.seed(100)
idx <- initial_split(diabetes, prop = 0.8, strata = "diabetes")
diabetes_train <- training(idx)
diabetes_test <- testing(idx)
prop.table(table(diabetes_train$diabetes))##
## neg pos
## 0.650974 0.349026
##
## neg pos
## 0.6513158 0.3486842
##
## neg pos
## 0.6510417 0.3489583
modeling
- naive bayes
- pada naive bayes semua predictor memiliki bobot yang sama (tidak ada yang 1 variabel yg lebih penting dibanding yang lain)
predict
# 2 jenis type : raw, class
bayes_pred <- predict(diabetes_bayes, diabetes_test, type = "class")
bayes_pred_prob <- predict(diabetes_bayes, diabetes_test, type = "raw")confusion matrix
## Accuracy
## 0.75
- decision tree
diabetes_tree <- ctree(formula = diabetes~., data = diabetes_train)
plot(diabetes_tree, type = "simple")predict
tree_pred <- predict(diabetes_tree, diabetes_test)
tree_pred_prob <- predict(diabetes_tree, diabetes_test, type = "prob")## Accuracy
## 0.6710526
- random forest
ctrl <- trainControl(method = "repeatedcv", number = 5, repeats = 2)
diabetes_forest <- train(diabetes~., data = diabetes_train, method = "rf", ntree = 200, trControl = ctrl)
plot(diabetes_forest)## rf variable importance
##
## Overall
## glucose 100.000
## mass 49.388
## pedigree 29.281
## age 21.953
## pressure 11.606
## pregnant 7.495
## triceps 2.478
## insulin 0.000
plot(diabetes_forest$finalModel)
legend("topright", colnames(diabetes_forest$finalModel$err.rate), fill = 1:3)## Accuracy
## 0.7631579
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
pred <- prediction(bayes_pred_prob[,2], diabetes_test$diabetes)
perf <- performance(pred, "tpr", "fpr")
plot(perf)## [1] 0.8151325