library(dplyr)
library(rsample)
library(smotefamily)
library(partykit)
library(caret)
# read data
diab <- read.csv("data_input/diabetes.csv")
# cek data
head(diab)
## pregnant glucose pressure triceps insulin mass pedigree age diabetes
## 1 6 148 72 35 0 33.6 0.627 50 pos
## 2 1 85 66 29 0 26.6 0.351 31 neg
## 3 8 183 64 0 0 23.3 0.672 32 pos
## 4 1 89 66 23 94 28.1 0.167 21 neg
## 5 0 137 40 35 168 43.1 2.288 33 pos
## 6 5 116 74 0 0 25.6 0.201 30 neg
Sesuaikan tipe data pada dataset diab:
# your code
diab <- diab %>%
mutate(diabetes = as.factor(diabetes))
glimpse(diab)
## Rows: 768
## Columns: 9
## $ pregnant <int> 6, 1, 8, 1, 0, 5, 3, 10, 2, 8, 4, 10, 10, 1, 5, 7, 0, 7, 1, 1…
## $ glucose <int> 148, 85, 183, 89, 137, 116, 78, 115, 197, 125, 110, 168, 139,…
## $ pressure <int> 72, 66, 64, 66, 40, 74, 50, 0, 70, 96, 92, 74, 80, 60, 72, 0,…
## $ triceps <int> 35, 29, 0, 23, 35, 0, 32, 0, 45, 0, 0, 0, 0, 23, 19, 0, 47, 0…
## $ insulin <int> 0, 0, 0, 94, 168, 0, 88, 0, 543, 0, 0, 0, 0, 846, 175, 0, 230…
## $ mass <dbl> 33.6, 26.6, 23.3, 28.1, 43.1, 25.6, 31.0, 35.3, 30.5, 0.0, 37…
## $ pedigree <dbl> 0.627, 0.351, 0.672, 0.167, 2.288, 0.201, 0.248, 0.134, 0.158…
## $ age <int> 50, 31, 32, 21, 33, 30, 26, 29, 53, 54, 30, 34, 57, 59, 51, 3…
## $ diabetes <fct> pos, neg, pos, neg, pos, neg, pos, neg, pos, pos, neg, pos, n…
Split data diab menjadi diab_train dan
diab_test dengan proporsi 80:20
#RNGkind(sample.kind = "Rounding")
#set.seed(100)
# your code here
index_diab <- sample(x = nrow(diab), size= nrow(diab)*0.8)
diab_train <- diab[index_diab, ]
diab_test <- diab[-index_diab, ]
prop.table(table(diab_train$diabetes))
##
## neg pos
## 0.6514658 0.3485342
# downsampling
#RNGkind(sample.kind = "Rounding")
#set.seed(100)
diab_train_down <- downSample(x = diab_train %>% select(-diabetes),
y = diab_train$diabetes,
yname = "diabetes") #nama kolom target
#head(diab_train)
prop.table(table(diab_train_down$diabetes))
##
## neg pos
## 0.5 0.5
#upsampling
diab_train_up <- upSample(x = diab_train %>% select(-diabetes),
y = diab_train$diabetes,
yname = "diabetes") #nama kolom target
prop.table(table(diab_train_up$diabetes))
##
## neg pos
## 0.5 0.5
#smote
diab_train_smote <- SMOTE(X = diab_train[,-9],
target = diab_train[,9],
dup_size = 1)
diab_train_smote<- diab_train_smote$data # extract only the balanced dataset
diab_train_smote$class <- as.factor(diab_train_smote$class)
prop.table(table(diab_train_smote$class))
##
## neg pos
## 0.4830918 0.5169082
diabetes_tree_down <- ctree(formula = diabetes ~ . ,
data = diab_train_down)
diabetes_tree_up <- ctree(formula = diabetes ~ . ,
data = diab_train_up)
diabetes_tree_smote <- ctree(formula = class ~ . ,
data = diab_train_smote)
# prediksi kelas di data test downsample
pred_diab_test_down <- predict(object = diabetes_tree_down,
newdata = diab_test,
type = "response")
# confusion matrix data test
confusionMatrix(data = pred_diab_test_down,
reference = diab_test$diabetes,
positive = "pos")
## Confusion Matrix and Statistics
##
## Reference
## Prediction neg pos
## neg 50 4
## pos 50 50
##
## Accuracy : 0.6494
## 95% CI : (0.5684, 0.7244)
## No Information Rate : 0.6494
## P-Value [Acc > NIR] : 0.537
##
## Kappa : 0.3561
##
## Mcnemar's Test P-Value : 9.141e-10
##
## Sensitivity : 0.9259
## Specificity : 0.5000
## Pos Pred Value : 0.5000
## Neg Pred Value : 0.9259
## Prevalence : 0.3506
## Detection Rate : 0.3247
## Detection Prevalence : 0.6494
## Balanced Accuracy : 0.7130
##
## 'Positive' Class : pos
##
# prediksi kelas di data test
pred_diab_test_up <- predict(object = diabetes_tree_up,
newdata = diab_test,
type = "response")
# confusion matrix data test
confusionMatrix(data = pred_diab_test_up,
reference = diab_test$diabetes,
positive = "pos")
## Confusion Matrix and Statistics
##
## Reference
## Prediction neg pos
## neg 74 13
## pos 26 41
##
## Accuracy : 0.7468
## 95% CI : (0.6705, 0.8133)
## No Information Rate : 0.6494
## P-Value [Acc > NIR] : 0.006192
##
## Kappa : 0.4731
##
## Mcnemar's Test P-Value : 0.054664
##
## Sensitivity : 0.7593
## Specificity : 0.7400
## Pos Pred Value : 0.6119
## Neg Pred Value : 0.8506
## Prevalence : 0.3506
## Detection Rate : 0.2662
## Detection Prevalence : 0.4351
## Balanced Accuracy : 0.7496
##
## 'Positive' Class : pos
##
# prediksi kelas di data test
pred_diab_test_smote <- predict(object = diabetes_tree_smote,
newdata = diab_test,
type = "response")
# confusion matrix data test
confusionMatrix(data = pred_diab_test_smote,
reference = diab_test$diabetes,
positive = "pos")
## Confusion Matrix and Statistics
##
## Reference
## Prediction neg pos
## neg 87 19
## pos 13 35
##
## Accuracy : 0.7922
## 95% CI : (0.7195, 0.8533)
## No Information Rate : 0.6494
## P-Value [Acc > NIR] : 8.061e-05
##
## Kappa : 0.5317
##
## Mcnemar's Test P-Value : 0.3768
##
## Sensitivity : 0.6481
## Specificity : 0.8700
## Pos Pred Value : 0.7292
## Neg Pred Value : 0.8208
## Prevalence : 0.3506
## Detection Rate : 0.2273
## Detection Prevalence : 0.3117
## Balanced Accuracy : 0.7591
##
## 'Positive' Class : pos
##
Note: add your train dataset to make models not prone to different seeds number.