Eksplorasi data, pra-proses data, dan feature engineering (https://rpubs.com/statedu/eda)
Pemodelan menggunakan Regresi Logistik(https://rpubs.com/statedu/regresi-logistik)
Pemodelan menggunakan Decision Tree(https://rpubs.com/statedu/decision-tree)
Pemodelan menggunakan Random Forest(https://rpubs.com/statedu/random-forest)
Pemodelan menggunakan Bagging(https://rpubs.com/statedu/bagging)
Pemodelan menggunakan AdaBoost(https://rpubs.com/statedu/adaboost)
mdata<- readxl::read_excel(path ="C:/Users/hp/Documents/mdata.xlsx")
# Melihat struktur data
str(mdata)
## tibble [5,000 x 15] (S3: tbl_df/tbl/data.frame)
## $ member_id : chr [1:5000] "ID237398" "ID502009" "ID011841" "ID645389" ...
## $ gender : num [1:5000] 1 1 1 1 1 1 1 0 1 1 ...
## $ visit_last_2mo : num [1:5000] 6 6 3 2 4 2 0 2 4 1 ...
## $ visit_last_3mo : num [1:5000] 6 2 2 1 3 4 1 2 3 0 ...
## $ monthly_income : num [1:5000] 3172700 4141600 5292900 2606600 4520100 ...
## $ marital_status : num [1:5000] 1 1 0 0 0 1 1 1 1 1 ...
## $ payment_channel : chr [1:5000] "0" "0" "2" "1" ...
## $ buy_groceries : num [1:5000] 1 0 0 0 0 0 1 0 0 0 ...
## $ buy_toiletries : num [1:5000] 1 1 1 0 1 1 1 1 0 0 ...
## $ buy_food : num [1:5000] 1 0 1 1 1 1 1 1 1 0 ...
## $ buy_electronic : num [1:5000] 0 0 1 0 0 0 0 0 0 0 ...
## $ buy_home_appliances: num [1:5000] 1 1 1 0 1 0 0 0 0 0 ...
## $ response : chr [1:5000] "0" "0" "0" "0" ...
## $ rata2_belanja : num [1:5000] 178413 824491 1038215 272914 890233 ...
## $ konsistensi : chr [1:5000] "1" "1" "1" "1" ...
# Menghapus kolom member_id
mdata$member_id<- NULL
set.seed(123)
attach(mdata)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
data.partition <- createDataPartition(response, p = 0.8, list = F)
training<- data.frame(mdata)[data.partition,]
testing<- data.frame(mdata)[-data.partition,]
caret untuk memodelkan data
library(rpart)
# Tuning Parameter
(tree <-train(response ~ .,
data=training,
method="rpart",
trControl = trainControl(method = "cv")))
## CART
##
## 4001 samples
## 13 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 3601, 3601, 3601, 3601, 3600, 3601, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.01880787 0.8345461 0.6612965
## 0.05729167 0.8053036 0.6066364
## 0.28038194 0.7408036 0.4538036
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.01880787.
# Pemodelan dengan parameter terbaik
best.tree <- rpart(response ~ .,
data=training,
method='class',
control=rpart.control(cp=0.01880787))
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.0.3
rpart.plot(best.tree, extra = 5)
prediksi.prob<- predict(best.tree, testing[,-12])
prediksi <- ifelse(prediksi.prob > 0.5, "1", "0")[,2]
confusionMatrix(as.factor(prediksi), as.factor(testing$response))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 503 80
## 1 65 351
##
## Accuracy : 0.8549
## 95% CI : (0.8315, 0.8761)
## No Information Rate : 0.5686
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.7029
##
## Mcnemar's Test P-Value : 0.245
##
## Sensitivity : 0.8856
## Specificity : 0.8144
## Pos Pred Value : 0.8628
## Neg Pred Value : 0.8437
## Prevalence : 0.5686
## Detection Rate : 0.5035
## Detection Prevalence : 0.5836
## Balanced Accuracy : 0.8500
##
## 'Positive' Class : 0
##