Eksplorasi data, pra-proses data, dan feature engineering (https://rpubs.com/statedu/eda)
Pemodelan menggunakan Regresi Logistik(https://rpubs.com/statedu/regresi-logistik)
Pemodelan menggunakan Decision Tree(https://rpubs.com/statedu/decision-tree)
Pemodelan menggunakan Random Forest(https://rpubs.com/statedu/random-forest)
Pemodelan menggunakan Bagging(https://rpubs.com/statedu/bagging)
Pemodelan menggunakan AdaBoost(https://rpubs.com/statedu/adaboost)
mdata<- readxl::read_excel(path ="C:/Users/hp/Documents/mdata.xlsx")
# Melihat struktur data
str(mdata)
## tibble [5,000 x 15] (S3: tbl_df/tbl/data.frame)
## $ member_id : chr [1:5000] "ID237398" "ID502009" "ID011841" "ID645389" ...
## $ gender : num [1:5000] 1 1 1 1 1 1 1 0 1 1 ...
## $ visit_last_2mo : num [1:5000] 6 6 3 2 4 2 0 2 4 1 ...
## $ visit_last_3mo : num [1:5000] 6 2 2 1 3 4 1 2 3 0 ...
## $ monthly_income : num [1:5000] 3172700 4141600 5292900 2606600 4520100 ...
## $ marital_status : num [1:5000] 1 1 0 0 0 1 1 1 1 1 ...
## $ payment_channel : chr [1:5000] "0" "0" "2" "1" ...
## $ buy_groceries : num [1:5000] 1 0 0 0 0 0 1 0 0 0 ...
## $ buy_toiletries : num [1:5000] 1 1 1 0 1 1 1 1 0 0 ...
## $ buy_food : num [1:5000] 1 0 1 1 1 1 1 1 1 0 ...
## $ buy_electronic : num [1:5000] 0 0 1 0 0 0 0 0 0 0 ...
## $ buy_home_appliances: num [1:5000] 1 1 1 0 1 0 0 0 0 0 ...
## $ response : chr [1:5000] "0" "0" "0" "0" ...
## $ rata2_belanja : num [1:5000] 178413 824491 1038215 272914 890233 ...
## $ konsistensi : chr [1:5000] "1" "1" "1" "1" ...
# Menghapus kolom member_id
mdata$member_id<- NULL
set.seed(123)
attach(mdata)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
data.partition <- createDataPartition(response, p = 0.8, list = F)
training<- data.frame(mdata)[data.partition,]
testing<- data.frame(mdata)[-data.partition,]
caret untuk memodelkan data
library(caret)
# Pemodelan Data dengan Semua Peubah Penjelas
trainctrl<- trainControl(method = "cv", number = 10)
(fit<- train(response~., data = training, trControl = trainctrl,
method = "glm",
family = binomial()))
## Generalized Linear Model
##
## 4001 samples
## 13 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 3601, 3601, 3601, 3601, 3600, 3601, ...
## Resampling results:
##
## Accuracy Kappa
## 0.5900991 0.1314192
summary(fit)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7291 -1.1067 -0.4671 1.1920 2.2445
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.494e+00 2.201e-01 -11.329 < 2e-16 ***
## gender -5.276e-02 6.763e-02 -0.780 0.435317
## visit_last_2mo 4.799e-02 1.698e-02 2.826 0.004707 **
## visit_last_3mo 6.666e-02 1.817e-02 3.668 0.000245 ***
## monthly_income 9.850e-09 1.453e-08 0.678 0.497848
## marital_status 2.420e-02 7.069e-02 0.342 0.732110
## payment_channel1 -8.269e-03 7.901e-02 -0.105 0.916652
## payment_channel2 4.587e-02 8.179e-02 0.561 0.574956
## buy_groceries 7.808e-02 8.383e-02 0.931 0.351644
## buy_toiletries 8.551e-02 7.599e-02 1.125 0.260440
## buy_food 3.594e-02 1.077e-01 0.334 0.738632
## buy_electronic 5.687e-02 1.222e-01 0.465 0.641753
## buy_home_appliances 1.635e-01 7.723e-02 2.117 0.034298 *
## rata2_belanja -1.356e-07 1.314e-07 -1.032 0.302073
## konsistensi1 1.822e+00 1.615e-01 11.282 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5472.1 on 4000 degrees of freedom
## Residual deviance: 5125.5 on 3986 degrees of freedom
## AIC: 5155.5
##
## Number of Fisher Scoring iterations: 4
# Pemodelan Data
(fit1<- train(response~visit_last_2mo+visit_last_3mo+konsistensi, data = training, trControl = trainctrl,
method = "glm",
family = binomial()))
## Generalized Linear Model
##
## 4001 samples
## 3 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 3601, 3601, 3602, 3600, 3601, 3601, ...
## Resampling results:
##
## Accuracy Kappa
## 0.5873573 0.1194535
summary(fit1)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6334 -1.1106 -0.4622 1.2022 2.2012
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.32975 0.15009 -15.522 < 2e-16 ***
## visit_last_2mo 0.04894 0.01694 2.889 0.003863 **
## visit_last_3mo 0.06631 0.01812 3.660 0.000252 ***
## konsistensi1 1.79324 0.15882 11.291 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5472.1 on 4000 degrees of freedom
## Residual deviance: 5135.2 on 3997 degrees of freedom
## AIC: 5143.2
##
## Number of Fisher Scoring iterations: 4
# Prediksi Data test
prediksi<- predict(fit1, newdata = testing[,-12])
confusionMatrix(prediksi, as.factor(testing$response))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 428 300
## 1 140 131
##
## Accuracy : 0.5596
## 95% CI : (0.5281, 0.5906)
## No Information Rate : 0.5686
## P-Value [Acc > NIR] : 0.7283
##
## Kappa : 0.0602
##
## Mcnemar's Test P-Value : 3.455e-14
##
## Sensitivity : 0.7535
## Specificity : 0.3039
## Pos Pred Value : 0.5879
## Neg Pred Value : 0.4834
## Prevalence : 0.5686
## Detection Rate : 0.4284
## Detection Prevalence : 0.7287
## Balanced Accuracy : 0.5287
##
## 'Positive' Class : 0
##