library(haven)
library(nnet)
library(readxl)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
data.SP <- read_excel("DATA MULTINOM.xlsx", sheet = "Segmentasi Pelanggan")
data.SP
str(data.SP)
## tibble [6,665 × 10] (S3: tbl_df/tbl/data.frame)
## $ Gender : chr [1:6665] "Male" "Female" "Male" "Male" ...
## $ Ever_Married : chr [1:6665] "No" "Yes" "Yes" "Yes" ...
## $ Age : num [1:6665] 22 67 67 56 32 33 61 55 26 19 ...
## $ Graduated : chr [1:6665] "No" "Yes" "Yes" "No" ...
## $ Profession : chr [1:6665] "Healthcare" "Engineer" "Lawyer" "Artist" ...
## $ Work_Experience: num [1:6665] 1 1 0 0 1 1 0 1 1 4 ...
## $ Spending_Score : chr [1:6665] "Low" "Low" "High" "Average" ...
## $ Family_Size : num [1:6665] 4 1 2 2 3 3 3 4 3 4 ...
## $ Var_1 : chr [1:6665] "Cat_4" "Cat_6" "Cat_6" "Cat_6" ...
## $ Segmentation : chr [1:6665] "D" "B" "B" "C" ...
#merubah var ke factor
data.SP$Gender <- as.factor(data.SP$Gender)
data.SP$Ever_Married <- as.factor(data.SP$Ever_Married)
data.SP$Graduated <- as.factor(data.SP$Graduated)
data.SP$Profession <- as.factor(data.SP$Profession)
data.SP$Spending_Score <- as.factor(data.SP$Spending_Score)
data.SP$Var_1 <- as.factor(data.SP$Var_1)
data.SP$Segmentation <- as.factor(data.SP$Segmentation)
str(data.SP)
## tibble [6,665 × 10] (S3: tbl_df/tbl/data.frame)
## $ Gender : Factor w/ 2 levels "Female","Male": 2 1 2 2 2 1 1 1 1 2 ...
## $ Ever_Married : Factor w/ 2 levels "No","Yes": 1 2 2 2 1 1 2 2 1 1 ...
## $ Age : num [1:6665] 22 67 67 56 32 33 61 55 26 19 ...
## $ Graduated : Factor w/ 2 levels "No","Yes": 1 2 2 1 2 2 2 2 2 1 ...
## $ Profession : Factor w/ 9 levels "Artist","Doctor",..: 6 3 8 1 6 6 3 1 3 6 ...
## $ Work_Experience: num [1:6665] 1 1 0 0 1 1 0 1 1 4 ...
## $ Spending_Score : Factor w/ 3 levels "Average","High",..: 3 3 2 1 3 3 3 1 3 3 ...
## $ Family_Size : num [1:6665] 4 1 2 2 3 3 3 4 3 4 ...
## $ Var_1 : Factor w/ 7 levels "Cat_1","Cat_2",..: 4 6 6 6 6 6 7 6 6 4 ...
## $ Segmentation : Factor w/ 4 levels "A","B","C","D": 4 2 2 3 3 4 4 3 1 4 ...
boxplot(Age~Segmentation, data = data.SP,
main = "Boxplot umur dan segmentasi pelanggan",
col = c("lightgreen", "lightblue", "mistyrose"))
# EDA
barplot (table(data.SP$Segmentation),
main = "bar plot Segmentasi Pelanggan",
xlab = "Segmentasi Pelanggan",
ylab = "jumlah",
col = c("lightgreen", "lightblue", "mistyrose"))
barplot (table(data.SP$Segmentation, data.SP$Spending_Score),
main = "bar plot segmentasi pelanggan dan spending score",
xlab = "segmentasi pelanggan",
ylab = "jumlah",
col = c("lightgreen", "lightblue", "mistyrose"),
beside = TRUE)
legend("topright",
c("low", "High", "average"),
fill = c("lightgreen", "lightblue", "mistyrose")
)
## sumary
library(arsenal)
tab <- tableby(Segmentation~., data = data.SP)
summary(tab, text = TRUE)
##
##
## | | A (N=1616) | B (N=1572) | C (N=1720) | D (N=1757) | Total (N=6665) | p value|
## |:----------------|:---------------:|:---------------:|:---------------:|:---------------:|:---------------:|-------:|
## |Gender | | | | | | 0.003|
## |- Female | 740 (45.8%) | 732 (46.6%) | 795 (46.2%) | 721 (41.0%) | 2988 (44.8%) | |
## |- Male | 876 (54.2%) | 840 (53.4%) | 925 (53.8%) | 1036 (59.0%) | 3677 (55.2%) | |
## |Ever_Married | | | | | | < 0.001|
## |- No | 676 (41.8%) | 419 (26.7%) | 338 (19.7%) | 1288 (73.3%) | 2721 (40.8%) | |
## |- Yes | 940 (58.2%) | 1153 (73.3%) | 1382 (80.3%) | 469 (26.7%) | 3944 (59.2%) | |
## |Age | | | | | | < 0.001|
## |- Mean (SD) | 44.421 (16.173) | 48.113 (14.694) | 49.328 (14.479) | 32.957 (15.262) | 43.536 (16.524) | |
## |- Range | 18.000 - 89.000 | 18.000 - 89.000 | 18.000 - 89.000 | 18.000 - 89.000 | 18.000 - 89.000 | |
## |Graduated | | | | | | < 0.001|
## |- No | 591 (36.6%) | 421 (26.8%) | 286 (16.6%) | 1118 (63.6%) | 2416 (36.2%) | |
## |- Yes | 1025 (63.4%) | 1151 (73.2%) | 1434 (83.4%) | 639 (36.4%) | 4249 (63.8%) | |
## |Profession | | | | | | < 0.001|
## |- Artist | 470 (29.1%) | 660 (42.0%) | 948 (55.1%) | 114 (6.5%) | 2192 (32.9%) | |
## |- Doctor | 168 (10.4%) | 127 (8.1%) | 123 (7.2%) | 174 (9.9%) | 592 (8.9%) | |
## |- Engineer | 221 (13.7%) | 159 (10.1%) | 64 (3.7%) | 138 (7.9%) | 582 (8.7%) | |
## |- Entertainment | 320 (19.8%) | 191 (12.2%) | 125 (7.3%) | 173 (9.8%) | 809 (12.1%) | |
## |- Executive | 98 (6.1%) | 149 (9.5%) | 164 (9.5%) | 94 (5.4%) | 505 (7.6%) | |
## |- Healthcare | 88 (5.4%) | 85 (5.4%) | 124 (7.2%) | 780 (44.4%) | 1077 (16.2%) | |
## |- Homemaker | 48 (3.0%) | 44 (2.8%) | 19 (1.1%) | 64 (3.6%) | 175 (2.6%) | |
## |- Lawyer | 157 (9.7%) | 133 (8.5%) | 124 (7.2%) | 86 (4.9%) | 500 (7.5%) | |
## |- Marketing | 46 (2.8%) | 24 (1.5%) | 29 (1.7%) | 134 (7.6%) | 233 (3.5%) | |
## |Work_Experience | | | | | | < 0.001|
## |- Mean (SD) | 2.889 (3.627) | 2.394 (3.247) | 2.224 (3.037) | 2.997 (3.611) | 2.629 (3.405) | |
## |- Range | 0.000 - 14.000 | 0.000 - 14.000 | 0.000 - 14.000 | 0.000 - 14.000 | 0.000 - 14.000 | |
## |Spending_Score | | | | | | < 0.001|
## |- Average | 270 (16.7%) | 499 (31.7%) | 795 (46.2%) | 98 (5.6%) | 1662 (24.9%) | |
## |- High | 214 (13.2%) | 315 (20.0%) | 370 (21.5%) | 105 (6.0%) | 1004 (15.1%) | |
## |- Low | 1132 (70.0%) | 758 (48.2%) | 555 (32.3%) | 1554 (88.4%) | 3999 (60.0%) | |
## |Family_Size | | | | | | < 0.001|
## |- Mean (SD) | 2.432 (1.480) | 2.683 (1.411) | 2.962 (1.374) | 3.241 (1.681) | 2.841 (1.525) | |
## |- Range | 1.000 - 9.000 | 1.000 - 9.000 | 1.000 - 9.000 | 1.000 - 9.000 | 1.000 - 9.000 | |
## |Var_1 | | | | | | < 0.001|
## |- Cat_1 | 24 (1.5%) | 21 (1.3%) | 23 (1.3%) | 36 (2.0%) | 104 (1.6%) | |
## |- Cat_2 | 71 (4.4%) | 89 (5.7%) | 86 (5.0%) | 116 (6.6%) | 362 (5.4%) | |
## |- Cat_3 | 175 (10.8%) | 150 (9.5%) | 118 (6.9%) | 191 (10.9%) | 634 (9.5%) | |
## |- Cat_4 | 258 (16.0%) | 188 (12.0%) | 92 (5.3%) | 311 (17.7%) | 849 (12.7%) | |
## |- Cat_5 | 16 (1.0%) | 20 (1.3%) | 17 (1.0%) | 21 (1.2%) | 74 (1.1%) | |
## |- Cat_6 | 1030 (63.7%) | 1067 (67.9%) | 1344 (78.1%) | 1035 (58.9%) | 4476 (67.2%) | |
## |- Cat_7 | 42 (2.6%) | 37 (2.4%) | 40 (2.3%) | 47 (2.7%) | 166 (2.5%) | |
acak <- createDataPartition(data.SP$Segmentation, p=0.8, list=FALSE)
data_train <- data.SP[acak,]
data_test <- data.SP[-acak,]
summary(data_train)
## Gender Ever_Married Age Graduated Profession
## Female:2422 No :2190 Min. :18.00 No :1928 Artist :1742
## Male :2911 Yes:3143 1st Qu.:31.00 Yes:3405 Healthcare : 862
## Median :41.00 Entertainment: 651
## Mean :43.64 Doctor : 477
## 3rd Qu.:53.00 Engineer : 457
## Max. :89.00 Lawyer : 417
## (Other) : 727
## Work_Experience Spending_Score Family_Size Var_1 Segmentation
## Min. : 0.000 Average:1320 Min. :1.000 Cat_1: 85 A:1293
## 1st Qu.: 0.000 High : 808 1st Qu.:2.000 Cat_2: 298 B:1258
## Median : 1.000 Low :3205 Median :2.000 Cat_3: 496 C:1376
## Mean : 2.615 Mean :2.839 Cat_4: 697 D:1406
## 3rd Qu.: 4.000 3rd Qu.:4.000 Cat_5: 62
## Max. :14.000 Max. :9.000 Cat_6:3565
## Cat_7: 130
summary(data_test)
## Gender Ever_Married Age Graduated Profession
## Female:566 No :531 Min. :18.00 No :488 Artist :450
## Male :766 Yes:801 1st Qu.:31.00 Yes:844 Healthcare :215
## Median :41.00 Entertainment:158
## Mean :43.11 Engineer :125
## 3rd Qu.:52.00 Doctor :115
## Max. :89.00 Executive : 97
## (Other) :172
## Work_Experience Spending_Score Family_Size Var_1 Segmentation
## Min. : 0.000 Average:342 Min. :1.000 Cat_1: 19 A:323
## 1st Qu.: 0.000 High :196 1st Qu.:2.000 Cat_2: 64 B:314
## Median : 1.000 Low :794 Median :2.500 Cat_3:138 C:344
## Mean : 2.685 Mean :2.848 Cat_4:152 D:351
## 3rd Qu.: 4.000 3rd Qu.:4.000 Cat_5: 12
## Max. :14.000 Max. :9.000 Cat_6:911
## Cat_7: 36
model.SP <- multinom(Segmentation~., data = data_train)
## # weights: 96 (69 variable)
## initial value 7393.107828
## iter 10 value 6369.667414
## iter 20 value 6035.063466
## iter 30 value 5878.763050
## iter 40 value 5844.961427
## iter 50 value 5840.875099
## iter 60 value 5840.564525
## iter 70 value 5840.533674
## final value 5840.533083
## converged
summary(model.SP)
## Call:
## multinom(formula = Segmentation ~ ., data = data_train)
##
## Coefficients:
## (Intercept) GenderMale Ever_MarriedYes Age GraduatedYes
## B -0.4283262 -0.2186115 0.01601032 0.01980924 0.3866280
## C -0.9085080 -0.3752771 -0.09030023 0.02630782 0.8548292
## D -0.8343470 0.2598026 -0.21434561 -0.02295938 -0.4302646
## ProfessionDoctor ProfessionEngineer ProfessionEntertainment
## B -0.4277846 -0.7313216 -0.9383434
## C -0.7117334 -1.8327770 -1.5478749
## D 0.9911519 0.8234278 0.6347058
## ProfessionExecutive ProfessionHealthcare ProfessionHomemaker ProfessionLawyer
## B -0.06976615 0.09752317 -0.3962112 -0.9783796
## C -0.29704525 0.47969891 -1.5570698 -1.4027101
## D 1.67984112 2.85705444 1.5391379 1.8985918
## ProfessionMarketing Work_Experience Spending_ScoreHigh Spending_ScoreLow
## B -0.5283807 -0.03286067 -0.4287972 -0.9680013
## C -0.5063767 -0.04185710 -0.7947333 -1.7199668
## D 2.2798178 0.03622472 -0.1767483 0.5416523
## Family_Size Var_1Cat_2 Var_1Cat_3 Var_1Cat_4 Var_1Cat_5 Var_1Cat_6
## B 0.1476870 0.47624532 -0.06017823 -0.2208119 0.4987125 0.1153367
## C 0.3052380 0.43335181 -0.01694129 -0.8157069 0.2317707 0.4619689
## D 0.1001686 -0.07324749 -0.36516893 -0.1653382 -0.6396496 -0.1028683
## Var_1Cat_7
## B -0.1068873
## C 0.1110324
## D -0.4037213
##
## Std. Errors:
## (Intercept) GenderMale Ever_MarriedYes Age GraduatedYes
## B 0.4157176 0.09101823 0.1235443 0.003904535 0.09604601
## C 0.4500204 0.09485061 0.1385390 0.004125147 0.10753786
## D 0.4514081 0.09805709 0.1289574 0.004651917 0.09648975
## ProfessionDoctor ProfessionEngineer ProfessionEntertainment
## B 0.1548611 0.1502176 0.1323352
## C 0.1634887 0.1911678 0.1487386
## D 0.1788079 0.1832287 0.1629675
## ProfessionExecutive ProfessionHealthcare ProfessionHomemaker ProfessionLawyer
## B 0.1934586 0.1966894 0.2654634 0.1928357
## C 0.1982932 0.1900196 0.3598920 0.2021920
## D 0.2331878 0.1865164 0.2564159 0.2467372
## ProfessionMarketing Work_Experience Spending_ScoreHigh Spending_ScoreLow
## B 0.2956932 0.01289386 0.1564656 0.1267418
## C 0.3050035 0.01367874 0.1596351 0.1353038
## D 0.2420811 0.01284578 0.2145613 0.1687526
## Family_Size Var_1Cat_2 Var_1Cat_3 Var_1Cat_4 Var_1Cat_5 Var_1Cat_6 Var_1Cat_7
## B 0.03307550 0.3781625 0.3538461 0.3474185 0.5016648 0.3326341 0.4233360
## C 0.03456831 0.4095465 0.3862873 0.3870741 0.5588105 0.3611670 0.4526103
## D 0.03259602 0.3867674 0.3598448 0.3505326 0.5317437 0.3379306 0.4264134
##
## Residual Deviance: 11681.07
## AIC: 11819.07
library(car)
## Loading required package: carData
vif(model.SP)
## Warning in vif.default(model.SP): No intercept: vifs may not be sensible.
## GVIF Df GVIF^(1/(2*Df))
## Gender 3.976892 1 1.994215
## Ever_Married 9.667996 1 3.109340
## Age 34.419459 1 5.866810
## Graduated 5.992289 1 2.447915
## Profession 189.172610 8 1.387726
## Work_Experience 2.448620 1 1.564807
## Spending_Score 23.721237 2 2.206909
## Family_Size 9.265946 1 3.044002
## Var_1 871.157406 6 1.757956
z <- summary(model.SP)$coefficients/summary(model.SP)$standard.errors
p <- (1-pnorm(abs(z), 0.1))*2
data.frame(p)
model.SP2 <- multinom(Segmentation~.-Spending_Score, data = data_train)
## # weights: 88 (63 variable)
## initial value 7393.107828
## iter 10 value 6396.287179
## iter 20 value 6125.539208
## iter 30 value 6011.860988
## iter 40 value 5978.258329
## iter 50 value 5976.093471
## iter 60 value 5975.916430
## final value 5975.910032
## converged
summary(model.SP2)
## Call:
## multinom(formula = Segmentation ~ . - Spending_Score, data = data_train)
##
## Coefficients:
## (Intercept) GenderMale Ever_MarriedYes Age GraduatedYes
## B -1.5641453 -0.2300263 0.5264565 0.01889156 0.3954291
## C -2.8563143 -0.3872174 0.9348297 0.02443910 0.8729774
## D -0.2437396 0.2783187 -0.5129394 -0.02311838 -0.4284313
## ProfessionDoctor ProfessionEngineer ProfessionEntertainment
## B -0.3883998 -0.7074396 -0.9005865
## C -0.6311860 -1.7895388 -1.4762634
## D 0.9823071 0.8024699 0.6297287
## ProfessionExecutive ProfessionHealthcare ProfessionHomemaker ProfessionLawyer
## B -0.09297969 0.03500562 -0.3677093 -1.058893
## C -0.45277605 0.33681790 -1.5563471 -1.628460
## D 1.45086719 2.82305608 1.5054816 1.846770
## ProfessionMarketing Work_Experience Family_Size Var_1Cat_2 Var_1Cat_3
## B -0.6376131 -0.02956244 0.2023769 0.5513853 -0.04409545
## C -0.8075547 -0.03840099 0.3961676 0.5425072 -0.01434540
## D 2.2650721 0.03437806 0.1042155 -0.1138639 -0.39014501
## Var_1Cat_4 Var_1Cat_5 Var_1Cat_6 Var_1Cat_7
## B -0.1486042 0.4871423 0.1685284 -0.11795422
## C -0.7540704 0.1954148 0.5177851 0.05015725
## D -0.2082483 -0.6230623 -0.1275978 -0.42814179
##
## Std. Errors:
## (Intercept) GenderMale Ever_MarriedYes Age GraduatedYes
## B 0.3862803 0.09011925 0.1029269 0.003862444 0.09522617
## C 0.4126256 0.09255520 0.1104434 0.004014716 0.10558442
## D 0.4107473 0.09760378 0.1104226 0.004617516 0.09632717
## ProfessionDoctor ProfessionEngineer ProfessionEntertainment
## B 0.1530460 0.1484033 0.1304411
## C 0.1585061 0.1865292 0.1435636
## D 0.1781065 0.1828797 0.1627603
## ProfessionExecutive ProfessionHealthcare ProfessionHomemaker ProfessionLawyer
## B 0.1765455 0.1959535 0.2638225 0.1851403
## C 0.1782405 0.1878996 0.3561844 0.1903369
## D 0.2167941 0.1861343 0.2565495 0.2408259
## ProfessionMarketing Work_Experience Family_Size Var_1Cat_2 Var_1Cat_3
## B 0.2947840 0.01280345 0.03221376 0.3742516 0.3504124
## C 0.3036744 0.01340298 0.03321924 0.3974474 0.3741214
## D 0.2421139 0.01282702 0.03267065 0.3863339 0.3599848
## Var_1Cat_4 Var_1Cat_5 Var_1Cat_6 Var_1Cat_7
## B 0.3437350 0.4963451 0.3291377 0.4198450
## C 0.3754024 0.5454959 0.3493941 0.4393662
## D 0.3504844 0.5326834 0.3379578 0.4262211
##
## Residual Deviance: 11951.82
## AIC: 12077.82
model.SP$AIC
## [1] 11819.07
model.SP2$AIC
## [1] 12077.82
#prediksi data pada test
predict_prob = predict(model.SP, data_test, type = "prob")
head(predict_prob,10)
## A B C D
## 1 0.05713456 0.02317009 0.009573713 0.910121639
## 2 0.47802631 0.26988845 0.094945901 0.157139330
## 3 0.07793862 0.27254727 0.640067841 0.009446265
## 4 0.30219613 0.31861772 0.321718335 0.057467817
## 5 0.15812749 0.40392015 0.414888923 0.023063436
## 6 0.05872954 0.03801525 0.041748318 0.861506890
## 7 0.09734635 0.28246934 0.609089547 0.011094770
## 8 0.08995961 0.30341785 0.599531777 0.007090763
## 9 0.46285073 0.13119594 0.064763827 0.341189508
## 10 0.20908934 0.29751801 0.403331677 0.090060978
head(data.frame(predict_prob, data_test$Segmentation),10)
prediksi.test <- predict(model.SP, data_test,type = "class")
data_test$Segmentation<-as.factor(data_test$Segmentation)
confusionMatrix(as.factor(prediksi.test),
data_test$Segmentation)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D
## A 152 103 39 75
## B 48 59 35 17
## C 68 120 231 9
## D 55 32 39 250
##
## Overall Statistics
##
## Accuracy : 0.5195
## 95% CI : (0.4923, 0.5467)
## No Information Rate : 0.2635
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.3571
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D
## Sensitivity 0.4706 0.18790 0.6715 0.7123
## Specificity 0.7849 0.90177 0.8006 0.8716
## Pos Pred Value 0.4119 0.37107 0.5397 0.6649
## Neg Pred Value 0.8224 0.78261 0.8750 0.8944
## Prevalence 0.2425 0.23574 0.2583 0.2635
## Detection Rate 0.1141 0.04429 0.1734 0.1877
## Detection Prevalence 0.2770 0.11937 0.3213 0.2823
## Balanced Accuracy 0.6278 0.54483 0.7361 0.7919
data.frame(summary(model.SP)$coefficients)
#odds ratio
data.frame(exp(summary(model.SP)$coefficients))
segmentasi B terhadap segmentasi A
intercept 0.5452851 artinya, responden dengan jenis kelamin laki-laki, sudah menikah, sudah lulus, profesi dokter, dll. ods memilih Segmentasi pelayanan B sebesar 0.54522851 kali dibanding memilih segmentasi pelayan A
gander male 0.8445335 artinya, responden dengan jenis kelamin laki-laki, odds memilih segmentasi pelayanan B sebanyak 0.8445335 kali dibanding segmentasi pelayan A
status sudah menikah 1.0220087, responden dengan status sudah menikah, odds memilih segmentasi pelayanan B sebanyak 1.0220087 kali dibanding segmentasi pelayan A