library(haven)
library(nnet)
library(readxl)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
data.SP <- read_excel("DATA MULTINOM.xlsx", sheet = "Segmentasi Pelanggan")
data.SP
str(data.SP)
## tibble [6,665 × 10] (S3: tbl_df/tbl/data.frame)
## $ Gender : chr [1:6665] "Male" "Female" "Male" "Male" ...
## $ Ever_Married : chr [1:6665] "No" "Yes" "Yes" "Yes" ...
## $ Age : num [1:6665] 22 67 67 56 32 33 61 55 26 19 ...
## $ Graduated : chr [1:6665] "No" "Yes" "Yes" "No" ...
## $ Profession : chr [1:6665] "Healthcare" "Engineer" "Lawyer" "Artist" ...
## $ Work_Experience: num [1:6665] 1 1 0 0 1 1 0 1 1 4 ...
## $ Spending_Score : chr [1:6665] "Low" "Low" "High" "Average" ...
## $ Family_Size : num [1:6665] 4 1 2 2 3 3 3 4 3 4 ...
## $ Var_1 : chr [1:6665] "Cat_4" "Cat_6" "Cat_6" "Cat_6" ...
## $ Segmentation : chr [1:6665] "D" "B" "B" "C" ...
#merubah var ke factor
data.SP$Gender <- as.factor(data.SP$Gender)
data.SP$Ever_Married <- as.factor(data.SP$Ever_Married)
data.SP$Graduated <- as.factor(data.SP$Graduated)
data.SP$Profession <- as.factor(data.SP$Profession)
data.SP$Spending_Score <- as.factor(data.SP$Spending_Score)
data.SP$Var_1 <- as.factor(data.SP$Var_1)
data.SP$Segmentation <- as.factor(data.SP$Segmentation)
str(data.SP)
## tibble [6,665 × 10] (S3: tbl_df/tbl/data.frame)
## $ Gender : Factor w/ 2 levels "Female","Male": 2 1 2 2 2 1 1 1 1 2 ...
## $ Ever_Married : Factor w/ 2 levels "No","Yes": 1 2 2 2 1 1 2 2 1 1 ...
## $ Age : num [1:6665] 22 67 67 56 32 33 61 55 26 19 ...
## $ Graduated : Factor w/ 2 levels "No","Yes": 1 2 2 1 2 2 2 2 2 1 ...
## $ Profession : Factor w/ 9 levels "Artist","Doctor",..: 6 3 8 1 6 6 3 1 3 6 ...
## $ Work_Experience: num [1:6665] 1 1 0 0 1 1 0 1 1 4 ...
## $ Spending_Score : Factor w/ 3 levels "Average","High",..: 3 3 2 1 3 3 3 1 3 3 ...
## $ Family_Size : num [1:6665] 4 1 2 2 3 3 3 4 3 4 ...
## $ Var_1 : Factor w/ 7 levels "Cat_1","Cat_2",..: 4 6 6 6 6 6 7 6 6 4 ...
## $ Segmentation : Factor w/ 4 levels "A","B","C","D": 4 2 2 3 3 4 4 3 1 4 ...
boxplot(Age~Segmentation, data = data.SP,
main = "Boxplot umur dan segmentasi pelanggan",
col = c("lightgreen", "lightblue", "mistyrose"))
# EDA
barplot (table(data.SP$Segmentation),
main = "bar plot Segmentasi Pelanggan",
xlab = "Segmentasi Pelanggan",
ylab = "jumlah",
col = c("lightgreen", "lightblue", "mistyrose"))
barplot (table(data.SP$Segmentation, data.SP$Spending_Score),
main = "bar plot segmentasi pelanggan dan spending score",
xlab = "segmentasi pelanggan",
ylab = "jumlah",
col = c("lightgreen", "lightblue", "mistyrose"),
beside = TRUE)
legend("topright",
c("low", "High", "average"),
fill = c("lightgreen", "lightblue", "mistyrose")
)
## sumary
library(arsenal)
tab <- tableby(Segmentation~., data = data.SP)
summary(tab, text = TRUE)
##
##
## | | A (N=1616) | B (N=1572) | C (N=1720) | D (N=1757) | Total (N=6665) | p value|
## |:----------------|:---------------:|:---------------:|:---------------:|:---------------:|:---------------:|-------:|
## |Gender | | | | | | 0.003|
## |- Female | 740 (45.8%) | 732 (46.6%) | 795 (46.2%) | 721 (41.0%) | 2988 (44.8%) | |
## |- Male | 876 (54.2%) | 840 (53.4%) | 925 (53.8%) | 1036 (59.0%) | 3677 (55.2%) | |
## |Ever_Married | | | | | | < 0.001|
## |- No | 676 (41.8%) | 419 (26.7%) | 338 (19.7%) | 1288 (73.3%) | 2721 (40.8%) | |
## |- Yes | 940 (58.2%) | 1153 (73.3%) | 1382 (80.3%) | 469 (26.7%) | 3944 (59.2%) | |
## |Age | | | | | | < 0.001|
## |- Mean (SD) | 44.421 (16.173) | 48.113 (14.694) | 49.328 (14.479) | 32.957 (15.262) | 43.536 (16.524) | |
## |- Range | 18.000 - 89.000 | 18.000 - 89.000 | 18.000 - 89.000 | 18.000 - 89.000 | 18.000 - 89.000 | |
## |Graduated | | | | | | < 0.001|
## |- No | 591 (36.6%) | 421 (26.8%) | 286 (16.6%) | 1118 (63.6%) | 2416 (36.2%) | |
## |- Yes | 1025 (63.4%) | 1151 (73.2%) | 1434 (83.4%) | 639 (36.4%) | 4249 (63.8%) | |
## |Profession | | | | | | < 0.001|
## |- Artist | 470 (29.1%) | 660 (42.0%) | 948 (55.1%) | 114 (6.5%) | 2192 (32.9%) | |
## |- Doctor | 168 (10.4%) | 127 (8.1%) | 123 (7.2%) | 174 (9.9%) | 592 (8.9%) | |
## |- Engineer | 221 (13.7%) | 159 (10.1%) | 64 (3.7%) | 138 (7.9%) | 582 (8.7%) | |
## |- Entertainment | 320 (19.8%) | 191 (12.2%) | 125 (7.3%) | 173 (9.8%) | 809 (12.1%) | |
## |- Executive | 98 (6.1%) | 149 (9.5%) | 164 (9.5%) | 94 (5.4%) | 505 (7.6%) | |
## |- Healthcare | 88 (5.4%) | 85 (5.4%) | 124 (7.2%) | 780 (44.4%) | 1077 (16.2%) | |
## |- Homemaker | 48 (3.0%) | 44 (2.8%) | 19 (1.1%) | 64 (3.6%) | 175 (2.6%) | |
## |- Lawyer | 157 (9.7%) | 133 (8.5%) | 124 (7.2%) | 86 (4.9%) | 500 (7.5%) | |
## |- Marketing | 46 (2.8%) | 24 (1.5%) | 29 (1.7%) | 134 (7.6%) | 233 (3.5%) | |
## |Work_Experience | | | | | | < 0.001|
## |- Mean (SD) | 2.889 (3.627) | 2.394 (3.247) | 2.224 (3.037) | 2.997 (3.611) | 2.629 (3.405) | |
## |- Range | 0.000 - 14.000 | 0.000 - 14.000 | 0.000 - 14.000 | 0.000 - 14.000 | 0.000 - 14.000 | |
## |Spending_Score | | | | | | < 0.001|
## |- Average | 270 (16.7%) | 499 (31.7%) | 795 (46.2%) | 98 (5.6%) | 1662 (24.9%) | |
## |- High | 214 (13.2%) | 315 (20.0%) | 370 (21.5%) | 105 (6.0%) | 1004 (15.1%) | |
## |- Low | 1132 (70.0%) | 758 (48.2%) | 555 (32.3%) | 1554 (88.4%) | 3999 (60.0%) | |
## |Family_Size | | | | | | < 0.001|
## |- Mean (SD) | 2.432 (1.480) | 2.683 (1.411) | 2.962 (1.374) | 3.241 (1.681) | 2.841 (1.525) | |
## |- Range | 1.000 - 9.000 | 1.000 - 9.000 | 1.000 - 9.000 | 1.000 - 9.000 | 1.000 - 9.000 | |
## |Var_1 | | | | | | < 0.001|
## |- Cat_1 | 24 (1.5%) | 21 (1.3%) | 23 (1.3%) | 36 (2.0%) | 104 (1.6%) | |
## |- Cat_2 | 71 (4.4%) | 89 (5.7%) | 86 (5.0%) | 116 (6.6%) | 362 (5.4%) | |
## |- Cat_3 | 175 (10.8%) | 150 (9.5%) | 118 (6.9%) | 191 (10.9%) | 634 (9.5%) | |
## |- Cat_4 | 258 (16.0%) | 188 (12.0%) | 92 (5.3%) | 311 (17.7%) | 849 (12.7%) | |
## |- Cat_5 | 16 (1.0%) | 20 (1.3%) | 17 (1.0%) | 21 (1.2%) | 74 (1.1%) | |
## |- Cat_6 | 1030 (63.7%) | 1067 (67.9%) | 1344 (78.1%) | 1035 (58.9%) | 4476 (67.2%) | |
## |- Cat_7 | 42 (2.6%) | 37 (2.4%) | 40 (2.3%) | 47 (2.7%) | 166 (2.5%) | |
acak <- createDataPartition(data.SP$Segmentation, p=0.8, list=FALSE)
data_train <- data.SP[acak,]
data_test <- data.SP[-acak,]
summary(data_train)
## Gender Ever_Married Age Graduated Profession
## Female:2415 No :2175 Min. :18.00 No :1943 Artist :1731
## Male :2918 Yes:3158 1st Qu.:30.00 Yes:3390 Healthcare : 872
## Median :41.00 Entertainment: 642
## Mean :43.46 Doctor : 479
## 3rd Qu.:53.00 Engineer : 469
## Max. :89.00 Lawyer : 407
## (Other) : 733
## Work_Experience Spending_Score Family_Size Var_1 Segmentation
## Min. : 0.000 Average:1317 Min. :1.000 Cat_1: 80 A:1293
## 1st Qu.: 0.000 High : 813 1st Qu.:2.000 Cat_2: 292 B:1258
## Median : 1.000 Low :3203 Median :2.000 Cat_3: 521 C:1376
## Mean : 2.661 Mean :2.849 Cat_4: 682 D:1406
## 3rd Qu.: 4.000 3rd Qu.:4.000 Cat_5: 63
## Max. :14.000 Max. :9.000 Cat_6:3566
## Cat_7: 129
summary(data_test)
## Gender Ever_Married Age Graduated Profession
## Female:573 No :546 Min. :18.00 No :473 Artist :461
## Male :759 Yes:786 1st Qu.:32.00 Yes:859 Healthcare :205
## Median :41.00 Entertainment:167
## Mean :43.85 Doctor :113
## 3rd Qu.:53.00 Engineer :113
## Max. :89.00 Executive :101
## (Other) :172
## Work_Experience Spending_Score Family_Size Var_1 Segmentation
## Min. : 0.000 Average:345 Min. :1.000 Cat_1: 24 A:323
## 1st Qu.: 0.000 High :191 1st Qu.:2.000 Cat_2: 70 B:314
## Median : 1.000 Low :796 Median :3.000 Cat_3:113 C:344
## Mean : 2.501 Mean :2.808 Cat_4:167 D:351
## 3rd Qu.: 4.000 3rd Qu.:4.000 Cat_5: 11
## Max. :14.000 Max. :9.000 Cat_6:910
## Cat_7: 37
model.SP <- multinom(Segmentation~., data = data_train)
## # weights: 96 (69 variable)
## initial value 7393.107828
## iter 10 value 6323.223291
## iter 20 value 5995.341444
## iter 30 value 5853.231447
## iter 40 value 5841.372966
## iter 50 value 5839.421169
## iter 60 value 5839.241930
## iter 70 value 5839.218344
## final value 5839.218041
## converged
summary(model.SP)
## Call:
## multinom(formula = Segmentation ~ ., data = data_train)
##
## Coefficients:
## (Intercept) GenderMale Ever_MarriedYes Age GraduatedYes
## B -0.7829477 -0.1725633 -0.1087873 0.02250967 0.3179974
## C -1.0140258 -0.3168519 -0.1789823 0.03139992 0.7595498
## D -1.2076361 0.2694101 -0.2740524 -0.02006700 -0.4651378
## ProfessionDoctor ProfessionEngineer ProfessionEntertainment
## B -0.4357373 -0.6655753 -0.8960501
## C -0.6254240 -1.9181040 -1.5402628
## D 1.0183015 0.9425994 0.5501784
## ProfessionExecutive ProfessionHealthcare ProfessionHomemaker ProfessionLawyer
## B -0.2885316 0.03406108 -0.296866 -1.164717
## C -0.6005189 0.51194744 -1.259967 -1.625160
## D 1.6056741 2.83248032 1.673951 1.782276
## ProfessionMarketing Work_Experience Spending_ScoreHigh Spending_ScoreLow
## B -0.8585493 -0.03667921 -0.1911240 -0.8925563
## C -0.7286192 -0.03355002 -0.6431719 -1.7235030
## D 2.0976688 0.02149467 -0.1061714 0.6141355
## Family_Size Var_1Cat_2 Var_1Cat_3 Var_1Cat_4 Var_1Cat_5 Var_1Cat_6
## B 0.15824294 0.58576463 0.27038445 0.1053514 0.4533940 0.3797321
## C 0.33047949 0.15529313 -0.12528210 -0.8135734 -0.1915739 0.3374775
## D 0.09238458 -0.02501649 0.03982441 0.1657007 -0.3680539 0.2052599
## Var_1Cat_7
## B 0.3236526
## C -0.1633475
## D -0.1713025
##
## Std. Errors:
## (Intercept) GenderMale Ever_MarriedYes Age GraduatedYes
## B 0.4336048 0.09082366 0.1227013 0.003867200 0.09594864
## C 0.4483221 0.09481643 0.1393627 0.004115655 0.10760767
## D 0.4542228 0.09769145 0.1282253 0.004582115 0.09642399
## ProfessionDoctor ProfessionEngineer ProfessionEntertainment
## B 0.1544440 0.1493899 0.1309554
## C 0.1616172 0.1969389 0.1481352
## D 0.1802663 0.1817508 0.1671445
## ProfessionExecutive ProfessionHealthcare ProfessionHomemaker ProfessionLawyer
## B 0.1918869 0.1967144 0.2629814 0.1928150
## C 0.1977630 0.1901777 0.3265132 0.2033881
## D 0.2318992 0.1879629 0.2568228 0.2424870
## ProfessionMarketing Work_Experience Spending_ScoreHigh Spending_ScoreLow
## B 0.2973080 0.01260492 0.1550838 0.1256158
## C 0.2990630 0.01332721 0.1581793 0.1341464
## D 0.2350315 0.01271250 0.2146279 0.1685413
## Family_Size Var_1Cat_2 Var_1Cat_3 Var_1Cat_4 Var_1Cat_5 Var_1Cat_6 Var_1Cat_7
## B 0.03270522 0.3973692 0.3754848 0.3703446 0.5041944 0.3565513 0.4347410
## C 0.03420059 0.4080892 0.3862729 0.3866658 0.5487239 0.3617049 0.4558000
## D 0.03258215 0.3874903 0.3599556 0.3526483 0.5186441 0.3397939 0.4345334
##
## Residual Deviance: 11678.44
## AIC: 11816.44
library(car)
## Loading required package: carData
vif(model.SP)
## Warning in vif.default(model.SP): No intercept: vifs may not be sensible.
## GVIF Df GVIF^(1/(2*Df))
## Gender 3.971065 1 1.992753
## Ever_Married 9.471209 1 3.077533
## Age 33.335299 1 5.773673
## Graduated 5.934203 1 2.436022
## Profession 176.799555 8 1.381871
## Work_Experience 2.423281 1 1.556689
## Spending_Score 23.788305 2 2.208467
## Family_Size 9.221925 1 3.036762
## Var_1 874.747141 6 1.758559
z <- summary(model.SP)$coefficients/summary(model.SP)$standard.errors
p <- (1-pnorm(abs(z), 0.1))*2
data.frame(p)
model.SP2 <- multinom(Segmentation~.-Spending_Score, data = data_train)
## # weights: 88 (63 variable)
## initial value 7393.107828
## iter 10 value 6362.258344
## iter 20 value 6082.772792
## iter 30 value 6006.244231
## iter 40 value 5986.100669
## iter 50 value 5983.942573
## iter 60 value 5983.763405
## final value 5983.752902
## converged
summary(model.SP2)
## Call:
## multinom(formula = Segmentation ~ . - Spending_Score, data = data_train)
##
## Coefficients:
## (Intercept) GenderMale Ever_MarriedYes Age GraduatedYes
## B -1.9048961 -0.1868985 0.3870945 0.02169582 0.3331082
## C -3.0473021 -0.3336610 0.8793520 0.02899665 0.7869425
## D -0.4945989 0.2813984 -0.5923520 -0.02012104 -0.4660476
## ProfessionDoctor ProfessionEngineer ProfessionEntertainment
## B -0.4150851 -0.6341401 -0.8725484
## C -0.5673661 -1.8636663 -1.4863985
## D 1.0146357 0.9192285 0.5374927
## ProfessionExecutive ProfessionHealthcare ProfessionHomemaker ProfessionLawyer
## B -0.1743129 -0.02146698 -0.240551 -1.150412
## C -0.6407548 0.35598659 -1.197279 -1.773837
## D 1.3898240 2.80429648 1.647010 1.756538
## ProfessionMarketing Work_Experience Family_Size Var_1Cat_2 Var_1Cat_3
## B -0.9011750 -0.03361670 0.2134207 0.7103984 0.34867075
## C -0.9578485 -0.03089333 0.4225866 0.3523850 -0.02145108
## D 2.0757367 0.01883543 0.0930950 -0.1005685 -0.02648870
## Var_1Cat_4 Var_1Cat_5 Var_1Cat_6 Var_1Cat_7
## B 0.21580654 0.54964309 0.4814703 0.41285226
## C -0.66902398 -0.04926557 0.4849103 -0.07709632
## D 0.09297849 -0.39110509 0.1437769 -0.25638085
##
## Std. Errors:
## (Intercept) GenderMale Ever_MarriedYes Age GraduatedYes
## B 0.4049524 0.08995445 0.1023195 0.003817823 0.09511616
## C 0.4095068 0.09240252 0.1109079 0.003990032 0.10551305
## D 0.4129535 0.09723205 0.1104496 0.004545626 0.09628718
## ProfessionDoctor ProfessionEngineer ProfessionEntertainment
## B 0.1530420 0.1476145 0.1295145
## C 0.1566503 0.1920940 0.1432259
## D 0.1796604 0.1814186 0.1670446
## ProfessionExecutive ProfessionHealthcare ProfessionHomemaker ProfessionLawyer
## B 0.1740337 0.1958678 0.2607126 0.1848027
## C 0.1766855 0.1875920 0.3206436 0.1906030
## D 0.2159090 0.1874572 0.2565239 0.2375751
## ProfessionMarketing Work_Experience Family_Size Var_1Cat_2 Var_1Cat_3
## B 0.2956409 0.01253319 0.03189155 0.3938871 0.3724979
## C 0.2970551 0.01306874 0.03289807 0.3949248 0.3731594
## D 0.2346564 0.01267453 0.03262355 0.3886499 0.3618387
## Var_1Cat_4 Var_1Cat_5 Var_1Cat_6 Var_1Cat_7
## B 0.3670942 0.4995356 0.3534843 0.4314751
## C 0.3738963 0.5355636 0.3487619 0.4428576
## D 0.3544972 0.5194655 0.3417803 0.4353276
##
## Residual Deviance: 11967.51
## AIC: 12093.51
model.SP$AIC
## [1] 11816.44
model.SP2$AIC
## [1] 12093.51
#prediksi data pada test
predict_prob = predict(model.SP, data_test, type = "prob")
head(predict_prob,10)
## A B C D
## 1 0.09578585 0.09990274 0.16067679 0.64363462
## 2 0.05788375 0.02614664 0.01202210 0.90394751
## 3 0.15981218 0.30284882 0.50799264 0.02934635
## 4 0.07597482 0.05287305 0.03304099 0.83811115
## 5 0.11181183 0.09220985 0.20435355 0.59162477
## 6 0.40153163 0.26035535 0.23219071 0.10592231
## 7 0.15428853 0.38519246 0.43889293 0.02162608
## 8 0.44340817 0.26657076 0.06981236 0.22020871
## 9 0.35356722 0.14698342 0.04363421 0.45581515
## 10 0.07528047 0.05858886 0.10785423 0.75827644
head(data.frame(predict_prob, data_test$Segmentation),10)
prediksi.test <- predict(model.SP, data_test,type = "class")
data_test$Segmentation<-as.factor(data_test$Segmentation)
confusionMatrix(as.factor(prediksi.test),
data_test$Segmentation)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D
## A 153 102 40 74
## B 68 74 37 14
## C 49 108 226 21
## D 53 30 41 242
##
## Overall Statistics
##
## Accuracy : 0.5218
## 95% CI : (0.4945, 0.5489)
## No Information Rate : 0.2635
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.3606
##
## Mcnemar's Test P-Value : 1.035e-10
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D
## Sensitivity 0.4737 0.23567 0.6570 0.6895
## Specificity 0.7859 0.88310 0.8198 0.8736
## Pos Pred Value 0.4146 0.38342 0.5594 0.6612
## Neg Pred Value 0.8235 0.78929 0.8728 0.8872
## Prevalence 0.2425 0.23574 0.2583 0.2635
## Detection Rate 0.1149 0.05556 0.1697 0.1817
## Detection Prevalence 0.2770 0.14489 0.3033 0.2748
## Balanced Accuracy 0.6298 0.55939 0.7384 0.7815
data.frame(summary(model.SP)$coefficients)
#odds ratio
data.frame(exp(summary(model.SP)$coefficients))
segmentasi B terhadap segmentasi A
intercept 0.5452851 artinya, responden dengan jenis kelamin laki-laki, sudah menikah, sudah lulus, profesi dokter, dll. ods memilih Segmentasi pelayanan B sebesar 0.54522851 kali dibanding memilih segmentasi pelayan A
gander male 0.8445335 artinya, responden dengan jenis kelamin laki-laki, odds memilih segmentasi pelayanan B sebanyak 0.8445335 kali dibanding segmentasi pelayan A
status sudah menikah 1.0220087, responden dengan status sudah menikah, odds memilih segmentasi pelayanan B sebanyak 1.0220087 kali dibanding segmentasi pelayan A