package

library(haven)
library(nnet)
library(readxl)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice

input data

data.SP <- read_excel("DATA MULTINOM.xlsx", sheet = "Segmentasi Pelanggan")
data.SP
str(data.SP)
## tibble [6,665 × 10] (S3: tbl_df/tbl/data.frame)
##  $ Gender         : chr [1:6665] "Male" "Female" "Male" "Male" ...
##  $ Ever_Married   : chr [1:6665] "No" "Yes" "Yes" "Yes" ...
##  $ Age            : num [1:6665] 22 67 67 56 32 33 61 55 26 19 ...
##  $ Graduated      : chr [1:6665] "No" "Yes" "Yes" "No" ...
##  $ Profession     : chr [1:6665] "Healthcare" "Engineer" "Lawyer" "Artist" ...
##  $ Work_Experience: num [1:6665] 1 1 0 0 1 1 0 1 1 4 ...
##  $ Spending_Score : chr [1:6665] "Low" "Low" "High" "Average" ...
##  $ Family_Size    : num [1:6665] 4 1 2 2 3 3 3 4 3 4 ...
##  $ Var_1          : chr [1:6665] "Cat_4" "Cat_6" "Cat_6" "Cat_6" ...
##  $ Segmentation   : chr [1:6665] "D" "B" "B" "C" ...

#merubah var ke factor

data.SP$Gender <- as.factor(data.SP$Gender)
data.SP$Ever_Married <- as.factor(data.SP$Ever_Married)
data.SP$Graduated <- as.factor(data.SP$Graduated)
data.SP$Profession <- as.factor(data.SP$Profession)
data.SP$Spending_Score <- as.factor(data.SP$Spending_Score)
data.SP$Var_1 <- as.factor(data.SP$Var_1)
data.SP$Segmentation <- as.factor(data.SP$Segmentation)


str(data.SP)
## tibble [6,665 × 10] (S3: tbl_df/tbl/data.frame)
##  $ Gender         : Factor w/ 2 levels "Female","Male": 2 1 2 2 2 1 1 1 1 2 ...
##  $ Ever_Married   : Factor w/ 2 levels "No","Yes": 1 2 2 2 1 1 2 2 1 1 ...
##  $ Age            : num [1:6665] 22 67 67 56 32 33 61 55 26 19 ...
##  $ Graduated      : Factor w/ 2 levels "No","Yes": 1 2 2 1 2 2 2 2 2 1 ...
##  $ Profession     : Factor w/ 9 levels "Artist","Doctor",..: 6 3 8 1 6 6 3 1 3 6 ...
##  $ Work_Experience: num [1:6665] 1 1 0 0 1 1 0 1 1 4 ...
##  $ Spending_Score : Factor w/ 3 levels "Average","High",..: 3 3 2 1 3 3 3 1 3 3 ...
##  $ Family_Size    : num [1:6665] 4 1 2 2 3 3 3 4 3 4 ...
##  $ Var_1          : Factor w/ 7 levels "Cat_1","Cat_2",..: 4 6 6 6 6 6 7 6 6 4 ...
##  $ Segmentation   : Factor w/ 4 levels "A","B","C","D": 4 2 2 3 3 4 4 3 1 4 ...
boxplot(Age~Segmentation, data = data.SP,
        main = "Boxplot umur dan segmentasi pelanggan",
        col = c("lightgreen", "lightblue", "mistyrose"))

# EDA

barplot (table(data.SP$Segmentation),
         main = "bar plot Segmentasi Pelanggan",
         xlab = "Segmentasi Pelanggan",
         ylab = "jumlah",
         col = c("lightgreen", "lightblue", "mistyrose"))

barplot (table(data.SP$Segmentation, data.SP$Spending_Score),
         main = "bar plot segmentasi pelanggan dan spending score",
         xlab = "segmentasi pelanggan",
         ylab = "jumlah",
         col = c("lightgreen", "lightblue", "mistyrose"),
         beside = TRUE)

legend("topright",
c("low", "High", "average"),
fill = c("lightgreen", "lightblue", "mistyrose")
)

## sumary

library(arsenal)

tab <- tableby(Segmentation~., data = data.SP)
summary(tab, text = TRUE)
## 
## 
## |                 |   A (N=1616)    |   B (N=1572)    |   C (N=1720)    |   D (N=1757)    | Total (N=6665)  | p value|
## |:----------------|:---------------:|:---------------:|:---------------:|:---------------:|:---------------:|-------:|
## |Gender           |                 |                 |                 |                 |                 |   0.003|
## |-  Female        |   740 (45.8%)   |   732 (46.6%)   |   795 (46.2%)   |   721 (41.0%)   |  2988 (44.8%)   |        |
## |-  Male          |   876 (54.2%)   |   840 (53.4%)   |   925 (53.8%)   |  1036 (59.0%)   |  3677 (55.2%)   |        |
## |Ever_Married     |                 |                 |                 |                 |                 | < 0.001|
## |-  No            |   676 (41.8%)   |   419 (26.7%)   |   338 (19.7%)   |  1288 (73.3%)   |  2721 (40.8%)   |        |
## |-  Yes           |   940 (58.2%)   |  1153 (73.3%)   |  1382 (80.3%)   |   469 (26.7%)   |  3944 (59.2%)   |        |
## |Age              |                 |                 |                 |                 |                 | < 0.001|
## |-  Mean (SD)     | 44.421 (16.173) | 48.113 (14.694) | 49.328 (14.479) | 32.957 (15.262) | 43.536 (16.524) |        |
## |-  Range         | 18.000 - 89.000 | 18.000 - 89.000 | 18.000 - 89.000 | 18.000 - 89.000 | 18.000 - 89.000 |        |
## |Graduated        |                 |                 |                 |                 |                 | < 0.001|
## |-  No            |   591 (36.6%)   |   421 (26.8%)   |   286 (16.6%)   |  1118 (63.6%)   |  2416 (36.2%)   |        |
## |-  Yes           |  1025 (63.4%)   |  1151 (73.2%)   |  1434 (83.4%)   |   639 (36.4%)   |  4249 (63.8%)   |        |
## |Profession       |                 |                 |                 |                 |                 | < 0.001|
## |-  Artist        |   470 (29.1%)   |   660 (42.0%)   |   948 (55.1%)   |   114 (6.5%)    |  2192 (32.9%)   |        |
## |-  Doctor        |   168 (10.4%)   |   127 (8.1%)    |   123 (7.2%)    |   174 (9.9%)    |   592 (8.9%)    |        |
## |-  Engineer      |   221 (13.7%)   |   159 (10.1%)   |    64 (3.7%)    |   138 (7.9%)    |   582 (8.7%)    |        |
## |-  Entertainment |   320 (19.8%)   |   191 (12.2%)   |   125 (7.3%)    |   173 (9.8%)    |   809 (12.1%)   |        |
## |-  Executive     |    98 (6.1%)    |   149 (9.5%)    |   164 (9.5%)    |    94 (5.4%)    |   505 (7.6%)    |        |
## |-  Healthcare    |    88 (5.4%)    |    85 (5.4%)    |   124 (7.2%)    |   780 (44.4%)   |  1077 (16.2%)   |        |
## |-  Homemaker     |    48 (3.0%)    |    44 (2.8%)    |    19 (1.1%)    |    64 (3.6%)    |   175 (2.6%)    |        |
## |-  Lawyer        |   157 (9.7%)    |   133 (8.5%)    |   124 (7.2%)    |    86 (4.9%)    |   500 (7.5%)    |        |
## |-  Marketing     |    46 (2.8%)    |    24 (1.5%)    |    29 (1.7%)    |   134 (7.6%)    |   233 (3.5%)    |        |
## |Work_Experience  |                 |                 |                 |                 |                 | < 0.001|
## |-  Mean (SD)     |  2.889 (3.627)  |  2.394 (3.247)  |  2.224 (3.037)  |  2.997 (3.611)  |  2.629 (3.405)  |        |
## |-  Range         | 0.000 - 14.000  | 0.000 - 14.000  | 0.000 - 14.000  | 0.000 - 14.000  | 0.000 - 14.000  |        |
## |Spending_Score   |                 |                 |                 |                 |                 | < 0.001|
## |-  Average       |   270 (16.7%)   |   499 (31.7%)   |   795 (46.2%)   |    98 (5.6%)    |  1662 (24.9%)   |        |
## |-  High          |   214 (13.2%)   |   315 (20.0%)   |   370 (21.5%)   |   105 (6.0%)    |  1004 (15.1%)   |        |
## |-  Low           |  1132 (70.0%)   |   758 (48.2%)   |   555 (32.3%)   |  1554 (88.4%)   |  3999 (60.0%)   |        |
## |Family_Size      |                 |                 |                 |                 |                 | < 0.001|
## |-  Mean (SD)     |  2.432 (1.480)  |  2.683 (1.411)  |  2.962 (1.374)  |  3.241 (1.681)  |  2.841 (1.525)  |        |
## |-  Range         |  1.000 - 9.000  |  1.000 - 9.000  |  1.000 - 9.000  |  1.000 - 9.000  |  1.000 - 9.000  |        |
## |Var_1            |                 |                 |                 |                 |                 | < 0.001|
## |-  Cat_1         |    24 (1.5%)    |    21 (1.3%)    |    23 (1.3%)    |    36 (2.0%)    |   104 (1.6%)    |        |
## |-  Cat_2         |    71 (4.4%)    |    89 (5.7%)    |    86 (5.0%)    |   116 (6.6%)    |   362 (5.4%)    |        |
## |-  Cat_3         |   175 (10.8%)   |   150 (9.5%)    |   118 (6.9%)    |   191 (10.9%)   |   634 (9.5%)    |        |
## |-  Cat_4         |   258 (16.0%)   |   188 (12.0%)   |    92 (5.3%)    |   311 (17.7%)   |   849 (12.7%)   |        |
## |-  Cat_5         |    16 (1.0%)    |    20 (1.3%)    |    17 (1.0%)    |    21 (1.2%)    |    74 (1.1%)    |        |
## |-  Cat_6         |  1030 (63.7%)   |  1067 (67.9%)   |  1344 (78.1%)   |  1035 (58.9%)   |  4476 (67.2%)   |        |
## |-  Cat_7         |    42 (2.6%)    |    37 (2.4%)    |    40 (2.3%)    |    47 (2.7%)    |   166 (2.5%)    |        |

partisi data

acak <- createDataPartition(data.SP$Segmentation, p=0.8, list=FALSE)
data_train <- data.SP[acak,]
data_test <- data.SP[-acak,]
summary(data_train)
##     Gender     Ever_Married      Age        Graduated          Profession  
##  Female:2422   No :2190     Min.   :18.00   No :1928   Artist       :1742  
##  Male  :2911   Yes:3143     1st Qu.:31.00   Yes:3405   Healthcare   : 862  
##                             Median :41.00              Entertainment: 651  
##                             Mean   :43.64              Doctor       : 477  
##                             3rd Qu.:53.00              Engineer     : 457  
##                             Max.   :89.00              Lawyer       : 417  
##                                                        (Other)      : 727  
##  Work_Experience  Spending_Score  Family_Size      Var_1      Segmentation
##  Min.   : 0.000   Average:1320   Min.   :1.000   Cat_1:  85   A:1293      
##  1st Qu.: 0.000   High   : 808   1st Qu.:2.000   Cat_2: 298   B:1258      
##  Median : 1.000   Low    :3205   Median :2.000   Cat_3: 496   C:1376      
##  Mean   : 2.615                  Mean   :2.839   Cat_4: 697   D:1406      
##  3rd Qu.: 4.000                  3rd Qu.:4.000   Cat_5:  62               
##  Max.   :14.000                  Max.   :9.000   Cat_6:3565               
##                                                  Cat_7: 130
summary(data_test)
##     Gender    Ever_Married      Age        Graduated         Profession 
##  Female:566   No :531      Min.   :18.00   No :488   Artist       :450  
##  Male  :766   Yes:801      1st Qu.:31.00   Yes:844   Healthcare   :215  
##                            Median :41.00             Entertainment:158  
##                            Mean   :43.11             Engineer     :125  
##                            3rd Qu.:52.00             Doctor       :115  
##                            Max.   :89.00             Executive    : 97  
##                                                      (Other)      :172  
##  Work_Experience  Spending_Score  Family_Size      Var_1     Segmentation
##  Min.   : 0.000   Average:342    Min.   :1.000   Cat_1: 19   A:323       
##  1st Qu.: 0.000   High   :196    1st Qu.:2.000   Cat_2: 64   B:314       
##  Median : 1.000   Low    :794    Median :2.500   Cat_3:138   C:344       
##  Mean   : 2.685                  Mean   :2.848   Cat_4:152   D:351       
##  3rd Qu.: 4.000                  3rd Qu.:4.000   Cat_5: 12               
##  Max.   :14.000                  Max.   :9.000   Cat_6:911               
##                                                  Cat_7: 36

Create model

model.SP <- multinom(Segmentation~., data = data_train)
## # weights:  96 (69 variable)
## initial  value 7393.107828 
## iter  10 value 6369.667414
## iter  20 value 6035.063466
## iter  30 value 5878.763050
## iter  40 value 5844.961427
## iter  50 value 5840.875099
## iter  60 value 5840.564525
## iter  70 value 5840.533674
## final  value 5840.533083 
## converged
summary(model.SP)
## Call:
## multinom(formula = Segmentation ~ ., data = data_train)
## 
## Coefficients:
##   (Intercept) GenderMale Ever_MarriedYes         Age GraduatedYes
## B  -0.4283262 -0.2186115      0.01601032  0.01980924    0.3866280
## C  -0.9085080 -0.3752771     -0.09030023  0.02630782    0.8548292
## D  -0.8343470  0.2598026     -0.21434561 -0.02295938   -0.4302646
##   ProfessionDoctor ProfessionEngineer ProfessionEntertainment
## B       -0.4277846         -0.7313216              -0.9383434
## C       -0.7117334         -1.8327770              -1.5478749
## D        0.9911519          0.8234278               0.6347058
##   ProfessionExecutive ProfessionHealthcare ProfessionHomemaker ProfessionLawyer
## B         -0.06976615           0.09752317          -0.3962112       -0.9783796
## C         -0.29704525           0.47969891          -1.5570698       -1.4027101
## D          1.67984112           2.85705444           1.5391379        1.8985918
##   ProfessionMarketing Work_Experience Spending_ScoreHigh Spending_ScoreLow
## B          -0.5283807     -0.03286067         -0.4287972        -0.9680013
## C          -0.5063767     -0.04185710         -0.7947333        -1.7199668
## D           2.2798178      0.03622472         -0.1767483         0.5416523
##   Family_Size  Var_1Cat_2  Var_1Cat_3 Var_1Cat_4 Var_1Cat_5 Var_1Cat_6
## B   0.1476870  0.47624532 -0.06017823 -0.2208119  0.4987125  0.1153367
## C   0.3052380  0.43335181 -0.01694129 -0.8157069  0.2317707  0.4619689
## D   0.1001686 -0.07324749 -0.36516893 -0.1653382 -0.6396496 -0.1028683
##   Var_1Cat_7
## B -0.1068873
## C  0.1110324
## D -0.4037213
## 
## Std. Errors:
##   (Intercept) GenderMale Ever_MarriedYes         Age GraduatedYes
## B   0.4157176 0.09101823       0.1235443 0.003904535   0.09604601
## C   0.4500204 0.09485061       0.1385390 0.004125147   0.10753786
## D   0.4514081 0.09805709       0.1289574 0.004651917   0.09648975
##   ProfessionDoctor ProfessionEngineer ProfessionEntertainment
## B        0.1548611          0.1502176               0.1323352
## C        0.1634887          0.1911678               0.1487386
## D        0.1788079          0.1832287               0.1629675
##   ProfessionExecutive ProfessionHealthcare ProfessionHomemaker ProfessionLawyer
## B           0.1934586            0.1966894           0.2654634        0.1928357
## C           0.1982932            0.1900196           0.3598920        0.2021920
## D           0.2331878            0.1865164           0.2564159        0.2467372
##   ProfessionMarketing Work_Experience Spending_ScoreHigh Spending_ScoreLow
## B           0.2956932      0.01289386          0.1564656         0.1267418
## C           0.3050035      0.01367874          0.1596351         0.1353038
## D           0.2420811      0.01284578          0.2145613         0.1687526
##   Family_Size Var_1Cat_2 Var_1Cat_3 Var_1Cat_4 Var_1Cat_5 Var_1Cat_6 Var_1Cat_7
## B  0.03307550  0.3781625  0.3538461  0.3474185  0.5016648  0.3326341  0.4233360
## C  0.03456831  0.4095465  0.3862873  0.3870741  0.5588105  0.3611670  0.4526103
## D  0.03259602  0.3867674  0.3598448  0.3505326  0.5317437  0.3379306  0.4264134
## 
## Residual Deviance: 11681.07 
## AIC: 11819.07

uji molticol

library(car)
## Loading required package: carData
vif(model.SP)
## Warning in vif.default(model.SP): No intercept: vifs may not be sensible.
##                       GVIF Df GVIF^(1/(2*Df))
## Gender            3.976892  1        1.994215
## Ever_Married      9.667996  1        3.109340
## Age              34.419459  1        5.866810
## Graduated         5.992289  1        2.447915
## Profession      189.172610  8        1.387726
## Work_Experience   2.448620  1        1.564807
## Spending_Score   23.721237  2        2.206909
## Family_Size       9.265946  1        3.044002
## Var_1           871.157406  6        1.757956

uji simultan

z <- summary(model.SP)$coefficients/summary(model.SP)$standard.errors
p <- (1-pnorm(abs(z), 0.1))*2
data.frame(p)

model 2 (membuang variabel yang terindikasi terjadi multikolinieritas)

model.SP2 <- multinom(Segmentation~.-Spending_Score, data = data_train)
## # weights:  88 (63 variable)
## initial  value 7393.107828 
## iter  10 value 6396.287179
## iter  20 value 6125.539208
## iter  30 value 6011.860988
## iter  40 value 5978.258329
## iter  50 value 5976.093471
## iter  60 value 5975.916430
## final  value 5975.910032 
## converged
summary(model.SP2)
## Call:
## multinom(formula = Segmentation ~ . - Spending_Score, data = data_train)
## 
## Coefficients:
##   (Intercept) GenderMale Ever_MarriedYes         Age GraduatedYes
## B  -1.5641453 -0.2300263       0.5264565  0.01889156    0.3954291
## C  -2.8563143 -0.3872174       0.9348297  0.02443910    0.8729774
## D  -0.2437396  0.2783187      -0.5129394 -0.02311838   -0.4284313
##   ProfessionDoctor ProfessionEngineer ProfessionEntertainment
## B       -0.3883998         -0.7074396              -0.9005865
## C       -0.6311860         -1.7895388              -1.4762634
## D        0.9823071          0.8024699               0.6297287
##   ProfessionExecutive ProfessionHealthcare ProfessionHomemaker ProfessionLawyer
## B         -0.09297969           0.03500562          -0.3677093        -1.058893
## C         -0.45277605           0.33681790          -1.5563471        -1.628460
## D          1.45086719           2.82305608           1.5054816         1.846770
##   ProfessionMarketing Work_Experience Family_Size Var_1Cat_2  Var_1Cat_3
## B          -0.6376131     -0.02956244   0.2023769  0.5513853 -0.04409545
## C          -0.8075547     -0.03840099   0.3961676  0.5425072 -0.01434540
## D           2.2650721      0.03437806   0.1042155 -0.1138639 -0.39014501
##   Var_1Cat_4 Var_1Cat_5 Var_1Cat_6  Var_1Cat_7
## B -0.1486042  0.4871423  0.1685284 -0.11795422
## C -0.7540704  0.1954148  0.5177851  0.05015725
## D -0.2082483 -0.6230623 -0.1275978 -0.42814179
## 
## Std. Errors:
##   (Intercept) GenderMale Ever_MarriedYes         Age GraduatedYes
## B   0.3862803 0.09011925       0.1029269 0.003862444   0.09522617
## C   0.4126256 0.09255520       0.1104434 0.004014716   0.10558442
## D   0.4107473 0.09760378       0.1104226 0.004617516   0.09632717
##   ProfessionDoctor ProfessionEngineer ProfessionEntertainment
## B        0.1530460          0.1484033               0.1304411
## C        0.1585061          0.1865292               0.1435636
## D        0.1781065          0.1828797               0.1627603
##   ProfessionExecutive ProfessionHealthcare ProfessionHomemaker ProfessionLawyer
## B           0.1765455            0.1959535           0.2638225        0.1851403
## C           0.1782405            0.1878996           0.3561844        0.1903369
## D           0.2167941            0.1861343           0.2565495        0.2408259
##   ProfessionMarketing Work_Experience Family_Size Var_1Cat_2 Var_1Cat_3
## B           0.2947840      0.01280345  0.03221376  0.3742516  0.3504124
## C           0.3036744      0.01340298  0.03321924  0.3974474  0.3741214
## D           0.2421139      0.01282702  0.03267065  0.3863339  0.3599848
##   Var_1Cat_4 Var_1Cat_5 Var_1Cat_6 Var_1Cat_7
## B  0.3437350  0.4963451  0.3291377  0.4198450
## C  0.3754024  0.5454959  0.3493941  0.4393662
## D  0.3504844  0.5326834  0.3379578  0.4262211
## 
## Residual Deviance: 11951.82 
## AIC: 12077.82
model.SP$AIC
## [1] 11819.07
model.SP2$AIC
## [1] 12077.82

#prediksi data pada test

predict_prob = predict(model.SP, data_test, type = "prob")
head(predict_prob,10)
##             A          B           C           D
## 1  0.05713456 0.02317009 0.009573713 0.910121639
## 2  0.47802631 0.26988845 0.094945901 0.157139330
## 3  0.07793862 0.27254727 0.640067841 0.009446265
## 4  0.30219613 0.31861772 0.321718335 0.057467817
## 5  0.15812749 0.40392015 0.414888923 0.023063436
## 6  0.05872954 0.03801525 0.041748318 0.861506890
## 7  0.09734635 0.28246934 0.609089547 0.011094770
## 8  0.08995961 0.30341785 0.599531777 0.007090763
## 9  0.46285073 0.13119594 0.064763827 0.341189508
## 10 0.20908934 0.29751801 0.403331677 0.090060978
head(data.frame(predict_prob, data_test$Segmentation),10)

confution matrix

prediksi.test <- predict(model.SP, data_test,type = "class")
data_test$Segmentation<-as.factor(data_test$Segmentation)
confusionMatrix(as.factor(prediksi.test),
                data_test$Segmentation)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   A   B   C   D
##          A 152 103  39  75
##          B  48  59  35  17
##          C  68 120 231   9
##          D  55  32  39 250
## 
## Overall Statistics
##                                           
##                Accuracy : 0.5195          
##                  95% CI : (0.4923, 0.5467)
##     No Information Rate : 0.2635          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.3571          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D
## Sensitivity            0.4706  0.18790   0.6715   0.7123
## Specificity            0.7849  0.90177   0.8006   0.8716
## Pos Pred Value         0.4119  0.37107   0.5397   0.6649
## Neg Pred Value         0.8224  0.78261   0.8750   0.8944
## Prevalence             0.2425  0.23574   0.2583   0.2635
## Detection Rate         0.1141  0.04429   0.1734   0.1877
## Detection Prevalence   0.2770  0.11937   0.3213   0.2823
## Balanced Accuracy      0.6278  0.54483   0.7361   0.7919

interpretasi (odds ratio)

data.frame(summary(model.SP)$coefficients)

#odds ratio

data.frame(exp(summary(model.SP)$coefficients))

interpretasi

segmentasi B terhadap segmentasi A

intercept 0.5452851 artinya, responden dengan jenis kelamin laki-laki, sudah menikah, sudah lulus, profesi dokter, dll. ods memilih Segmentasi pelayanan B sebesar 0.54522851 kali dibanding memilih segmentasi pelayan A

gander male 0.8445335 artinya, responden dengan jenis kelamin laki-laki, odds memilih segmentasi pelayanan B sebanyak 0.8445335 kali dibanding segmentasi pelayan A

status sudah menikah 1.0220087, responden dengan status sudah menikah, odds memilih segmentasi pelayanan B sebanyak 1.0220087 kali dibanding segmentasi pelayan A