Load Library

library(nnet)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(caret)
## Loading required package: lattice
library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
#Import dataset
df <- read.csv("health_insurance.csv")
df <- janitor::clean_names(df)
str(df)
## 'data.frame':    1453 obs. of  6 variables:
##  $ product       : chr  "C" "A" "C" "A" ...
##  $ age           : int  57 21 66 36 23 31 37 37 55 66 ...
##  $ household     : int  2 7 7 4 0 5 3 0 3 2 ...
##  $ position_level: int  2 2 2 2 2 1 3 3 3 4 ...
##  $ gender        : chr  "Male" "Male" "Male" "Female" ...
##  $ absent        : int  10 7 1 6 11 14 12 25 3 18 ...
#data prepocessing
df$product <- as.factor(df$product)
df$gender <- as.factor(df$gender)


#eksplorasi data

#distribusi pemilihan produk asuransi
ggplot(df, aes(x = product, fill = product)) +
  geom_bar() +
  labs(title = "Distribusi Pemilihan Produk Asuransi", x = "Produk", y = "Jumlah") +
  theme_minimal()

#usi berdasarkan produk
ggplot(df, aes(x = product, y = age, fill = product)) +
  geom_boxplot() +
  labs(title = "Usia berdasarkan Produk", x = "Produk", y = "Usia") +
  theme_minimal()

#proporsi gender dalam pemilihan produk
ggplot(df, aes(x = gender, fill = product)) +
  geom_bar(position = "fill") +
  labs(title = "Proporsi Gender dalam Pemilihan Produk", x = "Gender", y = "Proporsi") +
  theme_minimal()

#split data & modeling
set.seed(123)
train_index <- createDataPartition(df$product, p = 0.8, list = FALSE)
train <- df[train_index, ]
test <- df[-train_index, ]

#model regresi logistik multinomial
model <- multinom(product ~ age + household + position_level + gender + absent, data = train)
## # weights:  24 (14 variable)
## initial  value 1278.784704 
## iter  10 value 816.403274
## iter  20 value 609.019776
## final  value 608.936017 
## converged
summary(model)
## Call:
## multinom(formula = product ~ age + household + position_level + 
##     gender + absent, data = train)
## 
## Coefficients:
##   (Intercept)       age  household position_level genderMale genderNon-binary
## B   -4.821029 0.2440100 -0.9736457     -0.3260307 -2.1801556        0.2993189
## C  -10.423406 0.2719315  0.1821444     -0.1591155  0.1883674       -1.1883887
##        absent
## B 0.010914879
## C 0.006672787
## 
## Std. Errors:
##   (Intercept)        age  household position_level genderMale genderNon-binary
## B   0.5728851 0.01724552 0.07772138     0.09929768  0.2513283         1.226988
## C   0.6922356 0.01758332 0.05581736     0.09097226  0.2164298         2.012470
##       absent
## B 0.01414291
## C 0.01365930
## 
## Residual Deviance: 1217.872 
## AIC: 1245.872
#odds ratio
coefs <- coef(model)
OR <- exp(coefs)
round(OR, 2)
##   (Intercept)  age household position_level genderMale genderNon-binary absent
## B        0.01 1.28      0.38           0.72       0.11             1.35   1.01
## C        0.00 1.31      1.20           0.85       1.21             0.30   1.01
#prdiksi dan evaluasi model
prediksi <- predict(model, newdata = test)
confusionMatrix(prediksi, test$product)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  A  B  C
##          A 82 13  9
##          B 14 74 13
##          C  3  4 77
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8062          
##                  95% CI : (0.7559, 0.8502)
##     No Information Rate : 0.3426          
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.7095          
##                                           
##  Mcnemar's Test P-Value : 0.05029         
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C
## Sensitivity            0.8283   0.8132   0.7778
## Specificity            0.8842   0.8636   0.9632
## Pos Pred Value         0.7885   0.7327   0.9167
## Neg Pred Value         0.9081   0.9096   0.8927
## Prevalence             0.3426   0.3149   0.3426
## Detection Rate         0.2837   0.2561   0.2664
## Detection Prevalence   0.3599   0.3495   0.2907
## Balanced Accuracy      0.8562   0.8384   0.8705