Load Library
library(nnet)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(caret)
## Loading required package: lattice
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
#Import dataset
df <- read.csv("health_insurance.csv")
df <- janitor::clean_names(df)
str(df)
## 'data.frame': 1453 obs. of 6 variables:
## $ product : chr "C" "A" "C" "A" ...
## $ age : int 57 21 66 36 23 31 37 37 55 66 ...
## $ household : int 2 7 7 4 0 5 3 0 3 2 ...
## $ position_level: int 2 2 2 2 2 1 3 3 3 4 ...
## $ gender : chr "Male" "Male" "Male" "Female" ...
## $ absent : int 10 7 1 6 11 14 12 25 3 18 ...
#data prepocessing
df$product <- as.factor(df$product)
df$gender <- as.factor(df$gender)
#eksplorasi data
#distribusi pemilihan produk asuransi
ggplot(df, aes(x = product, fill = product)) +
geom_bar() +
labs(title = "Distribusi Pemilihan Produk Asuransi", x = "Produk", y = "Jumlah") +
theme_minimal()

#usi berdasarkan produk
ggplot(df, aes(x = product, y = age, fill = product)) +
geom_boxplot() +
labs(title = "Usia berdasarkan Produk", x = "Produk", y = "Usia") +
theme_minimal()

#proporsi gender dalam pemilihan produk
ggplot(df, aes(x = gender, fill = product)) +
geom_bar(position = "fill") +
labs(title = "Proporsi Gender dalam Pemilihan Produk", x = "Gender", y = "Proporsi") +
theme_minimal()

#split data & modeling
set.seed(123)
train_index <- createDataPartition(df$product, p = 0.8, list = FALSE)
train <- df[train_index, ]
test <- df[-train_index, ]
#model regresi logistik multinomial
model <- multinom(product ~ age + household + position_level + gender + absent, data = train)
## # weights: 24 (14 variable)
## initial value 1278.784704
## iter 10 value 816.403274
## iter 20 value 609.019776
## final value 608.936017
## converged
summary(model)
## Call:
## multinom(formula = product ~ age + household + position_level +
## gender + absent, data = train)
##
## Coefficients:
## (Intercept) age household position_level genderMale genderNon-binary
## B -4.821029 0.2440100 -0.9736457 -0.3260307 -2.1801556 0.2993189
## C -10.423406 0.2719315 0.1821444 -0.1591155 0.1883674 -1.1883887
## absent
## B 0.010914879
## C 0.006672787
##
## Std. Errors:
## (Intercept) age household position_level genderMale genderNon-binary
## B 0.5728851 0.01724552 0.07772138 0.09929768 0.2513283 1.226988
## C 0.6922356 0.01758332 0.05581736 0.09097226 0.2164298 2.012470
## absent
## B 0.01414291
## C 0.01365930
##
## Residual Deviance: 1217.872
## AIC: 1245.872
#odds ratio
coefs <- coef(model)
OR <- exp(coefs)
round(OR, 2)
## (Intercept) age household position_level genderMale genderNon-binary absent
## B 0.01 1.28 0.38 0.72 0.11 1.35 1.01
## C 0.00 1.31 1.20 0.85 1.21 0.30 1.01
#prdiksi dan evaluasi model
prediksi <- predict(model, newdata = test)
confusionMatrix(prediksi, test$product)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C
## A 82 13 9
## B 14 74 13
## C 3 4 77
##
## Overall Statistics
##
## Accuracy : 0.8062
## 95% CI : (0.7559, 0.8502)
## No Information Rate : 0.3426
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.7095
##
## Mcnemar's Test P-Value : 0.05029
##
## Statistics by Class:
##
## Class: A Class: B Class: C
## Sensitivity 0.8283 0.8132 0.7778
## Specificity 0.8842 0.8636 0.9632
## Pos Pred Value 0.7885 0.7327 0.9167
## Neg Pred Value 0.9081 0.9096 0.8927
## Prevalence 0.3426 0.3149 0.3426
## Detection Rate 0.2837 0.2561 0.2664
## Detection Prevalence 0.3599 0.3495 0.2907
## Balanced Accuracy 0.8562 0.8384 0.8705