Analisis Asuransi Kesehatan

1. Input Library

library(psych)
## Warning: package 'psych' was built under R version 4.4.1
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha

2. Read Data

insurance <- read.csv("insurance data.csv", stringsAsFactors = TRUE)
rmarkdown::paged_table(insurance)

Description - age : umur nasabah - sex : jenis kelamin nasabah - bmi : index massa tubuh - children : jumlah tanggungan anak nasabah - smoker : apakah nasabah perokok atau tidak - Claim Amount : - Past Consultations : - Num of steps: - Hospital Expenditure : - Number of past Hospitalizations : - Anual Salary - region : region tempat tinggal ansabah - charges : biaya pengobatan (medical cost) yang ditanggung oleh pihak asuransi terhadap nasabah

3. Data Wraling

Inspeksi data unique

unique(insurance$children)
## [1]  2  0  3  1  4  5 NA

Menghapus data duplikat

insurance <- insurance[!duplicated(insurance),]

Inspeksi missing value

is.na(insurance) %>% colSums()
##                             age                             sex 
##                               9                               0 
##                             bmi                        children 
##                               3                               5 
##                          smoker                    Claim_Amount 
##                               0                              14 
##              past_consultations                    num_of_steps 
##                               6                               3 
##            Hospital_expenditure NUmber_of_past_hospitalizations 
##                               4                               2 
##                    Anual_Salary                          region 
##                               6                               0 
##                         charges 
##                               0

Menghapus baris yang mengandung NA

# Menentukan kolom mana yang tidak memiliki nilai NA
complete_columns <- colSums(is.na(insurance)) == 0

# Subset dataframe hanya dengan kolom-kolom yang tidak memiliki NA
insurance_clean <- subset(insurance, select = complete_columns)

# Menampilkan struktur dataframe baru
str(insurance_clean)
## 'data.frame':    1334 obs. of  4 variables:
##  $ sex    : Factor w/ 2 levels "female","male": 2 2 1 2 1 2 1 1 1 1 ...
##  $ smoker : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ region : Factor w/ 4 levels "northeast","northwest",..: 4 4 2 1 2 3 4 2 4 3 ...
##  $ charges: num  8028 13823 2131 6749 2709 ...

Inspeksi range variable target

range(insurance$charges)
## [1]  1121.874 63770.428

4. Summary Data

summary(insurance)
##       age            sex           bmi           children     smoker    
##  Min.   :18.00   female:660   Min.   :15.96   Min.   :0.000   no :1062  
##  1st Qu.:27.00   male  :674   1st Qu.:26.25   1st Qu.:0.000   yes: 272  
##  Median :39.00                Median :30.36   Median :1.000             
##  Mean   :39.26                Mean   :30.65   Mean   :1.091             
##  3rd Qu.:51.00                3rd Qu.:34.60   3rd Qu.:2.000             
##  Max.   :64.00                Max.   :53.13   Max.   :5.000             
##  NA's   :9                    NA's   :3       NA's   :5                 
##   Claim_Amount   past_consultations  num_of_steps     Hospital_expenditure
##  Min.   : 1920   Min.   : 1.00      Min.   : 695430   Min.   :    29453   
##  1st Qu.:20715   1st Qu.: 9.00      1st Qu.: 847017   1st Qu.:  4062837   
##  Median :33670   Median :15.00      Median : 913998   Median :  7475057   
##  Mean   :33302   Mean   :15.19      Mean   : 909599   Mean   : 15701180   
##  3rd Qu.:45034   3rd Qu.:20.00      3rd Qu.: 970836   3rd Qu.: 10802988   
##  Max.   :77278   Max.   :40.00      Max.   :1107872   Max.   :261631699   
##  NA's   :14      NA's   :6          NA's   :3         NA's   :4           
##  NUmber_of_past_hospitalizations  Anual_Salary             region   
##  Min.   :0.000                   Min.   :2.747e+06   northeast:323  
##  1st Qu.:1.000                   1st Qu.:7.699e+07   northwest:324  
##  Median :1.000                   Median :1.414e+08   southeast:363  
##  Mean   :1.058                   Mean   :3.665e+08   southwest:324  
##  3rd Qu.:1.000                   3rd Qu.:3.205e+08                  
##  Max.   :3.000                   Max.   :4.117e+09                  
##  NA's   :2                       NA's   :6                          
##     charges     
##  Min.   : 1122  
##  1st Qu.: 4724  
##  Median : 9333  
##  Mean   :13200  
##  3rd Qu.:16455  
##  Max.   :63770  
## 

5. Exploratory Data

Cek persebaran data numerik pada dataset

pairs.panels(insurance[-c(2,5,6)])

Age Distribution Analysis

age_data <- insurance$age[!is.na(insurance$age)]

# Calculate statistics
age_mean <- mean(age_data)
age_median <- median(age_data)
age_min <- min(age_data)
age_max <- max(age_data)
quartiles <- quantile(age_data, probs = c(0.25, 0.75))
age_q1 <- quartiles[1]
age_q3 <- quartiles[2]

# Create box plot using ggplot2
ggplot(data = data.frame(age = age_data), aes(x = "", y = age)) +
  geom_boxplot(fill = "skyblue", color = "black", outlier.color = "red") +
  labs(x = "", y = "Age", title = "Box Plot of Age") +
  theme_minimal() +

  # Add mean, median, min, max, and quartiles using annotate()
  annotate("text", x = 0.5, y = age_mean, label = paste0("Mean: ", round(age_mean, 2)), color = "green", vjust = -0.5, size = 3) +
  annotate("text", x = 0.5, y = age_median, label = paste0("Median: ", round(age_median, 2)), color = "orange", vjust = -0.5, size = 3) +
  annotate("text", x = 0.5, y = age_min, label = paste0("Min: ", age_min), color = "red", vjust = 1, size = 3) +
  annotate("text", x = 0.5, y = age_max, label = paste0("Max: ", age_max), color = "blue", vjust = 1, size = 3) +
  annotate("text", x = 0.5, y = age_q1, label = paste0("Q1: ", round(age_q1, 2)), color = "purple", vjust = -0.5, size = 3) +
  annotate("text", x = 0.5, y = age_q3, label = paste0("Q3: ", round(age_q3, 2)), color = "brown", vjust = -0.5, size = 3)

💡 Ukuran statistik utama yang diperoleh dari plot adalah sebagai berikut:

  • Usia Rata-rata: Usia rata-rata individu dalam kumpulan data adalah sekitar 39 tahun.
  • Usia Median: Usia median, yang mewakili nilai tengah kumpulan data jika disusun dalam urutan menaik, adalah 39,0 tahun.
  • Usia Minimum: Individu termuda dalam kumpulan data berusia 18,0 tahun.
  • Usia Maksimum: Individu tertua dalam kumpulan data berusia 64,0 tahun.
  • Kuartil Pertama (Q1): 25% dari kumpulan data berada di bawah usia 26,0 tahun.
  • Kuartil Ketiga (Q3): 75% dari kumpulan data berada di bawah usia 51,0 tahun.

Region-wise Insurance Charges Analysis

# Calculate average charges by region
average_charges_by_region <- aggregate(charges ~ region, data = insurance, FUN = mean)

# Sort by average charges in descending order
average_charges_by_region <- average_charges_by_region[order(-average_charges_by_region$charges), ]

# Define colors for each region (adjust as per your preference)
colors <- c('skyblue', 'orange', 'green', 'red')

# Create the bar plot using ggplot2
ggplot(average_charges_by_region, aes(x = region, y = charges, fill = region)) +
  geom_bar(stat = "identity", color = "black", fill = colors) +
  labs(x = "Region", y = "Average Insurance Charges", title = "Average Insurance Charges by Region") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  scale_fill_manual(values = colors) +
  theme_minimal()

💡 Dari data akan memberikan gambaran rata-rata biaya asuransi untuk berbagai wilayah berdasarkan analisis data.

  • South East Region: Biaya Asuransi Rata-rata: $11.750 South East Region menunjukkan rata-rata biaya asuransi tertinggi di antara seluruh wilayah yang dianalisis.

  • North East Region: Biaya Asuransi Rata-rata: $11,500 North East Region Laut berada jauh di belakang wilayah Tenggara dalam hal rata-rata biaya asuransi.

  • North West Region: Biaya Asuransi Rata-rata: $11,250 North West Region menunjukkan rata-rata biaya asuransi yang sedikit lebih rendah dibandingkan wilayah Timur Laut.

  • South West Region: Biaya Asuransi Rata-rata: $11.000 South West Region mempunyai biaya asuransi rata-rata terendah di antara seluruh wilayah yang diteliti.

Kesimpulan: Analisis ini menunjukkan adanya variasi yang signifikan dalam biaya asuransi rata-rata di berbagai wilayah, dengan South East Region yang memiliki biaya asuransi paling rata-rata.

Insurance Charges by Number of Children Analysis

# Create a violin plot using ggplot2
ggplot(insurance, aes(x = factor(children), y = charges)) +
  geom_violin(trim = FALSE, fill = "skyblue") +
  geom_boxplot(width = 0.1, fill = "orange", color = "black") +
  stat_summary(fun = median, geom = "point", shape = 20, size = 2, color = "red") +
  labs(x = "Number of Children", y = "Insurance Charges", title = "Insurance Charges by Number of Children") +
  theme_minimal() +
  scale_x_discrete(labels = as.character(unique(insurance$children)))

Age-wise Trend Analysis

# Calculate average charges by age
average_charges_by_age <- aggregate(charges ~ age, data = insurance, FUN = mean)

# Create a line plot using ggplot2
ggplot(average_charges_by_age, aes(x = age, y = charges)) +
  geom_line(color = "blue") +
  geom_point(color = "orange", size = 3) +
  labs(x = "Age", y = "Average Insurance Charges", title = "Age-wise Trend in Insurance Charges") +
  theme_minimal()

💡 Analisis yang dilakukan terhadap biaya asuransi berdasarkan usia menunjukkan tren yang jelas dan diharapkan: seiring bertambahnya usia, biaya asuransi mereka cenderung meningkat. Pengamatan ini sejalan dengan perkembangan alami risiko kesehatan dan kebutuhan layanan kesehatan seiring bertambahnya usia.

Insurance Charges by Gender and Smoking Status Analysis

# Calculate average charges by gender and smoker status
average_charges <- aggregate(charges ~ sex + smoker, data = insurance, FUN = mean)

# Convert smoker status to factor for correct stacking order
average_charges$smoker <- factor(average_charges$smoker, levels = c("no", "yes"))

# Create stacked bar plot using ggplot2
ggplot(average_charges, aes(x = sex, y = charges, fill = smoker)) +
  geom_bar(stat = "identity", position = "stack", width = 0.7) +
  labs(x = "Gender", y = "Average Insurance Charges", title = "Insurance Charges by Gender and Smoking Status") +
  scale_fill_manual(values = c("skyblue", "salmon"), labels = c("Non-Smoker", "Smoker")) +
  theme_minimal()

💡 Data menunjukkan bahwa masyarakat yang tidak merokok, baik pria maupun wanita, biasanya memiliki biaya asuransi yang lebih rendah. Namun jika bicara soal perokok, ada perbedaan menarik antara pria dan wanita. Wanita yang merokok cenderung memiliki biaya asuransi yang lebih rendah dibandingkan pria yang merokok.

Hal ini menunjukkan bahwa walaupun merokok pada umumnya menyebabkan biaya asuransi lebih tinggi, mungkin ada beberapa perbedaan antara laki-laki dan perempuan dalam cara menentukan biaya tersebut.

BMI vs. Charges

# Create scatter plot using ggplot2
ggplot(insurance, aes(x = bmi, y = charges)) +
  geom_point(alpha = 0.5, color = "blue") +
  labs(x = "BMI", y = "Charges", title = "BMI vs. Charges") +
  theme_minimal()
## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_point()`).

Charges by gender Analysis

ggplot(insurance, aes(x = sex, y = charges, fill = sex)) +
  geom_boxplot() +
  labs(x = "Gender", y = "Insurance Charges", title = "Insurance Charges by Gender") +
  theme_minimal()

Insurance Charges by Number of Hospitalizations Analysis

# Calculate average charges by number of hospitalizations
average_charges <- aggregate(charges ~ NUmber_of_past_hospitalizations, data = insurance, FUN = mean)

# Create bar plot using ggplot2
ggplot(average_charges, aes(x = factor(NUmber_of_past_hospitalizations), y = charges)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  labs(x = "Number of Past Hospitalizations", y = "Average Insurance Charges", title = "Insurance Charges by Number of Hospitalizations") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  theme_minimal()

💡 Data menunjukkan bahwa biaya asuransi cenderung meningkat seiring dengan meningkatnya jumlah rawat inap. Ketika individu mengalami lebih banyak rawat inap, biaya asuransi mereka juga meningkat.