Health Cost Prediction

Problem Statement :- Predict the cost of health insurance based on Patient and certain Demographic Features

HEALTH COST

Setting up the environment and importing data

library(ggplot2)
library(dplyr)
library(Hmisc)
library(cowplot)
library(WVPlots)
set.seed(123)
Data <- read.csv("insurance.csv")
sample_n(Data, 10)

##      age    sex    bmi children smoker    region   charges
## 385   44   male 22.135        2     no northeast  8302.536
## 1054  47   male 29.800        3    yes southwest 25309.489
## 547   28   male 35.435        0     no northeast  3268.847
## 1179  23 female 34.865        0     no northeast  2899.489
## 1255  34 female 27.720        0     no southeast  4415.159
## 61    43   male 27.360        3     no northeast  8606.217
## 704   34 female 26.410        1     no northwest  5385.338
## 1188  62 female 32.680        0     no northwest 13844.797
## 734   48 female 27.265        1     no northeast  9447.250
## 607   27 female 25.175        0     no northeast  3558.620

Understanding the data

Age: insurance contractor age, years
Sex: insurance contractor gender, [female, male]
BMI: The BMI is an attempt to quantify the amount of tissue mass (muscle, fat, and bone) in an individual, and then categorize that person as underweight, normal weight, overweight, or obese based on that value.
children: Number of Children of that Patient
Smoker: Does he smokes - [Yes,No]
Region: Region in US he belongs to
Charges: Individual medical costs billed by health insurance, $predicted_value

describe(Data)

## Data 
## 
##  7  Variables      1338  Observations
## ---------------------------------------------------------------------------
## age 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1338        0       47    0.999    39.21    16.21       18       19 
##      .25      .50      .75      .90      .95 
##       27       39       51       59       62 
## 
## lowest : 18 19 20 21 22, highest: 60 61 62 63 64
## ---------------------------------------------------------------------------
## sex 
##        n  missing distinct 
##     1338        0        2 
##                         
## Value      female   male
## Frequency     662    676
## Proportion  0.495  0.505
## ---------------------------------------------------------------------------
## bmi 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1338        0      548        1    30.66    6.893    21.26    22.99 
##      .25      .50      .75      .90      .95 
##    26.30    30.40    34.69    38.62    41.11 
## 
## lowest : 15.960 16.815 17.195 17.290 17.385, highest: 48.070 49.060 50.380 52.580 53.130
## ---------------------------------------------------------------------------
## children 
##        n  missing distinct     Info     Mean      Gmd 
##     1338        0        6    0.899    1.095    1.275 
##                                               
## Value          0     1     2     3     4     5
## Frequency    574   324   240   157    25    18
## Proportion 0.429 0.242 0.179 0.117 0.019 0.013
## ---------------------------------------------------------------------------
## smoker 
##        n  missing distinct 
##     1338        0        2 
##                       
## Value         no   yes
## Frequency   1064   274
## Proportion 0.795 0.205
## ---------------------------------------------------------------------------
## region 
##        n  missing distinct 
##     1338        0        4 
##                                                   
## Value      northeast northwest southeast southwest
## Frequency        324       325       364       325
## Proportion     0.242     0.243     0.272     0.243
## ---------------------------------------------------------------------------
## charges 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1338        0     1337        1    13270    12301     1758     2347 
##      .25      .50      .75      .90      .95 
##     4740     9382    16640    34832    41182 
## 
## lowest :  1121.874  1131.507  1135.941  1136.399  1137.011
## highest: 55135.402 58571.074 60021.399 62592.873 63770.428
## ---------------------------------------------------------------------------

EXPLORATORY DATA ANALYSIS

x <- ggplot(Data, aes(age, charges)) +
  geom_jitter(color = "blue", alpha = 0.5) +
    theme_light()

y <- ggplot(Data, aes(bmi, charges)) +
  geom_jitter(color = "green", alpha = 0.5) +
  theme_light()

p <- plot_grid(x, y) 
title <- ggdraw() + draw_label("1. Correlation between Charges and Age / BMI", fontface='bold')
plot_grid(title, p, ncol=1, rel_heights=c(0.1, 1))

x <- ggplot(Data, aes(sex, charges)) +
  geom_jitter(aes(color = sex), alpha = 0.7) +
  theme_light()

y <- ggplot(Data, aes(children, charges)) +
  geom_jitter(aes(color = children), alpha = 0.7) +
  theme_light()

p <- plot_grid(x, y) 
title <- ggdraw() + draw_label("2. Correlation between Charges and Sex / Children covered by insurance", fontface='bold')
plot_grid(title, p, ncol=1, rel_heights=c(0.1, 1))

x <- ggplot(Data, aes(smoker, charges)) +
  geom_jitter(aes(color = smoker), alpha = 0.7) +
  theme_light()

y <- ggplot(Data, aes(region, charges)) +
  geom_jitter(aes(color = region), alpha = 0.7) +
  theme_light()

p <- plot_grid(x, y) 
title <- ggdraw() + draw_label("3. Correlation between Charges and Smoker / Region", fontface='bold')
plot_grid(title, p, ncol=1, rel_heights=c(0.1, 1))

Plot 1: As Age and BMI go up Charges for health insurance also trends up.
Plot 2: No obvious connection between Charges and Age. Charges for insurance with 4-5 chilren covered seems to go down (doesn’t make sense, does it?)
Plot 3: Charges for Smokers are higher for non-smokers (no surprise here). No obvious connection between Charges and Region.

Linear Regression Model

Preparation and splitting the data

n_train <- round(0.8 * nrow(Data))
train_indices <- sample(1:nrow(Data), n_train)
Data_train <- Data[train_indices, ]
Data_test <- Data[-train_indices, ]

formula_0 <- as.formula("charges ~ age + sex + bmi + children + smoker + region")

Train and Test the Model

model_0 <- lm(formula_0, data = Data_train)
summary(model_0)

## 
## Call:
## lm(formula = formula_0, data = Data_train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11883.6  -2725.8   -905.5   1356.5  29816.5 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -11855.48    1082.65 -10.950  < 2e-16 ***
## age                252.60      13.01  19.412  < 2e-16 ***
## sexmale           -394.90     366.20  -1.078 0.281106    
## bmi                329.81      31.62  10.431  < 2e-16 ***
## children           582.87     154.71   3.767 0.000174 ***
## smokeryes        24349.42     458.84  53.068  < 2e-16 ***
## regionnorthwest   -235.14     527.20  -0.446 0.655670    
## regionsoutheast   -702.47     531.02  -1.323 0.186169    
## regionsouthwest   -358.74     527.11  -0.681 0.496294    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5956 on 1061 degrees of freedom
## Multiple R-squared:  0.7615, Adjusted R-squared:  0.7597 
## F-statistic: 423.5 on 8 and 1061 DF,  p-value: < 2.2e-16

#Saving R-squared
r_sq_0 <- summary(model_0)$r.squared

#predict data on test set
prediction_0 <- predict(model_0, newdata = Data_test)
#calculating the residuals
residuals_0 <- Data_test$charges - prediction_0
#calculating Root Mean Squared Error
rmse_0 <- sqrt(mean(residuals_0^2))

As we can see, summary of a model showed us that some of the variable are not significant (sex), while smoking* seems to have a huge influence on charges Let us Train a model without non-significant variables and check if performance can be improved*.

Train and Test New Model

formula_1 <- as.formula("charges ~ age + bmi + children + smoker + region")

model_1 <- lm(formula_1, data = Data_train)
summary(model_1)

## 
## Call:
## lm(formula = formula_1, data = Data_train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12080.9  -2699.0   -834.1   1419.1  29634.6 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -11999.29    1074.49 -11.167  < 2e-16 ***
## age                253.26      13.00  19.484  < 2e-16 ***
## bmi                327.66      31.56  10.383  < 2e-16 ***
## children           579.35     154.69   3.745  0.00019 ***
## smokeryes        24318.89     458.00  53.098  < 2e-16 ***
## regionnorthwest   -250.22     527.05  -0.475  0.63506    
## regionsoutheast   -706.15     531.05  -1.330  0.18390    
## regionsouthwest   -366.00     527.11  -0.694  0.48762    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5957 on 1062 degrees of freedom
## Multiple R-squared:  0.7613, Adjusted R-squared:  0.7597 
## F-statistic: 483.8 on 7 and 1062 DF,  p-value: < 2.2e-16

r_sq_1 <- summary(model_1)$r.squared

prediction_1 <- predict(model_1, newdata = Data_test)

residuals_1 <- Data_test$charges - prediction_1
rmse_1 <- sqrt(mean(residuals_1^2))

Compare the models

print(paste0("R-squared for first model:", round(r_sq_0, 4)))

## [1] "R-squared for first model:0.7615"

print(paste0("R-squared for new model: ", round(r_sq_1, 4)))

## [1] "R-squared for new model: 0.7613"

print(paste0("RMSE for first model: ", round(rmse_0, 2)))

## [1] "RMSE for first model: 6511.83"

print(paste0("RMSE for new model: ", round(rmse_1, 2)))

## [1] "RMSE for new model: 6495.05"

As we can see, performance is quite similar between two models so I will keep the new model since it’s a little bit simpler.

Model Performance

Data_test$prediction <- predict(model_1, newdata = Data_test)
ggplot(Data_test, aes(x = prediction, y = charges)) + 
  geom_point(color = "blue", alpha = 0.7) + 
  geom_abline(color = "red") +
  ggtitle("Prediction vs. Real values")

Data_test$residuals <- Data_test$charges - Data_test$prediction

ggplot(data = Data_test, aes(x = prediction, y = residuals)) +
  geom_pointrange(aes(ymin = 0, ymax = residuals), color = "blue", alpha = 0.7) +
  geom_hline(yintercept = 0, linetype = 3, color = "red") +
  ggtitle("Residuals vs. Linear model prediction")

ggplot(Data_test, aes(x = residuals)) + 
  geom_histogram(bins = 15, fill = "blue") +
  ggtitle("Histogram of residuals")

GainCurvePlot(Data_test, "prediction", "charges", "Model")

We can see the errors in the model are close to zero so model predicts quite well.

Applying on new data

Let’s imagine 3 different people and see what charges on health care will be for them.

Bob: 19 years old, BMI 27.9, has no children, smokes, from northwest region.
Lisa: 40 years old, BMI 50, 2 children, doesn’t smoke, from southeast region.
John: 30 years old. BMI 31.2, no children, doesn’t smoke, from northeast region.

Bob <- data.frame(age = 19,
                  bmi = 27.9,
                  children = 0,
                  smoker = "yes",
                  region = "northwest")
print(paste0("Health care charges for Bob: ", round(predict(model_1, Bob), 2)))

## [1] "Health care charges for Bob: 26023.23"

Lisa <- data.frame(age = 40,
                   bmi = 50,
                   children = 2,
                   smoker = "no",
                   region = "southeast")
print(paste0("Health care charges for Lisa: ", round(predict(model_1, Lisa), 2)))

## [1] "Health care charges for Lisa: 14967.05"

John <- data.frame(age = 30,
                   bmi = 31.2,
                   children = 0,
                   smoker = "no",
                   region = "northeast")
print(paste0("Health care charges for John: ", round(predict(model_1, John), 2)))

## [1] "Health care charges for John: 5821.76"