Problem Statement :- Predict the cost of health insurance based on Patient and certain Demographic Features![]()
HEALTH COST
library(ggplot2)
library(dplyr)
library(Hmisc)
library(cowplot)
library(WVPlots)
set.seed(123)
Data <- read.csv("insurance.csv")
sample_n(Data, 10)
## age sex bmi children smoker region charges
## 385 44 male 22.135 2 no northeast 8302.536
## 1054 47 male 29.800 3 yes southwest 25309.489
## 547 28 male 35.435 0 no northeast 3268.847
## 1179 23 female 34.865 0 no northeast 2899.489
## 1255 34 female 27.720 0 no southeast 4415.159
## 61 43 male 27.360 3 no northeast 8606.217
## 704 34 female 26.410 1 no northwest 5385.338
## 1188 62 female 32.680 0 no northwest 13844.797
## 734 48 female 27.265 1 no northeast 9447.250
## 607 27 female 25.175 0 no northeast 3558.620
Age: insurance contractor age, years
Sex: insurance contractor gender, [female, male]
BMI: The BMI is an attempt to quantify the amount of tissue mass (muscle, fat, and bone) in an individual, and then categorize that person as underweight, normal weight, overweight, or obese based on that value.
children: Number of Children of that Patient
Smoker: Does he smokes - [Yes,No]
Region: Region in US he belongs to
Charges: Individual medical costs billed by health insurance, $predicted_value
describe(Data)
## Data
##
## 7 Variables 1338 Observations
## ---------------------------------------------------------------------------
## age
## n missing distinct Info Mean Gmd .05 .10
## 1338 0 47 0.999 39.21 16.21 18 19
## .25 .50 .75 .90 .95
## 27 39 51 59 62
##
## lowest : 18 19 20 21 22, highest: 60 61 62 63 64
## ---------------------------------------------------------------------------
## sex
## n missing distinct
## 1338 0 2
##
## Value female male
## Frequency 662 676
## Proportion 0.495 0.505
## ---------------------------------------------------------------------------
## bmi
## n missing distinct Info Mean Gmd .05 .10
## 1338 0 548 1 30.66 6.893 21.26 22.99
## .25 .50 .75 .90 .95
## 26.30 30.40 34.69 38.62 41.11
##
## lowest : 15.960 16.815 17.195 17.290 17.385, highest: 48.070 49.060 50.380 52.580 53.130
## ---------------------------------------------------------------------------
## children
## n missing distinct Info Mean Gmd
## 1338 0 6 0.899 1.095 1.275
##
## Value 0 1 2 3 4 5
## Frequency 574 324 240 157 25 18
## Proportion 0.429 0.242 0.179 0.117 0.019 0.013
## ---------------------------------------------------------------------------
## smoker
## n missing distinct
## 1338 0 2
##
## Value no yes
## Frequency 1064 274
## Proportion 0.795 0.205
## ---------------------------------------------------------------------------
## region
## n missing distinct
## 1338 0 4
##
## Value northeast northwest southeast southwest
## Frequency 324 325 364 325
## Proportion 0.242 0.243 0.272 0.243
## ---------------------------------------------------------------------------
## charges
## n missing distinct Info Mean Gmd .05 .10
## 1338 0 1337 1 13270 12301 1758 2347
## .25 .50 .75 .90 .95
## 4740 9382 16640 34832 41182
##
## lowest : 1121.874 1131.507 1135.941 1136.399 1137.011
## highest: 55135.402 58571.074 60021.399 62592.873 63770.428
## ---------------------------------------------------------------------------
x <- ggplot(Data, aes(age, charges)) +
geom_jitter(color = "blue", alpha = 0.5) +
theme_light()
y <- ggplot(Data, aes(bmi, charges)) +
geom_jitter(color = "green", alpha = 0.5) +
theme_light()
p <- plot_grid(x, y)
title <- ggdraw() + draw_label("1. Correlation between Charges and Age / BMI", fontface='bold')
plot_grid(title, p, ncol=1, rel_heights=c(0.1, 1))
x <- ggplot(Data, aes(sex, charges)) +
geom_jitter(aes(color = sex), alpha = 0.7) +
theme_light()
y <- ggplot(Data, aes(children, charges)) +
geom_jitter(aes(color = children), alpha = 0.7) +
theme_light()
p <- plot_grid(x, y)
title <- ggdraw() + draw_label("2. Correlation between Charges and Sex / Children covered by insurance", fontface='bold')
plot_grid(title, p, ncol=1, rel_heights=c(0.1, 1))
x <- ggplot(Data, aes(smoker, charges)) +
geom_jitter(aes(color = smoker), alpha = 0.7) +
theme_light()
y <- ggplot(Data, aes(region, charges)) +
geom_jitter(aes(color = region), alpha = 0.7) +
theme_light()
p <- plot_grid(x, y)
title <- ggdraw() + draw_label("3. Correlation between Charges and Smoker / Region", fontface='bold')
plot_grid(title, p, ncol=1, rel_heights=c(0.1, 1))
Plot 1: As Age and BMI go up Charges for health insurance also trends up.
Plot 2: No obvious connection between Charges and Age. Charges for insurance with 4-5 chilren covered seems to go down (doesn’t make sense, does it?)
Plot 3: Charges for Smokers are higher for non-smokers (no surprise here). No obvious connection between Charges and Region.
n_train <- round(0.8 * nrow(Data))
train_indices <- sample(1:nrow(Data), n_train)
Data_train <- Data[train_indices, ]
Data_test <- Data[-train_indices, ]
formula_0 <- as.formula("charges ~ age + sex + bmi + children + smoker + region")
model_0 <- lm(formula_0, data = Data_train)
summary(model_0)
##
## Call:
## lm(formula = formula_0, data = Data_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11883.6 -2725.8 -905.5 1356.5 29816.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -11855.48 1082.65 -10.950 < 2e-16 ***
## age 252.60 13.01 19.412 < 2e-16 ***
## sexmale -394.90 366.20 -1.078 0.281106
## bmi 329.81 31.62 10.431 < 2e-16 ***
## children 582.87 154.71 3.767 0.000174 ***
## smokeryes 24349.42 458.84 53.068 < 2e-16 ***
## regionnorthwest -235.14 527.20 -0.446 0.655670
## regionsoutheast -702.47 531.02 -1.323 0.186169
## regionsouthwest -358.74 527.11 -0.681 0.496294
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5956 on 1061 degrees of freedom
## Multiple R-squared: 0.7615, Adjusted R-squared: 0.7597
## F-statistic: 423.5 on 8 and 1061 DF, p-value: < 2.2e-16
#Saving R-squared
r_sq_0 <- summary(model_0)$r.squared
#predict data on test set
prediction_0 <- predict(model_0, newdata = Data_test)
#calculating the residuals
residuals_0 <- Data_test$charges - prediction_0
#calculating Root Mean Squared Error
rmse_0 <- sqrt(mean(residuals_0^2))
As we can see, summary of a model showed us that some of the variable are not significant (sex), while smoking* seems to have a huge influence on charges Let us Train a model without non-significant variables and check if performance can be improved*.
formula_1 <- as.formula("charges ~ age + bmi + children + smoker + region")
model_1 <- lm(formula_1, data = Data_train)
summary(model_1)
##
## Call:
## lm(formula = formula_1, data = Data_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12080.9 -2699.0 -834.1 1419.1 29634.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -11999.29 1074.49 -11.167 < 2e-16 ***
## age 253.26 13.00 19.484 < 2e-16 ***
## bmi 327.66 31.56 10.383 < 2e-16 ***
## children 579.35 154.69 3.745 0.00019 ***
## smokeryes 24318.89 458.00 53.098 < 2e-16 ***
## regionnorthwest -250.22 527.05 -0.475 0.63506
## regionsoutheast -706.15 531.05 -1.330 0.18390
## regionsouthwest -366.00 527.11 -0.694 0.48762
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5957 on 1062 degrees of freedom
## Multiple R-squared: 0.7613, Adjusted R-squared: 0.7597
## F-statistic: 483.8 on 7 and 1062 DF, p-value: < 2.2e-16
r_sq_1 <- summary(model_1)$r.squared
prediction_1 <- predict(model_1, newdata = Data_test)
residuals_1 <- Data_test$charges - prediction_1
rmse_1 <- sqrt(mean(residuals_1^2))
print(paste0("R-squared for first model:", round(r_sq_0, 4)))
## [1] "R-squared for first model:0.7615"
print(paste0("R-squared for new model: ", round(r_sq_1, 4)))
## [1] "R-squared for new model: 0.7613"
print(paste0("RMSE for first model: ", round(rmse_0, 2)))
## [1] "RMSE for first model: 6511.83"
print(paste0("RMSE for new model: ", round(rmse_1, 2)))
## [1] "RMSE for new model: 6495.05"
As we can see, performance is quite similar between two models so I will keep the new model since it’s a little bit simpler.
Data_test$prediction <- predict(model_1, newdata = Data_test)
ggplot(Data_test, aes(x = prediction, y = charges)) +
geom_point(color = "blue", alpha = 0.7) +
geom_abline(color = "red") +
ggtitle("Prediction vs. Real values")
Data_test$residuals <- Data_test$charges - Data_test$prediction
ggplot(data = Data_test, aes(x = prediction, y = residuals)) +
geom_pointrange(aes(ymin = 0, ymax = residuals), color = "blue", alpha = 0.7) +
geom_hline(yintercept = 0, linetype = 3, color = "red") +
ggtitle("Residuals vs. Linear model prediction")
ggplot(Data_test, aes(x = residuals)) +
geom_histogram(bins = 15, fill = "blue") +
ggtitle("Histogram of residuals")
GainCurvePlot(Data_test, "prediction", "charges", "Model")
We can see the errors in the model are close to zero so model predicts quite well.
Let’s imagine 3 different people and see what charges on health care will be for them.
Bob: 19 years old, BMI 27.9, has no children, smokes, from northwest region.
Lisa: 40 years old, BMI 50, 2 children, doesn’t smoke, from southeast region.
John: 30 years old. BMI 31.2, no children, doesn’t smoke, from northeast region.
Bob <- data.frame(age = 19,
bmi = 27.9,
children = 0,
smoker = "yes",
region = "northwest")
print(paste0("Health care charges for Bob: ", round(predict(model_1, Bob), 2)))
## [1] "Health care charges for Bob: 26023.23"
Lisa <- data.frame(age = 40,
bmi = 50,
children = 2,
smoker = "no",
region = "southeast")
print(paste0("Health care charges for Lisa: ", round(predict(model_1, Lisa), 2)))
## [1] "Health care charges for Lisa: 14967.05"
John <- data.frame(age = 30,
bmi = 31.2,
children = 0,
smoker = "no",
region = "northeast")
print(paste0("Health care charges for John: ", round(predict(model_1, John), 2)))
## [1] "Health care charges for John: 5821.76"