Health insurance is insurance that covers all or a portion of the risk that a person would incur medical costs, distributing the risk among several people. An insurer can create a regular financial structure, such as a monthly premium or payroll tax, to provide the funds to pay for the medical benefits stated in the insurance agreement by assessing the overall risk of health care and health system spending over the risk pool. A central organization, such as a government agency, for-profit corporation, or private company, is in charge of administering the benefit.
Health insurance is described as “coverage that allows for the payments of benefits as a result of sickness or accident,” according to the Health Insurance Association of America. It covers losses resulting from mishaps, medical bills, disabilities, or unintentional death and mutilation (p. 225).
The purpose of this data processing is to forecast insurance costs with accuracy.
Insurance Dataset for Regression model
We are going to use health insurance dataset to work on regression model
### Loading required packages
library(magrittr)
library(knitr)
library(pryr)
library(magrittr)
library(tidyverse)
library(GGally)
library(vtreat)
## Loading required packages
library(knitr)
# Reading the dataset
insure_data<-read.csv("insurance.csv")
## missing data
colSums(is.na(insure_data))
## age sex bmi children smoker region charges
## 0 0 0 0 0 0 0
## head of dataset
kable(head(insure_data),caption="Accuracy Comparison")
| age | sex | bmi | children | smoker | region | charges |
|---|---|---|---|---|---|---|
| 19 | female | 27.900 | 0 | yes | southwest | 16884.924 |
| 18 | male | 33.770 | 1 | no | southeast | 1725.552 |
| 28 | male | 33.000 | 3 | no | southeast | 4449.462 |
| 33 | male | 22.705 | 0 | no | northwest | 21984.471 |
| 32 | male | 28.880 | 0 | no | northwest | 3866.855 |
| 31 | female | 25.740 | 0 | no | southeast | 3756.622 |
## string type of data
str(insure_data)
## 'data.frame': 1338 obs. of 7 variables:
## $ age : int 19 18 28 33 32 31 46 37 37 60 ...
## $ sex : chr "female" "male" "male" "male" ...
## $ bmi : num 27.9 33.8 33 22.7 28.9 ...
## $ children: int 0 1 3 0 0 0 1 3 2 0 ...
## $ smoker : chr "yes" "no" "no" "no" ...
## $ region : chr "southwest" "southeast" "southeast" "northwest" ...
## $ charges : num 16885 1726 4449 21984 3867 ...
library(dplyr)
library(ggplot2)
library(gridExtra)
library(plotly)
library(corrgram)
library(corrplot)
## summary statistics for the response variable charges
insure_data %>%summarise(mean = mean(charges),median = median(charges),SD = sd(charges),IQR = IQR(charges)) %>% show()
## mean median SD IQR
## 1 13270.42 9382.033 12110.01 11899.63
## summary of all columns
summary(insure_data)
## age sex bmi children
## Min. :18.00 Length:1338 Min. :15.96 Min. :0.000
## 1st Qu.:27.00 Class :character 1st Qu.:26.30 1st Qu.:0.000
## Median :39.00 Mode :character Median :30.40 Median :1.000
## Mean :39.21 Mean :30.66 Mean :1.095
## 3rd Qu.:51.00 3rd Qu.:34.69 3rd Qu.:2.000
## Max. :64.00 Max. :53.13 Max. :5.000
## smoker region charges
## Length:1338 Length:1338 Min. : 1122
## Class :character Class :character 1st Qu.: 4740
## Mode :character Mode :character Median : 9382
## Mean :13270
## 3rd Qu.:16640
## Max. :63770
## plot of the distribution for the response variable charges
ggplot(insure_data, aes(charges)) +
geom_histogram(aes(y=..density..), alpha = 0.7) +
geom_density(col = "red") +
labs(title = "Medical Costs Distribution") +
theme(plot.title = element_text(hjust = 0.5))
## plot between gender and charges
## costs and gender box plot
ggplot(insure_data, aes(x = sex, y = charges, color = sex)) +
geom_boxplot() +
labs(title = "Medical Costs by Gender") +
theme(plot.title = element_text(hjust = 0.5),
legend.position = "none")
## histogram
ggplot(insure_data, aes(x = charges, fill = sex)) +
geom_histogram(position = "identity", alpha = 0.4) +
labs(title = "Medical Costs by Gender") +
theme(plot.title = element_text(hjust = 0.5))
## charges in different regions
ggplot(insure_data, aes(x = region, y = charges, color = region)) +
geom_boxplot() +
labs(title = "Medical Costs by Region") +
theme(plot.title = element_text(hjust = 0.5),
legend.position = "none")
## Medical Costs by no of children
insure_data %>%
group_by(children) %>%
summarise(median = median(charges), mean = mean(charges),total = n()) %>%
arrange(desc(median))
## # A tibble: 6 × 4
## children median mean total
## <int> <dbl> <dbl> <int>
## 1 4 11034. 13851. 25
## 2 3 10601. 15355. 157
## 3 0 9857. 12366. 574
## 4 2 9265. 15074. 240
## 5 5 8590. 8786. 18
## 6 1 8484. 12731. 324
## Plot on Medical costs by no of children
ggplot(insure_data, aes(x = as.factor(children), y = charges, color = as.factor(children))) +
geom_boxplot() +
labs(title = "Medical Costs By Number Of Children",
x = "Number of Children") +
theme(plot.title = element_text(hjust = 0.5),
legend.position = "none")
#distribution of cost by smoking
cs_pl1 <- ggplot(insure_data, aes(x = smoker, y = charges, fill = smoker)) +
geom_boxplot()+
labs(title = "Medical Costs Distribution By Smoking") +
theme(plot.title = element_text(hjust = 0.8),
legend.position = "none")
cs_pl2 <- ggplot(insure_data, aes(charges, fill = smoker)) +
geom_histogram(alpha = 0.7, position = "identity")+
labs(title = "Medical Costs Distribution By Smoking") +
theme(plot.title = element_text(hjust = 0.5))
grid.arrange(cs_pl1, cs_pl2, nrow = 2)
#create has_children
insure_data$has_children <- ifelse(insure_data$children >0,"Yes","No")
insure_data$has_children <- as.factor(insure_data$has_children)
#plot charges vs has_child
summary(insure_data$has_children) %>%
show()
## No Yes
## 574 764
cc_pl1 <- ggplot(insure_data, aes(y = charges, x = has_children, color = has_children)) +
geom_boxplot() +
labs(title = "Medical Costs and Having Children",
x = "Has Children?") +
theme(plot.title = element_text(hjust = 0.5),
legend.position = "none")
cc_pl2 <- ggplot(insure_data, aes(x = charges, fill = has_children)) +
geom_histogram(position = "identity", alpha = 0.5) +
labs(title = "Medical Costs and Having Children",
x = "Has Children?") +
theme(plot.title = element_text(hjust = 0.5))
grid.arrange(cc_pl1, cc_pl2, nrow = 2)
## correlation plots
names(insure_data)
## [1] "age" "sex" "bmi" "children" "smoker"
## [6] "region" "charges" "has_children"
str(insure_data)
## 'data.frame': 1338 obs. of 8 variables:
## $ age : int 19 18 28 33 32 31 46 37 37 60 ...
## $ sex : chr "female" "male" "male" "male" ...
## $ bmi : num 27.9 33.8 33 22.7 28.9 ...
## $ children : int 0 1 3 0 0 0 1 3 2 0 ...
## $ smoker : chr "yes" "no" "no" "no" ...
## $ region : chr "southwest" "southeast" "southeast" "northwest" ...
## $ charges : num 16885 1726 4449 21984 3867 ...
## $ has_children: Factor w/ 2 levels "No","Yes": 1 2 2 1 1 1 2 2 2 1 ...
library(corrplot)
corrplot(cor(insure_data[,c(1,3,4,7)]), # Correlation matrix
method = "shade", # Correlation plot method
type = "full", # Correlation plot style (also "upper" and "lower")
diag = TRUE, # If TRUE (default), adds the diagonal
tl.col = "black", # Labels color
bg = "white", # Background color
title = "", # Main title
col = NULL) # Color palette
## plot to check relation between age and charges and smoker?
ggplot(insure_data, aes(x = age, y = charges, color = smoker)) +
geom_point()+
labs(title = "Medical Costs And Age By smoking") +
theme(plot.title = element_text(hjust = 0.5))
# ## plot to check relation between age and charges and has children?
# ggplot(insure_data, aes(x = age, y = charges, color = has_children)) +
# geom_point()+
# labs(title = "Medical Costs And Age By Has Children") +
# theme(plot.title = element_text(hjust = 0.5))
## plot to check relation between bmi and charges and smoker?
ggplot(insure_data, aes(x = bmi, y = charges, color = smoker)) +
geom_point() +
labs(title = "Medical Costs And Body Mass Index By smoking",
x = "Body Mass Index") +
theme(plot.title = element_text(hjust = 0.5))
## based on the BMI we can derive one more variable as BMI class based on WHO weight classification based on BMI
insure_data$BMI_Class <- ifelse(insure_data$bmi < 18.5, "underweight",
ifelse( insure_data$bmi >= 18.5 & insure_data$bmi < 25, "normal",
ifelse(insure_data$bmi >= 25 & insure_data$bmi < 30, "overweight",
ifelse(insure_data$bmi >= 30 & insure_data$bmi < 40, "obese",
ifelse(insure_data$bmi >= 40, "morbid obese", NA)))))
table(insure_data$BMI_Class)
##
## morbid obese normal obese overweight underweight
## 91 225 616 386 20
insure_data$BMI_Class <- as.factor(insure_data$BMI_Class)
## plot to check the BMI_Class to charges by smoker?
ggplot(insure_data, aes(x = BMI_Class, y = charges, color = BMI_Class)) +
geom_boxplot() +
labs(title = "Distribution of Medical Costs and Weight Status By Smoking",
x = "Weight Status") +
theme(plot.title = element_text(hjust = 0.5),
legend.position = "none") +
facet_wrap(~smoker)
library(MASS)
lm_model <- lm(charges ~ . , insure_data)
summary(lm_model)
##
## Call:
## lm(formula = charges ~ ., data = insure_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11672.0 -3449.9 -240.9 1607.7 28606.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6984.05 3275.82 -2.132 0.0332 *
## age 256.48 11.81 21.711 <2e-16 ***
## sexmale -159.11 330.19 -0.482 0.6300
## bmi 187.56 74.81 2.507 0.0123 *
## children 394.85 221.57 1.782 0.0750 .
## smokeryes 23851.62 409.72 58.215 <2e-16 ***
## regionnorthwest -409.56 472.96 -0.866 0.3867
## regionsoutheast -898.82 476.25 -1.887 0.0593 .
## regionsouthwest -986.69 475.18 -2.076 0.0380 *
## has_childrenYes 254.10 539.41 0.471 0.6377
## BMI_Classnormal -1595.16 1684.32 -0.947 0.3438
## BMI_Classobese 941.30 946.21 0.995 0.3200
## BMI_Classoverweight -1687.39 1339.59 -1.260 0.2080
## BMI_Classunderweight -1917.83 2394.85 -0.801 0.4234
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6009 on 1324 degrees of freedom
## Multiple R-squared: 0.7562, Adjusted R-squared: 0.7538
## F-statistic: 315.8 on 13 and 1324 DF, p-value: < 2.2e-16
StepAIC_model <- stepAIC(lm_model, direction = "backward", trace = FALSE)
summary(StepAIC_model)
##
## Call:
## lm(formula = charges ~ age + bmi + children + smoker + BMI_Class,
## data = insure_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12359.9 -3527.7 -326.2 1543.7 28131.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6867.0 3242.9 -2.118 0.03440 *
## age 257.5 11.8 21.817 < 2e-16 ***
## bmi 166.6 73.5 2.266 0.02361 *
## children 473.1 136.6 3.464 0.00055 ***
## smokeryes 23826.2 407.6 58.448 < 2e-16 ***
## BMI_Classnormal -1777.2 1679.6 -1.058 0.29019
## BMI_Classobese 858.8 944.8 0.909 0.36355
## BMI_Classoverweight -1832.2 1336.6 -1.371 0.17068
## BMI_Classunderweight -1989.2 2391.5 -0.832 0.40569
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6011 on 1329 degrees of freedom
## Multiple R-squared: 0.7551, Adjusted R-squared: 0.7536
## F-statistic: 512.1 on 8 and 1329 DF, p-value: < 2.2e-16
# Randomforest model
library(randomForest)
set.seed(7)
str(insure_data)
## 'data.frame': 1338 obs. of 9 variables:
## $ age : int 19 18 28 33 32 31 46 37 37 60 ...
## $ sex : chr "female" "male" "male" "male" ...
## $ bmi : num 27.9 33.8 33 22.7 28.9 ...
## $ children : int 0 1 3 0 0 0 1 3 2 0 ...
## $ smoker : chr "yes" "no" "no" "no" ...
## $ region : chr "southwest" "southeast" "southeast" "northwest" ...
## $ charges : num 16885 1726 4449 21984 3867 ...
## $ has_children: Factor w/ 2 levels "No","Yes": 1 2 2 1 1 1 2 2 2 1 ...
## $ BMI_Class : Factor w/ 5 levels "morbid obese",..: 4 3 3 2 4 4 3 4 4 4 ...
rf_model <- randomForest(charges ~ ., data=insure_data, ntree=500,
keep.forest=FALSE, importance=TRUE)
rf_model
##
## Call:
## randomForest(formula = charges ~ ., data = insure_data, ntree = 500, keep.forest = FALSE, importance = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 2
##
## Mean of squared residuals: 23222360
## % Var explained: 84.15