Project Description

Health insurance is insurance that covers all or a portion of the risk that a person would incur medical costs, distributing the risk among several people. An insurer can create a regular financial structure, such as a monthly premium or payroll tax, to provide the funds to pay for the medical benefits stated in the insurance agreement by assessing the overall risk of health care and health system spending over the risk pool. A central organization, such as a government agency, for-profit corporation, or private company, is in charge of administering the benefit.

Health insurance is described as “coverage that allows for the payments of benefits as a result of sickness or accident,” according to the Health Insurance Association of America. It covers losses resulting from mishaps, medical bills, disabilities, or unintentional death and mutilation (p. 225).

The purpose of this data processing is to forecast insurance costs with accuracy.

Data set:

Insurance Dataset for Regression model

1. Regression

We are going to use health insurance dataset to work on regression model

### Loading required packages
library(magrittr)
library(knitr)
library(pryr)
library(magrittr)
library(tidyverse)
library(GGally)
library(vtreat)

Data processing

## Loading required packages
library(knitr)
# Reading the dataset
insure_data<-read.csv("insurance.csv")
## missing data
colSums(is.na(insure_data))
##      age      sex      bmi children   smoker   region  charges 
##        0        0        0        0        0        0        0
## head of dataset
kable(head(insure_data),caption="Accuracy Comparison")
Accuracy Comparison
age sex bmi children smoker region charges
19 female 27.900 0 yes southwest 16884.924
18 male 33.770 1 no southeast 1725.552
28 male 33.000 3 no southeast 4449.462
33 male 22.705 0 no northwest 21984.471
32 male 28.880 0 no northwest 3866.855
31 female 25.740 0 no southeast 3756.622
## string type of data
str(insure_data)
## 'data.frame':    1338 obs. of  7 variables:
##  $ age     : int  19 18 28 33 32 31 46 37 37 60 ...
##  $ sex     : chr  "female" "male" "male" "male" ...
##  $ bmi     : num  27.9 33.8 33 22.7 28.9 ...
##  $ children: int  0 1 3 0 0 0 1 3 2 0 ...
##  $ smoker  : chr  "yes" "no" "no" "no" ...
##  $ region  : chr  "southwest" "southeast" "southeast" "northwest" ...
##  $ charges : num  16885 1726 4449 21984 3867 ...

Exploratory Data Analysis

library(dplyr)
library(ggplot2)
library(gridExtra)
library(plotly)
library(corrgram)
library(corrplot)
## summary statistics for the response variable charges
insure_data %>%summarise(mean = mean(charges),median = median(charges),SD = sd(charges),IQR = IQR(charges)) %>% show()
##       mean   median       SD      IQR
## 1 13270.42 9382.033 12110.01 11899.63
## summary of all columns
summary(insure_data)
##       age            sex                 bmi           children    
##  Min.   :18.00   Length:1338        Min.   :15.96   Min.   :0.000  
##  1st Qu.:27.00   Class :character   1st Qu.:26.30   1st Qu.:0.000  
##  Median :39.00   Mode  :character   Median :30.40   Median :1.000  
##  Mean   :39.21                      Mean   :30.66   Mean   :1.095  
##  3rd Qu.:51.00                      3rd Qu.:34.69   3rd Qu.:2.000  
##  Max.   :64.00                      Max.   :53.13   Max.   :5.000  
##     smoker             region             charges     
##  Length:1338        Length:1338        Min.   : 1122  
##  Class :character   Class :character   1st Qu.: 4740  
##  Mode  :character   Mode  :character   Median : 9382  
##                                        Mean   :13270  
##                                        3rd Qu.:16640  
##                                        Max.   :63770
## plot of the distribution for the response variable charges
ggplot(insure_data, aes(charges)) +
  geom_histogram(aes(y=..density..), alpha = 0.7)  +
  geom_density(col = "red") +
  labs(title = "Medical Costs Distribution") +
  theme(plot.title = element_text(hjust = 0.5))

## plot between gender and charges
## costs and gender box plot
ggplot(insure_data, aes(x = sex, y = charges, color = sex)) +
  geom_boxplot() +
  labs(title = "Medical Costs by Gender") +
  theme(plot.title = element_text(hjust = 0.5),
        legend.position = "none")

## histogram
ggplot(insure_data, aes(x = charges, fill = sex)) +
  geom_histogram(position = "identity", alpha = 0.4) +
  labs(title = "Medical Costs by Gender") +
  theme(plot.title = element_text(hjust = 0.5))

## charges in different regions
ggplot(insure_data, aes(x = region, y = charges, color = region)) +
  geom_boxplot() +
  labs(title = "Medical Costs by Region") +
  theme(plot.title = element_text(hjust = 0.5),
        legend.position = "none")

## Medical Costs by no of children
insure_data %>%
  group_by(children) %>%
  summarise(median = median(charges), mean = mean(charges),total = n()) %>%
  arrange(desc(median)) 
## # A tibble: 6 × 4
##   children median   mean total
##      <int>  <dbl>  <dbl> <int>
## 1        4 11034. 13851.    25
## 2        3 10601. 15355.   157
## 3        0  9857. 12366.   574
## 4        2  9265. 15074.   240
## 5        5  8590.  8786.    18
## 6        1  8484. 12731.   324
## Plot on Medical costs by no of children
ggplot(insure_data, aes(x = as.factor(children), y = charges, color = as.factor(children))) +
  geom_boxplot() +
  labs(title = "Medical Costs By Number Of Children",
       x = "Number of Children") +
  theme(plot.title = element_text(hjust = 0.5),
        legend.position = "none")

#distribution of cost by smoking
cs_pl1 <- ggplot(insure_data, aes(x = smoker, y = charges, fill = smoker)) +
  geom_boxplot()+
  labs(title = "Medical Costs Distribution By Smoking") +
  theme(plot.title = element_text(hjust = 0.8),
        legend.position = "none")

cs_pl2 <- ggplot(insure_data, aes(charges, fill = smoker)) +
  geom_histogram(alpha = 0.7, position = "identity")+
  labs(title = "Medical Costs Distribution By Smoking") +
  theme(plot.title = element_text(hjust = 0.5))
grid.arrange(cs_pl1, cs_pl2, nrow = 2)

#create has_children 
insure_data$has_children <- ifelse(insure_data$children >0,"Yes","No")
insure_data$has_children <- as.factor(insure_data$has_children)
#plot charges vs has_child
summary(insure_data$has_children) %>%
  show()
##  No Yes 
## 574 764
cc_pl1 <- ggplot(insure_data, aes(y = charges, x = has_children, color = has_children)) +
  geom_boxplot() +
  labs(title = "Medical Costs and Having Children",
       x = "Has Children?") +
  theme(plot.title = element_text(hjust = 0.5),
        legend.position = "none")

cc_pl2 <- ggplot(insure_data, aes(x = charges, fill = has_children)) +
  geom_histogram(position = "identity", alpha = 0.5) +
  labs(title = "Medical Costs and Having Children",
       x = "Has Children?") +
  theme(plot.title = element_text(hjust = 0.5))

grid.arrange(cc_pl1, cc_pl2, nrow = 2)

## correlation plots
names(insure_data)
## [1] "age"          "sex"          "bmi"          "children"     "smoker"      
## [6] "region"       "charges"      "has_children"
str(insure_data)
## 'data.frame':    1338 obs. of  8 variables:
##  $ age         : int  19 18 28 33 32 31 46 37 37 60 ...
##  $ sex         : chr  "female" "male" "male" "male" ...
##  $ bmi         : num  27.9 33.8 33 22.7 28.9 ...
##  $ children    : int  0 1 3 0 0 0 1 3 2 0 ...
##  $ smoker      : chr  "yes" "no" "no" "no" ...
##  $ region      : chr  "southwest" "southeast" "southeast" "northwest" ...
##  $ charges     : num  16885 1726 4449 21984 3867 ...
##  $ has_children: Factor w/ 2 levels "No","Yes": 1 2 2 1 1 1 2 2 2 1 ...
library(corrplot)
corrplot(cor(insure_data[,c(1,3,4,7)]),        # Correlation matrix
         method = "shade", # Correlation plot method
         type = "full",    # Correlation plot style (also "upper" and "lower")
         diag = TRUE,      # If TRUE (default), adds the diagonal
         tl.col = "black", # Labels color
         bg = "white",     # Background color
         title = "",       # Main title
         col = NULL)       # Color palette

## plot to check relation between age and charges and smoker?
ggplot(insure_data, aes(x = age, y = charges, color = smoker)) +
  geom_point()+
  labs(title = "Medical Costs And Age By smoking") +
  theme(plot.title = element_text(hjust = 0.5))

# ## plot to check relation between age and charges and has children?
# ggplot(insure_data, aes(x = age, y = charges, color = has_children)) +
#   geom_point()+
#   labs(title = "Medical Costs And Age By Has Children") +
#   theme(plot.title = element_text(hjust = 0.5))

## plot to check relation between bmi and charges and smoker?
ggplot(insure_data, aes(x = bmi, y = charges, color = smoker)) +
  geom_point() +
  labs(title = "Medical Costs And Body Mass Index By smoking",
       x = "Body Mass Index") +
  theme(plot.title = element_text(hjust = 0.5))

## based on the BMI we can derive one more variable as BMI class based on WHO weight classification based on BMI
insure_data$BMI_Class <- ifelse(insure_data$bmi < 18.5, "underweight", 
                                ifelse( insure_data$bmi >= 18.5 & insure_data$bmi < 25, "normal",
                                        ifelse(insure_data$bmi >= 25 & insure_data$bmi < 30, "overweight",
                                               ifelse(insure_data$bmi >= 30 & insure_data$bmi < 40, "obese", 
                                                      ifelse(insure_data$bmi >= 40, "morbid obese", NA)))))
table(insure_data$BMI_Class)
## 
## morbid obese       normal        obese   overweight  underweight 
##           91          225          616          386           20
insure_data$BMI_Class <- as.factor(insure_data$BMI_Class)

## plot to check the BMI_Class to charges by smoker?
ggplot(insure_data, aes(x = BMI_Class, y = charges, color = BMI_Class)) +
  geom_boxplot() +
  labs(title = "Distribution of Medical Costs and Weight Status By Smoking",
       x = "Weight Status") +
  theme(plot.title = element_text(hjust = 0.5),
        legend.position = "none") +
  facet_wrap(~smoker)

Data Modelling

Linear Regression

library(MASS)
lm_model <- lm(charges ~ .  , insure_data)
summary(lm_model)
## 
## Call:
## lm(formula = charges ~ ., data = insure_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11672.0  -3449.9   -240.9   1607.7  28606.0 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -6984.05    3275.82  -2.132   0.0332 *  
## age                    256.48      11.81  21.711   <2e-16 ***
## sexmale               -159.11     330.19  -0.482   0.6300    
## bmi                    187.56      74.81   2.507   0.0123 *  
## children               394.85     221.57   1.782   0.0750 .  
## smokeryes            23851.62     409.72  58.215   <2e-16 ***
## regionnorthwest       -409.56     472.96  -0.866   0.3867    
## regionsoutheast       -898.82     476.25  -1.887   0.0593 .  
## regionsouthwest       -986.69     475.18  -2.076   0.0380 *  
## has_childrenYes        254.10     539.41   0.471   0.6377    
## BMI_Classnormal      -1595.16    1684.32  -0.947   0.3438    
## BMI_Classobese         941.30     946.21   0.995   0.3200    
## BMI_Classoverweight  -1687.39    1339.59  -1.260   0.2080    
## BMI_Classunderweight -1917.83    2394.85  -0.801   0.4234    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6009 on 1324 degrees of freedom
## Multiple R-squared:  0.7562, Adjusted R-squared:  0.7538 
## F-statistic: 315.8 on 13 and 1324 DF,  p-value: < 2.2e-16
StepAIC_model <- stepAIC(lm_model, direction = "backward", trace = FALSE)
summary(StepAIC_model)
## 
## Call:
## lm(formula = charges ~ age + bmi + children + smoker + BMI_Class, 
##     data = insure_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12359.9  -3527.7   -326.2   1543.7  28131.9 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           -6867.0     3242.9  -2.118  0.03440 *  
## age                     257.5       11.8  21.817  < 2e-16 ***
## bmi                     166.6       73.5   2.266  0.02361 *  
## children                473.1      136.6   3.464  0.00055 ***
## smokeryes             23826.2      407.6  58.448  < 2e-16 ***
## BMI_Classnormal       -1777.2     1679.6  -1.058  0.29019    
## BMI_Classobese          858.8      944.8   0.909  0.36355    
## BMI_Classoverweight   -1832.2     1336.6  -1.371  0.17068    
## BMI_Classunderweight  -1989.2     2391.5  -0.832  0.40569    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6011 on 1329 degrees of freedom
## Multiple R-squared:  0.7551, Adjusted R-squared:  0.7536 
## F-statistic: 512.1 on 8 and 1329 DF,  p-value: < 2.2e-16

Randomforest Regression

# Randomforest model
library(randomForest)
set.seed(7)
str(insure_data)
## 'data.frame':    1338 obs. of  9 variables:
##  $ age         : int  19 18 28 33 32 31 46 37 37 60 ...
##  $ sex         : chr  "female" "male" "male" "male" ...
##  $ bmi         : num  27.9 33.8 33 22.7 28.9 ...
##  $ children    : int  0 1 3 0 0 0 1 3 2 0 ...
##  $ smoker      : chr  "yes" "no" "no" "no" ...
##  $ region      : chr  "southwest" "southeast" "southeast" "northwest" ...
##  $ charges     : num  16885 1726 4449 21984 3867 ...
##  $ has_children: Factor w/ 2 levels "No","Yes": 1 2 2 1 1 1 2 2 2 1 ...
##  $ BMI_Class   : Factor w/ 5 levels "morbid obese",..: 4 3 3 2 4 4 3 4 4 4 ...
rf_model <- randomForest(charges ~ ., data=insure_data, ntree=500,
                       keep.forest=FALSE, importance=TRUE)
rf_model
## 
## Call:
##  randomForest(formula = charges ~ ., data = insure_data, ntree = 500,      keep.forest = FALSE, importance = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##           Mean of squared residuals: 23222360
##                     % Var explained: 84.15