Medical Insurance Forecast by Using Regression

1. Regression

We are going to use health insurance dataset to work on regression model

### Loading required packages
library(magrittr)
library(knitr)
library(pryr)
library(magrittr)
library(tidyverse)
library(GGally)
library(vtreat)

Data processing

## Loading required packages
library(knitr)
# Reading the dataset
insure_data<-read.csv("insurance.csv")
## missing data
colSums(is.na(insure_data))

##      age      sex      bmi children   smoker   region  charges 
##        0        0        0        0        0        0        0

## head of dataset
kable(head(insure_data),caption="Accuracy Comparison")

Accuracy Comparison
age	sex	bmi	children	smoker	region	charges
19	female	27.900	0	yes	southwest	16884.924
18	male	33.770	1	no	southeast	1725.552
28	male	33.000	3	no	southeast	4449.462
33	male	22.705	0	no	northwest	21984.471
32	male	28.880	0	no	northwest	3866.855
31	female	25.740	0	no	southeast	3756.622

## string type of data
str(insure_data)

## 'data.frame':    1338 obs. of  7 variables:
##  $ age     : int  19 18 28 33 32 31 46 37 37 60 ...
##  $ sex     : chr  "female" "male" "male" "male" ...
##  $ bmi     : num  27.9 33.8 33 22.7 28.9 ...
##  $ children: int  0 1 3 0 0 0 1 3 2 0 ...
##  $ smoker  : chr  "yes" "no" "no" "no" ...
##  $ region  : chr  "southwest" "southeast" "southeast" "northwest" ...
##  $ charges : num  16885 1726 4449 21984 3867 ...

Exploratory Data Analysis

library(dplyr)
library(ggplot2)
library(gridExtra)
library(plotly)
library(corrgram)
library(corrplot)
## summary statistics for the response variable charges
insure_data %>%summarise(mean = mean(charges),median = median(charges),SD = sd(charges),IQR = IQR(charges)) %>% show()

##       mean   median       SD      IQR
## 1 13270.42 9382.033 12110.01 11899.63

## summary of all columns
summary(insure_data)

##       age            sex                 bmi           children    
##  Min.   :18.00   Length:1338        Min.   :15.96   Min.   :0.000  
##  1st Qu.:27.00   Class :character   1st Qu.:26.30   1st Qu.:0.000  
##  Median :39.00   Mode  :character   Median :30.40   Median :1.000  
##  Mean   :39.21                      Mean   :30.66   Mean   :1.095  
##  3rd Qu.:51.00                      3rd Qu.:34.69   3rd Qu.:2.000  
##  Max.   :64.00                      Max.   :53.13   Max.   :5.000  
##     smoker             region             charges     
##  Length:1338        Length:1338        Min.   : 1122  
##  Class :character   Class :character   1st Qu.: 4740  
##  Mode  :character   Mode  :character   Median : 9382  
##                                        Mean   :13270  
##                                        3rd Qu.:16640  
##                                        Max.   :63770

## plot of the distribution for the response variable charges
ggplot(insure_data, aes(charges)) +
  geom_histogram(aes(y=..density..), alpha = 0.7)  +
  geom_density(col = "red") +
  labs(title = "Medical Costs Distribution") +
  theme(plot.title = element_text(hjust = 0.5))

## plot between gender and charges
## costs and gender box plot
ggplot(insure_data, aes(x = sex, y = charges, color = sex)) +
  geom_boxplot() +
  labs(title = "Medical Costs by Gender") +
  theme(plot.title = element_text(hjust = 0.5),
        legend.position = "none")

## histogram
ggplot(insure_data, aes(x = charges, fill = sex)) +
  geom_histogram(position = "identity", alpha = 0.4) +
  labs(title = "Medical Costs by Gender") +
  theme(plot.title = element_text(hjust = 0.5))

## charges in different regions
ggplot(insure_data, aes(x = region, y = charges, color = region)) +
  geom_boxplot() +
  labs(title = "Medical Costs by Region") +
  theme(plot.title = element_text(hjust = 0.5),
        legend.position = "none")

## Medical Costs by no of children
insure_data %>%
  group_by(children) %>%
  summarise(median = median(charges), mean = mean(charges),total = n()) %>%
  arrange(desc(median))

## # A tibble: 6 × 4
##   children median   mean total
##      <int>  <dbl>  <dbl> <int>
## 1        4 11034. 13851.    25
## 2        3 10601. 15355.   157
## 3        0  9857. 12366.   574
## 4        2  9265. 15074.   240
## 5        5  8590.  8786.    18
## 6        1  8484. 12731.   324

## Plot on Medical costs by no of children
ggplot(insure_data, aes(x = as.factor(children), y = charges, color = as.factor(children))) +
  geom_boxplot() +
  labs(title = "Medical Costs By Number Of Children",
       x = "Number of Children") +
  theme(plot.title = element_text(hjust = 0.5),
        legend.position = "none")

#distribution of cost by smoking
cs_pl1 <- ggplot(insure_data, aes(x = smoker, y = charges, fill = smoker)) +
  geom_boxplot()+
  labs(title = "Medical Costs Distribution By Smoking") +
  theme(plot.title = element_text(hjust = 0.8),
        legend.position = "none")

cs_pl2 <- ggplot(insure_data, aes(charges, fill = smoker)) +
  geom_histogram(alpha = 0.7, position = "identity")+
  labs(title = "Medical Costs Distribution By Smoking") +
  theme(plot.title = element_text(hjust = 0.5))
grid.arrange(cs_pl1, cs_pl2, nrow = 2)

#create has_children 
insure_data$has_children <- ifelse(insure_data$children >0,"Yes","No")
insure_data$has_children <- as.factor(insure_data$has_children)
#plot charges vs has_child
summary(insure_data$has_children) %>%
  show()

##  No Yes 
## 574 764

cc_pl1 <- ggplot(insure_data, aes(y = charges, x = has_children, color = has_children)) +
  geom_boxplot() +
  labs(title = "Medical Costs and Having Children",
       x = "Has Children?") +
  theme(plot.title = element_text(hjust = 0.5),
        legend.position = "none")

cc_pl2 <- ggplot(insure_data, aes(x = charges, fill = has_children)) +
  geom_histogram(position = "identity", alpha = 0.5) +
  labs(title = "Medical Costs and Having Children",
       x = "Has Children?") +
  theme(plot.title = element_text(hjust = 0.5))

grid.arrange(cc_pl1, cc_pl2, nrow = 2)

## correlation plots
names(insure_data)

## [1] "age"          "sex"          "bmi"          "children"     "smoker"      
## [6] "region"       "charges"      "has_children"

str(insure_data)

## 'data.frame':    1338 obs. of  8 variables:
##  $ age         : int  19 18 28 33 32 31 46 37 37 60 ...
##  $ sex         : chr  "female" "male" "male" "male" ...
##  $ bmi         : num  27.9 33.8 33 22.7 28.9 ...
##  $ children    : int  0 1 3 0 0 0 1 3 2 0 ...
##  $ smoker      : chr  "yes" "no" "no" "no" ...
##  $ region      : chr  "southwest" "southeast" "southeast" "northwest" ...
##  $ charges     : num  16885 1726 4449 21984 3867 ...
##  $ has_children: Factor w/ 2 levels "No","Yes": 1 2 2 1 1 1 2 2 2 1 ...

library(corrplot)
corrplot(cor(insure_data[,c(1,3,4,7)]),        # Correlation matrix
         method = "shade", # Correlation plot method
         type = "full",    # Correlation plot style (also "upper" and "lower")
         diag = TRUE,      # If TRUE (default), adds the diagonal
         tl.col = "black", # Labels color
         bg = "white",     # Background color
         title = "",       # Main title
         col = NULL)       # Color palette

## plot to check relation between age and charges and smoker?
ggplot(insure_data, aes(x = age, y = charges, color = smoker)) +
  geom_point()+
  labs(title = "Medical Costs And Age By smoking") +
  theme(plot.title = element_text(hjust = 0.5))

# ## plot to check relation between age and charges and has children?
# ggplot(insure_data, aes(x = age, y = charges, color = has_children)) +
#   geom_point()+
#   labs(title = "Medical Costs And Age By Has Children") +
#   theme(plot.title = element_text(hjust = 0.5))

## plot to check relation between bmi and charges and smoker?
ggplot(insure_data, aes(x = bmi, y = charges, color = smoker)) +
  geom_point() +
  labs(title = "Medical Costs And Body Mass Index By smoking",
       x = "Body Mass Index") +
  theme(plot.title = element_text(hjust = 0.5))

## based on the BMI we can derive one more variable as BMI class based on WHO weight classification based on BMI
insure_data$BMI_Class <- ifelse(insure_data$bmi < 18.5, "underweight", 
                                ifelse( insure_data$bmi >= 18.5 & insure_data$bmi < 25, "normal",
                                        ifelse(insure_data$bmi >= 25 & insure_data$bmi < 30, "overweight",
                                               ifelse(insure_data$bmi >= 30 & insure_data$bmi < 40, "obese", 
                                                      ifelse(insure_data$bmi >= 40, "morbid obese", NA)))))
table(insure_data$BMI_Class)

## 
## morbid obese       normal        obese   overweight  underweight 
##           91          225          616          386           20

insure_data$BMI_Class <- as.factor(insure_data$BMI_Class)

## plot to check the BMI_Class to charges by smoker?
ggplot(insure_data, aes(x = BMI_Class, y = charges, color = BMI_Class)) +
  geom_boxplot() +
  labs(title = "Distribution of Medical Costs and Weight Status By Smoking",
       x = "Weight Status") +
  theme(plot.title = element_text(hjust = 0.5),
        legend.position = "none") +
  facet_wrap(~smoker)

Data Modelling

Linear Regression

library(MASS)
lm_model <- lm(charges ~ .  , insure_data)
summary(lm_model)

## 
## Call:
## lm(formula = charges ~ ., data = insure_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11672.0  -3449.9   -240.9   1607.7  28606.0 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -6984.05    3275.82  -2.132   0.0332 *  
## age                    256.48      11.81  21.711   <2e-16 ***
## sexmale               -159.11     330.19  -0.482   0.6300    
## bmi                    187.56      74.81   2.507   0.0123 *  
## children               394.85     221.57   1.782   0.0750 .  
## smokeryes            23851.62     409.72  58.215   <2e-16 ***
## regionnorthwest       -409.56     472.96  -0.866   0.3867    
## regionsoutheast       -898.82     476.25  -1.887   0.0593 .  
## regionsouthwest       -986.69     475.18  -2.076   0.0380 *  
## has_childrenYes        254.10     539.41   0.471   0.6377    
## BMI_Classnormal      -1595.16    1684.32  -0.947   0.3438    
## BMI_Classobese         941.30     946.21   0.995   0.3200    
## BMI_Classoverweight  -1687.39    1339.59  -1.260   0.2080    
## BMI_Classunderweight -1917.83    2394.85  -0.801   0.4234    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6009 on 1324 degrees of freedom
## Multiple R-squared:  0.7562, Adjusted R-squared:  0.7538 
## F-statistic: 315.8 on 13 and 1324 DF,  p-value: < 2.2e-16

StepAIC_model <- stepAIC(lm_model, direction = "backward", trace = FALSE)
summary(StepAIC_model)

## 
## Call:
## lm(formula = charges ~ age + bmi + children + smoker + BMI_Class, 
##     data = insure_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12359.9  -3527.7   -326.2   1543.7  28131.9 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           -6867.0     3242.9  -2.118  0.03440 *  
## age                     257.5       11.8  21.817  < 2e-16 ***
## bmi                     166.6       73.5   2.266  0.02361 *  
## children                473.1      136.6   3.464  0.00055 ***
## smokeryes             23826.2      407.6  58.448  < 2e-16 ***
## BMI_Classnormal       -1777.2     1679.6  -1.058  0.29019    
## BMI_Classobese          858.8      944.8   0.909  0.36355    
## BMI_Classoverweight   -1832.2     1336.6  -1.371  0.17068    
## BMI_Classunderweight  -1989.2     2391.5  -0.832  0.40569    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6011 on 1329 degrees of freedom
## Multiple R-squared:  0.7551, Adjusted R-squared:  0.7536 
## F-statistic: 512.1 on 8 and 1329 DF,  p-value: < 2.2e-16

Randomforest Regression

# Randomforest model
library(randomForest)
set.seed(7)
str(insure_data)

## 'data.frame':    1338 obs. of  9 variables:
##  $ age         : int  19 18 28 33 32 31 46 37 37 60 ...
##  $ sex         : chr  "female" "male" "male" "male" ...
##  $ bmi         : num  27.9 33.8 33 22.7 28.9 ...
##  $ children    : int  0 1 3 0 0 0 1 3 2 0 ...
##  $ smoker      : chr  "yes" "no" "no" "no" ...
##  $ region      : chr  "southwest" "southeast" "southeast" "northwest" ...
##  $ charges     : num  16885 1726 4449 21984 3867 ...
##  $ has_children: Factor w/ 2 levels "No","Yes": 1 2 2 1 1 1 2 2 2 1 ...
##  $ BMI_Class   : Factor w/ 5 levels "morbid obese",..: 4 3 3 2 4 4 3 4 4 4 ...

rf_model <- randomForest(charges ~ ., data=insure_data, ntree=500,
                       keep.forest=FALSE, importance=TRUE)
rf_model

## 
## Call:
##  randomForest(formula = charges ~ ., data = insure_data, ntree = 500,      keep.forest = FALSE, importance = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##           Mean of squared residuals: 23222360
##                     % Var explained: 84.15

Medical Insurance Forecast by Using Regression

Hemanth Mydugolam

2022-11-20

Project Description

Data set:

1. Regression

Data processing

Exploratory Data Analysis

Data Modelling

Linear Regression

Randomforest Regression