Source

The dataset of this study is sourced from the medical cost personal dataset available in Kaggle (Dataset).

Data Analysis

Ins = read.csv('insurance.csv', header=TRUE)
str(Ins) # examine the structure of the data
## 'data.frame':    1338 obs. of  7 variables:
##  $ age     : int  19 18 28 33 32 31 46 37 37 60 ...
##  $ sex     : chr  "female" "male" "male" "male" ...
##  $ bmi     : num  27.9 33.8 33 22.7 28.9 ...
##  $ children: int  0 1 3 0 0 0 1 3 2 0 ...
##  $ smoker  : chr  "yes" "no" "no" "no" ...
##  $ region  : chr  "southwest" "southeast" "southeast" "northwest" ...
##  $ charges : num  16885 1726 4449 21984 3867 ...

There are altogether 1338 observations and 7 variables (listed below).

Each variable is associated with a certain data type. Age and children are integer; sex, smoker and region are character; bmi and charges are number.

head(Ins, 10)

The first 10 rows are extracted to get a sense on the data.

summary(Ins)
##       age            sex                 bmi           children    
##  Min.   :18.00   Length:1338        Min.   :15.96   Min.   :0.000  
##  1st Qu.:27.00   Class :character   1st Qu.:26.30   1st Qu.:0.000  
##  Median :39.00   Mode  :character   Median :30.40   Median :1.000  
##  Mean   :39.21                      Mean   :30.66   Mean   :1.095  
##  3rd Qu.:51.00                      3rd Qu.:34.69   3rd Qu.:2.000  
##  Max.   :64.00                      Max.   :53.13   Max.   :5.000  
##     smoker             region             charges     
##  Length:1338        Length:1338        Min.   : 1122  
##  Class :character   Class :character   1st Qu.: 4740  
##  Mode  :character   Mode  :character   Median : 9382  
##                                        Mean   :13270  
##                                        3rd Qu.:16640  
##                                        Max.   :63770
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
Ins %>% group_by(sex) %>% summarize(count_values = n())
Ins %>% group_by(smoker) %>% summarize(count_values = n())
Ins %>% group_by(region) %>% summarize(count_values = n())

Summary of the variables:

Data Visualization

# Boxplot
library(ggplot2)

ggplot(Ins, aes(x = sex, y = charges)) + geom_boxplot(aes(color = sex))

ggplot(Ins, aes(x = smoker, y = charges)) + geom_boxplot(aes(color = smoker))

ggplot(Ins, aes(x = region, y = charges)) + geom_boxplot(aes(color = region))

Here, three boxplots are plotted for charges vs sex, smoker and region. It is observed that smoker has a significantly larger charges compared to non-smoker.

ggplot(Ins, aes(x = charges, color = sex, fill = sex)) +
  geom_density(alpha = 0.5)

ggplot(Ins, aes(x = charges, color = smoker, fill = smoker)) +
  geom_density(alpha = 0.5)

ggplot(Ins, aes(x = charges, color = region, fill = region)) +
  geom_density(alpha = 0.5)

Density plots are constructed in a similar way.

ggplot(Ins, aes(age)) +
  geom_histogram(aes(fill=age), 
                   bins=8,
                   color="black",
                   fill="blue",
                   size=.1) +   # change number of bins
  labs(title="Distribution of age") 
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

ggplot(Ins, aes(bmi)) +
  geom_histogram(aes(fill=bmi), 
                   bins=15,
                   color="black",
                   fill="yellow",
                   size=.1) +   # change number of bins
  labs(title="Distribution of bmi") 

ggplot(Ins, aes(charges)) +
  geom_histogram(aes(fill=charges), 
                   bins=15,
                   color="black",
                   fill="green",
                   size=.1) +   # change number of bins
  labs(title="Distribution of charges") 

The distribution of charges is right-skewed (i.e. positive skewness).

Testing for Normality

QQ-Plot

qqnorm(Ins$bmi, pch = 1, frame = FALSE, main = "QQ plot for bmi")
qqline(Ins$bmi, col = "steelblue", lwd = 2)

qqnorm(Ins$charges, pch = 1, frame = FALSE, main = "QQ plot for charges")
qqline(Ins$charges, col = "steelblue", lwd = 2)

Shapiro-Wilk Test

Shapiro-Wilk Test is widely used to test the normality of the data (where the data is continuous and univariate). The more the test statistic W is close to 1, the more likely the sample data is drawn by a normal distribution.

shapiro.test(Ins$bmi)
## 
##  Shapiro-Wilk normality test
## 
## data:  Ins$bmi
## W = 0.99389, p-value = 2.605e-05
shapiro.test(Ins$charges)
## 
##  Shapiro-Wilk normality test
## 
## data:  Ins$charges
## W = 0.81469, p-value < 2.2e-16

From the Shapiro-Wilk tests, even though the p-values are less than 0.05, which indicate it is statistically significant to reject the null hypothesis that the samples are drawn from normal distribution, but for bmi, the W value is close to 1, suggesting some relation to normal distribution.

Correlation between variables

In studying the correlation, Spearman correlation test (used for non-parametric distribution; Pearson correlation test can be used if normal distribution is assumed) could be carried out to determine the correlation between the variables, in particular, the correlations between charges and other variables are of the greatest concern.

Under the Spearman correlation test, the value of rho indicates the correlation (rho = 1 when positively correlated; rho = -1 when negatively correlated; rho = 0 when no correlation).

cor.test(Ins$charges, Ins$bmi, method = "spearman", exact = FALSE)
## 
##  Spearman's rank correlation rho
## 
## data:  Ins$charges and Ins$bmi
## S = 351558456, p-value = 1.193e-05
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.1193959
cor.test(Ins$charges, Ins$age, method = "spearman", exact = FALSE)
## 
##  Spearman's rank correlation rho
## 
## data:  Ins$charges and Ins$age
## S = 185881923, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.5343921
cor.test(Ins$charges, Ins$children, method = "spearman", exact = FALSE)
## 
##  Spearman's rank correlation rho
## 
## data:  Ins$charges and Ins$children
## S = 345992058, p-value = 9.847e-07
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.1333389

From the Spearman correlation tests above, the three p-values are less than 0.05 (which is the common significance level), indicating that there are correlation between the variables tested, be it weak or strong. For charges and age, they have a strong positive correlation, with rho = 0.53. This potentially implies that medical charge is higher for a person who is older. The correlations between charges and the other two variables (i.e. bmi and children) are weak, with rho being around 0.11-0.14. One limitation on bmi is that it may not be so linearly correlated to charges since both higher or lower bmi may suggest the person is unhealthy, implying a higher medical charge.

chisq.test(Ins$sex,Ins$smoker)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  Ins$sex and Ins$smoker
## X-squared = 7.3929, df = 1, p-value = 0.006548
chisq.test(Ins$sex,Ins$region)
## 
##  Pearson's Chi-squared test
## 
## data:  Ins$sex and Ins$region
## X-squared = 0.43514, df = 3, p-value = 0.9329
chisq.test(Ins$smoker,Ins$region)
## 
##  Pearson's Chi-squared test
## 
## data:  Ins$smoker and Ins$region
## X-squared = 7.3435, df = 3, p-value = 0.06172

For categorical variables, their correlation dependence could be tested with Pearson’s Chi-squared test. For sex and smoker, the p-value is less than 0.05, rejecting the null hypothesis that the two variables are independent. This may imply a certain behavioural pattern found in the data set, for example, male tends to be a smoker than female. For the other two tests on sex & region and smoker & region, it is not statistically significant to reject the null hypothesis.

Model Evaluation

Genaralized Linear Model (GLM)

#Fit by normal distribution in GLM, which is essentially the same as linear model

GLM.a1 = glm(Ins$charges~., family = gaussian(link="identity"), data = Ins)
GLM.a2 = glm(Ins$charges~ age, family = gaussian(link="identity"), data = Ins)
GLM.a3 = glm(Ins$charges~. + sex*smoker, family = gaussian(link="identity"), data = Ins)

AIC(GLM.a1, GLM.a2, GLM.a3, k = 2)
BIC(GLM.a1, GLM.a2, GLM.a3)
summary(GLM.a1)
## 
## Call:
## glm(formula = Ins$charges ~ ., family = gaussian(link = "identity"), 
##     data = Ins)
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -11938.5      987.8 -12.086  < 2e-16 ***
## age                256.9       11.9  21.587  < 2e-16 ***
## sexmale           -131.3      332.9  -0.394 0.693348    
## bmi                339.2       28.6  11.860  < 2e-16 ***
## children           475.5      137.8   3.451 0.000577 ***
## smokeryes        23848.5      413.1  57.723  < 2e-16 ***
## regionnorthwest   -353.0      476.3  -0.741 0.458769    
## regionsoutheast  -1035.0      478.7  -2.162 0.030782 *  
## regionsouthwest   -960.0      477.9  -2.009 0.044765 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 36749084)
## 
##     Null deviance: 1.9607e+11  on 1337  degrees of freedom
## Residual deviance: 4.8840e+10  on 1329  degrees of freedom
## AIC: 27116
## 
## Number of Fisher Scoring iterations: 2
summary(GLM.a2)
## 
## Call:
## glm(formula = Ins$charges ~ age, family = gaussian(link = "identity"), 
##     data = Ins)
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3165.9      937.1   3.378 0.000751 ***
## age            257.7       22.5  11.453  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 133640741)
## 
##     Null deviance: 1.9607e+11  on 1337  degrees of freedom
## Residual deviance: 1.7854e+11  on 1336  degrees of freedom
## AIC: 28836
## 
## Number of Fisher Scoring iterations: 2
summary(GLM.a3)
## 
## Call:
## glm(formula = Ins$charges ~ . + sex * smoker, family = gaussian(link = "identity"), 
##     data = Ins)
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -11579.33     993.40 -11.656  < 2e-16 ***
## age                  256.87      11.87  21.645  < 2e-16 ***
## sexmale             -599.10     371.06  -1.615 0.106645    
## bmi                  335.22      28.56  11.738  < 2e-16 ***
## children             464.43     137.50   3.378 0.000752 ***
## smokeryes          22532.36     621.94  36.229  < 2e-16 ***
## regionnorthwest     -326.84     475.12  -0.688 0.491630    
## regionsoutheast    -1033.25     477.44  -2.164 0.030632 *  
## regionsouthwest     -983.30     476.75  -2.062 0.039356 *  
## sexmale:smokeryes   2345.02     829.99   2.825 0.004793 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 36557009)
## 
##     Null deviance: 1.9607e+11  on 1337  degrees of freedom
## Residual deviance: 4.8548e+10  on 1328  degrees of freedom
## AIC: 27109
## 
## Number of Fisher Scoring iterations: 2

Assuming medical charges follow normal distribution, three GLMs are fitted to test against different predictor variables.

GLM.a1: include all the variables (age, sex, smoker, bmi, region, children) as the predictor variables

GLM.a2: include one variable - age, as the predictor variable

GLM.a3: include all the variables as the predictor variables, plus an interaction term sex * smoker

The following aspects are considered when evaluating the model:

  1. Trade-off of goodness of fit and complexity of the model The model may have a higher explanatory power when more predictor variables are added, but at the same time it would come at the expense of a more complex model. To balance the complexity and explanatory power, AIC or BIC could be used in which penalty would be introduced when extra predictor variable is added. The difference between AIC and BIC is that BIC would penalize more. If the marginal decrease of AIC is less than 2, the model is deemed as working as good as the best model. For GLM.a2, it is the simplest model, however, its AIC is the highest as well (lower AIC means a better trade-off between goodness of fit and complexity of model) GLM.a3 has the lowest AIC and its marginal decrease of AIC when compared to GLM.a1 is less than 2, indicating that it is a better model.

  2. Interaction term In the previous section, some correlation is found for sex and smoker, suggesting an interaction term of sex * smoker may be added to the model (and hence added into GLM.a3). For GLM.a3, it does have a better AIC and thus this model is proposed. However, it should be noted that

  3. Significance of the coefficients Looking at the summary, the model suggests that age, bmi, children and smoker are significant predictors of medical insurance charge. The interaction term sex*smoker also shows a moderate level of significance. For sex, the p-value is larger than 0.05, indicating that the effect is not statistically significant.

#Fit by gamma distribution in GLM

GLM.b1 = glm(Ins$charges~., family = Gamma(link="inverse"), data = Ins)
GLM.b2 = glm(Ins$charges~ age, family = Gamma(link="inverse"), data = Ins)
GLM.b3 = glm(Ins$charges~. + sex*smoker, family = Gamma(link="inverse"), data = Ins)

AIC(GLM.b1, GLM.b2, GLM.b3, k = 2)
BIC(GLM.b1, GLM.b2, GLM.b3)
summary(GLM.b1)
## 
## Call:
## glm(formula = Ins$charges ~ ., family = Gamma(link = "inverse"), 
##     data = Ins)
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      1.932e-04  6.213e-06  31.090  < 2e-16 ***
## age             -9.693e-07  7.485e-08 -12.950  < 2e-16 ***
## sexmale          1.224e-06  1.865e-06   0.656  0.51194    
## bmi             -1.133e-06  1.592e-07  -7.113 1.85e-12 ***
## children        -2.157e-06  8.009e-07  -2.693  0.00717 ** 
## smokeryes       -8.207e-05  2.570e-06 -31.929  < 2e-16 ***
## regionnorthwest  1.786e-06  2.736e-06   0.653  0.51408    
## regionsoutheast  4.282e-06  2.465e-06   1.737  0.08256 .  
## regionsouthwest  2.918e-06  2.689e-06   1.085  0.27813    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Gamma family taken to be 0.3856602)
## 
##     Null deviance: 1056.0  on 1337  degrees of freedom
## Residual deviance:  478.7  on 1329  degrees of freedom
## AIC: 26869
## 
## Number of Fisher Scoring iterations: 6
summary(GLM.b2)
## 
## Call:
## glm(formula = Ins$charges ~ age, family = Gamma(link = "inverse"), 
##     data = Ins)
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.385e-04  6.968e-06   19.87   <2e-16 ***
## age         -1.466e-06  1.431e-07  -10.24   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Gamma family taken to be 0.9339199)
## 
##     Null deviance: 1056.04  on 1337  degrees of freedom
## Residual deviance:  956.64  on 1336  degrees of freedom
## AIC: 27859
## 
## Number of Fisher Scoring iterations: 6
summary(GLM.b3)
## 
## Call:
## glm(formula = Ins$charges ~ . + sex * smoker, family = Gamma(link = "inverse"), 
##     data = Ins)
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        1.892e-04  6.524e-06  29.000  < 2e-16 ***
## age               -9.685e-07  7.490e-08 -12.931  < 2e-16 ***
## sexmale            9.080e-06  4.498e-06   2.019   0.0437 *  
## bmi               -1.129e-06  1.607e-07  -7.026 3.38e-12 ***
## children          -2.043e-06  8.024e-07  -2.547   0.0110 *  
## smokeryes         -7.749e-05  3.476e-06 -22.294  < 2e-16 ***
## regionnorthwest    1.862e-06  2.734e-06   0.681   0.4960    
## regionsoutheast    4.493e-06  2.464e-06   1.823   0.0685 .  
## regionsouthwest    2.887e-06  2.709e-06   1.065   0.2869    
## sexmale:smokeryes -9.529e-06  4.960e-06  -1.921   0.0549 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Gamma family taken to be 0.3859279)
## 
##     Null deviance: 1056.04  on 1337  degrees of freedom
## Residual deviance:  477.27  on 1328  degrees of freedom
## AIC: 26867
## 
## Number of Fisher Scoring iterations: 6
# Calculate R-squared
#deviance <- summary(logit_model)$deviance
#null_deviance <- summary(logit_model)$null.deviance
#rsquared <- 1 - (deviance / null_deviance)

# Print the R-squared value
#print(rsquared)

In previous section, it is tested that the medical insurance charges may not follow the normal distribution. Hence, this leads to the thinking that to fit the model with another distribution which may closely reflect the characteristics of the medical charge. Thus, gamma distribution is proposed and tested in this case. Similar to the normal distribution, three GLMs are tested to examine which one works as a better model.

The AIC of GLM.b3 is only 2.2 lower than GLM.b1, suggesting it may not be optimal to add the interaction term to the model. Referring to the summary, the p-value of the interaction term is slightly larger than 0.05, indicating that its effect on charges is not statistically significant. When comparing BIC, GLM.b1 has the lowest BIC. In this regard, GLM.b1 will be proposed as the model for medical insurance charge.

Comparing the AIC and BIC, GLM fitted with gamma distribution has a lower AIC and BIC, indicating it is a better fit model.

Gradient Boosting Machine (GBM)

library(gbm)
## Loaded gbm 2.2.2
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
library(caret)
## Loading required package: lattice
library(ggplot2)
library(lattice)

# set seed for reproducing the consistent result
set.seed(100)

# split into training data and testing data
partition <- createDataPartition(Ins$charges, p=0.8, list = FALSE)
train <- Ins[partition, ]
test <- Ins[-partition, ]
test_x <- test[,1:6]
test_y <- test[,7]

# Train the gradient boosting regression model
GBM1 <- gbm(charges ~ age + bmi + children,
             data = train,
             distribution = "gaussian",
             cv.folds = 10,
             shrinkage = .1,
             n.trees = 200)

# Make predictions on the test set
pred_y <- predict(GBM1, newdata = test_x)
## Using 44 trees...
# Calculate evaluation metrics
mse <- mean((test_y - pred_y)^2)
mae <- mean(abs(test_y - pred_y))
rmse <- sqrt(mse)

# Print evaluation metrics
cat("MAE:", mae, "\n", "MSE:", mse, "\n", "RMSE:", rmse, "\n", "\n")
## MAE: 9090.069 
##  MSE: 130295185 
##  RMSE: 11414.69 
## 
# Plot actual and predicted values
plot(test_y, pch = 18, col = "red", xlab = "Observation", 
     ylab = "charges", main = "Actual vs. Predicted")
lines(pred_y, lwd = 1, col = "blue")
legend("topleft", legend = c("Actual", "Predicted"), 
       col = c("red", "blue"), pch = c(18, NA), lty = c(NA, 1), lwd = 1, cex = 0.8)

Random Forest

library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
library(caret)

# set seed for reproducing the consistent result
set.seed(100)

# split into training data and testing data
partition <- createDataPartition(Ins$charges, p=0.8, list = FALSE)
train <- Ins[partition, ]
test <- Ins[-partition, ]

rf <- randomForest(charges~., data=train, proximity=TRUE) 
print(rf)
## 
## Call:
##  randomForest(formula = charges ~ ., data = train, proximity = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##           Mean of squared residuals: 23557400
##                     % Var explained: 83.87