library(dplyr)
library(ggplot2)
library(tidyr)
library(patchwork)
library(reshape2)
library(tidyselect)
library(fmsb)
library(tibble)
library(glmnet)
library(randomForest)

Variables Used:

Age: Age of insured patient.
Sex: Sex of insured patient.
BMI: Body Mass Index of patient.
Children: Number of children the patient has.
Smoker: Whether the patient smokes or not.
Region: Region of residence.
Charges: Total Insurance Cost.

PC <- read.csv("patient_charges.csv",stringsAsFactors = T)
# any(is.na(PC))

Summary Statistics

summary(PC)

##       age            sex           bmi           children     smoker    
##  Min.   :18.00   female:662   Min.   :15.96   Min.   :0.000   no :1064  
##  1st Qu.:27.00   male  :676   1st Qu.:26.30   1st Qu.:0.000   yes: 274  
##  Median :39.00                Median :30.40   Median :1.000             
##  Mean   :39.21                Mean   :30.66   Mean   :1.095             
##  3rd Qu.:51.00                3rd Qu.:34.69   3rd Qu.:2.000             
##  Max.   :64.00                Max.   :53.13   Max.   :5.000             
##        region       charges     
##  northeast:324   Min.   : 1122  
##  northwest:325   1st Qu.: 4740  
##  southeast:364   Median : 9382  
##  southwest:325   Mean   :13270  
##                  3rd Qu.:16640  
##                  Max.   :63770

Noticing that charges and children count are right skewed.
Age is somewhat uniform.
BMI is definitely normally distributed. Most patients are above normal BMI.

PC %>%
  purrr::keep(is.numeric) %>% 
  gather() %>% 
  ggplot(aes(value)) +
    facet_wrap(~ key, scales = "free") +
    geom_density() + ggthemes::theme_fivethirtyeight()

Feature Engineering categories for bmi and age.
Will be useful for visualization.

PC <- PC %>%
  mutate(age_category = cut(PC$age, breaks = c(18, 35, 55, Inf),
          labels = c("Young Adult", "Middle-Aged Adult", "Senior Adult"), 
          include.lowest = TRUE),
         weight_condition = cut(PC$bmi, breaks = c(0, 18.4, 24.9, 29.9, Inf),
          labels = c("Underweight", "Normal Weight", "Overweight", "Obese")))

Exploratory Data Analysis

Even number of patients for those young and middle-aged.
Age and BMI are the most correlated with charges, numerically.

ggplot(PC, aes(x=age_category, fill=age_category)) + geom_bar()

cormat <- PC %>% select(age, bmi, children, charges) %>% cor() %>% round(2)
melted_cormat <- reshape2::melt(cormat)

ggplot(data = melted_cormat, aes(x=Var1, y=Var2, fill=value)) + 
  geom_tile() +
  geom_text(aes(Var2, Var1, label = value), color = "white", size = 4) +
   theme(axis.title.x=element_blank(),
        axis.title.y=element_blank(),
        )

Moving on to visualizations against charges, our target variable.
Men, on average, have higher insurance expenses.
Children on Charges also does not have protruding data points or clusters. Will further analyze these two fields.
- No clusters based on weight_conditition were found, either, with these fields.

g1 <- ggplot(PC, aes(x=sex, y=charges, fill=sex)) + 
  geom_bar(stat="identity", fun="mean")

g2 <- ggplot(PC, aes(x=children, y=charges, color=children)) + 
  geom_jitter(width=.1) + theme(axis.title.y=element_blank()) + theme(legend.position="none")

combined <- g1 + g2
combined

There is a pronounced correlation between charges and smokers with the graph on the right-hand side.
As patients rise in age, there is also an increase in charges.
Most notably, Obese patients seem to be charged the greatest within both age_category & smoker.

p1 <- ggplot(PC, aes(x=age_category, y=charges, color=weight_condition)) + 
  geom_jitter(width=.1)

p2 <- ggplot(PC, aes(x=smoker, y=charges, color=weight_condition)) +
  geom_jitter(width=.1) + theme(axis.title.y=element_blank())

combined <- p1 + p2 & theme(legend.position = "bottom")
combined + plot_layout(guides = "collect")

Since there were some outliers in the last visualizations, I create bar plots to measure the mean and median for age_category. I include the median as some charges could be influenced too much by those outliers.

a1 <- ggplot(data=PC, aes(x=age_category, y=charges)) + 
  geom_bar(stat="summary", fun="mean", fill="skyblue") +
  scale_y_continuous(labels = scales::label_number(suffix = "k", scale = 1e-3)) +
  ggtitle(label="Mean")

a2 <- ggplot(data=PC, aes(x=age_category, y=charges)) + 
  geom_bar(stat="summary", fun="median", fill="orange") +
  scale_y_continuous(labels = scales::label_number(suffix = "k", scale = 1e-3)) +
  ggtitle(label="Median") +
  theme(axis.title.y=element_blank())


combined <- a1 + a2
combined

Obese and smoking seemed to be important contributing factors to charges from earlier graphs. Let’s see how charges look for obese smokers against obese non-smokers.
It’s clear that smoking is a great predictor of a higher insurance charge for all weight conditions and not just Obese.

PC %>%
  ggplot(aes(x=weight_condition,y=charges,fill=smoker)) +
  geom_boxplot(position="dodge2") +
  scale_fill_discrete(labels=c("No","Yes")) +
  theme(axis.title.x = element_blank(),
        legend.position = c(0.2,0.85),
        legend.background = element_rect(fill="gray91", color="gray"))

PC %>% group_by(smoker) %>%
  summarize(smoker_avg = mean(charges),
            count = n())

## # A tibble: 2 × 3
##   smoker smoker_avg count
##   <fct>       <dbl> <int>
## 1 no          8434.  1064
## 2 yes        32050.   274

BMI steadily increases with age, as suspected.

PC %>%
  ggplot(aes(x=age_category,y=bmi, fill=age_category)) + 
  geom_boxplot()

Incorporating Regions.:
Noticing that the South East region has highest charges, on average.
South West with the lowest charges, on average.

ggplot(PC, aes(x=region,y=charges, fill=region)) +
  geom_bar(stat="summary", fun="mean") +
  ggtitle("Average Charges By Region")

Radar Charts picturing the average charging prices for each weight condition in each region.
- From these charts, we can tell that the East Coast has more residents falling under the Overweight and Obese categories as opposed to the West Coast.

region_on_weight <- PC %>% 
  select(region, charges, weight_condition) %>%
  group_by(region, weight_condition) %>%
  summarize(mean = mean(charges), .groups="drop") %>%
  pivot_wider(names_from = "weight_condition", values_from = mean) %>%
  remove_rownames %>% 
  column_to_rownames(var="region") %>% # simultaneously removing non-numeric variable
  mutate(across(everything(), ~ replace_na(., 0)))


create_beautiful_radarchart <- function(data, color = "#800000", 
                                        vlabels = colnames(data), vlcex = 0.7,
                                        caxislabels = NULL, title = NULL, ...){
  radarchart(
    data, axistype = 1,
    pcol = color, pfcol = scales::alpha(color, 0.5), plwd = 2, plty = 1,
    cglcol = "grey", cglty = 1, cglwd = 0.8,
    axislabcol = "grey",
    vlcex = vlcex, vlabels = vlabels,
    caxislabels = caxislabels, title = title, ...
  )
}


range <- as.data.frame(lapply(region_on_weight, function(x) rev(range(pretty(x)))))
colnames(range) <- colnames(region_on_weight)

colors <- c("#00AFBB", "#E0115F", "#800000", "orange")
titles <- c("North East", "North West", "South East", "South West")



for(i in 1:4){
  create_beautiful_radarchart(
    data = rbind(range, region_on_weight[i,]), caxislabels = c(0,5000,10000,15000,20000),
    color = colors[i], title = titles[i], 
    seg=4)
}

Inference Testing

There is some evidence, although small, that men are more likely to smoke based on this data.

PC %>% select(sex, smoker) %>%
  table() %>% chisq.test()

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  .
## X-squared = 7.3929, df = 1, p-value = 0.006548

PC %>% select(sex, smoker) %>%
  table() %>% psych::Yule() # very small association

## [1] 0.1879285

PC %>% select(sex, region) %>%
  table() %>% chisq.test()

## 
##  Pearson's Chi-squared test
## 
## data:  .
## X-squared = 0.43514, df = 3, p-value = 0.9329

PC %>% select(smoker, region) %>%
  table() %>% chisq.test()

## 
##  Pearson's Chi-squared test
## 
## data:  .
## X-squared = 7.3435, df = 3, p-value = 0.06172

Manual Clustering

b1 <- PC %>%
  ggplot(aes(x=bmi,y=charges,color=weight_condition)) + geom_point() +
  geom_segment(aes(x = 25,
                   y = 58000,
                   xend = 29,
                   yend = 52000),
               arrow = arrow(length = unit(0.35, "cm")),
               color = "black") +
  annotate("text", x=22, y=62000, label= "Obese Cluster?")

b2 <- PC %>%
  ggplot(aes(x=bmi,y=charges,color=smoker)) + geom_point() +
  scale_color_manual(values=c("dodgerblue", "tomato2")) +
  geom_segment(aes(x = 25,
                   y = 58000,
                   xend = 29,
                   yend = 52000),
               arrow = arrow(length = unit(0.35, "cm")),
               color = "black") +
  annotate("text", x=22, y=62000, label= "Obese Smokers") +
  geom_segment(aes(x = 22,
                   y = 39000,
                   xend = 23,
                   yend = 32000),
               arrow = arrow(length = unit(0.35, "cm")),
               color = "black") +
  annotate("text", x=22, y=45000, label= "Impact of Smoking to \n Charges on other \n Weight Conditions")



combined <- b1 + b2 & theme(legend.position = "bottom")
combined

Modeling

Removing the categorized variables

PC <- PC %>% select(-age_category, -weight_condition)

Splitting the Data

# separating training and testing
train_rows <- sample(1:nrow(PC), 0.75*nrow(PC))

# for Ridge & Lasso
variables = model.matrix(charges~.,PC)[,-1]
PCTarget = PC$charges

x_train <- variables[train_rows,]
x_test <- variables[-train_rows,]

y_train <- PCTarget[train_rows]
y_test <- PCTarget[-train_rows]


# for all other models
PC_Train <- PC[train_rows,]
PC_Test <- PC[-train_rows,]

LINEAR REGRESSION

Constructing a linear regression using all fields and incorporating step-wise reduction using the AIC.
Then, performing ANOVA to ensure a better model has been found.

# Modeling and performing stepwise reduction.
firstMod <- lm(charges ~ . , data=PC_Train)

stepfirst <- step(firstMod, direction="backward", trace=FALSE)

reducedMod <- lm(as.formula(stepfirst), data=PC_Train)


nobs(firstMod) # Once again making sure there were no NA values by counting the number of observations the model used.

## [1] 1003

nobs(reducedMod)

## [1] 1003

# With the second model's p-value greater than an alpha of 0.05, we discard the first model. The p-value doesn't showcase any significant difference between the two models and thus, use the model with less variables.
anova(firstMod, reducedMod, test = "LRT")

## Analysis of Variance Table
## 
## Model 1: charges ~ age + sex + bmi + children + smoker + region
## Model 2: charges ~ age + bmi + children + smoker
##   Res.Df        RSS Df  Sum of Sq Pr(>Chi)
## 1    994 3.6112e+10                       
## 2    998 3.6235e+10 -4 -123063928   0.4952

# Looking at the AIC values for each mode to compare them, we see that the reduced model also has a lower AIC by roughly 3 points further confirming a better model.
stats::extractAIC(firstMod)

## [1]     9.00 17469.33

stats::extractAIC(reducedMod)

## [1]     5.00 17464.74

coef(reducedMod)

## (Intercept)         age         bmi    children   smokeryes 
## -12857.9497    263.9421    338.1761    398.3956  23813.0511

# Region & Sex as variables were not used.


PC_Test <- PC_Test %>%
      mutate(LinearPredict=predict(reducedMod, newdata=PC_Test, type="response"))


summary(reducedMod)

## 
## Call:
## lm(formula = as.formula(stepfirst), data = PC_Train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11141.1  -2868.3   -867.3   1514.7  29493.4 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -12857.95    1090.19 -11.794   <2e-16 ***
## age            263.94      13.69  19.282   <2e-16 ***
## bmi            338.18      31.74  10.656   <2e-16 ***
## children       398.40     157.09   2.536   0.0114 *  
## smokeryes    23813.05     469.34  50.738   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6026 on 998 degrees of freedom
## Multiple R-squared:  0.7572, Adjusted R-squared:  0.7562 
## F-statistic: 777.9 on 4 and 998 DF,  p-value: < 2.2e-16

Linear Regression - Leave One Out Cross Validation

library(caret)

train.control <- trainControl(method = "LOOCV")

model <- train(charges ~., data=PC_Train, method = "lm",
               trControl = train.control)

# creating a LOO CV model using features found in the reduced linear regression before cross-validation
model2 <- train(charges ~age+bmi+children+smoker, data=PC_Train, method = "lm",
               trControl = train.control)


PC_Test <- PC_Test %>%
      mutate(LinearPredict2=predict(model, newdata=PC_Test, type="raw"),
             LinearPredict3=predict(model, newdata=PC_Test, type="raw"))

Linear Regression - Descriptive

Our three most extreme outliers according to a linear model’s Residuals vs Leverage plot are comprised of 2 females, 1 male, all are smokers, and all have extreme weight conditions.

plot(lm(charges ~ . , data=PC), 4)

PC[c(544,578,1301),]

##      age    sex    bmi children smoker    region  charges
## 544   54 female 47.410        0    yes southeast 63770.43
## 578   31 female 38.095        1    yes northeast 58571.07
## 1301  45   male 30.360        0    yes southeast 62592.87

Ridge Regression - Lasso Regression (Alpha Range)

The following loop uses 10-fold Cross Validation to determine the optimal value for lambda for alpha = 0, 0.1, … , 0.9, 1.0 using the Training dataset.

# Note: cv.glmnet doesn't accept formula notation like lm() or glm do.
# alpha = 0: ridge regression
# alpha = 1: lasso regression

# creating an empty list to hold each model
list.of.fits = list()

for (i in 0:10) {
  fit.name <- paste0("alpha", i/10)
  
  list.of.fits[[fit.name]] <- 
    cv.glmnet(x_train, y_train, type.measure="mse", alpha=i/10, family="gaussian")
}

results = data.frame()

for (i in 0:10){
  fit.name <- paste0("alpha", i/10)
  
  # Use each model to predict the target variable given the testing data
  predicted <- 
    predict(list.of.fits[[fit.name]],
            s=list.of.fits[[fit.name]]$lambda.min, newx=x_test)
  
  mse <- mean((y_test - predicted)^2)
  
  # store the results
  temp <- data.frame(alpha=i/10, mse=mse, fit.name=fit.name)
  results <- rbind(results, temp)
}

# alpha = 1, Lasso Regression is the best method to use on this data
results

##    alpha      mse fit.name
## 1    0.0 38569098   alpha0
## 2    0.1 38272284 alpha0.1
## 3    0.2 38288424 alpha0.2
## 4    0.3 38299126 alpha0.3
## 5    0.4 38317495 alpha0.4
## 6    0.5 38383874 alpha0.5
## 7    0.6 38315401 alpha0.6
## 8    0.7 38316885 alpha0.7
## 9    0.8 38347918 alpha0.8
## 10   0.9 38402702 alpha0.9
## 11   1.0 38434523   alpha1

### Final Lasso Model
LassoMod <- cv.glmnet(x_train, y_train, type.measure="mse",
  alpha=1.0, family="gaussian")

#lambda for min MSE
bestLam <- LassoMod$lambda.min

LassoPredict=predict(LassoMod, s=bestLam, newx=x_test)

PC_Test$LassoPredict = as.vector(LassoPredict)

coef(LassoMod)

## 9 x 1 sparse Matrix of class "dgCMatrix"
##                         s1
## (Intercept)     -4592.7469
## age               198.7064
## sexmale             .     
## bmi               184.1072
## children            .     
## smokeryes       21258.3546
## regionnorthwest     .     
## regionsoutheast     .     
## regionsouthwest     .

# Every variable was used in Lasso

Random Forest Regression

RFMod<- randomForest(charges ~ .,data = PC_Train, ntree = 1000, proximity=TRUE)

varImpPlot(RFMod)

PC_Test <- PC_Test %>%
      mutate(RFPredict=predict(RFMod, newdata=PC_Test, type="response"))

Comparing the two models

LinearMSE <- mean((PC_Test$charges - PC_Test$LinearPredict)^2)

LinearMSE2 <- mean((PC_Test$charges - PC_Test$LinearPredict2)^2)

LinearMSE3 <- mean((PC_Test$charges - PC_Test$LinearPredict3)^2)


LassoMSE <- mean((PC_Test$charges - PC_Test$LassoPredict)^2)

rfMSE <- mean((PC_Test$charges - PC_Test$RFPredict)^2)

# Finding the lowest MSE among the predictions for each model
LinearMSE < LinearMSE2

## [1] FALSE

LinearMSE2 < LinearMSE3

## [1] FALSE

LinearMSE3 < LassoMSE

## [1] TRUE

LinearMSE3 < rfMSE

## [1] FALSE

# Random Forest has lowest MSE
rfMSE < LinearMSE3

## [1] TRUE

head(PC_Test)

##    age    sex    bmi children smoker    region   charges LinearPredict
## 1   19 female 27.900        0    yes southwest 16884.924    25405.1133
## 3   28   male 33.000        3     no southeast  4449.462     6887.4256
## 4   33   male 22.705        0     no northwest 21984.471     3530.4262
## 9   37   male 29.830        2     no northeast  6406.411     7792.4903
## 13  23   male 34.400        0     no southwest  1826.843     4845.9750
## 16  19   male 24.600        1     no southwest  1837.237      874.4767
##    LinearPredict2 LinearPredict3 LassoPredict RFPredict
## 1      25412.8202     25412.8202   25456.7669 19166.367
## 3       6357.1132      6357.1132    6453.3634  5729.687
## 4       3609.5358      3609.5358    3758.8221  5243.550
## 9       8067.6870      8067.6870    7999.7201  8451.910
## 13      4767.9462      4767.9462    4942.2303  4626.161
## 16       628.3802       628.3802     876.1544  3410.832

Conclusion

Random Forest Regression is most optimal.

Insurance Costs | EDA, Clustering, and Regression

Antonio Pano

10/6/2022