library(dplyr)
library(ggplot2)
library(tidyr)
library(patchwork)
library(reshape2)
library(tidyselect)
library(fmsb)
library(tibble)
library(glmnet)
library(randomForest)

Variables Used:

Age: Age of insured patient.
Sex: Sex of insured patient.
BMI: Body Mass Index of patient.
Children: Number of children the patient has.
Smoker: Whether the patient smokes or not.
Region: Region of residence.
Charges: Total Insurance Cost.

PC <- read.csv("patient_charges.csv",stringsAsFactors = T)
# any(is.na(PC))
Summary Statistics
summary(PC)
##       age            sex           bmi           children     smoker    
##  Min.   :18.00   female:662   Min.   :15.96   Min.   :0.000   no :1064  
##  1st Qu.:27.00   male  :676   1st Qu.:26.30   1st Qu.:0.000   yes: 274  
##  Median :39.00                Median :30.40   Median :1.000             
##  Mean   :39.21                Mean   :30.66   Mean   :1.095             
##  3rd Qu.:51.00                3rd Qu.:34.69   3rd Qu.:2.000             
##  Max.   :64.00                Max.   :53.13   Max.   :5.000             
##        region       charges     
##  northeast:324   Min.   : 1122  
##  northwest:325   1st Qu.: 4740  
##  southeast:364   Median : 9382  
##  southwest:325   Mean   :13270  
##                  3rd Qu.:16640  
##                  Max.   :63770
  • Noticing that charges and children count are right skewed.
  • Age is somewhat uniform.
  • BMI is definitely normally distributed. Most patients are above normal BMI.
PC %>%
  purrr::keep(is.numeric) %>% 
  gather() %>% 
  ggplot(aes(value)) +
    facet_wrap(~ key, scales = "free") +
    geom_density() + ggthemes::theme_fivethirtyeight()

  • Feature Engineering categories for bmi and age.
  • Will be useful for visualization.
PC <- PC %>%
  mutate(age_category = cut(PC$age, breaks = c(18, 35, 55, Inf),
          labels = c("Young Adult", "Middle-Aged Adult", "Senior Adult"), 
          include.lowest = TRUE),
         weight_condition = cut(PC$bmi, breaks = c(0, 18.4, 24.9, 29.9, Inf),
          labels = c("Underweight", "Normal Weight", "Overweight", "Obese")))

Exploratory Data Analysis

ggplot(PC, aes(x=age_category, fill=age_category)) + geom_bar()

cormat <- PC %>% select(age, bmi, children, charges) %>% cor() %>% round(2)
melted_cormat <- reshape2::melt(cormat)

ggplot(data = melted_cormat, aes(x=Var1, y=Var2, fill=value)) + 
  geom_tile() +
  geom_text(aes(Var2, Var1, label = value), color = "white", size = 4) +
   theme(axis.title.x=element_blank(),
        axis.title.y=element_blank(),
        )

g1 <- ggplot(PC, aes(x=sex, y=charges, fill=sex)) + 
  geom_bar(stat="identity", fun="mean")

g2 <- ggplot(PC, aes(x=children, y=charges, color=children)) + 
  geom_jitter(width=.1) + theme(axis.title.y=element_blank()) + theme(legend.position="none")

combined <- g1 + g2
combined

p1 <- ggplot(PC, aes(x=age_category, y=charges, color=weight_condition)) + 
  geom_jitter(width=.1)

p2 <- ggplot(PC, aes(x=smoker, y=charges, color=weight_condition)) +
  geom_jitter(width=.1) + theme(axis.title.y=element_blank())

combined <- p1 + p2 & theme(legend.position = "bottom")
combined + plot_layout(guides = "collect")

a1 <- ggplot(data=PC, aes(x=age_category, y=charges)) + 
  geom_bar(stat="summary", fun="mean", fill="skyblue") +
  scale_y_continuous(labels = scales::label_number(suffix = "k", scale = 1e-3)) +
  ggtitle(label="Mean")

a2 <- ggplot(data=PC, aes(x=age_category, y=charges)) + 
  geom_bar(stat="summary", fun="median", fill="orange") +
  scale_y_continuous(labels = scales::label_number(suffix = "k", scale = 1e-3)) +
  ggtitle(label="Median") +
  theme(axis.title.y=element_blank())


combined <- a1 + a2
combined

PC %>%
  ggplot(aes(x=weight_condition,y=charges,fill=smoker)) +
  geom_boxplot(position="dodge2") +
  scale_fill_discrete(labels=c("No","Yes")) +
  theme(axis.title.x = element_blank(),
        legend.position = c(0.2,0.85),
        legend.background = element_rect(fill="gray91", color="gray"))

PC %>% group_by(smoker) %>%
  summarize(smoker_avg = mean(charges),
            count = n())
## # A tibble: 2 × 3
##   smoker smoker_avg count
##   <fct>       <dbl> <int>
## 1 no          8434.  1064
## 2 yes        32050.   274
PC %>%
  ggplot(aes(x=age_category,y=bmi, fill=age_category)) + 
  geom_boxplot()

ggplot(PC, aes(x=region,y=charges, fill=region)) +
  geom_bar(stat="summary", fun="mean") +
  ggtitle("Average Charges By Region")

region_on_weight <- PC %>% 
  select(region, charges, weight_condition) %>%
  group_by(region, weight_condition) %>%
  summarize(mean = mean(charges), .groups="drop") %>%
  pivot_wider(names_from = "weight_condition", values_from = mean) %>%
  remove_rownames %>% 
  column_to_rownames(var="region") %>% # simultaneously removing non-numeric variable
  mutate(across(everything(), ~ replace_na(., 0)))


create_beautiful_radarchart <- function(data, color = "#800000", 
                                        vlabels = colnames(data), vlcex = 0.7,
                                        caxislabels = NULL, title = NULL, ...){
  radarchart(
    data, axistype = 1,
    pcol = color, pfcol = scales::alpha(color, 0.5), plwd = 2, plty = 1,
    cglcol = "grey", cglty = 1, cglwd = 0.8,
    axislabcol = "grey",
    vlcex = vlcex, vlabels = vlabels,
    caxislabels = caxislabels, title = title, ...
  )
}


range <- as.data.frame(lapply(region_on_weight, function(x) rev(range(pretty(x)))))
colnames(range) <- colnames(region_on_weight)

colors <- c("#00AFBB", "#E0115F", "#800000", "orange")
titles <- c("North East", "North West", "South East", "South West")



for(i in 1:4){
  create_beautiful_radarchart(
    data = rbind(range, region_on_weight[i,]), caxislabels = c(0,5000,10000,15000,20000),
    color = colors[i], title = titles[i], 
    seg=4)
}

Inference Testing

PC %>% select(sex, smoker) %>%
  table() %>% chisq.test()
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  .
## X-squared = 7.3929, df = 1, p-value = 0.006548
PC %>% select(sex, smoker) %>%
  table() %>% psych::Yule() # very small association
## [1] 0.1879285
PC %>% select(sex, region) %>%
  table() %>% chisq.test()
## 
##  Pearson's Chi-squared test
## 
## data:  .
## X-squared = 0.43514, df = 3, p-value = 0.9329
PC %>% select(smoker, region) %>%
  table() %>% chisq.test()
## 
##  Pearson's Chi-squared test
## 
## data:  .
## X-squared = 7.3435, df = 3, p-value = 0.06172

Manual Clustering

b1 <- PC %>%
  ggplot(aes(x=bmi,y=charges,color=weight_condition)) + geom_point() +
  geom_segment(aes(x = 25,
                   y = 58000,
                   xend = 29,
                   yend = 52000),
               arrow = arrow(length = unit(0.35, "cm")),
               color = "black") +
  annotate("text", x=22, y=62000, label= "Obese Cluster?")

b2 <- PC %>%
  ggplot(aes(x=bmi,y=charges,color=smoker)) + geom_point() +
  scale_color_manual(values=c("dodgerblue", "tomato2")) +
  geom_segment(aes(x = 25,
                   y = 58000,
                   xend = 29,
                   yend = 52000),
               arrow = arrow(length = unit(0.35, "cm")),
               color = "black") +
  annotate("text", x=22, y=62000, label= "Obese Smokers") +
  geom_segment(aes(x = 22,
                   y = 39000,
                   xend = 23,
                   yend = 32000),
               arrow = arrow(length = unit(0.35, "cm")),
               color = "black") +
  annotate("text", x=22, y=45000, label= "Impact of Smoking to \n Charges on other \n Weight Conditions")



combined <- b1 + b2 & theme(legend.position = "bottom")
combined

Modeling

Removing the categorized variables

PC <- PC %>% select(-age_category, -weight_condition)

Splitting the Data

# separating training and testing
train_rows <- sample(1:nrow(PC), 0.75*nrow(PC))

# for Ridge & Lasso
variables = model.matrix(charges~.,PC)[,-1]
PCTarget = PC$charges

x_train <- variables[train_rows,]
x_test <- variables[-train_rows,]

y_train <- PCTarget[train_rows]
y_test <- PCTarget[-train_rows]


# for all other models
PC_Train <- PC[train_rows,]
PC_Test <- PC[-train_rows,]

LINEAR REGRESSION

  • Constructing a linear regression using all fields and incorporating step-wise reduction using the AIC.
  • Then, performing ANOVA to ensure a better model has been found.
# Modeling and performing stepwise reduction.
firstMod <- lm(charges ~ . , data=PC_Train)

stepfirst <- step(firstMod, direction="backward", trace=FALSE)

reducedMod <- lm(as.formula(stepfirst), data=PC_Train)


nobs(firstMod) # Once again making sure there were no NA values by counting the number of observations the model used. 
## [1] 1003
nobs(reducedMod)
## [1] 1003
# With the second model's p-value greater than an alpha of 0.05, we discard the first model. The p-value doesn't showcase any significant difference between the two models and thus, use the model with less variables.
anova(firstMod, reducedMod, test = "LRT")
## Analysis of Variance Table
## 
## Model 1: charges ~ age + sex + bmi + children + smoker + region
## Model 2: charges ~ age + bmi + children + smoker
##   Res.Df        RSS Df  Sum of Sq Pr(>Chi)
## 1    994 3.6112e+10                       
## 2    998 3.6235e+10 -4 -123063928   0.4952
# Looking at the AIC values for each mode to compare them, we see that the reduced model also has a lower AIC by roughly 3 points further confirming a better model.
stats::extractAIC(firstMod)
## [1]     9.00 17469.33
stats::extractAIC(reducedMod)
## [1]     5.00 17464.74
coef(reducedMod)
## (Intercept)         age         bmi    children   smokeryes 
## -12857.9497    263.9421    338.1761    398.3956  23813.0511
# Region & Sex as variables were not used.


PC_Test <- PC_Test %>%
      mutate(LinearPredict=predict(reducedMod, newdata=PC_Test, type="response"))


summary(reducedMod)
## 
## Call:
## lm(formula = as.formula(stepfirst), data = PC_Train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11141.1  -2868.3   -867.3   1514.7  29493.4 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -12857.95    1090.19 -11.794   <2e-16 ***
## age            263.94      13.69  19.282   <2e-16 ***
## bmi            338.18      31.74  10.656   <2e-16 ***
## children       398.40     157.09   2.536   0.0114 *  
## smokeryes    23813.05     469.34  50.738   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6026 on 998 degrees of freedom
## Multiple R-squared:  0.7572, Adjusted R-squared:  0.7562 
## F-statistic: 777.9 on 4 and 998 DF,  p-value: < 2.2e-16
Linear Regression - Leave One Out Cross Validation
library(caret)

train.control <- trainControl(method = "LOOCV")

model <- train(charges ~., data=PC_Train, method = "lm",
               trControl = train.control)

# creating a LOO CV model using features found in the reduced linear regression before cross-validation
model2 <- train(charges ~age+bmi+children+smoker, data=PC_Train, method = "lm",
               trControl = train.control)


PC_Test <- PC_Test %>%
      mutate(LinearPredict2=predict(model, newdata=PC_Test, type="raw"),
             LinearPredict3=predict(model, newdata=PC_Test, type="raw"))
Linear Regression - Descriptive
  • Our three most extreme outliers according to a linear model’s Residuals vs Leverage plot are comprised of 2 females, 1 male, all are smokers, and all have extreme weight conditions.
plot(lm(charges ~ . , data=PC), 4)

PC[c(544,578,1301),]
##      age    sex    bmi children smoker    region  charges
## 544   54 female 47.410        0    yes southeast 63770.43
## 578   31 female 38.095        1    yes northeast 58571.07
## 1301  45   male 30.360        0    yes southeast 62592.87

Ridge Regression - Lasso Regression (Alpha Range)

The following loop uses 10-fold Cross Validation to determine the optimal value for lambda for alpha = 0, 0.1, … , 0.9, 1.0 using the Training dataset.

# Note: cv.glmnet doesn't accept formula notation like lm() or glm do.
# alpha = 0: ridge regression
# alpha = 1: lasso regression

# creating an empty list to hold each model
list.of.fits = list()

for (i in 0:10) {
  fit.name <- paste0("alpha", i/10)
  
  list.of.fits[[fit.name]] <- 
    cv.glmnet(x_train, y_train, type.measure="mse", alpha=i/10, family="gaussian")
}

results = data.frame()

for (i in 0:10){
  fit.name <- paste0("alpha", i/10)
  
  # Use each model to predict the target variable given the testing data
  predicted <- 
    predict(list.of.fits[[fit.name]],
            s=list.of.fits[[fit.name]]$lambda.min, newx=x_test)
  
  mse <- mean((y_test - predicted)^2)
  
  # store the results
  temp <- data.frame(alpha=i/10, mse=mse, fit.name=fit.name)
  results <- rbind(results, temp)
}

# alpha = 1, Lasso Regression is the best method to use on this data
results
##    alpha      mse fit.name
## 1    0.0 38569098   alpha0
## 2    0.1 38272284 alpha0.1
## 3    0.2 38288424 alpha0.2
## 4    0.3 38299126 alpha0.3
## 5    0.4 38317495 alpha0.4
## 6    0.5 38383874 alpha0.5
## 7    0.6 38315401 alpha0.6
## 8    0.7 38316885 alpha0.7
## 9    0.8 38347918 alpha0.8
## 10   0.9 38402702 alpha0.9
## 11   1.0 38434523   alpha1
### Final Lasso Model
LassoMod <- cv.glmnet(x_train, y_train, type.measure="mse",
  alpha=1.0, family="gaussian")

#lambda for min MSE
bestLam <- LassoMod$lambda.min

LassoPredict=predict(LassoMod, s=bestLam, newx=x_test)

PC_Test$LassoPredict = as.vector(LassoPredict)

coef(LassoMod)
## 9 x 1 sparse Matrix of class "dgCMatrix"
##                         s1
## (Intercept)     -4592.7469
## age               198.7064
## sexmale             .     
## bmi               184.1072
## children            .     
## smokeryes       21258.3546
## regionnorthwest     .     
## regionsoutheast     .     
## regionsouthwest     .
# Every variable was used in Lasso

Random Forest Regression

RFMod<- randomForest(charges ~ .,data = PC_Train, ntree = 1000, proximity=TRUE)

varImpPlot(RFMod) 

PC_Test <- PC_Test %>%
      mutate(RFPredict=predict(RFMod, newdata=PC_Test, type="response"))

Comparing the two models

LinearMSE <- mean((PC_Test$charges - PC_Test$LinearPredict)^2)

LinearMSE2 <- mean((PC_Test$charges - PC_Test$LinearPredict2)^2)

LinearMSE3 <- mean((PC_Test$charges - PC_Test$LinearPredict3)^2)


LassoMSE <- mean((PC_Test$charges - PC_Test$LassoPredict)^2)

rfMSE <- mean((PC_Test$charges - PC_Test$RFPredict)^2)
# Finding the lowest MSE among the predictions for each model
LinearMSE < LinearMSE2
## [1] FALSE
LinearMSE2 < LinearMSE3
## [1] FALSE
LinearMSE3 < LassoMSE
## [1] TRUE
LinearMSE3 < rfMSE
## [1] FALSE
# Random Forest has lowest MSE
rfMSE < LinearMSE3
## [1] TRUE
head(PC_Test)
##    age    sex    bmi children smoker    region   charges LinearPredict
## 1   19 female 27.900        0    yes southwest 16884.924    25405.1133
## 3   28   male 33.000        3     no southeast  4449.462     6887.4256
## 4   33   male 22.705        0     no northwest 21984.471     3530.4262
## 9   37   male 29.830        2     no northeast  6406.411     7792.4903
## 13  23   male 34.400        0     no southwest  1826.843     4845.9750
## 16  19   male 24.600        1     no southwest  1837.237      874.4767
##    LinearPredict2 LinearPredict3 LassoPredict RFPredict
## 1      25412.8202     25412.8202   25456.7669 19166.367
## 3       6357.1132      6357.1132    6453.3634  5729.687
## 4       3609.5358      3609.5358    3758.8221  5243.550
## 9       8067.6870      8067.6870    7999.7201  8451.910
## 13      4767.9462      4767.9462    4942.2303  4626.161
## 16       628.3802       628.3802     876.1544  3410.832

Conclusion

Random Forest Regression is most optimal.