library(dplyr)
library(ggplot2)
library(tidyr)
library(patchwork)
library(reshape2)
library(tidyselect)
library(fmsb)
library(tibble)
library(glmnet)
library(randomForest)
Age: Age of insured patient.
Sex: Sex of insured patient.
BMI: Body Mass Index of patient.
Children: Number of children the patient has.
Smoker: Whether the patient smokes or not.
Region: Region of residence.
Charges: Total Insurance Cost.
PC <- read.csv("patient_charges.csv",stringsAsFactors = T)
# any(is.na(PC))
summary(PC)
## age sex bmi children smoker
## Min. :18.00 female:662 Min. :15.96 Min. :0.000 no :1064
## 1st Qu.:27.00 male :676 1st Qu.:26.30 1st Qu.:0.000 yes: 274
## Median :39.00 Median :30.40 Median :1.000
## Mean :39.21 Mean :30.66 Mean :1.095
## 3rd Qu.:51.00 3rd Qu.:34.69 3rd Qu.:2.000
## Max. :64.00 Max. :53.13 Max. :5.000
## region charges
## northeast:324 Min. : 1122
## northwest:325 1st Qu.: 4740
## southeast:364 Median : 9382
## southwest:325 Mean :13270
## 3rd Qu.:16640
## Max. :63770
PC %>%
purrr::keep(is.numeric) %>%
gather() %>%
ggplot(aes(value)) +
facet_wrap(~ key, scales = "free") +
geom_density() + ggthemes::theme_fivethirtyeight()
bmi
and
age
.PC <- PC %>%
mutate(age_category = cut(PC$age, breaks = c(18, 35, 55, Inf),
labels = c("Young Adult", "Middle-Aged Adult", "Senior Adult"),
include.lowest = TRUE),
weight_condition = cut(PC$bmi, breaks = c(0, 18.4, 24.9, 29.9, Inf),
labels = c("Underweight", "Normal Weight", "Overweight", "Obese")))
ggplot(PC, aes(x=age_category, fill=age_category)) + geom_bar()
cormat <- PC %>% select(age, bmi, children, charges) %>% cor() %>% round(2)
melted_cormat <- reshape2::melt(cormat)
ggplot(data = melted_cormat, aes(x=Var1, y=Var2, fill=value)) +
geom_tile() +
geom_text(aes(Var2, Var1, label = value), color = "white", size = 4) +
theme(axis.title.x=element_blank(),
axis.title.y=element_blank(),
)
Moving on to visualizations against charges, our target variable.
Men, on average, have higher insurance expenses.
Children on Charges also does not have protruding data points or clusters. Will further analyze these two fields.
g1 <- ggplot(PC, aes(x=sex, y=charges, fill=sex)) +
geom_bar(stat="identity", fun="mean")
g2 <- ggplot(PC, aes(x=children, y=charges, color=children)) +
geom_jitter(width=.1) + theme(axis.title.y=element_blank()) + theme(legend.position="none")
combined <- g1 + g2
combined
age_category
& smoker
.p1 <- ggplot(PC, aes(x=age_category, y=charges, color=weight_condition)) +
geom_jitter(width=.1)
p2 <- ggplot(PC, aes(x=smoker, y=charges, color=weight_condition)) +
geom_jitter(width=.1) + theme(axis.title.y=element_blank())
combined <- p1 + p2 & theme(legend.position = "bottom")
combined + plot_layout(guides = "collect")
age_category
.
I include the median as some charges could be influenced too much by
those outliers.a1 <- ggplot(data=PC, aes(x=age_category, y=charges)) +
geom_bar(stat="summary", fun="mean", fill="skyblue") +
scale_y_continuous(labels = scales::label_number(suffix = "k", scale = 1e-3)) +
ggtitle(label="Mean")
a2 <- ggplot(data=PC, aes(x=age_category, y=charges)) +
geom_bar(stat="summary", fun="median", fill="orange") +
scale_y_continuous(labels = scales::label_number(suffix = "k", scale = 1e-3)) +
ggtitle(label="Median") +
theme(axis.title.y=element_blank())
combined <- a1 + a2
combined
PC %>%
ggplot(aes(x=weight_condition,y=charges,fill=smoker)) +
geom_boxplot(position="dodge2") +
scale_fill_discrete(labels=c("No","Yes")) +
theme(axis.title.x = element_blank(),
legend.position = c(0.2,0.85),
legend.background = element_rect(fill="gray91", color="gray"))
PC %>% group_by(smoker) %>%
summarize(smoker_avg = mean(charges),
count = n())
## # A tibble: 2 × 3
## smoker smoker_avg count
## <fct> <dbl> <int>
## 1 no 8434. 1064
## 2 yes 32050. 274
PC %>%
ggplot(aes(x=age_category,y=bmi, fill=age_category)) +
geom_boxplot()
Incorporating Regions.:
Noticing that the South East
region has highest
charges, on average.
South West
with the lowest charges, on
average.
ggplot(PC, aes(x=region,y=charges, fill=region)) +
geom_bar(stat="summary", fun="mean") +
ggtitle("Average Charges By Region")
region_on_weight <- PC %>%
select(region, charges, weight_condition) %>%
group_by(region, weight_condition) %>%
summarize(mean = mean(charges), .groups="drop") %>%
pivot_wider(names_from = "weight_condition", values_from = mean) %>%
remove_rownames %>%
column_to_rownames(var="region") %>% # simultaneously removing non-numeric variable
mutate(across(everything(), ~ replace_na(., 0)))
create_beautiful_radarchart <- function(data, color = "#800000",
vlabels = colnames(data), vlcex = 0.7,
caxislabels = NULL, title = NULL, ...){
radarchart(
data, axistype = 1,
pcol = color, pfcol = scales::alpha(color, 0.5), plwd = 2, plty = 1,
cglcol = "grey", cglty = 1, cglwd = 0.8,
axislabcol = "grey",
vlcex = vlcex, vlabels = vlabels,
caxislabels = caxislabels, title = title, ...
)
}
range <- as.data.frame(lapply(region_on_weight, function(x) rev(range(pretty(x)))))
colnames(range) <- colnames(region_on_weight)
colors <- c("#00AFBB", "#E0115F", "#800000", "orange")
titles <- c("North East", "North West", "South East", "South West")
for(i in 1:4){
create_beautiful_radarchart(
data = rbind(range, region_on_weight[i,]), caxislabels = c(0,5000,10000,15000,20000),
color = colors[i], title = titles[i],
seg=4)
}
PC %>% select(sex, smoker) %>%
table() %>% chisq.test()
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: .
## X-squared = 7.3929, df = 1, p-value = 0.006548
PC %>% select(sex, smoker) %>%
table() %>% psych::Yule() # very small association
## [1] 0.1879285
PC %>% select(sex, region) %>%
table() %>% chisq.test()
##
## Pearson's Chi-squared test
##
## data: .
## X-squared = 0.43514, df = 3, p-value = 0.9329
PC %>% select(smoker, region) %>%
table() %>% chisq.test()
##
## Pearson's Chi-squared test
##
## data: .
## X-squared = 7.3435, df = 3, p-value = 0.06172
b1 <- PC %>%
ggplot(aes(x=bmi,y=charges,color=weight_condition)) + geom_point() +
geom_segment(aes(x = 25,
y = 58000,
xend = 29,
yend = 52000),
arrow = arrow(length = unit(0.35, "cm")),
color = "black") +
annotate("text", x=22, y=62000, label= "Obese Cluster?")
b2 <- PC %>%
ggplot(aes(x=bmi,y=charges,color=smoker)) + geom_point() +
scale_color_manual(values=c("dodgerblue", "tomato2")) +
geom_segment(aes(x = 25,
y = 58000,
xend = 29,
yend = 52000),
arrow = arrow(length = unit(0.35, "cm")),
color = "black") +
annotate("text", x=22, y=62000, label= "Obese Smokers") +
geom_segment(aes(x = 22,
y = 39000,
xend = 23,
yend = 32000),
arrow = arrow(length = unit(0.35, "cm")),
color = "black") +
annotate("text", x=22, y=45000, label= "Impact of Smoking to \n Charges on other \n Weight Conditions")
combined <- b1 + b2 & theme(legend.position = "bottom")
combined
PC <- PC %>% select(-age_category, -weight_condition)
# separating training and testing
train_rows <- sample(1:nrow(PC), 0.75*nrow(PC))
# for Ridge & Lasso
variables = model.matrix(charges~.,PC)[,-1]
PCTarget = PC$charges
x_train <- variables[train_rows,]
x_test <- variables[-train_rows,]
y_train <- PCTarget[train_rows]
y_test <- PCTarget[-train_rows]
# for all other models
PC_Train <- PC[train_rows,]
PC_Test <- PC[-train_rows,]
# Modeling and performing stepwise reduction.
firstMod <- lm(charges ~ . , data=PC_Train)
stepfirst <- step(firstMod, direction="backward", trace=FALSE)
reducedMod <- lm(as.formula(stepfirst), data=PC_Train)
nobs(firstMod) # Once again making sure there were no NA values by counting the number of observations the model used.
## [1] 1003
nobs(reducedMod)
## [1] 1003
# With the second model's p-value greater than an alpha of 0.05, we discard the first model. The p-value doesn't showcase any significant difference between the two models and thus, use the model with less variables.
anova(firstMod, reducedMod, test = "LRT")
## Analysis of Variance Table
##
## Model 1: charges ~ age + sex + bmi + children + smoker + region
## Model 2: charges ~ age + bmi + children + smoker
## Res.Df RSS Df Sum of Sq Pr(>Chi)
## 1 994 3.6112e+10
## 2 998 3.6235e+10 -4 -123063928 0.4952
# Looking at the AIC values for each mode to compare them, we see that the reduced model also has a lower AIC by roughly 3 points further confirming a better model.
stats::extractAIC(firstMod)
## [1] 9.00 17469.33
stats::extractAIC(reducedMod)
## [1] 5.00 17464.74
coef(reducedMod)
## (Intercept) age bmi children smokeryes
## -12857.9497 263.9421 338.1761 398.3956 23813.0511
# Region & Sex as variables were not used.
PC_Test <- PC_Test %>%
mutate(LinearPredict=predict(reducedMod, newdata=PC_Test, type="response"))
summary(reducedMod)
##
## Call:
## lm(formula = as.formula(stepfirst), data = PC_Train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11141.1 -2868.3 -867.3 1514.7 29493.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -12857.95 1090.19 -11.794 <2e-16 ***
## age 263.94 13.69 19.282 <2e-16 ***
## bmi 338.18 31.74 10.656 <2e-16 ***
## children 398.40 157.09 2.536 0.0114 *
## smokeryes 23813.05 469.34 50.738 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6026 on 998 degrees of freedom
## Multiple R-squared: 0.7572, Adjusted R-squared: 0.7562
## F-statistic: 777.9 on 4 and 998 DF, p-value: < 2.2e-16
library(caret)
train.control <- trainControl(method = "LOOCV")
model <- train(charges ~., data=PC_Train, method = "lm",
trControl = train.control)
# creating a LOO CV model using features found in the reduced linear regression before cross-validation
model2 <- train(charges ~age+bmi+children+smoker, data=PC_Train, method = "lm",
trControl = train.control)
PC_Test <- PC_Test %>%
mutate(LinearPredict2=predict(model, newdata=PC_Test, type="raw"),
LinearPredict3=predict(model, newdata=PC_Test, type="raw"))
plot(lm(charges ~ . , data=PC), 4)
PC[c(544,578,1301),]
## age sex bmi children smoker region charges
## 544 54 female 47.410 0 yes southeast 63770.43
## 578 31 female 38.095 1 yes northeast 58571.07
## 1301 45 male 30.360 0 yes southeast 62592.87
The following loop uses 10-fold Cross Validation to determine the optimal value for lambda for alpha = 0, 0.1, … , 0.9, 1.0 using the Training dataset.
# Note: cv.glmnet doesn't accept formula notation like lm() or glm do.
# alpha = 0: ridge regression
# alpha = 1: lasso regression
# creating an empty list to hold each model
list.of.fits = list()
for (i in 0:10) {
fit.name <- paste0("alpha", i/10)
list.of.fits[[fit.name]] <-
cv.glmnet(x_train, y_train, type.measure="mse", alpha=i/10, family="gaussian")
}
results = data.frame()
for (i in 0:10){
fit.name <- paste0("alpha", i/10)
# Use each model to predict the target variable given the testing data
predicted <-
predict(list.of.fits[[fit.name]],
s=list.of.fits[[fit.name]]$lambda.min, newx=x_test)
mse <- mean((y_test - predicted)^2)
# store the results
temp <- data.frame(alpha=i/10, mse=mse, fit.name=fit.name)
results <- rbind(results, temp)
}
# alpha = 1, Lasso Regression is the best method to use on this data
results
## alpha mse fit.name
## 1 0.0 38569098 alpha0
## 2 0.1 38272284 alpha0.1
## 3 0.2 38288424 alpha0.2
## 4 0.3 38299126 alpha0.3
## 5 0.4 38317495 alpha0.4
## 6 0.5 38383874 alpha0.5
## 7 0.6 38315401 alpha0.6
## 8 0.7 38316885 alpha0.7
## 9 0.8 38347918 alpha0.8
## 10 0.9 38402702 alpha0.9
## 11 1.0 38434523 alpha1
### Final Lasso Model
LassoMod <- cv.glmnet(x_train, y_train, type.measure="mse",
alpha=1.0, family="gaussian")
#lambda for min MSE
bestLam <- LassoMod$lambda.min
LassoPredict=predict(LassoMod, s=bestLam, newx=x_test)
PC_Test$LassoPredict = as.vector(LassoPredict)
coef(LassoMod)
## 9 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) -4592.7469
## age 198.7064
## sexmale .
## bmi 184.1072
## children .
## smokeryes 21258.3546
## regionnorthwest .
## regionsoutheast .
## regionsouthwest .
# Every variable was used in Lasso
RFMod<- randomForest(charges ~ .,data = PC_Train, ntree = 1000, proximity=TRUE)
varImpPlot(RFMod)
PC_Test <- PC_Test %>%
mutate(RFPredict=predict(RFMod, newdata=PC_Test, type="response"))
LinearMSE <- mean((PC_Test$charges - PC_Test$LinearPredict)^2)
LinearMSE2 <- mean((PC_Test$charges - PC_Test$LinearPredict2)^2)
LinearMSE3 <- mean((PC_Test$charges - PC_Test$LinearPredict3)^2)
LassoMSE <- mean((PC_Test$charges - PC_Test$LassoPredict)^2)
rfMSE <- mean((PC_Test$charges - PC_Test$RFPredict)^2)
# Finding the lowest MSE among the predictions for each model
LinearMSE < LinearMSE2
## [1] FALSE
LinearMSE2 < LinearMSE3
## [1] FALSE
LinearMSE3 < LassoMSE
## [1] TRUE
LinearMSE3 < rfMSE
## [1] FALSE
# Random Forest has lowest MSE
rfMSE < LinearMSE3
## [1] TRUE
head(PC_Test)
## age sex bmi children smoker region charges LinearPredict
## 1 19 female 27.900 0 yes southwest 16884.924 25405.1133
## 3 28 male 33.000 3 no southeast 4449.462 6887.4256
## 4 33 male 22.705 0 no northwest 21984.471 3530.4262
## 9 37 male 29.830 2 no northeast 6406.411 7792.4903
## 13 23 male 34.400 0 no southwest 1826.843 4845.9750
## 16 19 male 24.600 1 no southwest 1837.237 874.4767
## LinearPredict2 LinearPredict3 LassoPredict RFPredict
## 1 25412.8202 25412.8202 25456.7669 19166.367
## 3 6357.1132 6357.1132 6453.3634 5729.687
## 4 3609.5358 3609.5358 3758.8221 5243.550
## 9 8067.6870 8067.6870 7999.7201 8451.910
## 13 4767.9462 4767.9462 4942.2303 4626.161
## 16 628.3802 628.3802 876.1544 3410.832
Random Forest Regression is most optimal.