Background Context
- (US Health Insurance Data set): This data set contains: 1338 rows of
insured data, where the Insurance charges are given against the
following attributes of the insured:
- Age: age of primary beneficiary
- Sex: insurance contractor gender, female, male
- BMI: Body mass index, providing an understanding of body, weights
that are relatively high or low relative to height,objective index of
body weight (kg / m ^ 2) using the ratio of height to weight, ideally
18.5 to 24.9
- Number of Children: Number of children covered by health insurance /
Number of dependents
- Smoker: Smoking
- Region: the beneficiary’s residential area in the US, northeast,
southeast, southwest, northwest
- Charges: Individual medical costs billed by health insurance
Importing Libraries
library(dplyr)
library(ggplot2)
library(readr)
library(GGally)
library(reshape2)
library(caret)
library(lmtest)
library(Metrics)
library(polycor)
Importing Dataset
- df <- read.csv(Health_Insurance.csv)
Data overview
## Brief overview of raw data
head(df)
## age sex bmi children smoker region charges
## 1 19 female 27.900 0 yes southwest 16884.924
## 2 18 male 33.770 1 no southeast 1725.552
## 3 28 male 33.000 3 no southeast 4449.462
## 4 33 male 22.705 0 no northwest 21984.471
## 5 32 male 28.880 0 no northwest 3866.855
## 6 31 female 25.740 0 no southeast 3756.622
# Checking the Structure of the dataset
str(df)
## 'data.frame': 1338 obs. of 7 variables:
## $ age : int 19 18 28 33 32 31 46 37 37 60 ...
## $ sex : chr "female" "male" "male" "male" ...
## $ bmi : num 27.9 33.8 33 22.7 28.9 ...
## $ children: int 0 1 3 0 0 0 1 3 2 0 ...
## $ smoker : chr "yes" "no" "no" "no" ...
## $ region : chr "southwest" "southeast" "southeast" "northwest" ...
## $ charges : num 16885 1726 4449 21984 3867 ...
Simple EDA
# Summary Statistics
summary(df)
## age sex bmi children
## Min. :18.00 Length:1337 Min. :15.96 Min. :0.000
## 1st Qu.:27.00 Class :character 1st Qu.:26.29 1st Qu.:0.000
## Median :39.00 Mode :character Median :30.40 Median :1.000
## Mean :39.22 Mean :30.66 Mean :1.096
## 3rd Qu.:51.00 3rd Qu.:34.70 3rd Qu.:2.000
## Max. :64.00 Max. :53.13 Max. :5.000
## smoker region charges
## Length:1337 Length:1337 Min. : 1122
## Class :character Class :character 1st Qu.: 4746
## Mode :character Mode :character Median : 9386
## Mean :13279
## 3rd Qu.:16658
## Max. :63770
# Checking for Outliers
ggplot(df, aes(x = bmi)) +
geom_boxplot(fill = "pink") +
ggtitle("Outliers in BMI")

ggplot(df, aes(x = age)) +
geom_boxplot(fill = "lightblue") +
ggtitle("Outliers in Age")

ggplot(df, aes(x = children)) +
geom_boxplot(fill = "purple") +
ggtitle("Outliers in Children")

ggplot(df, aes(x = charges)) +
geom_boxplot(fill = "violet") +
ggtitle("Outliers in Charges")

## There seems to be outliers in both the (BMI & Charges) columns
## Handling Outliers on data (outside whiskers)
insurance <- df %>%
filter(charges < 21000, bmi < 46)
## Plotting new data frame without (outliers)
boxplot(insurance$age, insurance$bmi, insurance$children, insurance$charges,
names = c("age", "bmi", "children", "charges"),
main = "Box plot of Numerical Columns")

## Plotting Clean Data Frame's (Distribution)
ggplot(insurance, aes(x = charges)) +
geom_histogram(aes(y = ..density..), bins = 30, fill = "blue", alpha = 0.5) +
geom_density(col = "red", linewidth = 1)
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Creating a Table for (Charges by Region)
charges_by_region <- aggregate(charges ~ region, data = insurance, FUN = sum)
# Plotting a Vertical Bar Chart for Charges by Region
ggplot(charges_by_region, aes(x = region, y = charges)) +
geom_bar(stat = "identity", fill = "blue") +
labs(x = "Region", y = "Total Charges", title = "Total Charges by Region") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Creating a Horizontal Bar chart for (Smokers by Region)
smokers_by_region <- table(insurance$region, insurance$smoker)
barplot(smokers_by_region, beside = TRUE, legend = TRUE,
args.legend = list(title = "Smoker"), horiz = TRUE,
col = c("grey", "blue"), xlab = "Count", ylab = "Region",
main = "Count of Smokers by Region")

# Creating a pie chart for (Smoker's Distribution)
smoker_counts <- table(insurance$smoker)
pie(smoker_counts, labels = names(smoker_counts),
main = "Distribution of Smokers", col = c("red", "green"))

# Plotting a Violin Chart for (Charges by # of Children)
ggplot(insurance, aes(x = as.factor(children), y = charges)) +
geom_violin(trim = FALSE, fill = "blue", color = "black") +
labs(x = "Number of Children", y = "Charges", title = "Charges by Number of Children")

# Creating a visualization for Distribution of Sex
ggplot(insurance, aes(x = sex)) +
geom_bar(fill = "blue") +
labs(x = "Sex", y = "Count", title = "Distribution of Sex")

# Plotting a Histogram for the Distribution of BMI
hist(insurance$bmi, breaks = 20, main = "Distribution of BMI", xlab = "BMI", col = "red")

# Plotting the Correlation between (Age and Charges) 'Positive'
plot(insurance$age, insurance$charges, xlab = "Age", ylab = "Charges",
main = "Relationship between Age and Charges", col = "blue", pch = 19)

# Visualization for Charges for (non-smokers vs smokers)
boxplot(charges ~ smoker, data = insurance, main = "Charges Distribution for Smokers and Non-Smokers",
xlab = "Smoker", ylab = "Charges", col = c("red", "green"))

# Ultimately smokers are charged ~ 11,000+
Data Wrangling for Statistical Modeling
# One-hot encoding for regions. Assuming the regions are northeast, northwest, southeast, southwest
insurance <- insurance %>%
mutate(region_northeast = as.integer(region == 'northeast'),
region_northwest = as.integer(region == 'northwest'),
region_southeast = as.integer(region == 'southeast'),
region_southwest = as.integer(region == 'southwest'))
# Label encoding for smoker and sex
insurance$smoker_encoded <- ifelse(insurance$smoker == "yes", 1, 0)
insurance$sex_encoded <- ifelse(insurance$sex == "male", 1, 0)
# Dropping the original smoker, sex columns, region
insurance <- insurance %>%
select(-smoker, -sex, -region)
head(insurance)
## age bmi children charges region_northeast region_northwest
## 1 19 27.90 0 16884.924 0 0
## 2 18 33.77 1 1725.552 0 0
## 3 28 33.00 3 4449.462 0 0
## 4 32 28.88 0 3866.855 0 1
## 5 31 25.74 0 3756.622 0 0
## 6 46 33.44 1 8240.590 0 0
## region_southeast region_southwest smoker_encoded sex_encoded
## 1 0 1 1 0
## 2 1 0 0 1
## 3 1 0 0 1
## 4 0 0 0 1
## 5 1 0 0 0
## 6 1 0 0 0
Correlation Analysis in R
# Calculate the correlation matrix
numeric_cols <- insurance %>% select(where(is.numeric))
corr_matrix <- cor(numeric_cols, use = "complete.obs")
# Melting the correlation matrix
melted_corr_matrix <- melt(corr_matrix)
# Plotting with data labels
ggplot(data = melted_corr_matrix, aes(Var1, Var2, fill = value)) +
geom_tile() +
geom_text(aes(label = round(value, 2)), color = "white", size = 3) + # Data labels
scale_fill_gradient2(low = "cornflowerblue", high = "brown1", mid = "cornsilk3",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Pearson\nCorrelation") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1),
axis.text.y = element_text(size = 12)) +
labs(x = '', y = '', title = 'Correlation Matrix') +
coord_fixed()

## There seems to be a high correlation between: (charges and Smoking)
## (charges and age)
Identifying features based on Correlation
# Calculate the correlation matrix
numeric_cols <- insurance %>% select(where(is.numeric))
corr_matrix <- cor(numeric_cols, use = "complete.obs")
# Set the correlation threshold
threshold <- 0.3
# Find indices where correlation with 'charges' is above the threshold
high_corr_indices <- which(abs(corr_matrix['charges',]) > threshold, arr.ind = TRUE)
# Extract the names of the relevant features, excluding 'charges' itself
relevant_features <- names(high_corr_indices)[names(high_corr_indices) != 'charges']
# Print the relevant features
print(relevant_features)
## [1] "age" "smoker_encoded"
Training a Linear Regression Model
set.seed(2) # For reproducibility
# Determine the Features & Target Variables
X <- insurance %>% select(-charges) # Exclude target variable from features
y <- insurance$charges # Target variable
# Step 2: Split the Dataset to Train & Test (80/20)
list <- createDataPartition(y, p = .8, list = TRUE, times = 1)
X_train <- X[list[[1]], ]
y_train <- y[list[[1]]]
X_test <- X[-list[[1]], ]
y_test <- y[-list[[1]]]
# Step 3: Train the Model using X_train and y_train
model <- lm(y_train ~ ., data = as.data.frame(cbind(X_train, y_train)))
# Step 4: Coefficient Matrix
coefficients <- as.data.frame(coef(model))
colnames(coefficients) <- 'Coefficient'
# Display the coefficients
print(coefficients)
## Coefficient
## (Intercept) -4190.47505
## age 243.37215
## bmi 48.68423
## children 466.28330
## region_northeast 915.30136
## region_northwest 385.97623
## region_southeast 99.74915
## region_southwest NA
## smoker_encoded 13182.71086
## sex_encoded -395.23951
Predicting Test Data
# Step 1: Predicting Test Data
y_pred <- predict(model, newdata = X_test)
# Step 2: Create a Data Frame for Actual vs. Predicted Values
comparison_df <- data.frame(Y_Test = y_test, Y_Pred = y_pred)
# Step 3: Display the First 5 Rows of the Comparison
head(comparison_df)
## Y_Test Y_Pred
## 1 16884.924 14974.597
## 6 8240.590 9198.677
## 8 6406.411 7719.174
## 9 2721.321 3690.391
## 10 1826.843 2686.582
## 11 11090.718 11476.721
Evaluating the Model (Finding the Error ‘MSE’)
# Calculate R-squared
rss <- sum((y_pred - y_test)^2)
tss <- sum((y_test - mean(y_test))^2)
r_squared <- 1 - (rss/tss)
r_squared_rounded <- round(r_squared, 3)
r_squared_rounded
## [1] 0.702
## (~ 70.2%) Accuracy.. Perhaps Polynomial Regression model
# Mean of the target variable (8159)
mean_charges <- mean(insurance$charges)
# Mean Absolute Error (MAE)
mae <- mean(abs(y_pred - y_test))
# Mean Squared Error (MSE)
mse <- mean((y_pred - y_test)^2)
# Root Mean Squared Error (RMSE)
rmse <- sqrt(mse)
# Create a data frame to display the metrics
metrics_df <- data.frame(Metrics = c(MEAN = mean_charges, MAE = mae, MSE = mse, RMSE = rmse))
metrics_df
## Metrics
## MEAN 8159.150
## MAE 1295.889
## MSE 7768856.554
## RMSE 2787.267
# The MAE and RMSE values suggest the linear regression model has a reasonable level of accuracy, however there is room for improvement.
# MAE 1295.889 : the model's average predictions are off by approximately ~1,300 units
# MSE 7768856.554 :
# RMSE 2787.267 : the variability in prediction errors up to roughly 2,800 units
Residuals (Difference between y_test and y_pred)
# Calculate Residuals
test_residuals <- y_test - y_pred
# Scatter Plot of Actual vs. Predicted Values
ggplot(data.frame(Y_Test = y_test, Y_Pred = y_pred), aes(x = Y_Test, y = Y_Pred)) +
geom_point() +
geom_smooth(method = 'lm', color = 'blue', se = FALSE) +
labs(x = 'Y-Test', y = 'Y-Pred') +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# Scatter Plot of Actual Values vs. Residuals
ggplot(data.frame(Y_Test = y_test, Residuals = test_residuals), aes(x = Y_Test, y = Residuals)) +
geom_point() +
geom_hline(yintercept = 0, color = 'red', linetype = 'dashed') +
labs(x = 'Y-Test', y = 'Residuals') +
theme_minimal()

# Histogram (Density Plot) of Residuals
ggplot(data.frame(Residuals = test_residuals), aes(x = Residuals)) +
geom_histogram(aes(y = ..density..), bins = 25, fill = 'skyblue', color = 'black') +
geom_density(color = 'red', alpha = 0.7) +
labs(x = 'Residuals', y = 'Density') +
theme_minimal()

Saving & Testing the trained Linear Regression Model
# Saving trained linear regression model
test_lrm <- lm(charges ~ ., data = insurance)
saveRDS(test_lrm, 'insurance_lrm.rds')
# Loading the model
load_model <- readRDS('insurance_lrm.rds')
head(insurance)
## age bmi children charges region_northeast region_northwest
## 1 19 27.90 0 16884.924 0 0
## 2 18 33.77 1 1725.552 0 0
## 3 28 33.00 3 4449.462 0 0
## 4 32 28.88 0 3866.855 0 1
## 5 31 25.74 0 3756.622 0 0
## 6 46 33.44 1 8240.590 0 0
## region_southeast region_southwest smoker_encoded sex_encoded
## 1 0 1 1 0
## 2 1 0 0 1
## 3 1 0 0 1
## 4 0 0 0 1
## 5 1 0 0 0
## 6 1 0 0 0
# Predicting sample 3 (age: 28, bmi: 33.00, children: 3, charges: 4449.462, region_southeast: 1, smoker: 0, sex: 1)
insurance_encoded <- data.frame(age = 28, bmi = 33.00, children = 3, region_northeast = 0, region_northwest = 0, region_southeast = 1, region_southwest = 0, smoker_encoded = 0, sex_encoded = 1)
# Note: The names of the variables should exactly match those used in the model.
prediction <- predict(load_model, newdata = insurance_encoded)
print(prediction)
## 1
## 5392.575
### Charges Prediction: 5392.575
### Actual Charges: (4449.462)
Creating a Polynomial Regression
# Selecting numeric predictors and excluding 'charges'
predictors <- insurance %>% select(-charges)
# Adding polynomial features
predictors_poly <- as.data.frame(model.matrix(~ .^2, data = predictors)[,-1])
set.seed(1) # For reproducibility
indexes <- createDataPartition(insurance$charges, p = .8, list = FALSE)
X_train <- predictors_poly[indexes, ]
y_train <- insurance$charges[indexes]
X_test <- predictors_poly[-indexes, ]
y_test <- insurance$charges[-indexes]
model_poly <- lm(y_train ~ ., data = X_train)
# Predictions on the test set
predictions_poly <- predict(model_poly, newdata = X_test)
## Warning in predict.lm(model_poly, newdata = X_test): prediction from
## rank-deficient fit; attr(*, "non-estim") has doubtful cases
# Calculate R-squared value
R2_poly <- cor(y_test, predictions_poly)^2
R2_poly
## [1] 0.8019775
The Polynomial Regression has an R-Squared of : (~ 80.1%)
- That’s a (10%) improvement from the linear regression model
before!