Background Context

Importing Libraries

library(dplyr)
library(ggplot2)
library(readr)
library(GGally)
library(reshape2)
library(caret)
library(lmtest)
library(Metrics)
library(polycor)

Importing Dataset

Data overview

## Brief overview of raw data
head(df)
##   age    sex    bmi children smoker    region   charges
## 1  19 female 27.900        0    yes southwest 16884.924
## 2  18   male 33.770        1     no southeast  1725.552
## 3  28   male 33.000        3     no southeast  4449.462
## 4  33   male 22.705        0     no northwest 21984.471
## 5  32   male 28.880        0     no northwest  3866.855
## 6  31 female 25.740        0     no southeast  3756.622
# Checking the Structure of the dataset
str(df) 
## 'data.frame':    1338 obs. of  7 variables:
##  $ age     : int  19 18 28 33 32 31 46 37 37 60 ...
##  $ sex     : chr  "female" "male" "male" "male" ...
##  $ bmi     : num  27.9 33.8 33 22.7 28.9 ...
##  $ children: int  0 1 3 0 0 0 1 3 2 0 ...
##  $ smoker  : chr  "yes" "no" "no" "no" ...
##  $ region  : chr  "southwest" "southeast" "southeast" "northwest" ...
##  $ charges : num  16885 1726 4449 21984 3867 ...

Simple EDA

# Summary Statistics
summary(df)
##       age            sex                 bmi           children    
##  Min.   :18.00   Length:1337        Min.   :15.96   Min.   :0.000  
##  1st Qu.:27.00   Class :character   1st Qu.:26.29   1st Qu.:0.000  
##  Median :39.00   Mode  :character   Median :30.40   Median :1.000  
##  Mean   :39.22                      Mean   :30.66   Mean   :1.096  
##  3rd Qu.:51.00                      3rd Qu.:34.70   3rd Qu.:2.000  
##  Max.   :64.00                      Max.   :53.13   Max.   :5.000  
##     smoker             region             charges     
##  Length:1337        Length:1337        Min.   : 1122  
##  Class :character   Class :character   1st Qu.: 4746  
##  Mode  :character   Mode  :character   Median : 9386  
##                                        Mean   :13279  
##                                        3rd Qu.:16658  
##                                        Max.   :63770
# Checking for Outliers 

ggplot(df, aes(x = bmi)) + 
  geom_boxplot(fill = "pink") + 
  ggtitle("Outliers in BMI")

ggplot(df, aes(x = age)) + 
  geom_boxplot(fill = "lightblue") + 
  ggtitle("Outliers in Age")

ggplot(df, aes(x = children)) + 
  geom_boxplot(fill = "purple") + 
  ggtitle("Outliers in Children")

ggplot(df, aes(x = charges)) + 
  geom_boxplot(fill = "violet") + 
  ggtitle("Outliers in Charges")

## There seems to be outliers in both the (BMI & Charges) columns
## Handling Outliers on data (outside whiskers)

insurance <- df %>%
  filter(charges < 21000, bmi < 46)
## Plotting new data frame without (outliers)
boxplot(insurance$age, insurance$bmi, insurance$children, insurance$charges, 
        names = c("age", "bmi", "children", "charges"), 
        main = "Box plot of Numerical Columns")

## Plotting Clean Data Frame's (Distribution)
ggplot(insurance, aes(x = charges)) + 
  geom_histogram(aes(y = ..density..), bins = 30, fill = "blue", alpha = 0.5) + 
  geom_density(col = "red", linewidth = 1)
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Creating a Table for (Charges by Region)
charges_by_region <- aggregate(charges ~ region, data = insurance, FUN = sum)

# Plotting a Vertical Bar Chart for Charges by Region
ggplot(charges_by_region, aes(x = region, y = charges)) + 
  geom_bar(stat = "identity", fill = "blue") +
  labs(x = "Region", y = "Total Charges", title = "Total Charges by Region") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Creating a Horizontal Bar chart for (Smokers by Region)
smokers_by_region <- table(insurance$region, insurance$smoker)

barplot(smokers_by_region, beside = TRUE, legend = TRUE, 
        args.legend = list(title = "Smoker"), horiz = TRUE, 
        col = c("grey", "blue"), xlab = "Count", ylab = "Region", 
        main = "Count of Smokers by Region")

# Creating a pie chart for (Smoker's Distribution)
smoker_counts <- table(insurance$smoker)

pie(smoker_counts, labels = names(smoker_counts), 
    main = "Distribution of Smokers", col = c("red", "green"))

# Plotting a Violin Chart for (Charges by # of Children)
ggplot(insurance, aes(x = as.factor(children), y = charges)) + 
  geom_violin(trim = FALSE, fill = "blue", color = "black") +
  labs(x = "Number of Children", y = "Charges", title = "Charges by Number of Children")

# Creating a visualization for Distribution of Sex
ggplot(insurance, aes(x = sex)) + 
  geom_bar(fill = "blue") +
  labs(x = "Sex", y = "Count", title = "Distribution of Sex")

# Plotting a Histogram for the Distribution of BMI 
hist(insurance$bmi, breaks = 20, main = "Distribution of BMI", xlab = "BMI", col = "red")

# Plotting the Correlation between (Age and Charges) 'Positive'
plot(insurance$age, insurance$charges, xlab = "Age", ylab = "Charges", 
     main = "Relationship between Age and Charges", col = "blue", pch = 19)

# Visualization for Charges for (non-smokers vs smokers)
boxplot(charges ~ smoker, data = insurance, main = "Charges Distribution for Smokers and Non-Smokers",
        xlab = "Smoker", ylab = "Charges", col = c("red", "green"))

# Ultimately smokers are charged ~ 11,000+

Data Wrangling for Statistical Modeling

# One-hot encoding for regions. Assuming the regions are northeast, northwest, southeast, southwest
insurance <- insurance %>%
  mutate(region_northeast = as.integer(region == 'northeast'),
         region_northwest = as.integer(region == 'northwest'),
         region_southeast = as.integer(region == 'southeast'),
         region_southwest = as.integer(region == 'southwest'))

# Label encoding for smoker and sex
insurance$smoker_encoded <- ifelse(insurance$smoker == "yes", 1, 0)
insurance$sex_encoded <- ifelse(insurance$sex == "male", 1, 0)

# Dropping the original smoker, sex columns, region
insurance <- insurance %>%
  select(-smoker, -sex, -region)
head(insurance)
##   age   bmi children   charges region_northeast region_northwest
## 1  19 27.90        0 16884.924                0                0
## 2  18 33.77        1  1725.552                0                0
## 3  28 33.00        3  4449.462                0                0
## 4  32 28.88        0  3866.855                0                1
## 5  31 25.74        0  3756.622                0                0
## 6  46 33.44        1  8240.590                0                0
##   region_southeast region_southwest smoker_encoded sex_encoded
## 1                0                1              1           0
## 2                1                0              0           1
## 3                1                0              0           1
## 4                0                0              0           1
## 5                1                0              0           0
## 6                1                0              0           0

Correlation Analysis in R

# Calculate the correlation matrix
numeric_cols <- insurance %>% select(where(is.numeric))
corr_matrix <- cor(numeric_cols, use = "complete.obs")

# Melting the correlation matrix
melted_corr_matrix <- melt(corr_matrix)

# Plotting with data labels
ggplot(data = melted_corr_matrix, aes(Var1, Var2, fill = value)) +
  geom_tile() +
  geom_text(aes(label = round(value, 2)), color = "white", size = 3) + # Data labels 
  scale_fill_gradient2(low = "cornflowerblue", high = "brown1", mid = "cornsilk3", 
                       midpoint = 0, limit = c(-1,1), space = "Lab", 
                       name="Pearson\nCorrelation") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1),
        axis.text.y = element_text(size = 12)) +
  labs(x = '', y = '', title = 'Correlation Matrix') +
  coord_fixed()

## There seems to be a high correlation between: (charges and Smoking)
##                                               (charges and age)

Identifying features based on Correlation

# Calculate the correlation matrix
numeric_cols <- insurance %>% select(where(is.numeric))
corr_matrix <- cor(numeric_cols, use = "complete.obs")

# Set the correlation threshold
threshold <- 0.3

# Find indices where correlation with 'charges' is above the threshold
high_corr_indices <- which(abs(corr_matrix['charges',]) > threshold, arr.ind = TRUE)

# Extract the names of the relevant features, excluding 'charges' itself
relevant_features <- names(high_corr_indices)[names(high_corr_indices) != 'charges']

# Print the relevant features
print(relevant_features)
## [1] "age"            "smoker_encoded"

Training a Linear Regression Model

set.seed(2) # For reproducibility

# Determine the Features & Target Variables
X <- insurance %>% select(-charges) # Exclude target variable from features
y <- insurance$charges # Target variable

# Step 2: Split the Dataset to Train & Test (80/20)
list <- createDataPartition(y, p = .8, list = TRUE, times = 1)
X_train <- X[list[[1]], ]
y_train <- y[list[[1]]]
X_test <- X[-list[[1]], ]
y_test <- y[-list[[1]]]

# Step 3: Train the Model using X_train and y_train
model <- lm(y_train ~ ., data = as.data.frame(cbind(X_train, y_train)))

# Step 4: Coefficient Matrix
coefficients <- as.data.frame(coef(model))
colnames(coefficients) <- 'Coefficient'

# Display the coefficients
print(coefficients)
##                  Coefficient
## (Intercept)      -4190.47505
## age                243.37215
## bmi                 48.68423
## children           466.28330
## region_northeast   915.30136
## region_northwest   385.97623
## region_southeast    99.74915
## region_southwest          NA
## smoker_encoded   13182.71086
## sex_encoded       -395.23951

Predicting Test Data

# Step 1: Predicting Test Data
y_pred <- predict(model, newdata = X_test)

# Step 2: Create a Data Frame for Actual vs. Predicted Values
comparison_df <- data.frame(Y_Test = y_test, Y_Pred = y_pred)

# Step 3: Display the First 5 Rows of the Comparison
head(comparison_df)
##       Y_Test    Y_Pred
## 1  16884.924 14974.597
## 6   8240.590  9198.677
## 8   6406.411  7719.174
## 9   2721.321  3690.391
## 10  1826.843  2686.582
## 11 11090.718 11476.721

Evaluating the Model (Finding the Error ‘MSE’)

# Calculate R-squared
rss <- sum((y_pred - y_test)^2)
tss <- sum((y_test - mean(y_test))^2)
r_squared <- 1 - (rss/tss)
r_squared_rounded <- round(r_squared, 3)
r_squared_rounded
## [1] 0.702
## (~ 70.2%) Accuracy.. Perhaps Polynomial Regression model 
# Mean of the target variable (8159)
mean_charges <- mean(insurance$charges)

# Mean Absolute Error (MAE)
mae <- mean(abs(y_pred - y_test))

# Mean Squared Error (MSE)
mse <- mean((y_pred - y_test)^2)

# Root Mean Squared Error (RMSE)
rmse <- sqrt(mse)

# Create a data frame to display the metrics
metrics_df <- data.frame(Metrics = c(MEAN = mean_charges, MAE = mae, MSE = mse, RMSE = rmse))
metrics_df
##          Metrics
## MEAN    8159.150
## MAE     1295.889
## MSE  7768856.554
## RMSE    2787.267
# The MAE and RMSE values suggest the linear regression model has a reasonable level of accuracy, however there is room for improvement.
# MAE   1295.889      : the model's average predictions are off by approximately ~1,300 units   
# MSE   7768856.554 :           
# RMSE  2787.267  : the variability in prediction errors up to roughly 2,800 units

Residuals (Difference between y_test and y_pred)

# Calculate Residuals
test_residuals <- y_test - y_pred

# Scatter Plot of Actual vs. Predicted Values
ggplot(data.frame(Y_Test = y_test, Y_Pred = y_pred), aes(x = Y_Test, y = Y_Pred)) +
  geom_point() +
  geom_smooth(method = 'lm', color = 'blue', se = FALSE) +
  labs(x = 'Y-Test', y = 'Y-Pred') +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# Scatter Plot of Actual Values vs. Residuals
ggplot(data.frame(Y_Test = y_test, Residuals = test_residuals), aes(x = Y_Test, y = Residuals)) +
  geom_point() +
  geom_hline(yintercept = 0, color = 'red', linetype = 'dashed') +
  labs(x = 'Y-Test', y = 'Residuals') +
  theme_minimal()

# Histogram (Density Plot) of Residuals
ggplot(data.frame(Residuals = test_residuals), aes(x = Residuals)) +
  geom_histogram(aes(y = ..density..), bins = 25, fill = 'skyblue', color = 'black') +
  geom_density(color = 'red', alpha = 0.7) +
  labs(x = 'Residuals', y = 'Density') +
  theme_minimal()

Saving & Testing the trained Linear Regression Model

# Saving trained linear regression model 
test_lrm <- lm(charges ~ ., data = insurance)
saveRDS(test_lrm, 'insurance_lrm.rds')
# Loading the model
load_model <- readRDS('insurance_lrm.rds')
head(insurance)
##   age   bmi children   charges region_northeast region_northwest
## 1  19 27.90        0 16884.924                0                0
## 2  18 33.77        1  1725.552                0                0
## 3  28 33.00        3  4449.462                0                0
## 4  32 28.88        0  3866.855                0                1
## 5  31 25.74        0  3756.622                0                0
## 6  46 33.44        1  8240.590                0                0
##   region_southeast region_southwest smoker_encoded sex_encoded
## 1                0                1              1           0
## 2                1                0              0           1
## 3                1                0              0           1
## 4                0                0              0           1
## 5                1                0              0           0
## 6                1                0              0           0
# Predicting sample 3 (age: 28, bmi: 33.00, children: 3, charges: 4449.462, region_southeast: 1, smoker: 0, sex: 1)

insurance_encoded <- data.frame(age = 28, bmi = 33.00, children = 3, region_northeast = 0, region_northwest = 0, region_southeast = 1, region_southwest = 0, smoker_encoded = 0, sex_encoded = 1)
# Note: The names of the variables should exactly match those used in the model.
prediction <- predict(load_model, newdata = insurance_encoded)
print(prediction)
##        1 
## 5392.575
### Charges Prediction: 5392.575
### Actual Charges: (4449.462)

Creating a Polynomial Regression

# Selecting numeric predictors and excluding 'charges'
predictors <- insurance %>% select(-charges)

# Adding polynomial features
predictors_poly <- as.data.frame(model.matrix(~ .^2, data = predictors)[,-1])
set.seed(1) # For reproducibility
indexes <- createDataPartition(insurance$charges, p = .8, list = FALSE)
X_train <- predictors_poly[indexes, ]
y_train <- insurance$charges[indexes]
X_test <- predictors_poly[-indexes, ]
y_test <- insurance$charges[-indexes]
model_poly <- lm(y_train ~ ., data = X_train)
# Predictions on the test set
predictions_poly <- predict(model_poly, newdata = X_test)
## Warning in predict.lm(model_poly, newdata = X_test): prediction from
## rank-deficient fit; attr(*, "non-estim") has doubtful cases
# Calculate R-squared value
R2_poly <- cor(y_test, predictions_poly)^2
R2_poly
## [1] 0.8019775

The Polynomial Regression has an R-Squared of : (~ 80.1%)