library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
library(flexdashboard)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.4
## ✔ lubridate 1.9.3     ✔ stringr   1.5.0
## ✔ purrr     1.0.2     ✔ tibble    3.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(gapminder)
library(ggthemes)
library(ggthemes)
library(lessR)
## 
## lessR 4.2.9                         feedback: gerbing@pdx.edu 
## --------------------------------------------------------------
## > d <- Read("")   Read text, Excel, SPSS, SAS, or R data file
##   d is default data frame, data= in analysis routines optional
## 
## Learn about reading, writing, and manipulating data, graphics,
## testing means and proportions, regression, factor analysis,
## customization, and descriptive statistics from pivot tables
##   Enter:  browseVignettes("lessR")
## 
## View changes in this and recent versions of lessR
##   Enter: news(package="lessR")
## 
## Interactive data analysis
##   Enter: interact()
## 
## 
## Attaching package: 'lessR'
## 
## The following objects are masked from 'package:dplyr':
## 
##     recode, rename
library(dplyr)          
library(ggplot2)        
library(caret)          
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(DataExplorer)    
library(car)            
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following objects are masked from 'package:lessR':
## 
##     bc, recode, sp
## 
## The following object is masked from 'package:purrr':
## 
##     some
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
library(ggcorrplot)    
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(dplyr)
library(GGally)
library(ggcorrplot)
library(MLmetrics)
## 
## Attaching package: 'MLmetrics'
## 
## The following objects are masked from 'package:caret':
## 
##     MAE, RMSE
## 
## The following object is masked from 'package:base':
## 
##     Recall
library(performance)
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(highcharter)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo 
## Highcharts (www.highcharts.com) is a Highsoft software product which is
## not free for commercial and Governmental use
## 
## Attaching package: 'highcharter'
## 
## The following object is masked from 'package:lmtest':
## 
##     unemployment
library(RColorBrewer)
library(DT)
library(moments)
## 
## Attaching package: 'moments'
## 
## The following object is masked from 'package:lessR':
## 
##     kurtosis
library(see)
library(MASS)
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
library(psych)
## 
## Attaching package: 'psych'
## 
## The following object is masked from 'package:MLmetrics':
## 
##     AUC
## 
## The following object is masked from 'package:car':
## 
##     logit
## 
## The following objects are masked from 'package:lessR':
## 
##     reflect, rescale, scree, skew
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(Hmisc)
## 
## Attaching package: 'Hmisc'
## 
## The following object is masked from 'package:psych':
## 
##     describe
## 
## The following objects are masked from 'package:lessR':
## 
##     label, Merge
## 
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library(corrplot)
## corrplot 0.92 loaded
library(ROCR) 
## 
## Attaching package: 'ROCR'
## 
## The following object is masked from 'package:performance':
## 
##     performance
library(broom)
library(WVPlots)
## Loading required package: wrapr
## 
## Attaching package: 'wrapr'
## 
## The following object is masked from 'package:car':
## 
##     bc
## 
## The following objects are masked from 'package:lessR':
## 
##     bc, to
## 
## The following object is masked from 'package:tibble':
## 
##     view
## 
## The following objects are masked from 'package:tidyr':
## 
##     pack, unpack
## 
## The following object is masked from 'package:dplyr':
## 
##     coalesce
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:psych':
## 
##     outlier
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

Problem Statement: Predicting Insurance Charges

Insurance premiums are determined by various factors, impacting the coverage provided by insurance companies. As a data scientist, you have access to historical data concerning an organization’s customers and the corresponding insurance charges incurred. For our analysis we decided to work with data from kaggle.com EDA for Insurance Charges Prediction.

The dataset comprises user information such as age, sex, BMI, hospitalization history, annual income, etc. The target variable, health insurance charges, is quantifies based on the aforementioned characteristics. Our task is to thoroughly analyze this data, extract valuable insights, and develop a robust linear regression model.

The objective is to create a model capable of accurately predicting insurance charges for a new set of data. By leveraging the provided features, the model will help anticipate insurance costs for prospective customers, aiding in pricing strategies and risk assessment.

Key Words

Gender: Indicates the gender of the insured individual, distinguishing between male and female categories.

BMI (Body Mass Index): A numerical measure derived from an individual's weight and height, serving as an indicator of body fatness.

Age: Represents the age of the insured individual, a significant factor influencing health insurance charges.

Number of Children: Indicates the number of children covered under the insurance plan, reflecting family size and potential healthcare needs.

Smoking Status: Categorizes individuals based on their smoking habits, distinguishing between smokers and non-smokers.

Region: Specifies the geographical region where the insured individual resides, capturing regional variations in healthcare costs and accessibility.

Health Insurance Charges: The target variable, quantifying the charges associated with health insurance coverage, influenced by the individual's characteristics.
and some other columns too .

Access Data

Insurance data is available at: https://www.kaggle.com/datasets/zulqarnainalipk/insurace-data-with-10000-records

By employing Linear Regression, we aim to accurately predict insurance charges based on relevant factors. This predictive model can assist insurance companies in setting premiums, assessing risk, and making informed decisions.

Insurance <- read.csv("https://raw.githubusercontent.com/uplotnik/DATA621/main/insurance_data.csv")
str(Insurance)
## 'data.frame':    10008 obs. of  13 variables:
##  $ age                            : num  45 64 19 36 19 34 47 63 19 20 ...
##  $ sex                            : chr  "male" "male" "female" "male" ...
##  $ bmi                            : num  28.7 34.5 32.1 28.9 24.6 ...
##  $ children                       : num  2 0 0 3 1 0 1 0 1 0 ...
##  $ smoker                         : chr  "no" "no" "no" "no" ...
##  $ Claim_Amount                   : num  32994 38448 50778 33741 12198 ...
##  $ past_consultations             : num  16 8 15 10 16 5 23 7 3 10 ...
##  $ num_of_steps                   : num  902022 956604 758688 879560 793026 ...
##  $ Hospital_expenditure           : num  8640895 11022389 1642626 1985637 10009377 ...
##  $ NUmber_of_past_hospitalizations: num  1 1 0 1 1 1 1 1 0 0 ...
##  $ Anual_Salary                   : num  94365914 230021899 46443495 130616936 61133917 ...
##  $ region                         : chr  "southwest" "southwest" "northwest" "northeast" ...
##  $ charges                        : num  8028 13823 2131 6749 2709 ...
summary(Insurance)
##       age         sex                 bmi           children    
##  Min.   :18   Length:10008       Min.   :15.96   Min.   :0.000  
##  1st Qu.:26   Class :character   1st Qu.:26.12   1st Qu.:0.000  
##  Median :39   Mode  :character   Median :30.03   Median :1.000  
##  Mean   :39                      Mean   :30.44   Mean   :1.063  
##  3rd Qu.:51                      3rd Qu.:34.32   3rd Qu.:2.000  
##  Max.   :64                      Max.   :53.13   Max.   :5.000  
##  NA's   :73                      NA's   :24      NA's   :45     
##     smoker           Claim_Amount   past_consultations  num_of_steps    
##  Length:10008       Min.   : 1920   Min.   : 1.00      Min.   : 695430  
##  Class :character   1st Qu.:19541   1st Qu.: 9.00      1st Qu.: 841902  
##  Mode  :character   Median :32800   Median :14.00      Median : 903769  
##                     Mean   :32221   Mean   :14.42      Mean   : 896863  
##                     3rd Qu.:44148   3rd Qu.:19.00      3rd Qu.: 949238  
##                     Max.   :77278   Max.   :40.00      Max.   :1107872  
##                     NA's   :99      NA's   :51         NA's   :24       
##  Hospital_expenditure NUmber_of_past_hospitalizations  Anual_Salary       
##  Min.   :    29453    Min.   :0.0000                  Min.   :   2747072  
##  1st Qu.:  3776053    1st Qu.:1.0000                  1st Qu.:  73542243  
##  Median :  6894887    Median :1.0000                  Median : 125901988  
##  Mean   : 11198901    Mean   :0.9725                  Mean   : 263017112  
##  3rd Qu.:  9721549    3rd Qu.:1.0000                  3rd Qu.: 224741452  
##  Max.   :261631699    Max.   :3.0000                  Max.   :4117196637  
##  NA's   :30           NA's   :10                      NA's   :37          
##     region             charges     
##  Length:10008       Min.   : 1122  
##  Class :character   1st Qu.: 4441  
##  Mode  :character   Median : 8550  
##                     Mean   :11025  
##                     3rd Qu.:13224  
##                     Max.   :63770  
## 
introduce(Insurance)
##    rows columns discrete_columns continuous_columns all_missing_columns
## 1 10008      13                3                 10                   0
##   total_missing_values complete_rows total_observations memory_usage
## 1                  393          9618             130104      1044952

DATA OBSERVATION

plot_missing(Insurance)

Several variables: ‘age’, ‘bmi’, ‘children’, ‘Claim_Amount’, ‘past_consultations’, ‘num_of_steps’, ‘Hospital_expenditure’, ‘NUmber_of_past_hospitalizations’, ‘Anual_Salary’ have missing values which will need to be imputed.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Description of categorical variables

categ_cols <- Insurance %>% select_if(~ class(.) == "factor")
for (col in names(categ_cols)) {
  t <- Insurance %>%
    group_by_(col) %>%
    summarise(count = n()) %>%
    mutate(frequency = paste0(round(100 * count / sum(count), 0), "%")) %>% 
    knitr::kable("html", align = "lcc") %>%
    kableExtra::kable_styling(full_width = F, position = "left") %>% 
    print()
}
categ_cols
## data frame with 0 columns and 10008 rows

2. DATA PREPARATION

Change data types of the following variables: sex, smoker and region

Insurance[ , c("sex", "smoker", "region")] <- 
  lapply(Insurance[ , c("sex", "smoker", "region")], as.factor)
str(Insurance)
## 'data.frame':    10008 obs. of  13 variables:
##  $ age                            : num  45 64 19 36 19 34 47 63 19 20 ...
##  $ sex                            : Factor w/ 2 levels "female","male": 2 2 1 2 1 2 1 1 1 1 ...
##  $ bmi                            : num  28.7 34.5 32.1 28.9 24.6 ...
##  $ children                       : num  2 0 0 3 1 0 1 0 1 0 ...
##  $ smoker                         : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Claim_Amount                   : num  32994 38448 50778 33741 12198 ...
##  $ past_consultations             : num  16 8 15 10 16 5 23 7 3 10 ...
##  $ num_of_steps                   : num  902022 956604 758688 879560 793026 ...
##  $ Hospital_expenditure           : num  8640895 11022389 1642626 1985637 10009377 ...
##  $ NUmber_of_past_hospitalizations: num  1 1 0 1 1 1 1 1 0 0 ...
##  $ Anual_Salary                   : num  94365914 230021899 46443495 130616936 61133917 ...
##  $ region                         : Factor w/ 4 levels "northeast","northwest",..: 4 4 2 1 2 3 4 2 4 3 ...
##  $ charges                        : num  8028 13823 2131 6749 2709 ...

Data Imputation

Replace NA values with median of the column;

## [1] "Count of total missing values in after imputation "
## [1] 0

Data Visualization

How many insured individuals belong to each age group (e.g., 10-20, 21-30, 31-40, etc.)?

age_group <- cut(clean$age, 
                  breaks = seq(10, 100, by = 10), 
                  include.lowest = TRUE)

InsuranceData <- cbind(clean, age_group)
group<-InsuranceData%>% 
  count(age_group)
group %>% 
  hchart('column', hcaes(x = 'age_group', y = 'n'))%>% hc_chart(type = "column", options3d = list(enabled = TRUE, alpha = 15, beta = 15), dataLabels=list(enabled=TRUE))%>%
    hc_plotOptions(series = list(
        borderWidth= 0,
        dataLabels = list(
            enabled = TRUE,
            color = "black",
            format = '{point.y}')))
sm<-Insurance %>% 
  count(smoker) 
sm %>% 
  hchart('pie', hcaes(x = 'smoker', y = 'n'))%>% hc_chart(type = "column", options3d = list(enabled = TRUE, alpha = 15, beta = 15), dataLabels=list(enabled=TRUE))

Observation: there are more people who don’t smoke than who smokes

Put BMI into categories and explore how many individuals in each group.

BMI<-clean  %>% mutate(bmi_cat = cut(bmi,
  breaks = c(0, 18.5, 25, 30, 60),
  labels = c("Under Weight", "Normal Weight", "Overweight", "Obese")
))
BMI %>% 
  count(bmi_cat) %>% 
  hchart('bar', hcaes(x = 'bmi_cat', y = 'n'))%>% hc_chart(type = "bar", options3d = list(enabled = TRUE, alpha = 30, beta = 0))%>%
    hc_plotOptions(series = list(
        borderWidth= 0,
        dataLabels = list(
            enabled = TRUE,
            color = "black",
            format = '{point.y}')))

Looking into relationship between Target and Independent variables

#Correlation table
library(corrr)
cor<-clean %>% 
  correlate() %>% 
  focus(charges) %>%arrange(desc(charges))
## Non-numeric variables removed from input: `sex`, `smoker`, and `region`
## Correlation computed with
## • Method: 'pearson'
## • Missing treated using: 'pairwise.complete.obs'
knitr::kable(
 cor, caption = "Correlation Table")%>%
  kable_styling("striped", full_width = F)
Correlation Table
term charges
Anual_Salary 0.9284001
num_of_steps 0.8758246
Hospital_expenditure 0.8315979
NUmber_of_past_hospitalizations 0.7621982
past_consultations 0.5392697
Claim_Amount 0.3502274
age 0.3216199
bmi 0.1274681
children 0.0772186

From the correlation table above we can see strong and weak influences on insurance charges decisions.

Strong Influences:

Weaker Influences:

Many insurance companies can factor in tobacco use in order to increase health insurance rates for smokers. The practice of charging tobacco users more is called tobacco rating. The ACA allows for insurance companies to charge smokers up to 50% more (or premiums that are 1.5 times higher) than non-smokers through a tobacco surcharge. Below is the relationship between the cost of health insurance for smokers vs. nonsmokers.

clean%>%
  hchart(type = "column", hcaes(x = smoker, y = charges))%>% hc_chart(type = "column", options3d = list(enabled = TRUE, alpha = 30, beta = 0))%>% hc_title(text='The cost of health insurance for smokers vs. nonsmokers')

Insurance companies use age to figure out how likely you are to need medical care. Older people are more likely to have health problems, which means the insurance company has to pay more for their medical bills. This increases the cost for the insurance company, so it charges a higher rate for health insurance to compensate. Below is the relationship of Health Insurance Charges by Age.

clean%>%
  hchart(type = "scatter", hcaes(x = age, y = charges))%>% hc_chart(type = "scatter", options3d = list(enabled = TRUE, alpha = 30, beta = 0))%>% hc_title(text='Health Insurance Charges by Age')

In the context of insurance charges, BMI can be a relevant factor used by insurance companies to assess risk. Individuals with higher BMIs may be more prone to certain health conditions, leading to potentially higher healthcare costs and, consequently, higher insurance charges. Below is the relationship between the cost of health insurance and BMI.

clean%>%
  hchart(type = "scatter", hcaes(x = bmi, y = charges))%>% hc_chart(type = "scatter", options3d = list(enabled = TRUE, alpha = 30, beta = 0))%>% hc_title(text='BMI and Health Insurance Charges')
clean%>%
  hchart(type = "scatter", hcaes(x = children, y = charges))%>% hc_chart(type = "scatter", options3d = list(enabled = TRUE, alpha = 30, beta = 0))%>% hc_title(text='Amount of children and Health Insurance Charges')

Larger family size does not necessarily mean higher medical expenses. A family of six people may incur lesser medical costs than a family of two, depending on the individual’s current health condition. It means a healthy family of six people may utilize its health insurance only once during a year, whereas, a family of two people with existing health issues may need to use their health insurance multiple times.

Building Model

Splitting the data set

Split data into 80% training data and 20% testing data

RNGkind(sample.kind = "Rounding")
set.seed(100) 
index_ins <- sample(x = nrow(clean) , size = nrow(clean)*0.8) 
data_train <- clean[index_ins , ]
data_test <- clean[-index_ins, ]
cat(" Size of data train:",dim(data_train),"\n","Size of data test :",dim(data_test))
##  Size of data train: 8006 13 
##  Size of data test : 2002 13

APPENDIX

All code for this report

library(dplyr)
library(tidyr)
library(ggplot2)
library(flexdashboard)
library(tidyverse)
library(gapminder)
library(ggthemes)
library(ggthemes)
library(lessR)
library(dplyr)          
library(ggplot2)        
library(caret)          
library(DataExplorer)    
library(car)            
library(ggcorrplot)    
library(GGally)
library(kableExtra)
library(dplyr)
library(GGally)
library(ggcorrplot)
library(MLmetrics)
library(performance)
library(lmtest)
library(highcharter)
library(RColorBrewer)
library(DT)
library(moments)
library(see)
library(MASS)
library(psych)
library(Hmisc)
library(corrplot)
library(ROCR) 
library(broom)
library(WVPlots)
library(randomForest)
library(pROC)
Insurance <- read.csv("https://raw.githubusercontent.com/uplotnik/DATA621/main/insurance_data.csv")

str(Insurance)
summary(Insurance)
introduce(Insurance)
plot_intro(Insurance, 
           ggtheme = theme_minimal(),
  title = "Missing Data",
  theme_config = theme(plot.title = element_text(color = "orange")),
  geom_label_args = c(hjust = "inward")
)
plot_missing(Insurance)
DataExplorer::plot_bar(
  data = Insurance,
         order_bar = T,
         ggtheme=theme_bw())
         
DataExplorer::plot_histogram(
  geom_histogram_args = list(alpha = 0.5),
   data = Insurance,
         ggtheme=theme_bw())


categ_cols <- Insurance %>% select_if(~ class(.) == "factor")
for (col in names(categ_cols)) {
  t <- Insurance %>%
    group_by_(col) %>%
    summarise(count = n()) %>%
    mutate(frequency = paste0(round(100 * count / sum(count), 0), "%")) %>% 
    knitr::kable("html", align = "lcc") %>%
    kableExtra::kable_styling(full_width = F, position = "left") %>% 
    print()
}
categ_cols
Insurance[ , c("sex", "smoker", "region")] <- 
  lapply(Insurance[ , c("sex", "smoker", "region")], as.factor)
str(Insurance)
clean<-Insurance %>% 
   mutate_at(vars(c('age', 'bmi', 'children', 'Claim_Amount', 'past_consultations', 
                   'num_of_steps', 'Hospital_expenditure', 'NUmber_of_past_hospitalizations', 'Anual_Salary')), ~ifelse(is.na(.), median(., na.rm = TRUE), .))
print("Count of total missing values in after imputation ")
sum(is.na(clean))

age_group <- cut(clean$age, 
                  breaks = seq(10, 100, by = 10), 
                  include.lowest = TRUE)

InsuranceData <- cbind(clean, age_group)
group<-InsuranceData%>% 
  count(age_group)
group %>% 
  hchart('column', hcaes(x = 'age_group', y = 'n'))%>% hc_chart(type = "column", options3d = list(enabled = TRUE, alpha = 15, beta = 15), dataLabels=list(enabled=TRUE))%>%
    hc_plotOptions(series = list(
        borderWidth= 0,
        dataLabels = list(
            enabled = TRUE,
            color = "black",
            format = '{point.y}')))
sm<-Insurance %>% 
  count(smoker) 

sm %>% 
  hchart('pie', hcaes(x = 'smoker', y = 'n'))%>% hc_chart(type = "column", options3d = list(enabled = TRUE, alpha = 15, beta = 15), dataLabels=list(enabled=TRUE))
BMI<-clean  %>% mutate(bmi_cat = cut(bmi,
  breaks = c(0, 18.5, 25, 30, 60),
  labels = c("Under Weight", "Normal Weight", "Overweight", "Obese")
))
BMI %>% 
  count(bmi_cat) %>% 
  hchart('bar', hcaes(x = 'bmi_cat', y = 'n'))%>% hc_chart(type = "bar", options3d = list(enabled = TRUE, alpha = 30, beta = 0))%>%
    hc_plotOptions(series = list(
        borderWidth= 0,
        dataLabels = list(
            enabled = TRUE,
            color = "black",
            format = '{point.y}')))
#Correlation table
library(corrr)
cor<-clean %>% 
  correlate() %>% 
  focus(charges) %>%arrange(desc(charges))

knitr::kable(
 cor, caption = "Correlation Table")%>%
  kable_styling("striped", full_width = F)
clean%>%
  hchart(type = "column", hcaes(x = smoker, y = charges))%>% hc_chart(type = "column", options3d = list(enabled = TRUE, alpha = 30, beta = 0))%>% hc_title(text='The cost of health insurance for smokers vs. nonsmokers')
clean%>%
  hchart(type = "scatter", hcaes(x = age, y = charges))%>% hc_chart(type = "scatter", options3d = list(enabled = TRUE, alpha = 30, beta = 0))%>% hc_title(text='Health Insurance Charges by Age')
clean%>%
  hchart(type = "scatter", hcaes(x = bmi, y = charges))%>% hc_chart(type = "scatter", options3d = list(enabled = TRUE, alpha = 30, beta = 0))%>% hc_title(text='BMI and Health Insurance Charges')

clean%>%
  hchart(type = "scatter", hcaes(x = children, y = charges))%>% hc_chart(type = "scatter", options3d = list(enabled = TRUE, alpha = 30, beta = 0))%>% hc_title(text='Amount of children and Health Insurance Charges')
RNGkind(sample.kind = "Rounding")
set.seed(100) 
index_ins <- sample(x = nrow(clean) , size = nrow(clean)*0.8) 
data_train <- clean[index_ins , ]
data_test <- clean[-index_ins, ]
cat(" Size of data train:",dim(data_train),"\n","Size of data test :",dim(data_test))

Sources

https://www.healthmarkets.com/resources/health-insurance/smoking-and-health-insurance/ https://www.valuepenguin.com/how-age-affects-health-insurance-costs