HEALTH INSURANCE COST PREDICTION

#Set working directory
setwd("/Users/kiteomoru/Downloads")

#
library(readxl)

## Warning: package 'readxl' was built under R version 4.1.2

library(ggplot2)
library(reshape)

## Warning: package 'reshape' was built under R version 4.1.2

library(psych)

## Warning: package 'psych' was built under R version 4.1.2

## 
## Attaching package: 'psych'

## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

library(car)

## Warning: package 'car' was built under R version 4.1.2

## Loading required package: carData

## Warning: package 'carData' was built under R version 4.1.2

## 
## Attaching package: 'car'

## The following object is masked from 'package:psych':
## 
##     logit

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✔ tibble  3.1.7     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ✔ purrr   0.3.4

## Warning: package 'tidyr' was built under R version 4.1.2

## Warning: package 'readr' was built under R version 4.1.2

## Warning: package 'dplyr' was built under R version 4.1.2

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ psych::%+%()    masks ggplot2::%+%()
## ✖ psych::alpha()  masks ggplot2::alpha()
## ✖ tidyr::expand() masks reshape::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::recode() masks car::recode()
## ✖ dplyr::rename() masks reshape::rename()
## ✖ purrr::some()   masks car::some()

library(skimr)

## Warning: package 'skimr' was built under R version 4.1.2

library(janitor)

## 
## Attaching package: 'janitor'

## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

library(ggpubr)
theme_set(theme_pubr())

#load data #data obtained from Kaggle

insurance_data= read_excel("/Users/kiteomoru/Downloads/insurance.xlsx")
data= as.data.frame(insurance_data)
head(data)

##   age    sex    bmi children smoker    region   charges
## 1  19 female 27.900        0    yes southwest 16884.924
## 2  18   male 33.770        1     no southeast  1725.552
## 3  28   male 33.000        3     no southeast  4449.462
## 4  33   male 22.705        0     no northwest 21984.471
## 5  32   male 28.880        0     no northwest  3866.855
## 6  31 female 25.740        0     no southeast  3756.622

#data exploration

skim(data)

Data summary
Name	data
Number of rows	1338
Number of columns	7
_______________________
Column type frequency:
character	3
numeric	4
________________________
Group variables	None

Variable type: character

skim_variable	complete_rate	min	max	n_unique
sex	1	4	6	2
smoker	1	2	3	2
region	1	9	9	4

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
age	1	39.21	14.05	18.00	27.00	39.00	51.00	64.00	▇▅▅▆▆
bmi	1	30.66	6.10	15.96	26.30	30.40	34.69	53.13	▂▇▇▂▁
children	1	1.09	1.21	0.00	0.00	1.00	2.00	5.00	▇▂▂▁▁
charges	1	13270.42	12110.01	1121.87	4740.29	9382.03	16639.91	63770.43	▇▂▁▁▁

#data distribution
data %>% tabyl(region, sex, smoker)

## $no
##     region female male
##  northeast    132  125
##  northwest    135  132
##  southeast    139  134
##  southwest    141  126
## 
## $yes
##     region female male
##  northeast     29   38
##  northwest     29   29
##  southeast     36   55
##  southwest     21   37

#bmi to binary variable
data$bmi_cat= ifelse(data$bmi>= 30, 1, 0)
#Gender to binary variable
data$sex= ifelse(data$sex == "male", 1, 0) #male 1, female 0
#correlation analysis
cor_data_m=cor(data[c('age', 'bmi_cat', 'children', 'charges', 'sex')])
#visualize

cor_data_m_melt= melt(cor_data_m)

## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the
## caller; using TRUE

## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the
## caller; using TRUE

cor_data_m_melt$value= round(cor_data_m_melt$value, digits = 2)
heatmap= ggplot(cor_data_m_melt, aes(X1, X2, fill= value))+
  geom_tile(color = "white", lwd= 1.5, linetype= 1) +
  scale_fill_gradient2(low = "#2C7BB6",
                       mid = "white",
                       high = "#D7191C", breaks=seq(-1,1,0.2), limits= c(-1,1)) +
  coord_fixed() + 
  theme_minimal(base_family="Helvetica")+
  guides(fill = guide_colourbar(barwidth = 0.5,barheight = 20))

heatmap2=heatmap + 
  geom_text(aes(X1, X2, label = value), color = "black", size = 5) +
  theme(axis.ticks=element_blank())+
  theme(axis.text.x=element_text(angle = 45, hjust = 1))
heatmap2

#Age and BMI have the closest correlation with insurance charges

#scatterplot to find relationship among variables

pairs.panels(data[c('age', 'bmi_cat','bmi', 'children', 'charges', 'sex')])

#Statistics #Normality testing

# Two rows, two columns
par(mfrow = c(1, 4))
hist(data$charges, main = "Charges") #data is skewed 
hist(data$bmi,main = "BMI" ) #data is skewed 
hist(data$age, main = "Age") #data is skewed 
hist(data$children, main = "Children") #data is skewed

#visualize
par(mfrow = c(1, 4))
qqPlot(data$charges)

## [1]  544 1301

qqPlot(data$age)

## [1] 63 95

qqPlot(data$bmi)

## [1] 1318 1048

qqPlot(data$children)

## [1] 33 72

#Shapiro-Wilk
shapiro.test(data$age)

## 
##  Shapiro-Wilk normality test
## 
## data:  data$age
## W = 0.9447, p-value < 2.2e-16

shapiro.test(data$charges)

## 
##  Shapiro-Wilk normality test
## 
## data:  data$charges
## W = 0.81469, p-value < 2.2e-16

shapiro.test(data$bmi)

## 
##  Shapiro-Wilk normality test
## 
## data:  data$bmi
## W = 0.99389, p-value = 2.605e-05

shapiro.test(data$children)

## 
##  Shapiro-Wilk normality test
## 
## data:  data$children
## W = 0.82318, p-value < 2.2e-16

#Linear regression model to predict lm() ##prepare data for modelling # create testing and training data –30%test 70%train

traindata=round(0.7 * nrow(data))
train_indices=sample(1:nrow(data), traindata)
Data_train =data[train_indices, ]
Data_test =data[-train_indices, ]

#generate linear model-1

lmformula1=as.formula("charges ~ age + sex + bmi_cat + children + smoker + region")
ins_datamodel1= lm(lmformula1, data = Data_train)
#summarise
summary(ins_datamodel1)

## 
## Call:
## lm(formula = lmformula1, data = Data_train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13152.4  -3752.5    261.4   1692.9  27438.3 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -4338.18     742.32  -5.844 7.05e-09 ***
## age               264.43      14.14  18.703  < 2e-16 ***
## sex              -292.09     398.49  -0.733  0.46374    
## bmi_cat          4747.62     406.56  11.677  < 2e-16 ***
## children          534.78     165.02   3.241  0.00123 ** 
## smokeryes       24001.58     488.75  49.108  < 2e-16 ***
## regionnorthwest  -280.57     563.00  -0.498  0.61836    
## regionsoutheast  -863.55     558.46  -1.546  0.12237    
## regionsouthwest -1371.54     570.98  -2.402  0.01650 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6063 on 928 degrees of freedom
## Multiple R-squared:  0.7601, Adjusted R-squared:  0.7581 
## F-statistic: 367.6 on 8 and 928 DF,  p-value: < 2.2e-16

### pull the regression points and observed data in to one dataset
points <- broom::augment(ins_datamodel1)

#predict data
rsq1 <- summary(ins_datamodel1)$r.squared #r-squared
#predict data on test set
predict1 <- predict(ins_datamodel1, newdata = Data_test)
#calculating the residuals
residuals1 <- Data_test$charges - predict1
#calculating Root Mean Squared Error
rmse1 <- sqrt(mean(residuals1^2))

#generate linear model-2
#eliminating low correlating / insignificant variables, eg, sex and children
lmformula2=as.formula("charges ~ age  + smoker + region")
ins_datamodel2= lm(lmformula2, data = Data_train)
#summarise
summary(ins_datamodel2)

## 
## Call:
## lm(formula = lmformula2, data = Data_train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15762.4  -2087.1  -1295.0   -204.1  28392.9 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -2461.1      742.2  -3.316 0.000949 ***
## age                281.8       15.1  18.664  < 2e-16 ***
## smokeryes        23903.5      523.5  45.662  < 2e-16 ***
## regionnorthwest   -326.3      604.4  -0.540 0.589382    
## regionsoutheast    227.5      591.6   0.385 0.700681    
## regionsouthwest   -751.1      610.8  -1.230 0.219103    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6515 on 931 degrees of freedom
## Multiple R-squared:  0.7221, Adjusted R-squared:  0.7207 
## F-statistic: 483.9 on 5 and 931 DF,  p-value: < 2.2e-16

#predict data on test set
#predict data
rsq2 <- summary(ins_datamodel2)$r.squared #r-squared
predict2 <- predict(ins_datamodel2, newdata = Data_test)
#calculating the residuals
residuals2 <- Data_test$charges - predict2
#calculating Root Mean Squared Error
rmse2 <- sqrt(mean(residuals2^2))


#compare both linear models
print(paste0("R-squared for first model:", round(rsq1, 4)))

## [1] "R-squared for first model:0.7601"

print(paste0("R-squared for second model:", round(rsq2, 4)))

## [1] "R-squared for second model:0.7221"

print(paste0("RMSE for first model: ", round(rmse1, 2)))

## [1] "RMSE for first model: 5969.05"

print(paste0("RMSE for second model: ", round(rmse2, 2)))

## [1] "RMSE for second model: 6116.08"

#both models are quite similar. we will proceed with the second model

#visualize the distribution of predicted vs real charges values

Data_test$prediction <- predict(ins_datamodel2, newdata = Data_test)
A=ggplot(Data_test, aes(x = prediction, y = charges)) + 
  geom_point(color = "blue", alpha = 0.7) + 
  geom_abline(color = "red") +
  ggtitle("Predicted values vs. Real values")

#Residuals histogram
Data_test$residuals <- Data_test$charges - Data_test$prediction
B=ggplot(Data_test, aes(x = residuals)) + 
  geom_histogram(bins = 15, fill = "blue") +
  ggtitle("Histogram of residuals")

C= ggplot(data = Data_test, aes(x = prediction, y = residuals)) +
  geom_pointrange(aes(ymin = 0, ymax = residuals), color = "blue", alpha = 0.7) +
  geom_hline(yintercept = 0, linetype = 3, color = "red") +
  ggtitle("Residuals vs. Linear model prediction")

figure <- ggarrange(A,  B, C,
                    labels = c("A", "B", "C"),
                    ncol = 3, nrow = 1)
figure

#Test model

Kite <- data.frame(age = 28,
                   bmi = 22,
                   children = 0,
                   smoker = "no",
                   region = "northwest",
                   bmi_cat=0)
print(paste0("Health care charges for Kite: ", round(predict(ins_datamodel2, Kite), 2)))

## [1] "Health care charges for Kite: 5102.94"

#view data with predictionand residual
head(Data_test)

##    age sex    bmi children smoker    region   charges bmi_cat prediction
## 3   28   1 33.000        3     no southeast  4449.462       1   5656.757
## 20  30   1 35.300        0    yes southwest 36837.467       1  29145.348
## 24  34   0 31.920        1    yes northeast 37701.877       1  31023.601
## 27  63   0 23.085        0     no northeast 14451.835       0  15292.171
## 31  22   1 35.600        0    yes southwest 35585.576       1  26890.969
## 37  62   0 32.965        3     no northwest 15612.193       1  14684.054
##     residuals
## 3  -1207.2954
## 20  7692.1188
## 24  6678.2757
## 27  -840.3358
## 31  8694.6068
## 37   928.1393

Data_train$sex[Data_train$sex  == 0]<- 'F'
Data_train$sex[Data_train$sex  == 1]<- 'M'


# Save model to RDS file
saveRDS(ins_datamodel2, "model.rds")

#Reference:
#https://rpubs.com/SamShariful/health-insurance
#data- Kaggle

HEALTH INSURANCE COST PREDICTION

Okiemute Omoru

2022-08-01