#Set working directory
setwd("/Users/kiteomoru/Downloads")
#
library(readxl)
## Warning: package 'readxl' was built under R version 4.1.2
library(ggplot2)
library(reshape)
## Warning: package 'reshape' was built under R version 4.1.2
library(psych)
## Warning: package 'psych' was built under R version 4.1.2
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(car)
## Warning: package 'car' was built under R version 4.1.2
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.1.2
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ tibble 3.1.7 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ✔ purrr 0.3.4
## Warning: package 'tidyr' was built under R version 4.1.2
## Warning: package 'readr' was built under R version 4.1.2
## Warning: package 'dplyr' was built under R version 4.1.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ psych::%+%() masks ggplot2::%+%()
## ✖ psych::alpha() masks ggplot2::alpha()
## ✖ tidyr::expand() masks reshape::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::recode() masks car::recode()
## ✖ dplyr::rename() masks reshape::rename()
## ✖ purrr::some() masks car::some()
library(skimr)
## Warning: package 'skimr' was built under R version 4.1.2
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(ggpubr)
theme_set(theme_pubr())
#load data #data obtained from Kaggle
insurance_data= read_excel("/Users/kiteomoru/Downloads/insurance.xlsx")
data= as.data.frame(insurance_data)
head(data)
## age sex bmi children smoker region charges
## 1 19 female 27.900 0 yes southwest 16884.924
## 2 18 male 33.770 1 no southeast 1725.552
## 3 28 male 33.000 3 no southeast 4449.462
## 4 33 male 22.705 0 no northwest 21984.471
## 5 32 male 28.880 0 no northwest 3866.855
## 6 31 female 25.740 0 no southeast 3756.622
#data exploration
skim(data)
| Name | data |
| Number of rows | 1338 |
| Number of columns | 7 |
| _______________________ | |
| Column type frequency: | |
| character | 3 |
| numeric | 4 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| sex | 0 | 1 | 4 | 6 | 0 | 2 | 0 |
| smoker | 0 | 1 | 2 | 3 | 0 | 2 | 0 |
| region | 0 | 1 | 9 | 9 | 0 | 4 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| age | 0 | 1 | 39.21 | 14.05 | 18.00 | 27.00 | 39.00 | 51.00 | 64.00 | ▇▅▅▆▆ |
| bmi | 0 | 1 | 30.66 | 6.10 | 15.96 | 26.30 | 30.40 | 34.69 | 53.13 | ▂▇▇▂▁ |
| children | 0 | 1 | 1.09 | 1.21 | 0.00 | 0.00 | 1.00 | 2.00 | 5.00 | ▇▂▂▁▁ |
| charges | 0 | 1 | 13270.42 | 12110.01 | 1121.87 | 4740.29 | 9382.03 | 16639.91 | 63770.43 | ▇▂▁▁▁ |
#data distribution
data %>% tabyl(region, sex, smoker)
## $no
## region female male
## northeast 132 125
## northwest 135 132
## southeast 139 134
## southwest 141 126
##
## $yes
## region female male
## northeast 29 38
## northwest 29 29
## southeast 36 55
## southwest 21 37
#bmi to binary variable
data$bmi_cat= ifelse(data$bmi>= 30, 1, 0)
#Gender to binary variable
data$sex= ifelse(data$sex == "male", 1, 0) #male 1, female 0
#correlation analysis
cor_data_m=cor(data[c('age', 'bmi_cat', 'children', 'charges', 'sex')])
#visualize
cor_data_m_melt= melt(cor_data_m)
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the
## caller; using TRUE
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the
## caller; using TRUE
cor_data_m_melt$value= round(cor_data_m_melt$value, digits = 2)
heatmap= ggplot(cor_data_m_melt, aes(X1, X2, fill= value))+
geom_tile(color = "white", lwd= 1.5, linetype= 1) +
scale_fill_gradient2(low = "#2C7BB6",
mid = "white",
high = "#D7191C", breaks=seq(-1,1,0.2), limits= c(-1,1)) +
coord_fixed() +
theme_minimal(base_family="Helvetica")+
guides(fill = guide_colourbar(barwidth = 0.5,barheight = 20))
heatmap2=heatmap +
geom_text(aes(X1, X2, label = value), color = "black", size = 5) +
theme(axis.ticks=element_blank())+
theme(axis.text.x=element_text(angle = 45, hjust = 1))
heatmap2
#Age and BMI have the closest correlation with insurance charges
#scatterplot to find relationship among variables
pairs.panels(data[c('age', 'bmi_cat','bmi', 'children', 'charges', 'sex')])
#Statistics #Normality testing
# Two rows, two columns
par(mfrow = c(1, 4))
hist(data$charges, main = "Charges") #data is skewed
hist(data$bmi,main = "BMI" ) #data is skewed
hist(data$age, main = "Age") #data is skewed
hist(data$children, main = "Children") #data is skewed
#visualize
par(mfrow = c(1, 4))
qqPlot(data$charges)
## [1] 544 1301
qqPlot(data$age)
## [1] 63 95
qqPlot(data$bmi)
## [1] 1318 1048
qqPlot(data$children)
## [1] 33 72
#Shapiro-Wilk
shapiro.test(data$age)
##
## Shapiro-Wilk normality test
##
## data: data$age
## W = 0.9447, p-value < 2.2e-16
shapiro.test(data$charges)
##
## Shapiro-Wilk normality test
##
## data: data$charges
## W = 0.81469, p-value < 2.2e-16
shapiro.test(data$bmi)
##
## Shapiro-Wilk normality test
##
## data: data$bmi
## W = 0.99389, p-value = 2.605e-05
shapiro.test(data$children)
##
## Shapiro-Wilk normality test
##
## data: data$children
## W = 0.82318, p-value < 2.2e-16
#Linear regression model to predict lm() ##prepare data for modelling # create testing and training data –30%test 70%train
traindata=round(0.7 * nrow(data))
train_indices=sample(1:nrow(data), traindata)
Data_train =data[train_indices, ]
Data_test =data[-train_indices, ]
#generate linear model-1
lmformula1=as.formula("charges ~ age + sex + bmi_cat + children + smoker + region")
ins_datamodel1= lm(lmformula1, data = Data_train)
#summarise
summary(ins_datamodel1)
##
## Call:
## lm(formula = lmformula1, data = Data_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13152.4 -3752.5 261.4 1692.9 27438.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4338.18 742.32 -5.844 7.05e-09 ***
## age 264.43 14.14 18.703 < 2e-16 ***
## sex -292.09 398.49 -0.733 0.46374
## bmi_cat 4747.62 406.56 11.677 < 2e-16 ***
## children 534.78 165.02 3.241 0.00123 **
## smokeryes 24001.58 488.75 49.108 < 2e-16 ***
## regionnorthwest -280.57 563.00 -0.498 0.61836
## regionsoutheast -863.55 558.46 -1.546 0.12237
## regionsouthwest -1371.54 570.98 -2.402 0.01650 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6063 on 928 degrees of freedom
## Multiple R-squared: 0.7601, Adjusted R-squared: 0.7581
## F-statistic: 367.6 on 8 and 928 DF, p-value: < 2.2e-16
### pull the regression points and observed data in to one dataset
points <- broom::augment(ins_datamodel1)
#predict data
rsq1 <- summary(ins_datamodel1)$r.squared #r-squared
#predict data on test set
predict1 <- predict(ins_datamodel1, newdata = Data_test)
#calculating the residuals
residuals1 <- Data_test$charges - predict1
#calculating Root Mean Squared Error
rmse1 <- sqrt(mean(residuals1^2))
#generate linear model-2
#eliminating low correlating / insignificant variables, eg, sex and children
lmformula2=as.formula("charges ~ age + smoker + region")
ins_datamodel2= lm(lmformula2, data = Data_train)
#summarise
summary(ins_datamodel2)
##
## Call:
## lm(formula = lmformula2, data = Data_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15762.4 -2087.1 -1295.0 -204.1 28392.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2461.1 742.2 -3.316 0.000949 ***
## age 281.8 15.1 18.664 < 2e-16 ***
## smokeryes 23903.5 523.5 45.662 < 2e-16 ***
## regionnorthwest -326.3 604.4 -0.540 0.589382
## regionsoutheast 227.5 591.6 0.385 0.700681
## regionsouthwest -751.1 610.8 -1.230 0.219103
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6515 on 931 degrees of freedom
## Multiple R-squared: 0.7221, Adjusted R-squared: 0.7207
## F-statistic: 483.9 on 5 and 931 DF, p-value: < 2.2e-16
#predict data on test set
#predict data
rsq2 <- summary(ins_datamodel2)$r.squared #r-squared
predict2 <- predict(ins_datamodel2, newdata = Data_test)
#calculating the residuals
residuals2 <- Data_test$charges - predict2
#calculating Root Mean Squared Error
rmse2 <- sqrt(mean(residuals2^2))
#compare both linear models
print(paste0("R-squared for first model:", round(rsq1, 4)))
## [1] "R-squared for first model:0.7601"
print(paste0("R-squared for second model:", round(rsq2, 4)))
## [1] "R-squared for second model:0.7221"
print(paste0("RMSE for first model: ", round(rmse1, 2)))
## [1] "RMSE for first model: 5969.05"
print(paste0("RMSE for second model: ", round(rmse2, 2)))
## [1] "RMSE for second model: 6116.08"
#both models are quite similar. we will proceed with the second model
#visualize the distribution of predicted vs real charges values
Data_test$prediction <- predict(ins_datamodel2, newdata = Data_test)
A=ggplot(Data_test, aes(x = prediction, y = charges)) +
geom_point(color = "blue", alpha = 0.7) +
geom_abline(color = "red") +
ggtitle("Predicted values vs. Real values")
#Residuals histogram
Data_test$residuals <- Data_test$charges - Data_test$prediction
B=ggplot(Data_test, aes(x = residuals)) +
geom_histogram(bins = 15, fill = "blue") +
ggtitle("Histogram of residuals")
C= ggplot(data = Data_test, aes(x = prediction, y = residuals)) +
geom_pointrange(aes(ymin = 0, ymax = residuals), color = "blue", alpha = 0.7) +
geom_hline(yintercept = 0, linetype = 3, color = "red") +
ggtitle("Residuals vs. Linear model prediction")
figure <- ggarrange(A, B, C,
labels = c("A", "B", "C"),
ncol = 3, nrow = 1)
figure
#Test model
Kite <- data.frame(age = 28,
bmi = 22,
children = 0,
smoker = "no",
region = "northwest",
bmi_cat=0)
print(paste0("Health care charges for Kite: ", round(predict(ins_datamodel2, Kite), 2)))
## [1] "Health care charges for Kite: 5102.94"
#view data with predictionand residual
head(Data_test)
## age sex bmi children smoker region charges bmi_cat prediction
## 3 28 1 33.000 3 no southeast 4449.462 1 5656.757
## 20 30 1 35.300 0 yes southwest 36837.467 1 29145.348
## 24 34 0 31.920 1 yes northeast 37701.877 1 31023.601
## 27 63 0 23.085 0 no northeast 14451.835 0 15292.171
## 31 22 1 35.600 0 yes southwest 35585.576 1 26890.969
## 37 62 0 32.965 3 no northwest 15612.193 1 14684.054
## residuals
## 3 -1207.2954
## 20 7692.1188
## 24 6678.2757
## 27 -840.3358
## 31 8694.6068
## 37 928.1393
Data_train$sex[Data_train$sex == 0]<- 'F'
Data_train$sex[Data_train$sex == 1]<- 'M'
# Save model to RDS file
saveRDS(ins_datamodel2, "model.rds")
#Reference:
#https://rpubs.com/SamShariful/health-insurance
#data- Kaggle