library(tidyverse)
library(plotly)
library(knitr)
library(stargazer)
library(kableExtra)In this report, we are analyzing insurance charges dataset to discover the relationship between insurance charges and different factors. The data set is from GitHub repository of Machine Learning with R datasets.
insurance <- read.csv("insurance.csv")
tbl <- matrix(dim(insurance))
rownames(tbl) <- c("Observations", "Columns")
kbl(tbl,caption = "Data dimension") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))| Observations | 1338 |
| Columns | 7 |
The dataset has 1338 observations and 7 variables. It includes 5
numeric/integer variables which are age, bmi,
children, charges, and 3 categorical variables
which are sex, smoker, and
region.
stargazer(insurance,title = "Statistical summary",type = "html")| Statistic | N | Mean | St. Dev. | Min | Max |
| age | 1,338 | 39.207 | 14.050 | 18 | 64 |
| bmi | 1,338 | 30.663 | 6.098 | 15.960 | 53.130 |
| children | 1,338 | 1.095 | 1.205 | 0 | 5 |
| charges | 1,338 | 13,270.420 | 12,110.010 | 1,121.874 | 63,770.430 |
In this section, we are exploring, using charts and tables, the distribution of values of each feature from the data set in order to understand the data sample we are working with.
plot_sex <- insurance %>%
group_by(sex) %>%
summarise(percent = round(n()/nrow(insurance),2)) %>%
ggplot(aes(x = sex, y = percent)) +
geom_col(aes(fill = sex), width = 0.5) +
ggtitle("Proportion of female and male in the dataset") +
xlab("") +
ylab("Percentage (%)") +
theme_classic() +
theme(legend.position = "none") +
scale_fill_manual(values = c("brown3", "burlywood")) +
scale_y_continuous(labels = scales::percent)
ggplotly(plot_sex, tooltip = c("x","y"))The percentage of females in the data is 49% and the percentage of males is 51%, which indicates that the data is balanced in terms of sex.
tab <- insurance %>%
filter(children == 0 | children == 1) %>%
group_by(children) %>%
summarise(Count = n())
t1 <- insurance %>%
filter(children <= 2) %>%
summarise(n = n())
t2 <- insurance %>%
filter(children >= 3) %>%
summarise(n = n())
t3 <- insurance %>%
filter(children == 1 | children == 3) %>%
summarise(n = n())
t4 <- insurance %>%
filter(children >= 2 & children <= 4) %>%
summarise(n = n())
tab <- rbind(tab, data.frame("children"=c("less than 2","more than 3","1 or 3",
"between 2 and 4"), "Count" = c(t1$n,t2$n,t3$n, t4$n)))
plot_children <- ggplot(tab,aes(x = children, y = Count)) +
geom_col(fill = "brown3") +
ggtitle("Count of individuals by number of children") +
xlab("Children") +
ylab("Count") +
theme_classic()
ggplotly(plot_children)# a. individuals that have no children
# b. Individuals that have 1 child
tab2 <- insurance %>%
filter(children == 0 | children == 1) %>%
group_by(children) %>%
summarise(n = paste0(round(n()/nrow(insurance)*100,2)," %"))
# c. Individuals that have less than 2 children
p1 <- insurance %>%
filter(children <= 2) %>%
summarise(n = paste0(round(n()/nrow(insurance)*100,2)," %"))
# d. Individuals that have more than 3 children
p2 <- insurance %>%
filter(children > 3) %>%
summarise(n = paste0(round(n()/nrow(insurance)*100,2)," %"))
# e. Individuals that have 1 or 3 children
p3 <- insurance %>%
filter(children == 1 | children ==3) %>%
summarise(n = paste0(round(n()/nrow(insurance)*100,2)," %"))
# f. Individuals that have between 2 and 4 children
p4 <- insurance %>%
filter(children >= 2 & children <= 4) %>%
summarise(n = paste0(round(n()/nrow(insurance)*100,2)," %"))
tab2 <- rbind(tab2,data.frame("children" = c("less than 2 children",
"more than 3 children","1 or 3 children","between 2 and 4 children"),
"n" = c(p1$n,p2$n,p3$n,p4$n)))
kbl(tab2,caption =
"Proportion of individuals depending on number of children",
col.names = c("Number of children","Proportion"))%>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))| Number of children | Proportion |
|---|---|
| 0 | 42.9 % |
| 1 | 24.22 % |
| less than 2 children | 85.05 % |
| more than 3 children | 3.21 % |
| 1 or 3 children | 35.95 % |
| between 2 and 4 children | 31.54 % |
The majority of individuals in the dataset have less than 2 children (85.05%). Also, 42.9% of individuals don’t have any children while only 3.21% have more than 3 children.
plot_smoker <- insurance %>%
group_by(smoker) %>%
summarise(Count = n()) %>%
ggplot(aes(x = smoker, y = Count)) +
geom_bar(stat = "identity", width = 0.5, aes(fill = smoker)) +
ggtitle("Count of smokers and non smokers") +
xlab("Smoker") +
ylab("Count") +
theme_classic() +
theme(legend.position = "none") +
scale_fill_manual(values = c("brown3", "burlywood"))
ggplotly(plot_smoker,tooltip = c("y","x"))tab3 <- insurance %>%
group_by(smoker) %>%
summarise(n = paste0(round(n()/nrow(insurance)*100,2)," %"))
kbl(tab3,caption = "Proportion of smokers and non smokers",
col.names = c("Smoker","Proportion")) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))| Smoker | Proportion |
|---|---|
| no | 79.52 % |
| yes | 20.48 % |
The majority of individuals in the data are non smokers (79.52%).
tab4 <- insurance %>%
group_by(region) %>%
summarise(Count = n())
plot_region <- ggplot(tab4, aes(x = region, y = Count)) +
geom_bar(stat = "identity", width = 0.5, fill = "brown3") +
ggtitle("Count of individuals by region") +
xlab("Region") +
ylab("Count") +
theme_classic()
ggplotly(plot_region,tooltip = c("y","x"))tab7 <- insurance %>%
group_by(region) %>%
summarise(Count = paste0(round(n()/nrow(insurance)*100,2)," %"))
kbl(tab7, caption = "Proportion of individuals in each region",
col.names = c("Region","Proportion")) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))| Region | Proportion |
|---|---|
| northeast | 24.22 % |
| northwest | 24.29 % |
| southeast | 27.2 % |
| southwest | 24.29 % |
Individuals in the data sample are distributed almost evenly among the four regions, with southeast being the most common region in the data (27.2%)
plot_age <- insurance %>%
mutate(Age = ifelse(age < 20,"less than 20",ifelse(age > 40 ,"over 40",
ifelse(age >= 20 & age <= 40,"between 20 and 40","0")))) %>%
filter(Age != "0") %>%
group_by(Age) %>%
summarise(Count = n()) %>%
arrange(-Count) %>%
ggplot(aes(x = reorder(Age, -Count), y = Count, text = paste0("Age: ",Age))) +
geom_bar(stat = "identity", width = 0.5, fill = "brown3") +
ggtitle("Count of individuals by age") +
xlab("Age") +
ylab("Count") +
theme_classic()
ggplotly(plot_age,tooltip = c("y","text"))insurance %>%
mutate(countage1 = ifelse(age < 20,"less than 20",ifelse(age > 40,"over 40",
ifelse(age >= 20 & age <= 40,"between 20 and 40","0")))) %>%
filter(countage1 != "0") %>%
group_by(countage1) %>%
summarise(n = paste0(round(n()/nrow(insurance)*100,2)," %")) %>%
kbl(caption = "Proportion of individuals in each age range",
col.names = c("Age range","Proportion")) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))| Age range | Proportion |
|---|---|
| between 20 and 40 | 42.15 % |
| less than 20 | 10.24 % |
| over 40 | 47.61 % |
Individuals aged less than 20 years old make only 10.24% of the data, as the majority is aged over 40 years old (47.61%).
insurance %>%
filter(charges > mean(charges)) %>%
summarise(n = n(),proportion =
paste0(round(n()/nrow(insurance)*100,2)," %")) %>%
kbl(caption =
"Count and proportion of individuals with charges over the average",
col.names = c("Count","Proportion")) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))| Count | Proportion |
|---|---|
| 420 | 31.39 % |
We find that 31.39% of individuals in the data have charges over the average, which means that these individuals have some extreme charges values that pulled the average.
plot_bmi <- insurance%>%
ggplot(aes(x = bmi)) +
geom_histogram(fill = "brown3",color = "white") +
ggtitle("Histogram of distribution of BMI") +
xlab("BMI") +
ylab("Count") +
theme_classic()
ggplotly(plot_bmi)The distribution of BMI looks symmetrical and centered around 30.
In this section, we analyze the relationship between insurance charges and each one of the factors from the data using tables and visualizations.
chargesVsAge <- insurance %>%
ggplot(aes(x = charges, y = age)) +
geom_point(color = "brown3")+
geom_smooth(method = "lm") +
ggtitle("Relationship between charges and age") +
theme_classic() +
xlab("Charges") +
ylab("Age")
ggplotly(chargesVsAge)There is a moderate linear relationship between insurance charges and age. the relationship is positive, which means that the higher the age the higher the charges.
chargesVsChildren <- insurance %>%
group_by(children) %>%
summarise(`Mean charges` = mean(charges)) %>%
mutate(children = as.character(children)) %>%
ggplot(aes(x = children , y = `Mean charges`)) +
geom_col(fill = "brown3") +
ggtitle("Average insurance charges by number of children") +
xlab("Children") +
ylab("Mean charges") +
theme_classic() +
theme(legend.position = "none")
ggplotly(chargesVsChildren)Insurance charges are lower on average for individuals with 5 children compared to the other groups, and higher for individuals with 2 to 3 children.
chargesVsBMI <- insurance %>%
ggplot(aes(x = bmi, y = charges)) +
geom_point(color = "brown3") +
geom_smooth(method = "lm") +
ggtitle("Relationship between insurance charges and BMI") +
xlab("BMI") +
ylab("Insurance charges") +
theme_classic() +
theme(legend.position = "none")
ggplotly(chargesVsBMI)There is a moderate positive linear relationship between insurance charges and BMI, indicating that the higher the BMI value the higher are the charges.
chargesVsSex <- insurance %>%
ggplot( aes(x = sex, y = charges, fill = sex)) +
geom_boxplot() +
scale_fill_manual(values = c("brown3", "burlywood")) +
theme_classic() +
theme(
legend.position="none",
) +
ggtitle("Boxplot of distribution of charges for female and male") +
xlab("") +
ylab("Charges") +
geom_hline(aes(yintercept = mean(charges)))
ggplotly(chargesVsSex)The distribution of insurance charges for female is less variant than the distribution of charges for male. The average (median) insurance charges is similar for female and male individuals.
chargesVsSmoker <- insurance %>%
group_by(smoker) %>%
summarise(`Mean charges` = mean(charges)) %>%
ggplot(aes(x = smoker, y = `Mean charges`)) +
geom_bar(stat = "identity", width = 0.5, aes(fill = smoker)) +
ggtitle("Average insurance charges by smoking status") +
xlab("Smoker") +
ylab("Mean charges") +
theme_classic() +
theme(legend.position = "none") +
scale_fill_manual(values = c("brown3", "burlywood"))
ggplotly(chargesVsSmoker,tooltip = c("y","x"))Insurance charges are meaningfully higher on average for smokers compared to non-smokers.
chargesVsSmoker_F1 <- insurance %>%
filter(bmi >= 30 & bmi <= 40) %>%
group_by(smoker) %>%
summarise(`Mean charges` = mean(charges)) %>%
ggplot(aes(x = smoker, y = `Mean charges`)) +
geom_bar(stat = "identity", width = 0.5, aes(fill = smoker)) +
ggtitle("Average insurance charges by smoking status (30<BMI<40)") +
xlab("Smoker") +
ylab("Mean charges") +
theme_classic() +
theme(legend.position = "none") +
scale_fill_manual(values = c("brown3", "burlywood"))
ggplotly(chargesVsSmoker_F1,tooltip = c("y","x"))We notice that also for the group of individuals with BMI between 30 and 40, insurance charges are meaningfully higher on average for smokers compared to non-smokers.
chargesVsSmoker_F2 <- insurance %>%
filter(sex == "female") %>%
group_by(smoker) %>%
summarise(`Average charges` = mean(charges)) %>%
ggplot(aes(x = smoker, y = `Average charges`)) +
geom_bar(stat = "identity", width = 0.5, aes(fill = smoker)) +
ggtitle("Average charges for smoker and non-smoker females") +
xlab("Smoker") +
ylab("Average charges") +
theme_classic() +
theme(legend.position = "none") +
scale_fill_manual(values = c("brown3", "burlywood"))
ggplotly(chargesVsSmoker_F2,tooltip = c("y"))For only female individuals, insurance charges are noticeably higher on average for smokers compared to non-smokers.
insurance %>%
filter(charges < mean(charges)) %>%
group_by(smoker) %>%
summarise(n = paste0(round(n()/nrow(insurance)*100,2)," %"),
nn = paste0(round(n()/nrow(insurance[insurance$charges <
mean(insurance$charges),])*100,2)," %")) %>%
kbl(caption =
"Proportion of smokers and non smokers with charges below the global average",
col.names = c("Smoker","Proportion from the entire data",
"Proportion from individuals with charges below the average")) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))| Smoker | Proportion from the entire data | Proportion from individuals with charges below the average |
|---|---|---|
| no | 68.54 % | 99.89 % |
| yes | 0.07 % | 0.11 % |
The proportion of non-smokers with charges below the average is 68.54% of the entire data which is meaningfully higher compared to the proportion of smokers (0.07%), which indicates that insurance charges are mostly higher than average for smokers.
chargesVsSex_F1 <- insurance %>%
filter(smoker == "no", children > 1) %>%
group_by(sex) %>%
summarise(`Average charges` = mean(charges)) %>%
ggplot(aes(x = sex, y = `Average charges`)) +
geom_bar(stat = "identity", width = 0.5, aes(fill = sex)) +
ggtitle(
"Average charges for non-smoker individuals with at least 2 children by sex") +
xlab("") +
ylab("Average charges") +
theme_classic() +
theme(legend.position = "none") +
scale_fill_manual(values = c("brown3", "burlywood"))
ggplotly(chargesVsSex_F1,tooltip = c("y"))For non-smokers with less than 2 children, we notice that female have higher insurance charges on average compared to male individuals.
chargesVsSex_F2 <- insurance %>%
filter(bmi > 35 , region == "southwest") %>%
group_by(sex) %>%
summarise( `Average charges` = mean(charges)) %>%
ggplot(aes(x = sex, y = `Average charges`)) +
geom_bar(stat = "identity", width = 0.5, aes(fill = sex)) +
ggtitle(
"Average charges by sex for individuals from the southwest with BMI larger than 35") +
xlab("") +
ylab("Average charges") +
theme_classic() +
theme(legend.position = "none",plot.title = element_text(size = 10)) +
scale_fill_manual(values = c("brown3", "burlywood"))
ggplotly(chargesVsSex_F2,tooltip = c("y"))For individuals from southwest with BMI larger than 35, we notice that male individuals have higher insurance charges on average compared to female individuals.
ChargesVsSmoker.Sex_F1 <- insurance %>%
filter(bmi >= 30 & bmi <= 40) %>%
group_by(smoker,sex) %>%
summarise(`Mean charges` = mean(charges)) %>%
ggplot(aes(x = smoker, y = `Mean charges`)) +
geom_bar(stat = "identity",aes(fill = sex), position = "dodge", width = 0.7) +
ggtitle("Average insurance charges by smoking status and sex (30<BMI<40)") +
xlab("Smoker") +
ylab("Mean charges") +
theme_classic() +
scale_fill_manual(values = c("brown3", "burlywood"))
ggplotly(ChargesVsSmoker.Sex_F1)On average, insurance charges are similar for non-smoker males and females, and for smoker males and females.
ChargesVsSmoker.Sex.Region_F <- insurance %>%
filter(bmi >= 30 & bmi <= 40) %>%
group_by(smoker,sex,region) %>%
summarise(`Mean charges` = mean(charges)) %>%
ggplot(aes(x = smoker, y = `Mean charges`)) +
geom_bar(stat = "identity",aes(fill = sex), position = "dodge") +
facet_wrap(~region) +
ggtitle("Average insurance charges by smoking status and sex and region (30<BMI<40)") +
xlab("Smoker") +
ylab("Mean charges") +
theme_classic() +
theme(plot.title = element_text(size = 10)) +
scale_fill_manual(values = c("brown3", "burlywood"))
ggplotly(ChargesVsSmoker.Sex.Region_F)On average, insurance charges are similar among smokers regardless of their sex and region, and similar among non-smokers regardless of their sex and region.
ChargesVsSmoker.Sex_F2 <- insurance %>%
filter(children > 2 , charges > mean(charges)) %>%
group_by(sex,smoker) %>%
summarise(n = round(n()/nrow(insurance)*100,2),
Percentage = round(n()/nrow(insurance[insurance$children > 2 &
insurance$charges > mean(insurance$charges),])*100,2)) %>%
ggplot(aes(x = smoker, y = Percentage,text =
paste0("Percentage: ", Percentage, "%"))) +
geom_bar(stat = "identity", width = 0.5, aes(fill = smoker)) +
ggtitle("Percentage of smokers and non-smokers with more than 2 children and charges over the average by sex") +
xlab("Smoker") +
ylab("Percentage (%)") +
theme_classic() +
theme(legend.position = "none", plot.title = element_text(size = 10)) +
scale_fill_manual(values = c("brown3", "burlywood")) +
facet_wrap(~sex)
ggplotly(ChargesVsSmoker.Sex_F2,tooltip = c("text"))the majority of individuals with more than 2 children and charges over the average are smoker males (39.44%) followed by non-smoker females (25.35%) and the minority is non-smoker males (14.08%)
ChargesVsSex.Region_F1 <- insurance %>%
filter(smoker == "yes", charges > mean(charges)) %>%
group_by(region,sex) %>%
summarise(n = round(n()/nrow(insurance[insurance$smoker == "yes" &
insurance$charges > mean(insurance$charges),])*100,2)) %>%
ggplot(aes(x = sex, y = n,text = paste0("Percentage: ", n, "%"))) +
geom_bar(stat = "identity", width = 0.5, aes(fill = sex)) +
ggtitle("Percentage of smokers with charges over the average in each region by sex") +
xlab("") +
ylab("Percentage (%)") +
theme_classic() +
theme(legend.position = "none") +
scale_fill_manual(values = c("brown3", "burlywood")) +
facet_wrap(~region)
ggplotly(ChargesVsSex.Region_F1,tooltip = c("text"))The majority of smokers with charges over the average are males from southeast (20.15%), followed by males from northeast and southwest (13.55% each), and the minority are females from southwest (7.69%)
ChargesVsSex.Region_F2 <- insurance %>%
filter(bmi >= 35 & bmi <= 45) %>%
ggplot( aes(x = sex, y = charges, fill = sex)) +
geom_boxplot() +
scale_fill_manual(values = c("brown3", "burlywood")) +
theme_classic() +
theme(
legend.position="none",plot.title = element_text(size = 10)
) +
ggtitle("Boxplot of distribution of charges for individuals with BMI between 35 and 45 by sex and region") +
xlab("") +
ylab("Charges") +
facet_wrap(~region)
ggplotly(ChargesVsSex.Region_F2)The boxplots above illustrate the distribution of charges for individuals with BMI between 35 and 45 by sex and region. We notice that the medians of insurance charges for females and males are very close in each region. The boxes sizes of females and males are different within each region which indicates that the variation of charges is different between them. We also observe that there are some outlier values in the data, like in the group of females from southeast and males from northeast.