The cost of health care is a huge topic in todays media. Many people are struggling to pay their bill and let alone pay for their medical bills. Obesity levels and smoking habits have seen as causes of health issues. The research topic for this analysis is going to look at how insurance costs are affected by people’s BMI levels and whether people smoke or not. This analysis will find if BMI levels and smoking habits actually affect the amount a person is paying for insurance costs.
The hypothesis for this analysis is that those people with higher BMI levels and those people who smoke will have higher insurance costs. People who are overweight tend to have more health issues. And smoking cigarettes have been proven to cause health issues such as cancer. These two variables will be the main points of this analysis and will show if health insurance costs are higher for those with higher BMIs and if they are smokers.
This dataset was taken from Kaggle.com. The dataset is called “Medical Cost Personal Dataset.” This is a dataset of 1338 people with their age, sex, BMI, number of children, smoking status, region and the cost of insurance charged to that person listed. All the variables in this data set were used besides the number of children variable. The variable for insurance cost is called charges. The data will be observed by looking at the entire dataset first then spilting the analysis into regions. The regions are split into Southeast, Southwest, Northeast and Northwest of the United States.
library(ggstance)
library(readr)
library(ggplot2)
library(dplyr)
library(tidyr)
library(ggridges)
health <- read_csv("/Users/paulkim/Downloads/insurance.csv")
head(health)
healths <- health%>%
mutate(sex = ifelse(sex == "male", "Male", "Female"),
smoker = ifelse(smoker == "yes", "Yes", "No"),
region = ifelse(region == "northeast", "Northeast",
ifelse(region == "northwest", "Northwest",
ifelse(region == "southeast", "Southeast",
ifelse(region == "southwest", "Southwest", NA)))),
age = ifelse(age <= 9, "<10",
ifelse(age <= 19, "10-19",
ifelse(age <= 29, "20-29",
ifelse(age <= 39, "30-39",
ifelse(age <= 49, "40-49",
ifelse(age <= 59, "50-59",
ifelse(age <= 69, "60-69",
ifelse(age <= 79, "70-79", NA)))))))))
gg1 <- ggplot(healths, aes(x = sex)) + geom_bar(color ="Dark Blue", fill = "Light Green")
ggsex <- gg1 + labs(title = "Count of Sex") + ylab("Count") + xlab("Sex")
plot(ggsex)
gg2 <- ggplot(healths, aes(x = sex)) + geom_bar(color ="Light Blue", aes(fill = smoker))
ggsex <- gg2 + labs(title = "Smokers By Sex") + ylab("Count") + xlab("Sex")
plot(ggsex)
gg3 <- ggplot(healths, aes(x = region)) + geom_bar(color ="Blue", aes(fill = smoker))
ggregionsmoke <- gg3 + labs(title = "Smokers By Sex") + ylab("Count") + xlab("Sex")
plot(ggregionsmoke)
gg4 <- ggplot(healths, aes(x = age)) + geom_bar(color ="Blue", aes(fill = smoker))
ggregionsmoke <- gg4 + labs(title = "Smokers by Age Group") + ylab("Count") + xlab("Age")
plot(ggregionsmoke)
plot.diamonds <- ggplot(healths, aes(smoker, fill = age)) + geom_bar() +
theme(axis.text.x = element_text(angle=70, vjust=0.5))
plot.diamonds
ggplot(healths, aes(x=bmi, y=charges)) + geom_point(color = "Orange") + labs(title = "BMI and Insurance Charges") + ylab("Charges") + xlab("BMI")
ggplot(healths, aes(x=charges, y=smoker)) +
geom_density_ridges(jittered_points = TRUE, position = "raincloud",
alpha = 0.7, scale = 0.9) + labs(title = "Insurance Charges and Smokers") + ylab("Smoker") + xlab("Charges")
## Picking joint bandwidth of 2300
ggplot(healths, aes(x = charges, y = region, fill=sex)) +
geom_density_ridges(scale = 1) + facet_wrap(~region) +
labs(title = "Charges by Region per Sex") + ylab("Region") + xlab("Charges")
## Picking joint bandwidth of 2830
## Picking joint bandwidth of 2540
## Picking joint bandwidth of 3720
## Picking joint bandwidth of 2450
ggplot(healths, aes(x = charges, y = region, fill=smoker)) +
geom_density_ridges(scale = 1) + facet_wrap(~region) +
labs(title = "Charges by Region per Smoking Status") + ylab("Region") + xlab("Charges")
## Picking joint bandwidth of 3080
## Picking joint bandwidth of 3070
## Picking joint bandwidth of 2890
## Picking joint bandwidth of 3030
ggplot(healths, aes(x=bmi, y=region, fill = region)) +
geom_density_ridges(aes(point_color = region, point_fill = region,
point_shape = region),
alpha = .2, jittered_points = TRUE) +
scale_point_color_hue(l = 40) +
scale_discrete_manual(aesthetics = "point_shape", values = c(21, 22, 23, 24)) + labs(title = "BMI by Region") + ylab("Region") + xlab("BMI")
## Picking joint bandwidth of 1.63
horizontal <- ggplot(healths, aes(bmi, sex))+
geom_boxploth(aes(fill = region))+
facet_grid(region ~ ., scales = "free_y") + labs(title = "BMI by Region per Sex") + ylab("Sex") + xlab("BMI")
horizontal
horizontals <- ggplot(healths, aes(bmi, smoker))+
geom_boxploth(aes(fill = region))+
facet_grid(region ~ ., scales = "free_y") + labs(title = "BMI by Region per Smoking Habit") + ylab("Smoker") + xlab("BMI")
horizontals