Exploratory Data Analysis

insurance <- read_xlsx("insurance.xlsx")
str(insurance)
## tibble [1,338 × 7] (S3: tbl_df/tbl/data.frame)
##  $ age     : num [1:1338] 19 18 28 33 32 31 46 37 37 60 ...
##  $ sex     : chr [1:1338] "female" "male" "male" "male" ...
##  $ bmi     : num [1:1338] 27.9 33.8 33 22.7 28.9 ...
##  $ children: num [1:1338] 0 1 3 0 0 0 1 3 2 0 ...
##  $ smoker  : chr [1:1338] "yes" "no" "no" "no" ...
##  $ region  : chr [1:1338] "southwest" "southeast" "southeast" "northwest" ...
##  $ charges : num [1:1338] 16885 1726 4449 21984 3867 ...
insurance$sex <- as.factor(insurance$sex)
insurance$smoker <- as.factor(insurance$smoker)
insurance$region <- as.factor(insurance$region)


ggplot(data = insurance, aes(x=children,y=charges)) +
geom_jitter(alpha = 0.5) + theme_bw() + 
labs(x="# of children", y="Medical Costs", title = "# of children and Medical Costs")

ggplot(data = insurance, aes(x=region,y=charges)) +
geom_jitter(alpha = 0.5) + theme_bw() + 
labs(x="# of children", y="Medical Costs", title = "Region and Medical Costs")

ggplot(data = insurance, aes(x=bmi,y=charges)) +
geom_point() + theme_bw() +
labs(x="BMI", y="Medical Costs", title = "BMI and Medical Costs")

ggplot(data = insurance, aes(x=age,y=bmi)) +
geom_point() + theme_bw() +
labs(x="Age", y="BMI", title = "BMI and Age")

ggplot(data = insurance, aes(x=bmi,y=charges)) +
geom_point() + facet_wrap(~ sex) + theme_bw() +
labs(x="BMI", y="Medical Costs", title = "BMI and Medical Costs by gender")

#scale_fill_viridis_d(option = "D")


ggplot(data = insurance, aes(x=bmi,y=charges)) +
geom_point() + facet_wrap(~ region) + theme_bw() +
labs(x="BMI", y="Medical Costs", title = "BMI and Medical Costs by regions")

ggplot(data = insurance, aes(x=smoker,y=charges)) +
geom_point() +  theme_bw() +
labs(x="Smoker", y="Medical Costs", title = "Smoking and Medical Costs")

ggplot(data = insurance, aes(x=smoker,y=charges)) +
geom_point() + geom_boxplot(alpha = 0.5) + facet_wrap(~ sex) + theme_bw() +
labs(x="Smoker", y="Medical Costs", title = "Smoking and Medical Costs by gender")

ggplot(data = insurance, aes(x=smoker,y=charges)) +
geom_point() + geom_boxplot(alpha = 0.5) + facet_wrap(~ region) + theme_bw() +
labs(x="Smoker", y="Medical Costs", title = "Smoking and Medical Costs by region")

ggplot(data = insurance, aes(x=bmi,y=charges)) +
geom_point(aes(color=smoker)) + theme_bw() +
labs(x="BMI", y="Medical Costs", title = "BMI and Medical Costs by smoker")

Further Analysis

To be continued…