age, sex, bmi, number of children, smoking status, and region of living.
medicalcost <- read.csv("medical_cost.csv")
medicalcost <- medicalcost %>%
mutate(BMI.status = case_when(
bmi < 18.5 ~ "Underweight",
bmi >= 18.5 & bmi < 24.9 ~ "Normal",
bmi >= 25 & bmi < 29.9 ~ "Overweight",
bmi >= 30 ~ "Obese",
TRUE ~ "Unknown")) %>%
mutate(BMI.status = factor(BMI.status, ordered = TRUE,
levels = c("Underweight", "Normal", "Overweight", "Obese", "Unknown"))) %>%
mutate(children = factor(children,
ordered = TRUE,
levels = c("0", "1", "2", "3", "4", "5"))) %>%
mutate(sex = factor(sex, ordered = TRUE, levels = c("male", "female"))) %>%
mutate(smoker = factor(smoker, ordered = TRUE, levels = c("no", "yes"))) %>%
mutate(region = factor(region, ordered = TRUE, levels = c("northeast", "northwest", "southeast", "southwest")))
I added a variable to stratify the BMI into 4 different categories: Underweight, Normal, Overweight, and Obese. I also transformed the sex, bmi status, children, smoking status, region into categorical factors.
age.median <- median(medicalcost$age, na.rm = TRUE)
age.mean <- mean(medicalcost$age, na.rm = TRUE)
age.normality <- shapiro.test(medicalcost$age)
age.distribution <- ggplotly(ggplot(data = medicalcost, aes(x=age)) +
geom_histogram(aes(y = ..density..), fill="navy", color="black", bins = 10, alpha = 0.7) +
geom_density(color = "#B31312", size = 1, linetype = 2) +
geom_vline(xintercept = age.median, color="#054B71", linetype="dashed", linewidth=1) +
annotate(geom="text", x=30, y=0.03, color="#054B71", size = 3,
label=paste("Median\n", round(age.median, 1))) +
geom_vline(xintercept = age.mean, color="#0086BD", linetype = "dashed", linewidth=1) +
annotate(geom="text", x=50, y=0.03, color="#0086BD", size=3,
label=paste("Mean\n", round(age.mean, 1))) +
annotate(geom = "text", x=30, y=0.035, size=3,
label=paste("Shapiro-Wilk\nNormality Test =", round(age.normality$p.value, 10))) +
labs(x="Age", y="Count", title="Age Distribution") +
karl_theme)
### BMI distribution
bmi.median <- median(medicalcost$bmi, na.rm = TRUE)
bmi.mean <- mean(medicalcost$bmi, na.rm = TRUE)
bmi.normality <- shapiro.test(medicalcost$bmi)
bmi.distribution <- ggplotly(ggplot(data = medicalcost, aes(x=bmi)) +
geom_histogram(aes(y = ..density..), fill="navy", color="black", bins = 10, alpha = 0.7) +
geom_density(color = "#B31312", size = 1, linetype = 2) +
geom_vline(xintercept = bmi.median, color="#054B71", linetype="dashed", linewidth=1) +
annotate(geom="text", x=45, y=0.03, color="#054B71", size = 3,
label=paste("Median\n", round(bmi.median, 1))) +
geom_vline(xintercept = bmi.mean, color="#0086BD", linetype = "dashed", linewidth=1) +
annotate(geom="text", x=45, y=0.04, color="#0086BD", size=3,
label=paste("Mean\n", round(bmi.mean, 1))) +
annotate(geom = "text", x=45, y=0.06, size=3,
label=paste("Shapiro-Wilk\nNormality Test =", round(bmi.normality$p.value, 10))) +
labs(x="BMI", y="Count", title="BMI Distribution") +
karl_theme)
### charges distribution
charges.median <- median(medicalcost$charges, na.rm = TRUE)
charges.mean <- mean(medicalcost$charges, na.rm = TRUE)
charges.normality <- shapiro.test(medicalcost$charges)
charges.distribution <- ggplotly(ggplot(data = medicalcost, aes(x=charges)) +
geom_histogram(aes(y = ..density..), fill="navy", color="black", bins = 15, alpha = 0.7) +
geom_density(color = "#B31312", size = 1, linetype = 2) +
geom_vline(xintercept = charges.median, color="#054B71", linetype="dashed", linewidth=1) +
annotate(geom="text", x=30000, y=0.00002, color="#054B71", size = 3,
label=paste("Median\n", round(charges.median, 1))) +
geom_vline(xintercept = charges.mean, color="#0086BD", linetype = "dashed", linewidth=1) +
annotate(geom="text", x=30000, y=0.00003, color="#0086BD", size=3,
label=paste("Mean\n", round(charges.mean, 1))) +
annotate(geom = "text", x=30000, y=0.00004, size=3,
label=paste("Shapiro-Wilk\nNormality Test =", round(charges.normality$p.value, 10))) +
labs(x="Charges ($)", y="Count", title="Charges ($) Distribution") +
scale_x_continuous(breaks = c(10000, 20000, 30000, 40000, 50000, 60000),
labels = c("$10k", "$20k", "$30k", "$40k", "$50k", "$60k")) +
karl_theme)
BMIstatus <- medicalcost %>%
filter(BMI.status != "Unknown")
BMIstatus <- ggplotly(
ggplot(data = BMIstatus, aes(x=BMI.status)) +
geom_bar(fill = "navy", color = "black", alpha = 0.7) +
labs(x = "BMI status",
y = "Count") +
scale_x_discrete(breaks = c("Underweight", "Normal", "Overweight", "Obese"),
labels = c("Under\nWeight", "Normal", "Under\nWeight", "Obese")) +
karl_theme)
distribution <- subplot(
age.distribution, bmi.distribution, charges.distribution, BMIstatus, nrows = 2,
shareX = FALSE, shareY = FALSE, titleX = TRUE, titleY = TRUE, margin = 0.12
)
distribution <- layout(distribution,
title = list(
text = "Age, BMI, and Medical Costs Distribution",
font = list(size = 25, family= "serif", face = "bold", weight = "bold")),
margin = list(t = 100))
distribution
Figure 2.1: BMI is the only normally distributed variable. Medical charges are skewed to the right.
children <- ggplotly(
ggplot(data = medicalcost, aes(x = children)) +
geom_bar(fill="navy", alpha = 0.7, color = "black") +
labs(x="Number of Children", y="Count",
title=" Number of Children") +
karl_theme)
### Sex
sex <- ggplotly(
ggplot(data = medicalcost, aes(x = sex)) +
geom_bar(fill="navy", alpha = 0.7, color = "black") +
labs(x="Sex", y="Count",
title="Sex") +
karl_theme)
### region
region <- ggplotly(
ggplot(data = medicalcost, aes(x = region)) +
geom_bar(fill="navy", alpha = 0.7, color = "black") +
labs(x="Region", y="Count",
title="Region") +
scale_x_discrete(breaks = c("northeast", "northwest", "southeast", "southwest"),
labels = c("North\nEast", "North\nWest", "South\nEast", "South\nWest")) +
karl_theme)
### smoking status
smoker <- ggplotly(
ggplot(data = medicalcost, aes(x = smoker)) +
geom_bar(fill="navy", alpha = 0.7, color = "black") +
labs(x="Smoker", y="Count",
title="Smoker") +
karl_theme)
categories <- subplot(
children, sex, region, smoker, nrows = 2,
shareX = FALSE, shareY = FALSE, titleX = TRUE, titleY = TRUE, margin = 0.1
)
categories <- layout(categories,
title = list(
text = "Sex, Smoking Status, Number of Children, and Region",
font = list(size = 25, family= "serif", face = "bold", weight = "bold")))
categories
age.sex.comparison <- list(c("male", "female"))
age.sex.plot <- ggplotly(ggplot(data = medicalcost, aes(x = reorder(sex, -age, FUN = median), y=age)) +
geom_violin(aes(fill = sex), alpha = 0.5, color = "black") +
geom_boxplot(aes(fill = sex), alpha = 0.75, color = "black") +
geom_sina(aes(fill = sex), alpha = 0.5, size = 3, color = "white") +
labs(x="Sex", y="Age", title = "Age Distribution by Sex") +
scale_fill_manual(values = c("#1D24CA", "#B31312")) +
scale_color_manual(values = c("#1D24CA", "#B31312")) +
#stat_summary(fun.data = mean_cl_boot, geom = "point", color = "black", size = 3) +
stat_summary(fun.data = mean_cl_boot, geom = "errorbar", color = "black", size = 1, width = 0.1) +
stat_compare_means(comparisons = age.sex.comparison, method = "t.test") +
stat_compare_means(method = "t.test", label.y = 65, label.x = 1.5) +
karl_theme +
theme(legend.position = "none"))
age.smoker.comparison <- list(c("no", "yes"))
age.smoker.plot <- ggplotly(ggplot(data = medicalcost, aes(x = reorder(smoker, -age, FUN = median), y=age)) +
geom_violin(aes(fill = smoker), alpha = 0.5, color = "black") +
geom_boxplot(aes(fill = smoker), alpha = 0.75, color = "black") +
geom_sina(aes(fill = smoker), alpha = 0.5, size = 3, color = "white") +
labs(x="Smoker", y="Age", title = "Age Distribution by Smoking Status") +
scale_fill_manual(values = c("#1D24CA", "#B31312")) +
scale_color_manual(values = c("#1D24CA", "#B31312")) +
stat_summary(fun.data = mean_cl_boot, geom = "point", color = "black", size = 3) +
stat_summary(fun.data = mean_cl_boot, geom = "errorbar", color = "black", size = 1, width = 0.1) +
#stat_compare_means(comparisons = age.smoker.comparison, method = "t.test") +
stat_compare_means(method = "t.test", label.y = 62.5, label.x = 1.5) +
karl_theme +
theme(legend.position = "none"))
age.region.comparison <- list(c("northeast", "northwest"), c("northeast", "southeast"), c("northeast", "southwest"),
c("northwest", "southeast"), c("northwest", "southwest"),
c("southeast", "southwest"))
age.region.plot <- ggplotly(ggplot(data = medicalcost, aes(x = reorder(region, -age, FUN = median), y=age)) +
geom_violin(aes(fill = region), alpha = 0.5, color = "black") +
geom_boxplot(aes(fill = region), alpha = 0.75, color = "black") +
geom_sina(aes(fill = region), alpha = 0.5, size = 3, color = "white") +
labs(x="Region", y="Age", title = "Age Distribution by Region") +
scale_fill_manual(values = c("#1D24CA", "#B31312", "#F7C04A", "#539165")) +
scale_color_manual(values = c("#1D24CA", "#B31312", "#F7C04A", "#539165")) +
stat_summary(fun.data = mean_cl_boot, geom = "point", color = "black", size = 3) +
stat_summary(fun.data = mean_cl_boot, geom = "errorbar", color = "black", size = 1, width = 0.1) +
#stat_compare_means(comparisons = age.region.comparison, method = "t.test") +
stat_compare_means(method = "anova", label.y = 60, label.x = 2) +
karl_theme +
theme(legend.position = "none"))
age.children.comparison <- list(c("0", "1"), c("0", "2"), c("0", "3"), c("0", "4"), c("0", "5"),
c("1", "2"), c("1", "3"), c("1", "4"), c("1", "5"),
c("2", "3"), c("2", "4"), c("2", "5"),
c("3", "4"), c("3", "5"), c("4", "5"))
age.children.plot <- ggplotly(ggplot(data = medicalcost, aes(x = reorder(children, -age, FUN = median), y=age)) +
geom_violin(aes(fill = children), alpha = 0.5, color = "black") +
geom_boxplot(aes(fill = children), alpha = 0.75, color = "black") +
geom_sina(aes(fill = children), alpha = 0.5, size = 3, color = "white") +
labs(x="Children", y="Age", title = "Age Distribution by Number of Children") +
scale_fill_manual(values = c("#1D24CA", "#005EF9", "#0083FF", "#00A0F6", "#00B9D0", "#00D0A2")) +
scale_color_manual(values = c("#1D24CA", "#005EF9", "#0083FF", "#00A0F6", "#00B9D0", "#00D0A2")) +
stat_summary(fun.data = mean_cl_boot, geom = "point", color = "black", size = 3) +
stat_summary(fun.data = mean_cl_boot, geom = "errorbar", color = "black", size = 1, width = 0.1) +
#stat_compare_means(comparisons = age.children.comparison, method = "t.test") +
stat_compare_means(method = "anova", label.y = 65, label.x = 2) +
karl_theme +
theme(legend.position = "none"))
age.categories <- subplot(
age.sex.plot, age.smoker.plot, age.region.plot, age.children.plot, nrows = 2,
shareX = FALSE, shareY = FALSE, titleX = TRUE, titleY = TRUE, margin = 0.1
)
age.categories <- layout(age.categories,
title = list(
text = "Age Distribution among Categories",
font = list(size = 25, family= "serif", face = "bold", weight = "bold")))
age.categories
bmi.sex.comparison <- list(c("male", "female"))
bmi.sex.plot <- ggplotly(ggplot(data = medicalcost, aes(x = reorder(sex, -bmi, FUN = median), y=bmi)) +
geom_violin(aes(fill = sex), alpha = 0.5, color = "black") +
geom_boxplot(aes(fill = sex), alpha = 0.75, color = "black") +
geom_sina(aes(fill = sex), alpha = 0.5, size = 3, color = "white") +
labs(x="Sex", y="BMI", title = "BMI Distribution by Sex") +
scale_fill_manual(values = c("#1D24CA", "#B31312")) +
scale_color_manual(values = c("#1D24CA", "#B31312")) +
stat_summary(fun.data = mean_cl_boot, geom = "point", color = "black", size = 3) +
stat_summary(fun.data = mean_cl_boot, geom = "errorbar", color = "black", size = 1, width = 0.1) +
#stat_compare_means(comparisons = age.sex.comparison, method = "t.test") +
stat_compare_means(method = "t.test", label.y = 50, label.x = 1.5) +
karl_theme +
theme(legend.position = "none"))
bmi.smoker.comparison <- list(c("no", "yes"))
bmi.smoker.plot <- ggplotly(ggplot(data = medicalcost, aes(x = reorder(smoker, -bmi, FUN = median), y=bmi)) +
geom_violin(aes(fill = smoker), alpha = 0.5, color = "black") +
geom_boxplot(aes(fill = smoker), alpha = 0.75, color = "black") +
geom_sina(aes(fill = smoker), alpha = 0.5, size = 3, color = "white") +
labs(x="Smoker", y="BMI", title = "BMI Distribution by Smoking Status") +
scale_fill_manual(values = c("#1D24CA", "#B31312")) +
scale_color_manual(values = c("#1D24CA", "#B31312")) +
stat_summary(fun.data = mean_cl_boot, geom = "point", color = "black", size = 3) +
stat_summary(fun.data = mean_cl_boot, geom = "errorbar", color = "black", size = 1, width = 0.1) +
#stat_compare_means(comparisons = age.smoker.comparison, method = "t.test") +
stat_compare_means(method = "t.test", label.y = 50, label.x = 1.5) +
karl_theme +
theme(legend.position = "none"))
bmi.region.comparison <- list(c("northeast", "northwest"), c("northeast", "southeast"), c("northeast", "southwest"),
c("northwest", "southeast"), c("northwest", "southwest"),
c("southeast", "southwest"))
bmi.region.plot <- ggplotly(ggplot(data = medicalcost, aes(x = reorder(region, -bmi, FUN = median), y=bmi)) +
geom_violin(aes(fill = region), alpha = 0.5, color = "black") +
geom_boxplot(aes(fill = region), alpha = 0.75, color = "black") +
geom_sina(aes(fill = region), alpha = 0.5, size = 3, color = "white") +
labs(x="Region", y="BMI", title = "BMI Distribution by Region") +
scale_fill_manual(values = c("#1D24CA", "#B31312", "#F7C04A", "#539165")) +
scale_color_manual(values = c("#1D24CA", "#B31312", "#F7C04A", "#539165")) +
stat_summary(fun.data = mean_cl_boot, geom = "point", color = "black", size = 3) +
stat_summary(fun.data = mean_cl_boot, geom = "errorbar", color = "black", size = 1, width = 0.1) +
#stat_compare_means(comparisons = age.region.comparison, method = "t.test") +
stat_compare_means(method = "anova", label.y = 50, label.x = 2) +
karl_theme +
theme(legend.position = "none"))
bmi.children.comparison <- list(c("0", "1"), c("0", "2"), c("0", "3"), c("0", "4"), c("0", "5"),
c("1", "2"), c("1", "3"), c("1", "4"), c("1", "5"),
c("2", "3"), c("2", "4"), c("2", "5"),
c("3", "4"), c("3", "5"), c("4", "5"))
bmi.children.plot <- ggplotly(ggplot(data = medicalcost, aes(x = reorder(children, -bmi, FUN = median), y=bmi)) +
geom_violin(aes(fill = children), alpha = 0.5, color = "black") +
geom_boxplot(aes(fill = children), alpha = 0.75, color = "black") +
geom_sina(aes(fill = children), alpha = 0.5, size = 3, color = "white") +
labs(x="Children", y="BMI", title = "BMI Distribution by Number of Children") +
scale_fill_manual(values = c("#1D24CA", "#005EF9", "#0083FF", "#00A0F6", "#00B9D0", "#00D0A2")) +
scale_color_manual(values = c("#1D24CA", "#005EF9", "#0083FF", "#00A0F6", "#00B9D0", "#00D0A2")) +
stat_summary(fun.data = mean_cl_boot, geom = "point", color = "black", size = 3) +
stat_summary(fun.data = mean_cl_boot, geom = "errorbar", color = "black", size = 1, width = 0.1) +
#stat_compare_means(comparisons = age.children.comparison, method = "t.test") +
stat_compare_means(method = "anova", label.y = 50, label.x = 3) +
karl_theme +
theme(legend.position = "none"))
bmi.categories <- subplot(
bmi.sex.plot, bmi.smoker.plot, bmi.region.plot, bmi.children.plot, nrows = 2,
shareX = FALSE, shareY = FALSE, titleX = TRUE, titleY = TRUE, margin = 0.1
)
bmi.categories <- layout(bmi.categories,
title = list(
text = "BMI Distribution among Categories",
font = list(size = 25, family= "serif", face = "bold", weight = "bold")))
bmi.categories
charges.sex.comparison <- list(c("male", "female"))
charges.sex.plot <- ggplotly(ggplot(data = medicalcost, aes(x = reorder(sex, -charges, FUN = median), y=charges)) +
geom_violin(aes(fill = sex), alpha = 0.5, color = "black") +
geom_boxplot(aes(fill = sex), alpha = 0.75, color = "black") +
geom_sina(aes(fill = sex), alpha = 0.5, size = 3, color = "white") +
labs(x="Sex", y="Charges ($)", title = "Charges Distribution by Sex") +
scale_fill_manual(values = c("#1D24CA", "#B31312")) +
scale_color_manual(values = c("#1D24CA", "#B31312")) +
stat_summary(fun.data = mean_cl_boot, geom = "point", color = "black", size = 3) +
stat_summary(fun.data = mean_cl_boot, geom = "errorbar", color = "black", size = 1, width = 0.1) +
#stat_compare_means(comparisons = age.sex.comparison, method = "t.test") +
stat_compare_means(method = "t.test", label.y = 60000, label.x = 1.5) +
karl_theme +
theme(legend.position = "none") +
scale_y_continuous(breaks = c(20000, 40000, 60000),
labels = c("$20k", "$40k", "$60k")))
charges.smoker.comparison <- list(c("no", "yes"))
charges.smoker.plot <- ggplotly(ggplot(data = medicalcost, aes(x = reorder(smoker, -charges, FUN = median), y=charges)) +
geom_violin(aes(fill = smoker), alpha = 0.5, color = "black") +
geom_boxplot(aes(fill = smoker), alpha = 0.75, color = "black") +
geom_sina(aes(fill = smoker), alpha = 0.5, size = 3, color = "white") +
labs(x="Smoker", y="Charges ($)", title = "Charges Distribution by Smoking Status") +
scale_fill_manual(values = c("#1D24CA", "#B31312")) +
scale_color_manual(values = c("#1D24CA", "#B31312")) +
stat_summary(fun.data = mean_cl_boot, geom = "point", color = "black", size = 3) +
stat_summary(fun.data = mean_cl_boot, geom = "errorbar", color = "black", size = 1, width = 0.1) +
#stat_compare_means(comparisons = age.smoker.comparison, method = "t.test") +
stat_compare_means(method = "t.test", label.y = 60000, label.x = 1.5) +
karl_theme +
theme(legend.position = "none") +
scale_y_continuous(breaks = c(20000, 40000, 60000),
labels = c("$20k", "$40k", "$60k")))
charges.region.comparison <- list(c("northeast", "northwest"), c("northeast", "southeast"), c("northeast", "southwest"),
c("northwest", "southeast"), c("northwest", "southwest"),
c("southeast", "southwest"))
charges.region.plot <- ggplotly(ggplot(data = medicalcost, aes(x = reorder(region, -charges, FUN = median), y=charges)) +
geom_violin(aes(fill = region), alpha = 0.5, color = "black") +
geom_boxplot(aes(fill = region), alpha = 0.75, color = "black") +
geom_sina(aes(fill = region), alpha = 0.5, size = 3, color = "white") +
labs(x="Region", y="Charges ($)", title = "Charges Distribution by Region") +
scale_fill_manual(values = c("#1D24CA", "#B31312", "#F7C04A", "#539165")) +
scale_color_manual(values = c("#1D24CA", "#B31312", "#F7C04A", "#539165")) +
stat_summary(fun.data = mean_cl_boot, geom = "point", color = "black", size = 3) +
stat_summary(fun.data = mean_cl_boot, geom = "errorbar", color = "black", size = 1, width = 0.1) +
#stat_compare_means(comparisons = age.region.comparison, method = "t.test") +
stat_compare_means(method = "anova", label.y = 60000, label.x = 2.5) +
karl_theme +
theme(legend.position = "none") +
scale_y_continuous(breaks = c(20000, 40000, 60000),
labels = c("$20k", "$40k", "$60k")))
charges.children.comparison <- list(c("0", "1"), c("0", "2"), c("0", "3"), c("0", "4"), c("0", "5"),
c("1", "2"), c("1", "3"), c("1", "4"), c("1", "5"),
c("2", "3"), c("2", "4"), c("2", "5"),
c("3", "4"), c("3", "5"), c("4", "5"))
charges.children.plot <- ggplotly(ggplot(data = medicalcost, aes(x = reorder(children, -charges, FUN = median), y=charges)) +
geom_violin(aes(fill = children), alpha = 0.5, color = "black") +
geom_boxplot(aes(fill = children), alpha = 0.75, color = "black") +
geom_sina(aes(fill = children), alpha = 0.5, size = 3, color = "white") +
labs(x="Children", y="Charges ($)", title = "Charges Distribution by Number of Children") +
scale_fill_manual(values = c("#1D24CA", "#005EF9", "#0083FF", "#00A0F6", "#00B9D0", "#00D0A2")) +
scale_color_manual(values = c("#1D24CA", "#005EF9", "#0083FF", "#00A0F6", "#00B9D0", "#00D0A2")) +
stat_summary(fun.data = mean_cl_boot, geom = "point", color = "black", size = 3) +
stat_summary(fun.data = mean_cl_boot, geom = "errorbar", color = "black", size = 1, width = 0.1) +
#stat_compare_means(comparisons = age.children.comparison, method = "t.test") +
stat_compare_means(method = "anova", label.y = 60000, label.x = 2.5) +
karl_theme +
theme(legend.position = "none") +
scale_y_continuous(breaks = c(20000, 40000, 60000),
labels = c("$20k", "$40k", "$60k")))
charges.categories <- subplot(
charges.sex.plot, charges.smoker.plot, charges.region.plot, charges.children.plot, nrows = 2,
shareX = FALSE, shareY = FALSE, titleX = TRUE, titleY = TRUE, margin = 0.1
)
charges.categories <- layout(charges.categories,
title = list(
text = "Charges ($) Distribution among Categories",
font = list(size = 25, family= "serif", face = "bold", weight = "bold")))
charges.categories
age.BMIstatus.comparison <- list(c("Underweight", "Normal"),
c("Underweight", "Overweight"),
c("Underweight", "Obese"),
c("Normal", "Overweight"),
c("Normal", "Obese"),
c("Overweight", "Obese"))
age.bmi.status <- age.BMIstatus.plot <- ggplotly(ggplot(data = BMIstatus, aes(x = reorder(BMI.status, -age, FUN = median), y=age)) +
geom_violin(aes(fill = BMI.status), alpha = 0.5, color = "black") +
geom_boxplot(aes(fill = BMI.status), alpha = 0.75, color = "black") +
geom_sina(aes(fill = BMI.status), alpha = 0.5, size = 3, color = "white") +
labs(x="BMI status", y="Age", title = "Age Distribution by BMI status") +
scale_fill_manual(values = c("#1D24CA", "#539165", "#F7C04A","#B31312")) +
scale_color_manual(values = c("#1D24CA", "#539165", "#F7C04A","#B31312")) +
stat_summary(fun.data = mean_cl_boot, geom = "point", color = "black", size = 3) +
stat_summary(fun.data = mean_cl_boot, geom = "errorbar", color = "black", size = 1, width = 0.1) +
#stat_compare_means(comparisons = age.BMIstatus.comparison) +
stat_compare_means(method = "anova", label.y = 62, label.x = 2) +
karl_theme +
theme(legend.position = "none"))
charge.bmistatus <- charges.BMIstatus.plot <- ggplotly(ggplot(data = BMIstatus, aes(x = reorder(BMI.status, -charges, FUN = median), y=charges)) +
geom_violin(aes(fill = BMI.status), alpha = 0.5, color = "black") +
geom_boxplot(aes(fill = BMI.status), alpha = 0.75, color = "black") +
geom_sina(aes(fill = BMI.status), alpha = 0.5, size = 3, color = "white") +
labs(x="BMI status", y="Charges ($)", title = "Charge Distribution by BMI status") +
scale_fill_manual(values = c("#1D24CA", "#539165", "#F7C04A","#B31312")) +
scale_color_manual(values = c("#1D24CA", "#539165", "#F7C04A","#B31312")) +
stat_summary(fun.data = mean_cl_boot, geom = "point", color = "black", size = 3) +
stat_summary(fun.data = mean_cl_boot, geom = "errorbar", color = "black", size = 1, width = 0.1) +
#stat_compare_means(comparisons = age.BMIstatus.comparison) +
stat_compare_means(method = "anova", label.y = 60000, label.x = 1.5) +
karl_theme +
theme(legend.position = "none") +
scale_y_continuous(breaks = c(20000, 40000, 60000),
labels = c("$20k", "$40k", "$60k")))
BMIstatus <- subplot(
age.bmi.status, charge.bmistatus, nrows = 1,
shareX = FALSE, shareY = FALSE, titleX = TRUE, titleY = TRUE, margin = 0.075
)
BMIstatus <- layout(BMIstatus,
title = list(
text = "Age and Medical Charges among different BMI classification",
font = list(size = 25, family= "serif", face = "bold", weight = "bold")))
BMIstatus
age.vs.charge.by.smoker <- ggplotly(ggplot(data = medicalcost, aes(x = age, y = charges)) +
geom_point(alpha = 0.7, size = 5, color = "white", aes(fill = smoker)) +
labs(title = "Smoker among the highest charged medically",
x = "Age", y = "Medical Cost") +
scale_fill_manual(values = c("no" = "#1D24CA","yes" = "#B31312")) +
scale_y_continuous(breaks = c(20000, 40000, 60000),
labels = c("$20k", "$40k", "$60k")) +
karl_theme +
theme(legend.position.inside = c(0, 0)))
age.vs.charge.by.smoker
BMIstatus <- medicalcost %>%
filter(BMI.status != "Unknown")
ggplotly(ggplot(data = BMIstatus, aes(x = age, y = charges)) +
geom_point(alpha = 0.75, size = 3, color = "white", aes(fill = BMI.status)) +
labs(title = "Obese among the highest charged medically") +
scale_fill_manual(values = c("Underweight" = "#1D24CA",
"Normal" = "#539165",
"Overweight" = "#F7C04A",
"Obese" = "#B31312")) +
scale_y_continuous(breaks = c(20000, 40000, 60000),
labels = c("$20k", "$40k", "$60k")) +
facet_wrap(~ BMI.status) +
karl_theme)
bmi.vs.charge.by.smoker <- ggplotly(ggplot(data = medicalcost, aes(x = bmi, y = charges)) +
geom_point(alpha = 0.75, size = 5, color = "white", aes(fill = smoker)) +
labs(title = "Smoker among the highest medically charged",
x = "BMI", y = "Charges ($)") +
scale_fill_manual(values = c("no" = "#1D24CA","yes" = "#B31312")) +
scale_y_continuous(breaks = c(20000, 40000, 60000),
labels = c("$20k", "$40k", "$60k")) +
karl_theme)
bmi.vs.charge.by.smoker
BMIstatus.naremoved <- medicalcost %>%
filter(BMI.status != "Unknown") %>%
mutate(BMI.status = factor(BMI.status,
levels = c("Underweight", "Normal",
"Overweight", "Obese")))
age.bmi.charge2 <- plot_ly(
data = BMIstatus.naremoved, type = "scatter3d", mode = "markers",
x = ~age, y = ~bmi, z = ~charges,
color = ~smoker, colors = c("no" = "#1D24CA", "yes" = "#B31312"),
symbol = ~BMI.status, symbols = c("Underweight" = "circle",
"Normal" = "square",
"Overweight" = "diamond",
"Obese" = "cross"),
marker = list(opacity = 0.5), width = 900, height = 800) %>%
layout(
title = list(
text = "3D ScatterPlot of Age, BMI, and Charges",
font = list(size = 20)),
scene = list(
xaxis = list(title = "Age", titlefont = list(size = 10)),
yaxis = list(title = "BMI", titlefont = list(size = 10)),
zaxis = list(title = "Charges ($)", titlefont = list(size = 10))),
legend = list(title = list(text = "Smoker Status", font = list(size = 10))),
margin = list(l = 10, r = 10, b = 10, t = 40),
autosize = TRUE)
age.bmi.charge2
Figure 3.1: Double-positive for Smoker and Obese among the highest medically charged.