# Load the necessary library
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
df1 <- read_csv("/Users/chrysspoetry/Downloads/Health-Insurance-Dataset.csv")
## Rows: 1338 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): sex, smoker, region
## dbl (4): age, bmi, children, charges
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Create a scatter plot of age vs charges
ggplot(df1, aes(age,charges)) +
geom_smooth() +
labs(x="Age", y="Charges", title="Scatter plot of Age vs Charges") +
theme_minimal()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
For a more detailed analysis, we could fit a linear regression model to the data and check the coefficient of the ‘age’ variable. This will give a numerical estimate of how much the charges increase for each additional year of age
# Fit a linear regression model
model <- lm(charges ~ age, data = df1)
# Print the summary of the model
summary(model)
##
## Call:
## lm(formula = charges ~ age, data = df1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8059 -6671 -5939 5440 47829
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3165.9 937.1 3.378 0.000751 ***
## age 257.7 22.5 11.453 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11560 on 1336 degrees of freedom
## Multiple R-squared: 0.08941, Adjusted R-squared: 0.08872
## F-statistic: 131.2 on 1 and 1336 DF, p-value: < 2.2e-16
#Create a scatter plot of BMI vs charges
ggplot(df1, aes(x=bmi, y=charges)) +
geom_smooth() +
labs(x="BMI", y="Charges", title="Scatter plot of BMI vs Charges") +
theme_minimal()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
# Fit a linear regression model
model <- lm(charges ~ bmi, data = df1)
# Print the summary of the model
summary(model)
##
## Call:
## lm(formula = charges ~ bmi, data = df1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20956 -8118 -3757 4722 49442
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1192.94 1664.80 0.717 0.474
## bmi 393.87 53.25 7.397 2.46e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11870 on 1336 degrees of freedom
## Multiple R-squared: 0.03934, Adjusted R-squared: 0.03862
## F-statistic: 54.71 on 1 and 1336 DF, p-value: 2.459e-13
The coefficient of ‘BMI’ in the output tells you how much the charges increase for each additional unit of BMI, on average. If the p-value associated with the ‘BMI’ coefficient is small (typically less than 0.05), this suggests that the effect of BMI on charges is statistically significant.
# Create a boxplot of smoker vs charges
ggplot(df1, aes(x=smoker, y=charges)) +
geom_boxplot() +
labs(x="Smoker", y="Charges", title="Boxplot of Smoker vs Charges") +
theme_minimal()
# Convert smoker to a numeric variable
df1$smoker_num <- ifelse(df1$smoker == "yes", 1, 0)
# Fit a linear regression model
model <- lm(charges ~ smoker_num, data = df1)
# Print the summary of the model
summary(model)
##
## Call:
## lm(formula = charges ~ smoker_num, data = df1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19221 -5042 -919 3705 31720
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8434.3 229.0 36.83 <2e-16 ***
## smoker_num 23616.0 506.1 46.66 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7470 on 1336 degrees of freedom
## Multiple R-squared: 0.6198, Adjusted R-squared: 0.6195
## F-statistic: 2178 on 1 and 1336 DF, p-value: < 2.2e-16