# Load the necessary library
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
df1 <- read_csv("/Users/chrysspoetry/Downloads/Health-Insurance-Dataset.csv")
## Rows: 1338 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): sex, smoker, region
## dbl (4): age, bmi, children, charges
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Create a scatter plot of age vs charges
ggplot(df1, aes(age,charges)) +
  geom_smooth() +
  labs(x="Age", y="Charges", title="Scatter plot of Age vs Charges") +
  theme_minimal()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

For a more detailed analysis, we could fit a linear regression model to the data and check the coefficient of the ‘age’ variable. This will give a numerical estimate of how much the charges increase for each additional year of age

# Fit a linear regression model
model <- lm(charges ~ age, data = df1)

# Print the summary of the model
summary(model)
## 
## Call:
## lm(formula = charges ~ age, data = df1)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -8059  -6671  -5939   5440  47829 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3165.9      937.1   3.378 0.000751 ***
## age            257.7       22.5  11.453  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11560 on 1336 degrees of freedom
## Multiple R-squared:  0.08941,    Adjusted R-squared:  0.08872 
## F-statistic: 131.2 on 1 and 1336 DF,  p-value: < 2.2e-16
 #Create a scatter plot of BMI vs charges
ggplot(df1, aes(x=bmi, y=charges)) +
  geom_smooth() +
  labs(x="BMI", y="Charges", title="Scatter plot of BMI vs Charges") +
  theme_minimal()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

# Fit a linear regression model
model <- lm(charges ~ bmi, data = df1)

# Print the summary of the model
summary(model)
## 
## Call:
## lm(formula = charges ~ bmi, data = df1)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -20956  -8118  -3757   4722  49442 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1192.94    1664.80   0.717    0.474    
## bmi           393.87      53.25   7.397 2.46e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11870 on 1336 degrees of freedom
## Multiple R-squared:  0.03934,    Adjusted R-squared:  0.03862 
## F-statistic: 54.71 on 1 and 1336 DF,  p-value: 2.459e-13

The coefficient of ‘BMI’ in the output tells you how much the charges increase for each additional unit of BMI, on average. If the p-value associated with the ‘BMI’ coefficient is small (typically less than 0.05), this suggests that the effect of BMI on charges is statistically significant.

# Create a boxplot of smoker vs charges
ggplot(df1, aes(x=smoker, y=charges)) +
  geom_boxplot() +
  labs(x="Smoker", y="Charges", title="Boxplot of Smoker vs Charges") +
  theme_minimal()

# Convert smoker to a numeric variable
df1$smoker_num <- ifelse(df1$smoker == "yes", 1, 0)

# Fit a linear regression model
model <- lm(charges ~ smoker_num, data = df1)

# Print the summary of the model
summary(model)
## 
## Call:
## lm(formula = charges ~ smoker_num, data = df1)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -19221  -5042   -919   3705  31720 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   8434.3      229.0   36.83   <2e-16 ***
## smoker_num   23616.0      506.1   46.66   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7470 on 1336 degrees of freedom
## Multiple R-squared:  0.6198, Adjusted R-squared:  0.6195 
## F-statistic:  2178 on 1 and 1336 DF,  p-value: < 2.2e-16