setwd("C:/Users/kalen/Downloads/sds hackathon")
# Load the data
insurance<- read.csv("insurance.csv")
# Display the observations in the dataframe
head(insurance)
## age sex bmi children smoker region charges
## 1 19 female 27.900 0 yes southwest 16884.924
## 2 18 male 33.770 1 no southeast 1725.552
## 3 28 male 33.000 3 no southeast 4449.462
## 4 33 male 22.705 0 no northwest 21984.471
## 5 32 male 28.880 0 no northwest 3866.855
## 6 31 female 25.740 0 no southeast 3756.622
names(insurance)
## [1] "age" "sex" "bmi" "children" "smoker" "region" "charges"
str(insurance)
## 'data.frame': 1338 obs. of 7 variables:
## $ age : int 19 18 28 33 32 31 46 37 37 60 ...
## $ sex : chr "female" "male" "male" "male" ...
## $ bmi : num 27.9 33.8 33 22.7 28.9 ...
## $ children: int 0 1 3 0 0 0 1 3 2 0 ...
## $ smoker : chr "yes" "no" "no" "no" ...
## $ region : chr "southwest" "southeast" "southeast" "northwest" ...
## $ charges : num 16885 1726 4449 21984 3867 ...
summary(insurance)
## age sex bmi children
## Min. :18.00 Length:1338 Min. :15.96 Min. :0.000
## 1st Qu.:27.00 Class :character 1st Qu.:26.30 1st Qu.:0.000
## Median :39.00 Mode :character Median :30.40 Median :1.000
## Mean :39.21 Mean :30.66 Mean :1.095
## 3rd Qu.:51.00 3rd Qu.:34.69 3rd Qu.:2.000
## Max. :64.00 Max. :53.13 Max. :5.000
## smoker region charges
## Length:1338 Length:1338 Min. : 1122
## Class :character Class :character 1st Qu.: 4740
## Mode :character Mode :character Median : 9382
## Mean :13270
## 3rd Qu.:16640
## Max. :63770
#explore correlation between variables
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
numeric_vars <- insurance %>% select(age, bmi, children, charges)
corrplot(cor(numeric_vars), method = "number", type = "upper")
insurance.num <-insurance %>% select(age, bmi, children, charges)
corr.test(insurance.num)
## Call:corr.test(x = insurance.num)
## Correlation matrix
## age bmi children charges
## age 1.00 0.11 0.04 0.30
## bmi 0.11 1.00 0.01 0.20
## children 0.04 0.01 1.00 0.07
## charges 0.30 0.20 0.07 1.00
## Sample Size
## [1] 1338
## Probability values (Entries above the diagonal are adjusted for multiple tests.)
## age bmi children charges
## age 0.00 0.00 0.24 0.00
## bmi 0.00 0.00 0.64 0.00
## children 0.12 0.64 0.00 0.04
## charges 0.00 0.00 0.01 0.00
##
## To see confidence intervals of the correlations, print with the short=FALSE option
Age and bmi are rather moderately correlated with charges, and is significant cause p-value< 0.05.
#scatterplot
plot(insurance$age,
insurance$charge)
plot(insurance$bmi,
insurance$charge)
plot(insurance$children,
insurance$charge)
Insurance charges rise with age, though with distinct clusters likely caused by smoking status. This suggests interaction effects between age and smoker behaviour.” “BMI shows a mild positive association with charges, but extreme BMI values (likely among smokers) correspond to significantly higher medical costs. “Number of dependents has minimal effect on insurance charges, suggesting that individual lifestyle and health factors (BMI, smoking) dominate over family size.”
# Charges by smoker
boxplot(charges ~ smoker, data = insurance,
main = "Charges by Smoking Status",
col = c("lightblue", "salmon"))
# Charges by region
boxplot(charges ~ region, data = insurance,
main = "Charges by Region",
col = "lightgreen")
Visual observation:
-Smokers (yes) have dramatically higher median charges than non-smokers (no). -The median for smokers (~$35,000) is well above even the upper whisker for non-smokers (~$15,000). -Smokers also show a wider spread of charges (larger IQR and more outliers).
Interpretation: -Smoking is the single most influential factor driving insurance costs. -The strong gap indicates insurers charge much higher premiums for smokers to offset health risk. -The wider spread suggests smokers’ medical costs are more unpredictable, varying widely by individual health condition.
Insurance charges are relatively consistent across regions, suggesting location plays a minimal role compared to demographic or lifestyle variables.”
lm_base <- lm(charges ~ age + sex + bmi + children + smoker + region, data = insurance)
summary(lm_base)
##
## Call:
## lm(formula = charges ~ age + sex + bmi + children + smoker +
## region, data = insurance)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11304.9 -2848.1 -982.1 1393.9 29992.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -11938.5 987.8 -12.086 < 2e-16 ***
## age 256.9 11.9 21.587 < 2e-16 ***
## sexmale -131.3 332.9 -0.394 0.693348
## bmi 339.2 28.6 11.860 < 2e-16 ***
## children 475.5 137.8 3.451 0.000577 ***
## smokeryes 23848.5 413.1 57.723 < 2e-16 ***
## regionnorthwest -353.0 476.3 -0.741 0.458769
## regionsoutheast -1035.0 478.7 -2.162 0.030782 *
## regionsouthwest -960.0 477.9 -2.009 0.044765 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6062 on 1329 degrees of freedom
## Multiple R-squared: 0.7509, Adjusted R-squared: 0.7494
## F-statistic: 500.8 on 8 and 1329 DF, p-value: < 2.2e-16
#Test for multicollinearity
library(car)
vif(lm_base)
## GVIF Df GVIF^(1/(2*Df))
## age 1.016822 1 1.008376
## sex 1.008900 1 1.004440
## bmi 1.106630 1 1.051965
## children 1.004011 1 1.002003
## smoker 1.012074 1 1.006019
## region 1.098893 3 1.015841
#encode as factors
insurance$sex <- as.factor(insurance$sex)
insurance$smoker <- as.factor(insurance$smoker)
insurance$region <- as.factor(insurance$region)
insurance$sex <- relevel(insurance$sex, ref = "male")
insurance$smoker <- relevel(insurance$smoker, ref = "no")
insurance$region <- relevel(insurance$region, ref = "northeast")
lm_base <- lm(charges ~ age + sex + bmi + children + smoker + region, data = insurance)
summary(lm_base)
##
## Call:
## lm(formula = charges ~ age + sex + bmi + children + smoker +
## region, data = insurance)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11304.9 -2848.1 -982.1 1393.9 29992.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -12069.9 999.6 -12.074 < 2e-16 ***
## age 256.9 11.9 21.587 < 2e-16 ***
## sexfemale 131.3 332.9 0.394 0.693348
## bmi 339.2 28.6 11.860 < 2e-16 ***
## children 475.5 137.8 3.451 0.000577 ***
## smokeryes 23848.5 413.1 57.723 < 2e-16 ***
## regionnorthwest -353.0 476.3 -0.741 0.458769
## regionsoutheast -1035.0 478.7 -2.162 0.030782 *
## regionsouthwest -960.0 477.9 -2.009 0.044765 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6062 on 1329 degrees of freedom
## Multiple R-squared: 0.7509, Adjusted R-squared: 0.7494
## F-statistic: 500.8 on 8 and 1329 DF, p-value: < 2.2e-16
R² = 0.7509 → the model explains ~75% of variation in insurance charges. Adjusted R² = 0.7494 → still strong after adjusting for number of predictors. F-statistic: 500.8, p < 2.2e-16 → model as a whole is statistically significant.
Age, BMI, Children, Smoker — all strongly positive and highly significant (p < 0.001). Interpretation: As people age, gain BMI, or have more dependents, insurance costs rise. Smoking, however, dwarfs other effects — consistent with health risk patterns.
Smoker status dominates the model (coefficient ~23,800), confirming it’s the main cost driver. Lifestyle and age matter more than geography or gender. The model’s high R² shows these simple features already explain much of the variation — suggesting the dataset is well-structured and relatively clean.
#check model assumptions
par(mfrow=c(2,2))
plot(lm_base)
Regression diagnostics indicate that the model satisfies the main
assumptions of linear regression. Residuals are approximately normal and
centered around zero, with no extreme outliers. A slight curvature in
the residual pattern and mild heteroscedasticity suggest that
smoker-related effects introduce nonlinearity, which could be explored
with interaction or nonlinear models. Overall, the linear model is
robust and interpretable for this dataset.
# Age–Smoker interaction
lm_interact1 <- lm(charges ~ age * smoker + sex + bmi + children + region, data = insurance)
summary(lm_interact1)
##
## Call:
## lm(formula = charges ~ age * smoker + sex + bmi + children +
## region, data = insurance)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11296.5 -2832.8 -970.8 1420.9 29775.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -11747.29 1021.19 -11.504 < 2e-16 ***
## age 247.73 13.31 18.617 < 2e-16 ***
## smokeryes 22105.00 1213.13 18.222 < 2e-16 ***
## sexfemale 135.16 332.79 0.406 0.684706
## bmi 340.62 28.60 11.910 < 2e-16 ***
## children 471.41 137.76 3.422 0.000641 ***
## regionnorthwest -362.54 476.08 -0.762 0.446480
## regionsoutheast -1060.06 478.73 -2.214 0.026977 *
## regionsouthwest -943.31 477.82 -1.974 0.048566 *
## age:smokeryes 45.13 29.53 1.529 0.126626
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6059 on 1328 degrees of freedom
## Multiple R-squared: 0.7514, Adjusted R-squared: 0.7497
## F-statistic: 445.9 on 9 and 1328 DF, p-value: < 2.2e-16
# BMI–Smoker interaction
lm_interact2 <- lm(charges ~ bmi * smoker + age + sex + children + region, data = insurance)
summary(lm_interact2)
##
## Call:
## lm(formula = charges ~ bmi * smoker + age + sex + children +
## region, data = insurance)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14580.7 -1857.2 -1360.8 -475.7 30552.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2723.600 868.881 -3.135 0.00176 **
## bmi 23.533 25.601 0.919 0.35814
## smokeryes -20415.611 1648.277 -12.386 < 2e-16 ***
## age 263.620 9.516 27.703 < 2e-16 ***
## sexfemale 500.146 266.518 1.877 0.06079 .
## children 516.403 110.179 4.687 3.06e-06 ***
## regionnorthwest -585.478 380.859 -1.537 0.12447
## regionsoutheast -1210.131 382.750 -3.162 0.00160 **
## regionsouthwest -1231.108 382.218 -3.221 0.00131 **
## bmi:smokeryes 1443.096 52.647 27.411 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4846 on 1328 degrees of freedom
## Multiple R-squared: 0.8409, Adjusted R-squared: 0.8398
## F-statistic: 780 on 9 and 1328 DF, p-value: < 2.2e-16
# (Optional) both together
lm_interact3 <- lm(charges ~ age * smoker + bmi * smoker + sex + children + region, data = insurance)
summary(lm_interact3)
##
## Call:
## lm(formula = charges ~ age * smoker + bmi * smoker + sex + children +
## region, data = insurance)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14617.8 -1857.4 -1365.3 -471.3 30564.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2738.035 881.089 -3.108 0.00193 **
## age 264.101 10.664 24.765 < 2e-16 ***
## smokeryes -20335.905 1831.134 -11.106 < 2e-16 ***
## bmi 23.373 25.660 0.911 0.36253
## sexfemale 500.043 266.619 1.875 0.06094 .
## children 516.629 110.244 4.686 3.07e-06 ***
## regionnorthwest -585.037 381.027 -1.535 0.12492
## regionsoutheast -1208.863 383.103 -3.155 0.00164 **
## regionsouthwest -1232.060 382.479 -3.221 0.00131 **
## age:smokeryes -2.371 23.689 -0.100 0.92029
## smokeryes:bmi 1443.484 52.809 27.334 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4848 on 1327 degrees of freedom
## Multiple R-squared: 0.8409, Adjusted R-squared: 0.8397
## F-statistic: 701.5 on 10 and 1327 DF, p-value: < 2.2e-16
“Including an age–smoker interaction does not significantly improve model fit. The effect of age on insurance cost is roughly parallel for smokers and non-smokers.” “BMI’s impact on charges depends heavily on smoking status. Among smokers, higher BMI multiplies healthcare costs dramatically, explaining the nonlinear jump observed in EDA.” “Only the BMI–smoker interaction meaningfully improves predictive power. The age–smoker term does not contribute additional variance once BMI × smoker is included.”
#final model
lm_final <- lm(charges ~ bmi * smoker + age + sex + children + region, data = insurance)
par(mfrow=c(2,2))
plot(lm_final)
Model assumptions are mostly satisfied. The bmi × smoker interaction successfully flattened the residual pattern, confirming it captures the key nonlinearity in the data. The model is statistically sound and interpretable.
step(lm_final, direction = "backward", trace = 1)
## Start: AIC=22718.49
## charges ~ bmi * smoker + age + sex + children + region
##
## Df Sum of Sq RSS AIC
## <none> 3.1192e+10 22719
## - sex 1 8.2715e+07 3.1275e+10 22720
## - region 3 3.2668e+08 3.1519e+10 22726
## - children 1 5.1597e+08 3.1708e+10 22738
## - bmi:smoker 1 1.7648e+10 4.8840e+10 23316
## - age 1 1.8026e+10 4.9218e+10 23327
##
## Call:
## lm(formula = charges ~ bmi * smoker + age + sex + children +
## region, data = insurance)
##
## Coefficients:
## (Intercept) bmi smokeryes age
## -2723.60 23.53 -20415.61 263.62
## sexfemale children regionnorthwest regionsoutheast
## 500.15 516.40 -585.48 -1210.13
## regionsouthwest bmi:smokeryes
## -1231.11 1443.10
#In forward selection, we start with a very simple model and expand
mod_intercept <-lm(charges~1, data = insurance)
#A list of potential predictors need to be specified in scope.
step(mod_intercept,charges ~ bmi * smoker + age + sex + children + region , direction = 'forward', trace = 1 )
## Start: AIC=25160.18
## charges ~ 1
##
## Df Sum of Sq RSS AIC
## + smoker 1 1.2152e+11 7.4554e+10 23868
## + age 1 1.7530e+10 1.7854e+11 25037
## + bmi 1 7.7134e+09 1.8836e+11 25109
## + children 1 9.0660e+08 1.9517e+11 25156
## + region 3 1.3008e+09 1.9477e+11 25157
## + sex 1 6.4359e+08 1.9543e+11 25158
## <none> 1.9607e+11 25160
##
## Step: AIC=23868.38
## charges ~ smoker
##
## Df Sum of Sq RSS AIC
## + age 1 1.9928e+10 5.4626e+10 23454
## + bmi 1 7.4856e+09 6.7069e+10 23729
## + children 1 7.5272e+08 7.3802e+10 23857
## <none> 7.4554e+10 23868
## + sex 1 1.4213e+06 7.4553e+10 23870
## + region 3 1.0752e+08 7.4447e+10 23873
##
## Step: AIC=23454.24
## charges ~ smoker + age
##
## Df Sum of Sq RSS AIC
## + bmi 1 5112896646 4.9513e+10 23325
## + children 1 459283727 5.4167e+10 23445
## <none> 5.4626e+10 23454
## + sex 1 2225509 5.4624e+10 23456
## + region 3 138426748 5.4488e+10 23457
##
## Step: AIC=23324.76
## charges ~ smoker + age + bmi
##
## Df Sum of Sq RSS AIC
## + bmi:smoker 1 1.7411e+10 3.2102e+10 22747
## + children 1 4.3477e+08 4.9078e+10 23315
## + region 3 2.3201e+08 4.9281e+10 23325
## <none> 4.9513e+10 23325
## + sex 1 3.9429e+06 4.9509e+10 23327
##
## Step: AIC=22746.97
## charges ~ smoker + age + bmi + smoker:bmi
##
## Df Sum of Sq RSS AIC
## + children 1 502180555 3.1600e+10 22728
## + region 3 318542385 3.1783e+10 22740
## + sex 1 74160840 3.2028e+10 22746
## <none> 3.2102e+10 22747
##
## Step: AIC=22727.87
## charges ~ smoker + age + bmi + children + smoker:bmi
##
## Df Sum of Sq RSS AIC
## + region 3 325142179 3.1275e+10 22720
## + sex 1 81174650 3.1519e+10 22726
## <none> 3.1600e+10 22728
##
## Step: AIC=22720.03
## charges ~ smoker + age + bmi + children + region + smoker:bmi
##
## Df Sum of Sq RSS AIC
## + sex 1 82715239 3.1192e+10 22719
## <none> 3.1275e+10 22720
##
## Step: AIC=22718.49
## charges ~ smoker + age + bmi + children + region + sex + smoker:bmi
##
## Call:
## lm(formula = charges ~ smoker + age + bmi + children + region +
## sex + smoker:bmi, data = insurance)
##
## Coefficients:
## (Intercept) smokeryes age bmi
## -2723.60 -20415.61 263.62 23.53
## children regionnorthwest regionsoutheast regionsouthwest
## 516.40 -585.48 -1210.13 -1231.11
## sexfemale smokeryes:bmi
## 500.15 1443.10
Stepwise Model Validation Both forward and backward stepwise regression using AIC led to the same final model specification:
charges∼smoker+age+bmi+children+region+sex+(smoker:bmi)
This confirms that the BMI × smoker interaction is the most critical non-additive term, while other predictors contribute moderate additive effects. Stepwise selection improved model parsimony without sacrificing performance (AIC ≈ 22,718, Adj. R² ≈ 0.84).
library(broom)
## Warning: package 'broom' was built under R version 4.4.3
library(ggplot2)
coef_df <- tidy(lm_final)
ggplot(coef_df[-1, ], aes(x=reorder(term, estimate), y=estimate, fill=estimate > 0)) +
geom_bar(stat="identity", show.legend=FALSE) +
coord_flip() +
labs(title="Feature Effects on Insurance Charges",
x="Predictor",
y="Coefficient Estimate ($)") +
scale_fill_manual(values=c("firebrick3", "steelblue3")) +
theme_minimal()
ggplot(insurance, aes(x=bmi, y=charges, color=smoker)) +
geom_point(alpha=0.5) +
geom_smooth(method="lm", se=FALSE, size=1.2) +
labs(title="Interaction between BMI and Smoking on Insurance Charges",
x="BMI",
y="Insurance Charges ($)") +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'
library(dplyr)
insurance %>%
mutate(predicted = predict(lm_final)) %>%
group_by(smoker, sex, region) %>%
summarise(
mean_actual = mean(charges),
mean_predicted = mean(predicted),
error = mean(predicted - charges),
mae = mean(abs(predicted - charges))
)
## `summarise()` has grouped output by 'smoker', 'sex'. You can override using the
## `.groups` argument.
## # A tibble: 16 × 7
## # Groups: smoker, sex [4]
## smoker sex region mean_actual mean_predicted error mae
## <fct> <fct> <fct> <dbl> <dbl> <dbl> <dbl>
## 1 no male northeast 8664. 8884. 220. 2895.
## 2 no male northwest 8321. 8087. -234. 2762.
## 3 no male southeast 7609. 7499. -110. 2314.
## 4 no male southwest 7779. 7971. 192. 1914.
## 5 no female northeast 9640. 9473. -167. 3188.
## 6 no female northwest 8787. 8974. 187. 2428.
## 7 no female southeast 8440. 8195. -245. 2504.
## 8 no female southwest 8234. 8410. 175. 2032.
## 9 yes male northeast 30926. 30659. -267. 4826.
## 10 yes male northwest 30713. 31640. 927. 4739.
## 11 yes male southeast 36030. 36097. 67.3 4167.
## 12 yes male southwest 32599. 31879. -720. 4646.
## 13 yes female northeast 28032. 28193. 161. 2982.
## 14 yes female northwest 29671. 28940. -731. 5674.
## 15 yes female southeast 33035. 34286. 1252. 4145.
## 16 yes female southwest 31688. 30625. -1063. 4554.