setwd("C:/Users/kalen/Downloads/sds hackathon")
# Load the data
insurance<- read.csv("insurance.csv")

# Display the observations in the dataframe
head(insurance)
##   age    sex    bmi children smoker    region   charges
## 1  19 female 27.900        0    yes southwest 16884.924
## 2  18   male 33.770        1     no southeast  1725.552
## 3  28   male 33.000        3     no southeast  4449.462
## 4  33   male 22.705        0     no northwest 21984.471
## 5  32   male 28.880        0     no northwest  3866.855
## 6  31 female 25.740        0     no southeast  3756.622
names(insurance)
## [1] "age"      "sex"      "bmi"      "children" "smoker"   "region"   "charges"
str(insurance)
## 'data.frame':    1338 obs. of  7 variables:
##  $ age     : int  19 18 28 33 32 31 46 37 37 60 ...
##  $ sex     : chr  "female" "male" "male" "male" ...
##  $ bmi     : num  27.9 33.8 33 22.7 28.9 ...
##  $ children: int  0 1 3 0 0 0 1 3 2 0 ...
##  $ smoker  : chr  "yes" "no" "no" "no" ...
##  $ region  : chr  "southwest" "southeast" "southeast" "northwest" ...
##  $ charges : num  16885 1726 4449 21984 3867 ...
summary(insurance)
##       age            sex                 bmi           children    
##  Min.   :18.00   Length:1338        Min.   :15.96   Min.   :0.000  
##  1st Qu.:27.00   Class :character   1st Qu.:26.30   1st Qu.:0.000  
##  Median :39.00   Mode  :character   Median :30.40   Median :1.000  
##  Mean   :39.21                      Mean   :30.66   Mean   :1.095  
##  3rd Qu.:51.00                      3rd Qu.:34.69   3rd Qu.:2.000  
##  Max.   :64.00                      Max.   :53.13   Max.   :5.000  
##     smoker             region             charges     
##  Length:1338        Length:1338        Min.   : 1122  
##  Class :character   Class :character   1st Qu.: 4740  
##  Mode  :character   Mode  :character   Median : 9382  
##                                        Mean   :13270  
##                                        3rd Qu.:16640  
##                                        Max.   :63770
#explore correlation between variables
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
numeric_vars <- insurance %>% select(age, bmi, children, charges)
corrplot(cor(numeric_vars), method = "number", type = "upper")

insurance.num <-insurance %>% select(age, bmi, children, charges)
corr.test(insurance.num)
## Call:corr.test(x = insurance.num)
## Correlation matrix 
##           age  bmi children charges
## age      1.00 0.11     0.04    0.30
## bmi      0.11 1.00     0.01    0.20
## children 0.04 0.01     1.00    0.07
## charges  0.30 0.20     0.07    1.00
## Sample Size 
## [1] 1338
## Probability values (Entries above the diagonal are adjusted for multiple tests.) 
##           age  bmi children charges
## age      0.00 0.00     0.24    0.00
## bmi      0.00 0.00     0.64    0.00
## children 0.12 0.64     0.00    0.04
## charges  0.00 0.00     0.01    0.00
## 
##  To see confidence intervals of the correlations, print with the short=FALSE option

Age and bmi are rather moderately correlated with charges, and is significant cause p-value< 0.05.

#scatterplot
plot(insurance$age,
     insurance$charge)

plot(insurance$bmi,
     insurance$charge)

plot(insurance$children,
     insurance$charge)

Insurance charges rise with age, though with distinct clusters likely caused by smoking status. This suggests interaction effects between age and smoker behaviour.” “BMI shows a mild positive association with charges, but extreme BMI values (likely among smokers) correspond to significantly higher medical costs. “Number of dependents has minimal effect on insurance charges, suggesting that individual lifestyle and health factors (BMI, smoking) dominate over family size.”

# Charges by smoker
boxplot(charges ~ smoker, data = insurance,
        main = "Charges by Smoking Status",
        col = c("lightblue", "salmon"))

# Charges by region
boxplot(charges ~ region, data = insurance,
        main = "Charges by Region",
        col = "lightgreen")

Visual observation:

-Smokers (yes) have dramatically higher median charges than non-smokers (no). -The median for smokers (~$35,000) is well above even the upper whisker for non-smokers (~$15,000). -Smokers also show a wider spread of charges (larger IQR and more outliers).

Interpretation: -Smoking is the single most influential factor driving insurance costs. -The strong gap indicates insurers charge much higher premiums for smokers to offset health risk. -The wider spread suggests smokers’ medical costs are more unpredictable, varying widely by individual health condition.

Insurance charges are relatively consistent across regions, suggesting location plays a minimal role compared to demographic or lifestyle variables.”

lm_base <- lm(charges ~ age + sex + bmi + children + smoker + region, data = insurance)
summary(lm_base)
## 
## Call:
## lm(formula = charges ~ age + sex + bmi + children + smoker + 
##     region, data = insurance)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11304.9  -2848.1   -982.1   1393.9  29992.8 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -11938.5      987.8 -12.086  < 2e-16 ***
## age                256.9       11.9  21.587  < 2e-16 ***
## sexmale           -131.3      332.9  -0.394 0.693348    
## bmi                339.2       28.6  11.860  < 2e-16 ***
## children           475.5      137.8   3.451 0.000577 ***
## smokeryes        23848.5      413.1  57.723  < 2e-16 ***
## regionnorthwest   -353.0      476.3  -0.741 0.458769    
## regionsoutheast  -1035.0      478.7  -2.162 0.030782 *  
## regionsouthwest   -960.0      477.9  -2.009 0.044765 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6062 on 1329 degrees of freedom
## Multiple R-squared:  0.7509, Adjusted R-squared:  0.7494 
## F-statistic: 500.8 on 8 and 1329 DF,  p-value: < 2.2e-16
#Test for multicollinearity
library(car)
vif(lm_base)
##              GVIF Df GVIF^(1/(2*Df))
## age      1.016822  1        1.008376
## sex      1.008900  1        1.004440
## bmi      1.106630  1        1.051965
## children 1.004011  1        1.002003
## smoker   1.012074  1        1.006019
## region   1.098893  3        1.015841
#encode as factors
insurance$sex <- as.factor(insurance$sex)
insurance$smoker <- as.factor(insurance$smoker)
insurance$region <- as.factor(insurance$region)
insurance$sex <- relevel(insurance$sex, ref = "male")
insurance$smoker <- relevel(insurance$smoker, ref = "no")
insurance$region <- relevel(insurance$region, ref = "northeast")
lm_base <- lm(charges ~ age + sex + bmi + children + smoker + region, data = insurance)
summary(lm_base)
## 
## Call:
## lm(formula = charges ~ age + sex + bmi + children + smoker + 
##     region, data = insurance)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11304.9  -2848.1   -982.1   1393.9  29992.8 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -12069.9      999.6 -12.074  < 2e-16 ***
## age                256.9       11.9  21.587  < 2e-16 ***
## sexfemale          131.3      332.9   0.394 0.693348    
## bmi                339.2       28.6  11.860  < 2e-16 ***
## children           475.5      137.8   3.451 0.000577 ***
## smokeryes        23848.5      413.1  57.723  < 2e-16 ***
## regionnorthwest   -353.0      476.3  -0.741 0.458769    
## regionsoutheast  -1035.0      478.7  -2.162 0.030782 *  
## regionsouthwest   -960.0      477.9  -2.009 0.044765 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6062 on 1329 degrees of freedom
## Multiple R-squared:  0.7509, Adjusted R-squared:  0.7494 
## F-statistic: 500.8 on 8 and 1329 DF,  p-value: < 2.2e-16

R² = 0.7509 → the model explains ~75% of variation in insurance charges. Adjusted R² = 0.7494 → still strong after adjusting for number of predictors. F-statistic: 500.8, p < 2.2e-16 → model as a whole is statistically significant.

Age, BMI, Children, Smoker — all strongly positive and highly significant (p < 0.001). Interpretation: As people age, gain BMI, or have more dependents, insurance costs rise. Smoking, however, dwarfs other effects — consistent with health risk patterns.

Smoker status dominates the model (coefficient ~23,800), confirming it’s the main cost driver. Lifestyle and age matter more than geography or gender. The model’s high R² shows these simple features already explain much of the variation — suggesting the dataset is well-structured and relatively clean.

#check model assumptions
par(mfrow=c(2,2))
plot(lm_base)

Regression diagnostics indicate that the model satisfies the main assumptions of linear regression. Residuals are approximately normal and centered around zero, with no extreme outliers. A slight curvature in the residual pattern and mild heteroscedasticity suggest that smoker-related effects introduce nonlinearity, which could be explored with interaction or nonlinear models. Overall, the linear model is robust and interpretable for this dataset.

# Age–Smoker interaction
lm_interact1 <- lm(charges ~ age * smoker + sex + bmi + children + region, data = insurance)
summary(lm_interact1)
## 
## Call:
## lm(formula = charges ~ age * smoker + sex + bmi + children + 
##     region, data = insurance)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11296.5  -2832.8   -970.8   1420.9  29775.1 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -11747.29    1021.19 -11.504  < 2e-16 ***
## age                247.73      13.31  18.617  < 2e-16 ***
## smokeryes        22105.00    1213.13  18.222  < 2e-16 ***
## sexfemale          135.16     332.79   0.406 0.684706    
## bmi                340.62      28.60  11.910  < 2e-16 ***
## children           471.41     137.76   3.422 0.000641 ***
## regionnorthwest   -362.54     476.08  -0.762 0.446480    
## regionsoutheast  -1060.06     478.73  -2.214 0.026977 *  
## regionsouthwest   -943.31     477.82  -1.974 0.048566 *  
## age:smokeryes       45.13      29.53   1.529 0.126626    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6059 on 1328 degrees of freedom
## Multiple R-squared:  0.7514, Adjusted R-squared:  0.7497 
## F-statistic: 445.9 on 9 and 1328 DF,  p-value: < 2.2e-16
# BMI–Smoker interaction
lm_interact2 <- lm(charges ~ bmi * smoker + age + sex + children + region, data = insurance)
summary(lm_interact2)
## 
## Call:
## lm(formula = charges ~ bmi * smoker + age + sex + children + 
##     region, data = insurance)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14580.7  -1857.2  -1360.8   -475.7  30552.4 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -2723.600    868.881  -3.135  0.00176 ** 
## bmi                 23.533     25.601   0.919  0.35814    
## smokeryes       -20415.611   1648.277 -12.386  < 2e-16 ***
## age                263.620      9.516  27.703  < 2e-16 ***
## sexfemale          500.146    266.518   1.877  0.06079 .  
## children           516.403    110.179   4.687 3.06e-06 ***
## regionnorthwest   -585.478    380.859  -1.537  0.12447    
## regionsoutheast  -1210.131    382.750  -3.162  0.00160 ** 
## regionsouthwest  -1231.108    382.218  -3.221  0.00131 ** 
## bmi:smokeryes     1443.096     52.647  27.411  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4846 on 1328 degrees of freedom
## Multiple R-squared:  0.8409, Adjusted R-squared:  0.8398 
## F-statistic:   780 on 9 and 1328 DF,  p-value: < 2.2e-16
# (Optional) both together
lm_interact3 <- lm(charges ~ age * smoker + bmi * smoker + sex + children + region, data = insurance)
summary(lm_interact3)
## 
## Call:
## lm(formula = charges ~ age * smoker + bmi * smoker + sex + children + 
##     region, data = insurance)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14617.8  -1857.4  -1365.3   -471.3  30564.0 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -2738.035    881.089  -3.108  0.00193 ** 
## age                264.101     10.664  24.765  < 2e-16 ***
## smokeryes       -20335.905   1831.134 -11.106  < 2e-16 ***
## bmi                 23.373     25.660   0.911  0.36253    
## sexfemale          500.043    266.619   1.875  0.06094 .  
## children           516.629    110.244   4.686 3.07e-06 ***
## regionnorthwest   -585.037    381.027  -1.535  0.12492    
## regionsoutheast  -1208.863    383.103  -3.155  0.00164 ** 
## regionsouthwest  -1232.060    382.479  -3.221  0.00131 ** 
## age:smokeryes       -2.371     23.689  -0.100  0.92029    
## smokeryes:bmi     1443.484     52.809  27.334  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4848 on 1327 degrees of freedom
## Multiple R-squared:  0.8409, Adjusted R-squared:  0.8397 
## F-statistic: 701.5 on 10 and 1327 DF,  p-value: < 2.2e-16

“Including an age–smoker interaction does not significantly improve model fit. The effect of age on insurance cost is roughly parallel for smokers and non-smokers.” “BMI’s impact on charges depends heavily on smoking status. Among smokers, higher BMI multiplies healthcare costs dramatically, explaining the nonlinear jump observed in EDA.” “Only the BMI–smoker interaction meaningfully improves predictive power. The age–smoker term does not contribute additional variance once BMI × smoker is included.”

#final model
lm_final <- lm(charges ~ bmi * smoker + age + sex + children + region, data = insurance)

par(mfrow=c(2,2))
plot(lm_final)

Model assumptions are mostly satisfied. The bmi × smoker interaction successfully flattened the residual pattern, confirming it captures the key nonlinearity in the data. The model is statistically sound and interpretable.

step(lm_final, direction = "backward", trace = 1)
## Start:  AIC=22718.49
## charges ~ bmi * smoker + age + sex + children + region
## 
##              Df  Sum of Sq        RSS   AIC
## <none>                     3.1192e+10 22719
## - sex         1 8.2715e+07 3.1275e+10 22720
## - region      3 3.2668e+08 3.1519e+10 22726
## - children    1 5.1597e+08 3.1708e+10 22738
## - bmi:smoker  1 1.7648e+10 4.8840e+10 23316
## - age         1 1.8026e+10 4.9218e+10 23327
## 
## Call:
## lm(formula = charges ~ bmi * smoker + age + sex + children + 
##     region, data = insurance)
## 
## Coefficients:
##     (Intercept)              bmi        smokeryes              age  
##        -2723.60            23.53        -20415.61           263.62  
##       sexfemale         children  regionnorthwest  regionsoutheast  
##          500.15           516.40          -585.48         -1210.13  
## regionsouthwest    bmi:smokeryes  
##        -1231.11          1443.10
#In forward selection, we start with a very simple model and expand
mod_intercept <-lm(charges~1, data = insurance)



#A list of potential predictors need to be specified in scope.

step(mod_intercept,charges ~ bmi * smoker + age + sex + children + region , direction = 'forward', trace = 1  )
## Start:  AIC=25160.18
## charges ~ 1
## 
##            Df  Sum of Sq        RSS   AIC
## + smoker    1 1.2152e+11 7.4554e+10 23868
## + age       1 1.7530e+10 1.7854e+11 25037
## + bmi       1 7.7134e+09 1.8836e+11 25109
## + children  1 9.0660e+08 1.9517e+11 25156
## + region    3 1.3008e+09 1.9477e+11 25157
## + sex       1 6.4359e+08 1.9543e+11 25158
## <none>                   1.9607e+11 25160
## 
## Step:  AIC=23868.38
## charges ~ smoker
## 
##            Df  Sum of Sq        RSS   AIC
## + age       1 1.9928e+10 5.4626e+10 23454
## + bmi       1 7.4856e+09 6.7069e+10 23729
## + children  1 7.5272e+08 7.3802e+10 23857
## <none>                   7.4554e+10 23868
## + sex       1 1.4213e+06 7.4553e+10 23870
## + region    3 1.0752e+08 7.4447e+10 23873
## 
## Step:  AIC=23454.24
## charges ~ smoker + age
## 
##            Df  Sum of Sq        RSS   AIC
## + bmi       1 5112896646 4.9513e+10 23325
## + children  1  459283727 5.4167e+10 23445
## <none>                   5.4626e+10 23454
## + sex       1    2225509 5.4624e+10 23456
## + region    3  138426748 5.4488e+10 23457
## 
## Step:  AIC=23324.76
## charges ~ smoker + age + bmi
## 
##              Df  Sum of Sq        RSS   AIC
## + bmi:smoker  1 1.7411e+10 3.2102e+10 22747
## + children    1 4.3477e+08 4.9078e+10 23315
## + region      3 2.3201e+08 4.9281e+10 23325
## <none>                     4.9513e+10 23325
## + sex         1 3.9429e+06 4.9509e+10 23327
## 
## Step:  AIC=22746.97
## charges ~ smoker + age + bmi + smoker:bmi
## 
##            Df Sum of Sq        RSS   AIC
## + children  1 502180555 3.1600e+10 22728
## + region    3 318542385 3.1783e+10 22740
## + sex       1  74160840 3.2028e+10 22746
## <none>                  3.2102e+10 22747
## 
## Step:  AIC=22727.87
## charges ~ smoker + age + bmi + children + smoker:bmi
## 
##          Df Sum of Sq        RSS   AIC
## + region  3 325142179 3.1275e+10 22720
## + sex     1  81174650 3.1519e+10 22726
## <none>                3.1600e+10 22728
## 
## Step:  AIC=22720.03
## charges ~ smoker + age + bmi + children + region + smoker:bmi
## 
##        Df Sum of Sq        RSS   AIC
## + sex   1  82715239 3.1192e+10 22719
## <none>              3.1275e+10 22720
## 
## Step:  AIC=22718.49
## charges ~ smoker + age + bmi + children + region + sex + smoker:bmi
## 
## Call:
## lm(formula = charges ~ smoker + age + bmi + children + region + 
##     sex + smoker:bmi, data = insurance)
## 
## Coefficients:
##     (Intercept)        smokeryes              age              bmi  
##        -2723.60        -20415.61           263.62            23.53  
##        children  regionnorthwest  regionsoutheast  regionsouthwest  
##          516.40          -585.48         -1210.13         -1231.11  
##       sexfemale    smokeryes:bmi  
##          500.15          1443.10

Stepwise Model Validation Both forward and backward stepwise regression using AIC led to the same final model specification:

charges∼smoker+age+bmi+children+region+sex+(smoker:bmi)

This confirms that the BMI × smoker interaction is the most critical non-additive term, while other predictors contribute moderate additive effects. Stepwise selection improved model parsimony without sacrificing performance (AIC ≈ 22,718, Adj. R² ≈ 0.84).

library(broom)
## Warning: package 'broom' was built under R version 4.4.3
library(ggplot2)

coef_df <- tidy(lm_final)


ggplot(coef_df[-1, ], aes(x=reorder(term, estimate), y=estimate, fill=estimate > 0)) +
  geom_bar(stat="identity", show.legend=FALSE) +
  coord_flip() +
  labs(title="Feature Effects on Insurance Charges",
       x="Predictor",
       y="Coefficient Estimate ($)") +
  scale_fill_manual(values=c("firebrick3", "steelblue3")) +
  theme_minimal()

ggplot(insurance, aes(x=bmi, y=charges, color=smoker)) +
  geom_point(alpha=0.5) +
  geom_smooth(method="lm", se=FALSE, size=1.2) +
  labs(title="Interaction between BMI and Smoking on Insurance Charges",
       x="BMI",
       y="Insurance Charges ($)") +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'

library(dplyr)

insurance %>%
  mutate(predicted = predict(lm_final)) %>%
  group_by(smoker, sex, region) %>%
  summarise(
    mean_actual = mean(charges),
    mean_predicted = mean(predicted),
    error = mean(predicted - charges),
    mae = mean(abs(predicted - charges))
  )
## `summarise()` has grouped output by 'smoker', 'sex'. You can override using the
## `.groups` argument.
## # A tibble: 16 × 7
## # Groups:   smoker, sex [4]
##    smoker sex    region    mean_actual mean_predicted   error   mae
##    <fct>  <fct>  <fct>           <dbl>          <dbl>   <dbl> <dbl>
##  1 no     male   northeast       8664.          8884.   220.  2895.
##  2 no     male   northwest       8321.          8087.  -234.  2762.
##  3 no     male   southeast       7609.          7499.  -110.  2314.
##  4 no     male   southwest       7779.          7971.   192.  1914.
##  5 no     female northeast       9640.          9473.  -167.  3188.
##  6 no     female northwest       8787.          8974.   187.  2428.
##  7 no     female southeast       8440.          8195.  -245.  2504.
##  8 no     female southwest       8234.          8410.   175.  2032.
##  9 yes    male   northeast      30926.         30659.  -267.  4826.
## 10 yes    male   northwest      30713.         31640.   927.  4739.
## 11 yes    male   southeast      36030.         36097.    67.3 4167.
## 12 yes    male   southwest      32599.         31879.  -720.  4646.
## 13 yes    female northeast      28032.         28193.   161.  2982.
## 14 yes    female northwest      29671.         28940.  -731.  5674.
## 15 yes    female southeast      33035.         34286.  1252.  4145.
## 16 yes    female southwest      31688.         30625. -1063.  4554.