set.seed(465)
# Locate the dataset even if its location or uploaded file name differs slightly.
# If the CSV is not available, the code can also read the embedded insurance
# data from datasets.qmd, which was uploaded with the project files.
find_file <- function(possible_names) {
existing <- possible_names[file.exists(possible_names)]
if (length(existing) == 0) {
return(NA_character_)
}
existing[1]
}
insurance_path <- find_file(c(
"data/insurance.csv", "insurance.csv",
"data/insurance(2).csv", "insurance(2).csv",
"data/insurance (2).csv", "insurance (2).csv"
))
datasets_qmd_path <- find_file(c(
"datasets.qmd", "data/datasets.qmd"
))
read_embedded_insurance <- function(qmd_path) {
lines <- readLines(qmd_path, warn = FALSE)
start_line <- grep("^insurance_csv <- paste\\(c\\(", lines)[1]
if (is.na(start_line)) {
stop("Embedded insurance data was not found inside datasets.qmd.")
}
end_candidates <- grep("^\\), collapse = \"\\\\n\"\\)", lines)
end_line <- end_candidates[end_candidates > start_line][1]
if (is.na(end_line)) {
stop("Could not find the end of embedded insurance data inside datasets.qmd.")
}
env <- new.env()
eval(parse(text = paste(lines[start_line:end_line], collapse = "\n")), envir = env)
read.csv(text = env$insurance_csv, stringsAsFactors = FALSE)
}
# Metrics used to compare out-of-sample prediction performance.
rmse <- function(truth, predicted) {
sqrt(mean((truth - predicted)^2))
}
rsq <- function(truth, predicted) {
1 - sum((truth - predicted)^2) / sum((truth - mean(truth))^2)
}
# Base R 80/20 split helper to avoid package-installation problems.
make_split <- function(data, prop = 0.80) {
training_rows <- sample(seq_len(nrow(data)), size = floor(prop * nrow(data)))
list(
train = data[training_rows, , drop = FALSE],
test = data[-training_rows, , drop = FALSE]
)
}ECON 465 - Stage 3 Final Report
Health Risk and Medical Insurance Charges
1 Economic Question and Motivation
This final report combines my Stage 1 probability/distribution analysis and Stage 2 modeling work using one dataset: the medical insurance charges regression dataset.
Economic question: How do observable health-risk and demographic characteristics relate to individual medical insurance charges?
Medical insurance charges represent an individual cost outcome related to expected health-care risk. Studying their relationship with observable characteristics is economically relevant because differences in expected cost can affect household affordability, insurance-pricing decisions, and policy discussions about prevention and access.
Research narrative: I predict individual medical insurance charges from age, BMI (Body Mass Index), smoking status, number of children, sex, and region because expected medical cost may differ according to health-risk and demographic factors.
Hypothesis: Smoking status will be the strongest predictor of charges, while age and BMI will also have positive relationships with charges. I also expect the relationship between BMI and charges to be stronger for smokers than for non-smokers; therefore, a model including a BMI-by-smoking interaction should perform better than a simpler baseline model.
2 Reproducibility Setup
This document uses relative file paths, a fixed random seed, and base R calculations so that it can be rendered from the project folder with minimal package requirements. The dataset may be stored in the data/ folder or beside this .qmd file.
3 Data
3.1 Data Source and Import
The dataset used in this report is the public medical insurance charges teaching dataset commonly distributed with Machine Learning with R. It contains individual observations on age, sex, BMI, children, smoking status, region, and medical insurance charges. The supplied data file does not identify a country or currency; therefore, this report does not claim that the observations represent the United States or interpret charges as a particular currency.
if (!is.na(insurance_path)) {
insurance <- read.csv(insurance_path, stringsAsFactors = FALSE)
} else if (!is.na(datasets_qmd_path)) {
insurance <- read_embedded_insurance(datasets_qmd_path)
} else {
stop(paste(
"Insurance dataset not found. Keep insurance.csv inside the data folder,",
"in the same folder as this QMD file, or keep datasets.qmd beside this QMD file."
))
}
required_vars <- c("age", "sex", "bmi", "children", "smoker", "region", "charges")
# Some downloaded versions contain an extra exported index column.
if (!all(required_vars %in% names(insurance)) && ncol(insurance) == 8) {
insurance <- insurance[, -1]
}
if (ncol(insurance) == 8 && all(required_vars %in% names(insurance))) {
insurance <- insurance[, required_vars]
}
if (!all(required_vars %in% names(insurance))) {
stop("The CSV must contain: age, sex, bmi, children, smoker, region, and charges.")
}
insurance$sex <- factor(insurance$sex)
insurance$smoker <- factor(insurance$smoker, levels = c("no", "yes"))
insurance$region <- factor(insurance$region)
data_check <- data.frame(
Item = c("Observations", "Analysis variables", "Missing values"),
Value = c(nrow(insurance), ncol(insurance), sum(is.na(insurance)))
)
knitr::kable(data_check, caption = "Dataset validation check")| Item | Value |
|---|---|
| Observations | 1338 |
| Analysis variables | 7 |
| Missing values | 0 |
The cleaned analysis dataset contains 1338 observations, 7 variables, and 0 missing values.
3.2 Variable Roles
| Role | Variable | Economic meaning |
|---|---|---|
| Outcome | charges |
Individual medical insurance charges; the measured cost outcome |
| Main predictor | age |
Older individuals may face higher expected medical costs |
| Main predictor | bmi |
A health-risk indicator potentially related to expected cost |
| Main predictor | smoker |
A preventable risk behavior expected to strongly raise charges |
| Control | children |
Household/dependent structure may be associated with costs |
| Control | sex |
Demographic control |
| Control | region |
Geographic control for possible charge differences |
4 Probability and Distribution Analysis
Before estimating regression models, I examine the distributions of the key variables and the conditional pattern of charges by smoking status. This connects the probability analysis from Stage 1 to the modeling decision in Stage 2.
4.1 Descriptive Statistics
summary_stats <- data.frame(
Variable = c("charges", "age", "bmi", "children"),
Mean = c(mean(insurance$charges), mean(insurance$age),
mean(insurance$bmi), mean(insurance$children)),
SD = c(sd(insurance$charges), sd(insurance$age),
sd(insurance$bmi), sd(insurance$children)),
Minimum = c(min(insurance$charges), min(insurance$age),
min(insurance$bmi), min(insurance$children)),
Median = c(median(insurance$charges), median(insurance$age),
median(insurance$bmi), median(insurance$children)),
Maximum = c(max(insurance$charges), max(insurance$age),
max(insurance$bmi), max(insurance$children))
)
knitr::kable(summary_stats, digits = 2, caption = "Summary statistics")| Variable | Mean | SD | Minimum | Median | Maximum |
|---|---|---|---|---|---|
| charges | 13270.42 | 12110.01 | 1121.87 | 9382.03 | 63770.43 |
| age | 39.21 | 14.05 | 18.00 | 39.00 | 64.00 |
| bmi | 30.66 | 6.10 | 15.96 | 30.40 | 53.13 |
| children | 1.09 | 1.21 | 0.00 | 1.00 | 5.00 |
4.2 Smoking Status and Conditional Charges
smoker_share <- data.frame(
Smoking_status = names(prop.table(table(insurance$smoker))),
Probability = as.numeric(prop.table(table(insurance$smoker)))
)
mean_charges <- aggregate(charges ~ smoker, data = insurance, FUN = mean)
names(mean_charges) <- c("Smoking_status", "Mean_charges")
above_median <- insurance$charges > median(insurance$charges)
conditional_probability <- aggregate(above_median ~ smoker, data = insurance, FUN = mean)
names(conditional_probability) <- c("Smoking_status", "Probability_charges_above_median")
knitr::kable(smoker_share, digits = 3,
caption = "Probability distribution of smoking status")| Smoking_status | Probability |
|---|---|
| no | 0.795 |
| yes | 0.205 |
knitr::kable(mean_charges, digits = 2,
caption = "Mean charges conditional on smoking status")| Smoking_status | Mean_charges |
|---|---|
| no | 8434.27 |
| yes | 32050.23 |
knitr::kable(conditional_probability, digits = 3,
caption = "Probability of charges above the median, conditional on smoking status")| Smoking_status | Probability_charges_above_median |
|---|---|
| no | 0.371 |
| yes | 1.000 |
Smokers make up 20.5% of the dataset. Their mean charge is 32,050.23, whereas the mean for non-smokers is 8,434.27. This descriptive difference supports the hypothesis that smoking status is economically important for explaining charges.
hist(insurance$charges,
breaks = 30,
main = "Distribution of Medical Insurance Charges",
xlab = "Charges")The distribution is right-skewed: many individuals have moderate charges, while a smaller group has very high charges. This suggests that observed risk factors may help explain the high-charge tail of the distribution.
boxplot(charges ~ smoker, data = insurance,
main = "Medical Insurance Charges by Smoking Status",
xlab = "Smoking status", ylab = "Charges")The boxplot shows that charges are substantially higher among smokers. This finding motivates including smoking status in both models and testing whether its relationship with charges differs according to BMI.
5 Modeling
5.1 Train-Test Split
I divide the observations into an 80% training set and a 20% test set. The training set is used to estimate the model coefficients, while the test set provides an out-of-sample comparison of model performance.
set.seed(465)
insurance_split <- make_split(insurance, prop = 0.80)
train_reg <- insurance_split$train
test_reg <- insurance_split$test
split_table <- data.frame(
Dataset = c("Training set", "Test set", "Total"),
Observations = c(nrow(train_reg), nrow(test_reg), nrow(insurance))
)
knitr::kable(split_table, caption = "Training and test split")| Dataset | Observations |
|---|---|
| Training set | 1070 |
| Test set | 268 |
| Total | 1338 |
5.2 Model Specifications
Model 1: Baseline model. This model uses the three central health-risk predictors.
reg_model1 <- lm(charges ~ age + bmi + smoker, data = train_reg)Model 2: Final candidate model. This model adds demographic/geographic controls and a BMI-by-smoking interaction.
reg_model2 <- lm(charges ~ age + bmi + smoker + children + sex + region + bmi:smoker,
data = train_reg)The interaction term allows the relationship between BMI and charges to differ between smokers and non-smokers instead of imposing one identical BMI effect on everyone.
6 Results
6.1 Test-Set Model Comparison
pred_reg1 <- predict(reg_model1, newdata = test_reg)
pred_reg2 <- predict(reg_model2, newdata = test_reg)
reg_metrics <- data.frame(
Model = c("Model 1: age + bmi + smoker",
"Model 2: controls + bmi × smoker"),
RMSE = c(rmse(test_reg$charges, pred_reg1),
rmse(test_reg$charges, pred_reg2)),
R_squared = c(rsq(test_reg$charges, pred_reg1),
rsq(test_reg$charges, pred_reg2))
)
knitr::kable(reg_metrics, digits = 3, caption = "Test-set performance comparison")| Model | RMSE | R_squared |
|---|---|---|
| Model 1: age + bmi + smoker | 6253.471 | 0.729 |
| Model 2: controls + bmi × smoker | 4757.981 | 0.843 |
Model 2 is selected as the final model because it has a lower test-set RMSE (4757.98 versus 6253.47) and a higher test-set R-squared (0.843 versus 0.729). A lower RMSE means smaller typical prediction errors in charge units, while a higher R-squared means the model explains more variation in previously unseen observations.
6.2 Final Model Coefficients and P-values
model2_summary <- coef(summary(reg_model2))
coefficient_table <- data.frame(
Variable = rownames(model2_summary),
Estimate = model2_summary[, "Estimate"],
Std_Error = model2_summary[, "Std. Error"],
P_Value = model2_summary[, "Pr(>|t|)"],
row.names = NULL
)
coefficient_table$Significance <- ifelse(
coefficient_table$P_Value < 0.001, "***",
ifelse(coefficient_table$P_Value < 0.01, "**",
ifelse(coefficient_table$P_Value < 0.05, "*", "Not significant"))
)
knitr::kable(coefficient_table, digits = 4,
caption = "Final model coefficient estimates and p-values")| Variable | Estimate | Std_Error | P_Value | Significance |
|---|---|---|---|---|
| (Intercept) | -2289.8796 | 955.9283 | 0.0168 | * |
| age | 262.2917 | 10.7168 | 0.0000 | *** |
| bmi | 23.4490 | 28.5379 | 0.4114 | Not significant |
| smokeryes | -19999.2295 | 1904.6831 | 0.0000 | *** |
| children | 503.4018 | 124.0296 | 0.0001 | *** |
| sexmale | -256.5642 | 299.6526 | 0.3921 | Not significant |
| regionnorthwest | -622.4264 | 425.9419 | 0.1442 | Not significant |
| regionsoutheast | -1041.3637 | 427.3282 | 0.0150 | * |
| regionsouthwest | -1316.0765 | 431.3762 | 0.0023 | ** |
| bmi:smokeryes | 1430.9972 | 60.4523 | 0.0000 | *** |
Note: Significance stars summarize p-value thresholds: *** p < 0.001, ** p < 0.01, and * p < 0.05. “Not significant” means p ≥ 0.05.
b <- coef(reg_model2)
p <- coefficient_table$P_Value
names(p) <- coefficient_table$Variable
bmi_non_smoker <- b["bmi"]
bmi_smoker <- b["bmi"] + b["bmi:smokeryes"]
key_results <- data.frame(
Result = c(
"Age coefficient",
"BMI relationship for non-smokers",
"Smoking indicator at BMI = 0 (not directly meaningful)",
"BMI × smoking interaction",
"BMI relationship for smokers (BMI + interaction)"
),
Estimate = c(
b["age"], bmi_non_smoker, b["smokeryes"],
b["bmi:smokeryes"], bmi_smoker
)
)
knitr::kable(key_results, digits = 2,
caption = "Economically relevant estimates from the final model")| Result | Estimate |
|---|---|
| Age coefficient | 262.29 |
| BMI relationship for non-smokers | 23.45 |
| Smoking indicator at BMI = 0 (not directly meaningful) | -19999.23 |
| BMI × smoking interaction | 1431.00 |
| BMI relationship for smokers (BMI + interaction) | 1454.45 |
6.3 Economic Interpretation
Holding the other included characteristics constant, a one-year increase in age is associated with an estimated increase of 262.29 in charges. The age p-value is 3.17^{-105}, indicating whether this relationship is statistically distinguishable from zero in the sample.
For non-smokers, a one-unit increase in BMI is associated with an estimated change of 23.45 in charges, holding the remaining variables constant. For smokers, the corresponding BMI relationship is 1454.45 because it combines the BMI coefficient with the interaction term. The interaction estimate is 1431 with a p-value of 8.7^{-100}. Economically, this tests whether higher BMI is associated with an additional charge difference specifically among smokers.
Because Model 2 includes a BMI-by-smoking interaction, the separate coefficient on smokeryes should not be interpreted alone as the general cost difference between smokers and non-smokers. The smoker–non-smoker difference depends on BMI and is calculated as:
comparison_bmi <- c(25, 30, 35)
smoker_difference <- b["smokeryes"] + b["bmi:smokeryes"] * comparison_bmi
smoking_effect_table <- data.frame(
BMI = comparison_bmi,
Estimated_smoker_minus_non_smoker_charge_difference = smoker_difference
)
knitr::kable(smoking_effect_table, digits = 2,
caption = "Estimated smoking-related charge difference at selected BMI values")| BMI | Estimated_smoker_minus_non_smoker_charge_difference |
|---|---|
| 25 | 15775.70 |
| 30 | 22930.69 |
| 35 | 30085.67 |
At BMI values of 25, 30, and 35, the predicted smoker–non-smoker charge gap grows when the interaction coefficient is positive. Therefore, the results suggest that smoking is associated with much higher charges and that this gap becomes larger at higher BMI levels.
6.4 Implications for Policy, Business, and Future Research
For insurers, the improved performance of the interaction model suggests that combinations of observable risk characteristics may predict cost more accurately than considering each factor separately. For policymakers and public-health planners, the association between smoking and much higher charges supports the economic value of smoking-prevention and cessation policies. However, these results should be treated as predictive associations rather than proof that changing one characteristic alone will cause an exact change in charges.
7 Limitations and Reproducibility
7.1 Limitations
First, the dataset does not report important factors such as medical history, income, insurance-plan coverage, health-service use, or time period. Omitted factors may be related to both the included predictors and charges, so the model cannot fully explain individual costs.
Second, this is observational data. The estimated relationships are useful for prediction and association, but they do not identify causal effects. For example, the smoking coefficient cannot by itself prove the exact causal impact of smoking on an individual’s charge.
Third, the data source does not document country or currency in the supplied file. This restricts the ability to generalize the numerical charge estimates to a specific real-world insurance market.
7.2 Reproducibility Steps
The analysis is made reproducible in four ways:
- The
.qmdfile uses relative paths and searches forinsurance.csvboth in the project folder and insidedata/. set.seed(465)is used before the train-test split so the same split can be reproduced when the document is rendered again.- The modeling and metric calculations use base R functions (
lm,predict, and explicitly written RMSE/R-squared functions), reducing dependence on external packages. - The report records session and package information below so the computing environment can be checked.
sessionInfo()R version 4.5.3 (2026-03-11)
Platform: aarch64-apple-darwin20
Running under: macOS Tahoe 26.3.1
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.1
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
time zone: Europe/Istanbul
tzcode source: internal
attached base packages:
[1] stats graphics grDevices utils datasets methods base
loaded via a namespace (and not attached):
[1] compiler_4.5.3 fastmap_1.2.0 cli_3.6.5 tools_4.5.3
[5] htmltools_0.5.9 rstudioapi_0.18.0 yaml_2.3.12 rmarkdown_2.31
[9] knitr_1.51 jsonlite_2.0.0 xfun_0.57 digest_0.6.39
[13] rlang_1.1.7 evaluate_1.0.5
8 AI Use Log
ChatGPT was used as a supplementary support tool during the preparation of this report. Its role was limited to troubleshooting coding and rendering issues. I reviewed, edited, and finalized the report myself, and all statistical results are generated directly from the code included in the document.
9 Final Reflections
9.1 Improvement with More Time or Better Data
With more time and better data, I would use a richer dataset containing medical-history indicators, insurance-plan characteristics, income, and repeated observations over time. This would allow me to test whether the observed high charges are explained by existing health conditions, coverage differences, or persistent risk patterns, rather than relying only on demographic and behavioral variables.
9.2 New Economic Question for Future Research
A new question inspired by this analysis is: Would preventive-health interventions targeted at high-risk groups reduce future insurance charges enough to justify their cost? This question would extend the analysis from prediction toward policy evaluation and would require longitudinal data or a credible causal research design.
10 Conclusion
This report asked how observable health-risk and demographic characteristics relate to individual medical insurance charges. The descriptive analysis showed a large charge difference between smokers and non-smokers. In the predictive modeling stage, the model including controls and a BMI-by-smoking interaction performed better on the test set than the simpler baseline model. The final results indicate that age and smoking-related risk patterns are strongly associated with charges, and that the relationship between BMI and charges differs by smoking status.
The analysis provides an economically meaningful prediction framework: health-risk factors are associated with substantial differences in individual medical charges, and allowing risks to interact improves prediction. At the same time, the observational and limited nature of the dataset means these findings should guide further research rather than be interpreted as causal proof.