NCHS - Leading Causes of Death: United States
Load and Glimpse The Data Set
us_death_data <- read_csv("~/Downloads/NCHS_-_Leading_Causes_of_Death__United_States.csv")
## Rows: 10868 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): 113 Cause Name, Cause Name, State
## dbl (3): Year, Deaths, Age-adjusted Death Rate
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(us_death_data)
## Rows: 10,868
## Columns: 6
## $ Year <dbl> 2017, 2017, 2017, 2017, 2017, 2017, 2017, 20…
## $ `113 Cause Name` <chr> "Accidents (unintentional injuries) (V01-X59…
## $ `Cause Name` <chr> "Unintentional injuries", "Unintentional inj…
## $ State <chr> "United States", "Alabama", "Alaska", "Arizo…
## $ Deaths <dbl> 169936, 2703, 436, 4184, 1625, 13840, 3037, …
## $ `Age-adjusted Death Rate` <dbl> 49.4, 53.8, 63.7, 56.2, 51.8, 33.2, 53.6, 53…
Clean and reshape the data
df_clean <- us_death_data %>%
filter(State != "United States") %>%
dplyr::select(Year, State, `Cause Name`, `Age-adjusted Death Rate`) %>%
pivot_wider(names_from = `Cause Name`,
values_from = `Age-adjusted Death Rate`) %>%
dplyr::select(-`All causes`) %>%
drop_na()
df_clean
## # A tibble: 969 × 12
## Year State Unintentional injuri…¹ `Alzheimer's disease` Stroke CLRD
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 2017 Alabama 53.8 45.2 50 57.8
## 2 2017 Alaska 63.7 22.1 35.1 35.9
## 3 2017 Arizona 56.2 35.1 30.8 42.7
## 4 2017 Arkansas 51.8 39.4 43.8 66.7
## 5 2017 California 33.2 37.1 37.6 32.2
## 6 2017 Colorado 53.6 34.2 35.8 45.6
## 7 2017 Connecticut 53.2 20.4 27.8 30.4
## 8 2017 Delaware 61.9 30.6 46.2 40.8
## 9 2017 District of … 61 17.6 35.9 19.6
## 10 2017 Florida 56.1 20.7 38.9 39
## # ℹ 959 more rows
## # ℹ abbreviated name: ¹​`Unintentional injuries`
## # ℹ 6 more variables: Diabetes <dbl>, `Heart disease` <dbl>,
## # `Influenza and pneumonia` <dbl>, Suicide <dbl>, Cancer <dbl>,
## # `Kidney disease` <dbl>
colnames(df_clean) <- make.names(colnames(df_clean))
median_heart <- median(df_clean$Heart.disease, na.rm = TRUE)
df_clean <- df_clean %>%
mutate(High_Heart_Disease = as.factor(ifelse(Heart.disease > median_heart, "Yes", "No")))
Apply Analytical Methods
Exploratory Data Analysis (EDA)
# Distribution of our continuous target variable
ggplot(df_clean, aes(x = Heart.disease)) +
geom_histogram(bins = 30, fill = "steelblue", color = "black") +
geom_vline(xintercept = median_heart, color = "red", linetype = "dashed") +
theme_minimal() +
labs(title = "Distribution of Age-Adjusted Heart Disease Death Rates",
x = "Heart Disease Death Rate",
y = "Count")

# Relationship between Stroke, Diabetes, and Heart Disease Classification
ggplot(df_clean, aes(x = Diabetes, y = Stroke, color = High_Heart_Disease)) +
geom_point(alpha = 0.7) +
theme_minimal() +
labs(title = "Stroke vs. Diabetes Rates Colored by Heart Disease Risk",
x = "Diabetes Death Rate", y = "Stroke Death Rate")

Data Splitting
set.seed(123)
# 80/20 Train/Test split to evaluate model performance
trainIndex <- createDataPartition(df_clean$High_Heart_Disease, p = .8,
list = FALSE,
times = 1)
df_train <- df_clean[ trainIndex,]
df_test <- df_clean[-trainIndex,]
Classification: Logistic Regression & LDA
# Logistic Regression
logit_model <- glm(High_Heart_Disease ~ Diabetes + Stroke + Kidney.disease + Year,
data = df_train, family = "binomial")
summary(logit_model)
##
## Call:
## glm(formula = High_Heart_Disease ~ Diabetes + Stroke + Kidney.disease +
## Year, family = "binomial", data = df_train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 373.15573 60.40584 6.177 6.51e-10 ***
## Diabetes 0.09984 0.02626 3.802 0.000143 ***
## Stroke 0.08249 0.01750 4.714 2.43e-06 ***
## Kidney.disease 0.19527 0.02642 7.391 1.45e-13 ***
## Year -0.19023 0.02998 -6.346 2.21e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1075.76 on 775 degrees of freedom
## Residual deviance: 594.79 on 771 degrees of freedom
## AIC: 604.79
##
## Number of Fisher Scoring iterations: 5
# Linear Discriminant Analysis (LDA)
lda_model <- lda(High_Heart_Disease ~ Diabetes + Stroke + Kidney.disease + Year,
data = df_train)
lda_model
## Call:
## lda(High_Heart_Disease ~ Diabetes + Stroke + Kidney.disease +
## Year, data = df_train)
##
## Prior probabilities of groups:
## No Yes
## 0.5 0.5
##
## Group means:
## Diabetes Stroke Kidney.disease Year
## No 21.27397 38.66727 12.35000 2011.031
## Yes 25.56057 52.82552 15.91753 2004.887
##
## Coefficients of linear discriminants:
## LD1
## Diabetes 0.04990431
## Stroke 0.04367589
## Kidney.disease 0.10033414
## Year -0.11873145
# Predictions & Validation
logit_preds <- predict(logit_model, df_test, type = "response")
logit_class <- ifelse(logit_preds > 0.5, "Yes", "No")
# Confusion Matrix
confusionMatrix(as.factor(logit_class), df_test$High_Heart_Disease)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 83 23
## Yes 14 73
##
## Accuracy : 0.8083
## 95% CI : (0.7456, 0.8613)
## No Information Rate : 0.5026
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.6164
##
## Mcnemar's Test P-Value : 0.1884
##
## Sensitivity : 0.8557
## Specificity : 0.7604
## Pos Pred Value : 0.7830
## Neg Pred Value : 0.8391
## Prevalence : 0.5026
## Detection Rate : 0.4301
## Detection Prevalence : 0.5492
## Balanced Accuracy : 0.8080
##
## 'Positive' Class : No
##
Nonlinear Regression: Splines
# Fit a model using Natural Splines with 3 degrees of freedom for Year
spline_model <- lm(Heart.disease ~ ns(Year, df = 3) + Diabetes + Stroke, data = df_train)
summary(spline_model)
##
## Call:
## lm(formula = Heart.disease ~ ns(Year, df = 3) + Diabetes + Stroke,
## data = df_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -68.783 -18.124 -1.543 16.615 90.428
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 115.0456 9.4623 12.158 < 2e-16 ***
## ns(Year, df = 3)1 -35.0599 5.1221 -6.845 1.56e-11 ***
## ns(Year, df = 3)2 -69.6150 9.3451 -7.449 2.52e-13 ***
## ns(Year, df = 3)3 -30.9008 4.0585 -7.614 7.79e-14 ***
## Diabetes 2.2791 0.2316 9.841 < 2e-16 ***
## Stroke 1.3152 0.1536 8.560 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 26.91 on 770 degrees of freedom
## Multiple R-squared: 0.6254, Adjusted R-squared: 0.623
## F-statistic: 257.1 on 5 and 770 DF, p-value: < 2.2e-16
# Partial F-Test (ANOVA) to validate if the spline adds value over a linear trend
linear_model <- lm(Heart.disease ~ Year + Diabetes + Stroke, data = df_train)
anova(linear_model, spline_model)
## Analysis of Variance Table
##
## Model 1: Heart.disease ~ Year + Diabetes + Stroke
## Model 2: Heart.disease ~ ns(Year, df = 3) + Diabetes + Stroke
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 772 568227
## 2 770 557721 2 10507 7.2529 0.0007575 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Tree-Based Methods: Decision Trees & Random Forests
# Basic Decision Tree for interpretability
tree_model <- rpart(Heart.disease ~ Year + Diabetes + Stroke + Cancer + Alzheimer.s.disease,
data = df_train, method = "anova")
rpart.plot(tree_model, main = "Decision Tree for Heart Disease Death Rate")

# Random Forest for predictive power
set.seed(42)
rf_model <- randomForest(Heart.disease ~ Year + Diabetes + Stroke + Cancer + Alzheimer.s.disease + Kidney.disease,
data = df_train,
importance = TRUE,
ntree = 500)
# View Variable Importance
importance(rf_model)
## %IncMSE IncNodePurity
## Year 23.78244 218777.8
## Diabetes 29.34004 137171.5
## Stroke 35.01276 344952.4
## Cancer 48.11044 549948.6
## Alzheimer.s.disease 49.31285 108427.2
## Kidney.disease 35.36466 105739.9
varImpPlot(rf_model, main = "Random Forest Variable Importance")

Model Selection and Validation
# Set up cross-validation using caret
train_control <- trainControl(method = "cv", number = 5)
# Train the Random Forest using caret
cv_rf <- train(Heart.disease ~ Year + Diabetes + Stroke + Cancer + Alzheimer.s.disease,
data = df_train,
method = "rf",
trControl = train_control,
tuneGrid = expand.grid(mtry = c(2, 3, 4)))
print(cv_rf)
## Random Forest
##
## 776 samples
## 5 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 620, 621, 622, 620, 621
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 16.62343 0.8599694 12.60881
## 3 16.47857 0.8620793 12.47879
## 4 16.47453 0.8621755 12.47652
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 4.
Bayesian Analysis
bayes_model <- stan_glm(Heart.disease ~ Stroke + Diabetes,
data = df_train,
family = gaussian(),
prior = normal(0, 5),
chains = 3,
iter = 2000,
seed = 123)
##
## SAMPLING FOR MODEL 'continuous' NOW (CHAIN 1).
## Chain 1:
## Chain 1: Gradient evaluation took 4.7e-05 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 0.47 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1:
## Chain 1:
## Chain 1: Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 1: Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 1: Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 1: Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 1: Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 1: Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 1: Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 1: Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 1: Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 1: Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 1: Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 1: Iteration: 2000 / 2000 [100%] (Sampling)
## Chain 1:
## Chain 1: Elapsed Time: 0.044 seconds (Warm-up)
## Chain 1: 0.074 seconds (Sampling)
## Chain 1: 0.118 seconds (Total)
## Chain 1:
##
## SAMPLING FOR MODEL 'continuous' NOW (CHAIN 2).
## Chain 2:
## Chain 2: Gradient evaluation took 1.2e-05 seconds
## Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 0.12 seconds.
## Chain 2: Adjust your expectations accordingly!
## Chain 2:
## Chain 2:
## Chain 2: Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 2: Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 2: Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 2: Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 2: Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 2: Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 2: Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 2: Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 2: Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 2: Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 2: Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 2: Iteration: 2000 / 2000 [100%] (Sampling)
## Chain 2:
## Chain 2: Elapsed Time: 0.05 seconds (Warm-up)
## Chain 2: 0.069 seconds (Sampling)
## Chain 2: 0.119 seconds (Total)
## Chain 2:
##
## SAMPLING FOR MODEL 'continuous' NOW (CHAIN 3).
## Chain 3:
## Chain 3: Gradient evaluation took 7e-06 seconds
## Chain 3: 1000 transitions using 10 leapfrog steps per transition would take 0.07 seconds.
## Chain 3: Adjust your expectations accordingly!
## Chain 3:
## Chain 3:
## Chain 3: Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 3: Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 3: Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 3: Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 3: Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 3: Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 3: Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 3: Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 3: Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 3: Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 3: Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 3: Iteration: 2000 / 2000 [100%] (Sampling)
## Chain 3:
## Chain 3: Elapsed Time: 0.032 seconds (Warm-up)
## Chain 3: 0.069 seconds (Sampling)
## Chain 3: 0.101 seconds (Total)
## Chain 3:
# Summary of the Bayesian Model
summary(bayes_model, digits = 3)
##
## Model Info:
## function: stan_glm
## family: gaussian [identity]
## formula: Heart.disease ~ Stroke + Diabetes
## algorithm: sampling
## sample: 3000 (posterior sample size)
## priors: see help('prior_summary')
## observations: 776
## predictors: 3
##
## Estimates:
## mean sd 10% 50% 90%
## (Intercept) 41.416 5.125 34.890 41.393 47.903
## Stroke 2.400 0.101 2.269 2.401 2.529
## Diabetes 2.000 0.230 1.711 1.999 2.299
## sigma 28.380 0.745 27.446 28.368 29.330
##
## Fit Diagnostics:
## mean sd 10% 50% 90%
## mean_PPD 198.020 1.449 196.149 198.030 199.880
##
## The mean_ppd is the sample average posterior predictive distribution of the outcome variable (for details see help('summary.stanreg')).
##
## MCMC diagnostics
## mcse Rhat n_eff
## (Intercept) 0.081 0.999 4029
## Stroke 0.002 0.999 2254
## Diabetes 0.005 1.000 2420
## sigma 0.014 1.000 2930
## mean_PPD 0.026 1.000 3056
## log-posterior 0.039 1.001 1364
##
## For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
# Posterior Predictive Check
plot(bayes_model, "areas", prob = 0.95, prob_outer = 1) +
labs(title = "Posterior Distributions of Coefficients (95% Credible Intervals)")
