NCHS - Leading Causes of Death: United States

Load and Glimpse The Data Set

us_death_data <- read_csv("~/Downloads/NCHS_-_Leading_Causes_of_Death__United_States.csv")

## Rows: 10868 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): 113 Cause Name, Cause Name, State
## dbl (3): Year, Deaths, Age-adjusted Death Rate
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

glimpse(us_death_data)

## Rows: 10,868
## Columns: 6
## $ Year                      <dbl> 2017, 2017, 2017, 2017, 2017, 2017, 2017, 20…
## $ `113 Cause Name`          <chr> "Accidents (unintentional injuries) (V01-X59…
## $ `Cause Name`              <chr> "Unintentional injuries", "Unintentional inj…
## $ State                     <chr> "United States", "Alabama", "Alaska", "Arizo…
## $ Deaths                    <dbl> 169936, 2703, 436, 4184, 1625, 13840, 3037, …
## $ `Age-adjusted Death Rate` <dbl> 49.4, 53.8, 63.7, 56.2, 51.8, 33.2, 53.6, 53…

Clean and reshape the data

df_clean <- us_death_data %>%
  filter(State != "United States") %>%
  dplyr::select(Year, State, `Cause Name`, `Age-adjusted Death Rate`) %>%
  pivot_wider(names_from = `Cause Name`, 
              values_from = `Age-adjusted Death Rate`) %>%
  dplyr::select(-`All causes`) %>%
  drop_na()
df_clean

## # A tibble: 969 × 12
##     Year State         Unintentional injuri…¹ `Alzheimer's disease` Stroke  CLRD
##    <dbl> <chr>                          <dbl>                 <dbl>  <dbl> <dbl>
##  1  2017 Alabama                         53.8                  45.2   50    57.8
##  2  2017 Alaska                          63.7                  22.1   35.1  35.9
##  3  2017 Arizona                         56.2                  35.1   30.8  42.7
##  4  2017 Arkansas                        51.8                  39.4   43.8  66.7
##  5  2017 California                      33.2                  37.1   37.6  32.2
##  6  2017 Colorado                        53.6                  34.2   35.8  45.6
##  7  2017 Connecticut                     53.2                  20.4   27.8  30.4
##  8  2017 Delaware                        61.9                  30.6   46.2  40.8
##  9  2017 District of …                   61                    17.6   35.9  19.6
## 10  2017 Florida                         56.1                  20.7   38.9  39  
## # ℹ 959 more rows
## # ℹ abbreviated name: ¹`Unintentional injuries`
## # ℹ 6 more variables: Diabetes <dbl>, `Heart disease` <dbl>,
## #   `Influenza and pneumonia` <dbl>, Suicide <dbl>, Cancer <dbl>,
## #   `Kidney disease` <dbl>

colnames(df_clean) <- make.names(colnames(df_clean))

median_heart <- median(df_clean$Heart.disease, na.rm = TRUE)

df_clean <- df_clean %>%
  mutate(High_Heart_Disease = as.factor(ifelse(Heart.disease > median_heart, "Yes", "No")))

Apply Analytical Methods

Exploratory Data Analysis (EDA)

# Distribution of our continuous target variable
ggplot(df_clean, aes(x = Heart.disease)) +
  geom_histogram(bins = 30, fill = "steelblue", color = "black") +
  geom_vline(xintercept = median_heart, color = "red", linetype = "dashed") +
  theme_minimal() +
  labs(title = "Distribution of Age-Adjusted Heart Disease Death Rates",
       x = "Heart Disease Death Rate",
       y = "Count")

# Relationship between Stroke, Diabetes, and Heart Disease Classification
ggplot(df_clean, aes(x = Diabetes, y = Stroke, color = High_Heart_Disease)) +
  geom_point(alpha = 0.7) +
  theme_minimal() +
  labs(title = "Stroke vs. Diabetes Rates Colored by Heart Disease Risk",
       x = "Diabetes Death Rate", y = "Stroke Death Rate")

Data Splitting

set.seed(123)
# 80/20 Train/Test split to evaluate model performance
trainIndex <- createDataPartition(df_clean$High_Heart_Disease, p = .8, 
                                  list = FALSE, 
                                  times = 1)
df_train <- df_clean[ trainIndex,]
df_test  <- df_clean[-trainIndex,]

Classification: Logistic Regression & LDA

# Logistic Regression
logit_model <- glm(High_Heart_Disease ~ Diabetes + Stroke + Kidney.disease + Year, 
                   data = df_train, family = "binomial")

summary(logit_model)

## 
## Call:
## glm(formula = High_Heart_Disease ~ Diabetes + Stroke + Kidney.disease + 
##     Year, family = "binomial", data = df_train)
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    373.15573   60.40584   6.177 6.51e-10 ***
## Diabetes         0.09984    0.02626   3.802 0.000143 ***
## Stroke           0.08249    0.01750   4.714 2.43e-06 ***
## Kidney.disease   0.19527    0.02642   7.391 1.45e-13 ***
## Year            -0.19023    0.02998  -6.346 2.21e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1075.76  on 775  degrees of freedom
## Residual deviance:  594.79  on 771  degrees of freedom
## AIC: 604.79
## 
## Number of Fisher Scoring iterations: 5

# Linear Discriminant Analysis (LDA)
lda_model <- lda(High_Heart_Disease ~ Diabetes + Stroke + Kidney.disease + Year, 
                 data = df_train)

lda_model

## Call:
## lda(High_Heart_Disease ~ Diabetes + Stroke + Kidney.disease + 
##     Year, data = df_train)
## 
## Prior probabilities of groups:
##  No Yes 
## 0.5 0.5 
## 
## Group means:
##     Diabetes   Stroke Kidney.disease     Year
## No  21.27397 38.66727       12.35000 2011.031
## Yes 25.56057 52.82552       15.91753 2004.887
## 
## Coefficients of linear discriminants:
##                        LD1
## Diabetes        0.04990431
## Stroke          0.04367589
## Kidney.disease  0.10033414
## Year           -0.11873145

# Predictions & Validation
logit_preds <- predict(logit_model, df_test, type = "response")
logit_class <- ifelse(logit_preds > 0.5, "Yes", "No")

# Confusion Matrix
confusionMatrix(as.factor(logit_class), df_test$High_Heart_Disease)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No Yes
##        No  83  23
##        Yes 14  73
##                                           
##                Accuracy : 0.8083          
##                  95% CI : (0.7456, 0.8613)
##     No Information Rate : 0.5026          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.6164          
##                                           
##  Mcnemar's Test P-Value : 0.1884          
##                                           
##             Sensitivity : 0.8557          
##             Specificity : 0.7604          
##          Pos Pred Value : 0.7830          
##          Neg Pred Value : 0.8391          
##              Prevalence : 0.5026          
##          Detection Rate : 0.4301          
##    Detection Prevalence : 0.5492          
##       Balanced Accuracy : 0.8080          
##                                           
##        'Positive' Class : No              
##

Nonlinear Regression: Splines

# Fit a model using Natural Splines with 3 degrees of freedom for Year
spline_model <- lm(Heart.disease ~ ns(Year, df = 3) + Diabetes + Stroke, data = df_train)

summary(spline_model)

## 
## Call:
## lm(formula = Heart.disease ~ ns(Year, df = 3) + Diabetes + Stroke, 
##     data = df_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -68.783 -18.124  -1.543  16.615  90.428 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       115.0456     9.4623  12.158  < 2e-16 ***
## ns(Year, df = 3)1 -35.0599     5.1221  -6.845 1.56e-11 ***
## ns(Year, df = 3)2 -69.6150     9.3451  -7.449 2.52e-13 ***
## ns(Year, df = 3)3 -30.9008     4.0585  -7.614 7.79e-14 ***
## Diabetes            2.2791     0.2316   9.841  < 2e-16 ***
## Stroke              1.3152     0.1536   8.560  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 26.91 on 770 degrees of freedom
## Multiple R-squared:  0.6254, Adjusted R-squared:  0.623 
## F-statistic: 257.1 on 5 and 770 DF,  p-value: < 2.2e-16

# Partial F-Test (ANOVA) to validate if the spline adds value over a linear trend
linear_model <- lm(Heart.disease ~ Year + Diabetes + Stroke, data = df_train)
anova(linear_model, spline_model)

## Analysis of Variance Table
## 
## Model 1: Heart.disease ~ Year + Diabetes + Stroke
## Model 2: Heart.disease ~ ns(Year, df = 3) + Diabetes + Stroke
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1    772 568227                                  
## 2    770 557721  2     10507 7.2529 0.0007575 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Tree-Based Methods: Decision Trees & Random Forests

# Basic Decision Tree for interpretability
tree_model <- rpart(Heart.disease ~ Year + Diabetes + Stroke + Cancer + Alzheimer.s.disease, 
                    data = df_train, method = "anova")

rpart.plot(tree_model, main = "Decision Tree for Heart Disease Death Rate")

# Random Forest for predictive power
set.seed(42)
rf_model <- randomForest(Heart.disease ~ Year + Diabetes + Stroke + Cancer + Alzheimer.s.disease + Kidney.disease, 
                         data = df_train, 
                         importance = TRUE, 
                         ntree = 500)

# View Variable Importance
importance(rf_model)

##                      %IncMSE IncNodePurity
## Year                23.78244      218777.8
## Diabetes            29.34004      137171.5
## Stroke              35.01276      344952.4
## Cancer              48.11044      549948.6
## Alzheimer.s.disease 49.31285      108427.2
## Kidney.disease      35.36466      105739.9

varImpPlot(rf_model, main = "Random Forest Variable Importance")

Model Selection and Validation

# Set up cross-validation using caret
train_control <- trainControl(method = "cv", number = 5)

# Train the Random Forest using caret
cv_rf <- train(Heart.disease ~ Year + Diabetes + Stroke + Cancer + Alzheimer.s.disease, 
               data = df_train, 
               method = "rf", 
               trControl = train_control,
               tuneGrid = expand.grid(mtry = c(2, 3, 4)))

print(cv_rf)

## Random Forest 
## 
## 776 samples
##   5 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 620, 621, 622, 620, 621 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE      Rsquared   MAE     
##   2     16.62343  0.8599694  12.60881
##   3     16.47857  0.8620793  12.47879
##   4     16.47453  0.8621755  12.47652
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 4.

Bayesian Analysis

bayes_model <- stan_glm(Heart.disease ~ Stroke + Diabetes, 
                        data = df_train, 
                        family = gaussian(), 
                        prior = normal(0, 5), 
                        chains = 3, 
                        iter = 2000, 
                        seed = 123)

## 
## SAMPLING FOR MODEL 'continuous' NOW (CHAIN 1).
## Chain 1: 
## Chain 1: Gradient evaluation took 4.7e-05 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 0.47 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1: 
## Chain 1: 
## Chain 1: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 1: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 1: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 1: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 1: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 1: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 1: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 1: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 1: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 1: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 1: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 1: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 1: 
## Chain 1:  Elapsed Time: 0.044 seconds (Warm-up)
## Chain 1:                0.074 seconds (Sampling)
## Chain 1:                0.118 seconds (Total)
## Chain 1: 
## 
## SAMPLING FOR MODEL 'continuous' NOW (CHAIN 2).
## Chain 2: 
## Chain 2: Gradient evaluation took 1.2e-05 seconds
## Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 0.12 seconds.
## Chain 2: Adjust your expectations accordingly!
## Chain 2: 
## Chain 2: 
## Chain 2: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 2: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 2: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 2: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 2: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 2: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 2: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 2: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 2: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 2: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 2: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 2: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 2: 
## Chain 2:  Elapsed Time: 0.05 seconds (Warm-up)
## Chain 2:                0.069 seconds (Sampling)
## Chain 2:                0.119 seconds (Total)
## Chain 2: 
## 
## SAMPLING FOR MODEL 'continuous' NOW (CHAIN 3).
## Chain 3: 
## Chain 3: Gradient evaluation took 7e-06 seconds
## Chain 3: 1000 transitions using 10 leapfrog steps per transition would take 0.07 seconds.
## Chain 3: Adjust your expectations accordingly!
## Chain 3: 
## Chain 3: 
## Chain 3: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 3: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 3: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 3: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 3: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 3: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 3: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 3: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 3: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 3: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 3: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 3: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 3: 
## Chain 3:  Elapsed Time: 0.032 seconds (Warm-up)
## Chain 3:                0.069 seconds (Sampling)
## Chain 3:                0.101 seconds (Total)
## Chain 3:

# Summary of the Bayesian Model
summary(bayes_model, digits = 3)

## 
## Model Info:
##  function:     stan_glm
##  family:       gaussian [identity]
##  formula:      Heart.disease ~ Stroke + Diabetes
##  algorithm:    sampling
##  sample:       3000 (posterior sample size)
##  priors:       see help('prior_summary')
##  observations: 776
##  predictors:   3
## 
## Estimates:
##               mean   sd     10%    50%    90% 
## (Intercept) 41.416  5.125 34.890 41.393 47.903
## Stroke       2.400  0.101  2.269  2.401  2.529
## Diabetes     2.000  0.230  1.711  1.999  2.299
## sigma       28.380  0.745 27.446 28.368 29.330
## 
## Fit Diagnostics:
##            mean    sd      10%     50%     90%  
## mean_PPD 198.020   1.449 196.149 198.030 199.880
## 
## The mean_ppd is the sample average posterior predictive distribution of the outcome variable (for details see help('summary.stanreg')).
## 
## MCMC diagnostics
##               mcse  Rhat  n_eff
## (Intercept)   0.081 0.999 4029 
## Stroke        0.002 0.999 2254 
## Diabetes      0.005 1.000 2420 
## sigma         0.014 1.000 2930 
## mean_PPD      0.026 1.000 3056 
## log-posterior 0.039 1.001 1364 
## 
## For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).

# Posterior Predictive Check
plot(bayes_model, "areas", prob = 0.95, prob_outer = 1) +
  labs(title = "Posterior Distributions of Coefficients (95% Credible Intervals)")

DAS251 Data Inference - Final Project in Data Modeling

Jonathan

NCHS - Leading Causes of Death: United States

Load and Glimpse The Data Set

Clean and reshape the data

Apply Analytical Methods

Exploratory Data Analysis (EDA)

Data Splitting

Classification: Logistic Regression & LDA

Nonlinear Regression: Splines

Tree-Based Methods: Decision Trees & Random Forests

Model Selection and Validation

Bayesian Analysis