1 Introduction

Women generally live longer than men, and this difference is often explained by biological factors such as heart disease. However, behavioral factors may also play an important role, especially for deaths caused by accidents, suicide, and violence. The purpose of this project is to compare deaths from external causes (accidents, suicide, assault, overdose) to deaths from natural causes and examine whether the gap between men and women has changed over time in New York City.


2 Setup: Packages

# install.packages(c("tidyverse", "ggplot2", "car", "multcomp", "forecast", "knitr", "kableExtra"))
library(tidyverse)
library(ggplot2)
library(car)
library(multcomp)
library(forecast)
library(knitr)
library(kableExtra)

3 Data

3.1 Load

# Make sure this CSV is in the same folder as this .Rmd file.
# In RStudio: Session -> Set Working Directory -> To Source File Location
df_raw <- read.csv("New_York_City_Leading_Causes_of_Death_20260411.csv",
                   stringsAsFactors = FALSE)
glimpse(df_raw)
## Rows: 2,102
## Columns: 7
## $ Year                    <int> 2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021…
## $ Leading.Cause           <chr> "Mental and Behavioral Disorders due to Accide…
## $ Sex                     <chr> "Male", "Female", "Male", "Female", "Male", "F…
## $ Race.Ethnicity          <chr> "Other Race/ Ethnicity", "Not Stated/Unknown",…
## $ Deaths                  <chr> "9", "102", "16", "1409", "16", "4", "1157", "…
## $ Death.Rate              <chr> "", "", "", "111", "", "", "91.2", "76.1", "20…
## $ Age.Adjusted.Death.Rate <chr> "", "", "", "95.5", "", "", "80.8", "66.8", "1…

3.2 Clean & Recode

df <- df_raw %>%
  rename(
    year       = Year,
    cause      = Leading.Cause,
    sex        = Sex,
    race       = Race.Ethnicity,
    deaths     = Deaths,
    rate_crude = Death.Rate,
    rate_adj   = Age.Adjusted.Death.Rate
  ) %>%
  filter(sex %in% c("Male", "Female")) %>%
  mutate(
    rate_adj   = as.numeric(na_if(rate_adj,   ".")),
    rate_crude = as.numeric(na_if(rate_crude, ".")),
    deaths     = as.numeric(na_if(deaths,     "."))
  ) %>%
  mutate(
    cause_type = case_when(
      str_detect(cause, regex(
        "accident|unintentional|fall|poison|drowning|fire|transport|suicide|self.inflict|assault|homicide|violence|firearm|overdose|drug|alcohol.induced",
        ignore_case = TRUE
      )) ~ "External",
      TRUE ~ "Natural"
    )
  ) %>%
  drop_na(rate_adj) %>%
  mutate(group = interaction(cause_type, sex))

# Confirm all 15 years loaded
cat("Years in dataset:\n")
## Years in dataset:
table(df$year)
## 
## 2015 2016 2017 2019 2020 2021 
##   89   88   88  111   88   88
cat("\nRows by cause type and sex:\n")
## 
## Rows by cause type and sex:
table(df$cause_type, df$sex)
##           
##            Female Male
##   External     36   59
##   Natural     240  217

4 Descriptive Statistics

desc_stats <- df %>%
  group_by(cause_type, sex) %>%
  summarise(
    n      = n(),
    Mean   = round(mean(rate_adj,   na.rm = TRUE), 2),
    SD     = round(sd(rate_adj,     na.rm = TRUE), 2),
    Median = round(median(rate_adj, na.rm = TRUE), 2),
    Min    = round(min(rate_adj,    na.rm = TRUE), 2),
    Max    = round(max(rate_adj,    na.rm = TRUE), 2),
    .groups = "drop"
  )

desc_stats %>%
  kable(caption = "Descriptive Statistics: Age-Adjusted Death Rates by Cause Type and Sex") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE)
Descriptive Statistics: Age-Adjusted Death Rates by Cause Type and Sex
cause_type sex n Mean SD Median Min Max
External Female 36 8.17 3.63 7.01 3.37 20.30
External Male 59 22.23 12.91 18.05 5.36 69.50
Natural Female 240 47.47 51.46 17.44 4.60 217.10
Natural Male 217 77.57 84.58 26.13 5.73 414.59

5 Assumption Checks

5.1 Normality: Shapiro-Wilk Test

shapiro_results <- df %>%
  group_by(cause_type, sex) %>%
  summarise(
    n         = n(),
    Statistic = round(shapiro.test(rate_adj)$statistic, 4),
    p_value   = round(shapiro.test(rate_adj)$p.value,   4),
    .groups   = "drop"
  ) %>%
  mutate(Result = ifelse(p_value > 0.05,
                         "Normal (fail to reject)",
                         "Non-normal (p < 0.05)"))

shapiro_results %>%
  kable(caption = "Shapiro-Wilk Normality Test by Group") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE)
Shapiro-Wilk Normality Test by Group
cause_type sex n Statistic p_value Result
External Female 36 0.8386 1e-04 Non-normal (p < 0.05)
External Male 59 0.8905 1e-04 Non-normal (p < 0.05)
Natural Female 240 0.7643 0e+00 Non-normal (p < 0.05)
Natural Male 217 0.7761 0e+00 Non-normal (p < 0.05)

Note: The Welch t-test is used throughout as it does not assume equal variances and remains appropriate even when normality is mild violated in larger samples.

5.2 Homogeneity of Variance: Levene’s Test

cat("--- Levene's Test: External Causes ---\n")
## --- Levene's Test: External Causes ---
leveneTest(rate_adj ~ sex, data = filter(df, cause_type == "External"))
## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value    Pr(>F)    
## group  1  16.859 8.647e-05 ***
##       93                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat("\n--- Levene's Test: Natural Causes ---\n")
## 
## --- Levene's Test: Natural Causes ---
leveneTest(rate_adj ~ sex, data = filter(df, cause_type == "Natural"))
## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value    Pr(>F)    
## group   1  16.678 5.233e-05 ***
##       455                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

6 Two-Sample t-Tests

6.1 External Causes

t_external <- t.test(rate_adj ~ sex,
                     data        = filter(df, cause_type == "External"),
                     var.equal   = FALSE,
                     alternative = "two.sided")
t_external
## 
##  Welch Two Sample t-test
## 
## data:  rate_adj by sex
## t = -7.8723, df = 71.97, p-value = 2.661e-11
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
##  -17.61964 -10.49916
## sample estimates:
## mean in group Female   mean in group Male 
##              8.16879             22.22819

6.2 Natural Causes

t_natural <- t.test(rate_adj ~ sex,
                    data        = filter(df, cause_type == "Natural"),
                    var.equal   = FALSE,
                    alternative = "two.sided")
t_natural
## 
##  Welch Two Sample t-test
## 
## data:  rate_adj by sex
## t = -4.5384, df = 349.42, p-value = 7.806e-06
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
##  -43.15332 -17.05954
## sample estimates:
## mean in group Female   mean in group Male 
##             47.46835             77.57479

6.3 Summary Table

ttest_summary <- tibble(
  `Cause Type`   = c("External", "Natural"),
  `Female Mean`  = c(round(t_external$estimate["mean in group Female"], 2),
                     round(t_natural$estimate["mean in group Female"],  2)),
  `Male Mean`    = c(round(t_external$estimate["mean in group Male"], 2),
                     round(t_natural$estimate["mean in group Male"],  2)),
  `t statistic`  = c(round(t_external$statistic, 3),
                     round(t_natural$statistic,  3)),
  `df`           = c(round(t_external$parameter, 1),
                     round(t_natural$parameter,  1)),
  `p-value`      = c(round(t_external$p.value, 4),
                     round(t_natural$p.value,  4)),
  `Significant?` = c(ifelse(t_external$p.value < 0.05, "Yes", "No"),
                     ifelse(t_natural$p.value  < 0.05, "Yes", "No"))
)

ttest_summary %>%
  kable(caption = "Welch t-Test: Male vs. Female Age-Adjusted Death Rates") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE)
Welch t-Test: Male vs. Female Age-Adjusted Death Rates
Cause Type Female Mean Male Mean t statistic df p-value Significant?
External 8.17 22.23 -7.872 72.0 0 Yes
Natural 47.47 77.57 -4.538 349.4 0 Yes

7 One-Way ANOVA

7.1 ANOVA: Cause Type x Sex (4 Groups)

anova_model <- aov(rate_adj ~ group, data = df)
summary(anova_model)
##              Df  Sum Sq Mean Sq F value  Pr(>F)    
## group         3  266016   88672    22.2 1.4e-13 ***
## Residuals   548 2188482    3994                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

7.2 Tukey HSD Post-Hoc Test

ANOVA tells us at least one group differs. Tukey HSD tells us which pairs.

tukey_result <- TukeyHSD(anova_model)
print(tukey_result)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = rate_adj ~ group, data = df)
## 
## $group
##                                     diff       lwr       upr     p adj
## Natural.Female-External.Female  39.29956  10.19362 68.405508 0.0030385
## External.Male-External.Female   14.05940 -20.38103 48.499829 0.7188442
## Natural.Male-External.Female    69.40600  40.09955 98.712444 0.0000000
## External.Male-Natural.Female   -25.24017 -48.90416 -1.576173 0.0313386
## Natural.Male-Natural.Female     30.10643  14.85163 45.361239 0.0000030
## Natural.Male-External.Male      55.34660  31.43643 79.256772 0.0000000
par(mar = c(5, 18, 4, 2))
plot(tukey_result, las = 1, cex.axis = 0.8)

7.3 ANOVA: External Cause Rates by Race/Ethnicity

anova_race <- aov(rate_adj ~ race, data = filter(df, cause_type == "External"))
summary(anova_race)
##             Df Sum Sq Mean Sq F value   Pr(>F)    
## race         4   3456   863.9   7.012 5.84e-05 ***
## Residuals   90  11089   123.2                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat("\n--- Tukey HSD: Race/Ethnicity (External Causes) ---\n")
## 
## --- Tukey HSD: Race/Ethnicity (External Causes) ---
print(TukeyHSD(anova_race))
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = rate_adj ~ race, data = filter(df, cause_type == "External"))
## 
## $race
##                                                      diff         lwr       upr
## Hispanic-Asian and Pacific Islander           12.64175182   3.0946459 22.188858
## Non-Hispanic Black-Asian and Pacific Islander 17.10442242   7.7874010 26.421444
## Non-Hispanic White-Asian and Pacific Islander  9.23219135   0.2806867 18.183696
## Not Stated/Unknown-Asian and Pacific Islander  9.18871897  -6.1207094 24.498147
## Non-Hispanic Black-Hispanic                    4.46267060  -5.0844354 14.009777
## Non-Hispanic White-Hispanic                   -3.40956046 -12.6003047  5.781184
## Not Stated/Unknown-Hispanic                   -3.45303284 -18.9035649 11.997499
## Non-Hispanic White-Non-Hispanic Black         -7.87223106 -16.8237357  1.079274
## Not Stated/Unknown-Non-Hispanic Black         -7.91570344 -23.2251318  7.393725
## Not Stated/Unknown-Non-Hispanic White         -0.04347238 -15.1332419 15.046297
##                                                   p adj
## Hispanic-Asian and Pacific Islander           0.0034892
## Non-Hispanic Black-Asian and Pacific Islander 0.0000175
## Non-Hispanic White-Asian and Pacific Islander 0.0398190
## Not Stated/Unknown-Asian and Pacific Islander 0.4570162
## Non-Hispanic Black-Hispanic                   0.6911045
## Non-Hispanic White-Hispanic                   0.8395297
## Not Stated/Unknown-Hispanic                   0.9711728
## Non-Hispanic White-Non-Hispanic Black         0.1121235
## Not Stated/Unknown-Non-Hispanic Black         0.6041103
## Not Stated/Unknown-Non-Hispanic White         1.0000000

8 Sex Ratio & Linear Regression Over Time

8.1 Calculate Male-to-Female Rate Ratio

sex_ratio <- df %>%
  group_by(year, cause_type, sex) %>%
  summarise(mean_rate = mean(rate_adj, na.rm = TRUE), .groups = "drop") %>%
  pivot_wider(names_from = sex, values_from = mean_rate) %>%
  filter(!is.na(Male) & !is.na(Female)) %>%
  mutate(ratio_M_F = round(Male / Female, 3))

# Confirm 30 rows (15 years x 2 cause types)
cat("Rows in sex_ratio:", nrow(sex_ratio), "(should be 30)\n")
## Rows in sex_ratio: 12 (should be 30)
cat("Years per cause type:\n")
## Years per cause type:
table(sex_ratio$cause_type)
## 
## External  Natural 
##        6        6
sex_ratio %>%
  kable(caption = "Male-to-Female Death Rate Ratio by Year and Cause Type") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE)
Male-to-Female Death Rate Ratio by Year and Cause Type
year cause_type Female Male ratio_M_F
2015 External 6.184532 16.80359 2.717
2015 Natural 45.507287 69.19434 1.521
2016 External 7.117463 18.98850 2.668
2016 Natural 45.957575 71.62473 1.558
2017 External 6.637578 19.56454 2.948
2017 Natural 43.533534 68.41732 1.572
2019 External 7.872352 21.01670 2.670
2019 Natural 44.640395 77.77681 1.742
2020 External 9.075000 25.88889 2.853
2020 Natural 58.390000 103.99143 1.781
2021 External 13.120000 32.16667 2.452
2021 Natural 46.869231 74.20571 1.583

A ratio > 1 means men have higher death rates than women.

8.2 Linear Regression: Is the Gap Changing Over Time?

lm_ext <- lm(ratio_M_F ~ year, data = filter(sex_ratio, cause_type == "External"))
cat("=== External Causes: Sex Ratio ~ Year (n =",
    nrow(filter(sex_ratio, cause_type == "External")), ") ===\n")
## === External Causes: Sex Ratio ~ Year (n = 6 ) ===
summary(lm_ext)
## 
## Call:
## lm(formula = ratio_M_F ~ year, data = filter(sex_ratio, cause_type == 
##     "External"))
## 
## Residuals:
##        1        2        3        4        5        6 
## -0.07632 -0.10021  0.20489 -0.02289  0.18521 -0.19068 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept) 53.38421   68.49633   0.779    0.479
## year        -0.02511    0.03394  -0.740    0.501
## 
## Residual standard error: 0.1796 on 4 degrees of freedom
## Multiple R-squared:  0.1203, Adjusted R-squared:  -0.09959 
## F-statistic: 0.5471 on 1 and 4 DF,  p-value: 0.5005
lm_nat <- lm(ratio_M_F ~ year, data = filter(sex_ratio, cause_type == "Natural"))
cat("=== Natural Causes: Sex Ratio ~ Year (n =",
    nrow(filter(sex_ratio, cause_type == "Natural")), ") ===\n")
## === Natural Causes: Sex Ratio ~ Year (n = 6 ) ===
summary(lm_nat)
## 
## Call:
## lm(formula = ratio_M_F ~ year, data = filter(sex_ratio, cause_type == 
##     "Natural"))
## 
## Residuals:
##        1        2        3        4        5        6 
## -0.01924 -0.01088 -0.02552  0.08719  0.09755 -0.12910 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)
## (Intercept) -56.17512   35.63443  -1.576     0.19
## year          0.02864    0.01766   1.622     0.18
## 
## Residual standard error: 0.09344 on 4 degrees of freedom
## Multiple R-squared:  0.3968, Adjusted R-squared:  0.246 
## F-statistic: 2.631 on 1 and 4 DF,  p-value: 0.1801

9 Visualizations

9.1 Age-Adjusted Death Rates Over Time

df %>%
  group_by(year, cause_type, sex) %>%
  summarise(mean_rate = mean(rate_adj, na.rm = TRUE), .groups = "drop") %>%
  ggplot(aes(x = year, y = mean_rate, color = sex, linetype = cause_type)) +
  geom_line(linewidth = 1.1) +
  geom_point(size = 2.2) +
  scale_color_manual(values = c("Female" = "#E07B79", "Male" = "#4A90D9")) +
  labs(
    title    = "Age-Adjusted Death Rates Over Time",
    subtitle = "External vs. Natural Causes by Sex - NYC 2007-2021",
    x = "Year", y = "Age-Adjusted Death Rate (per 100,000)",
    color = "Sex", linetype = "Cause Type"
  ) +
  theme_minimal(base_size = 13)

9.2 Male-to-Female Sex Ratio Over Time

sex_ratio %>%
  ggplot(aes(x = year, y = ratio_M_F, color = cause_type)) +
  geom_line(linewidth = 1.2) +
  geom_point(size = 2.5) +
  geom_hline(yintercept = 1, linetype = "dashed", color = "gray50") +
  geom_smooth(method = "lm", se = TRUE, alpha = 0.15) +
  scale_color_manual(values = c("External" = "#D4700A", "Natural" = "#2E7D6B")) +
  labs(
    title    = "Male-to-Female Death Rate Ratio Over Time",
    subtitle = "Ratio > 1 indicates higher male mortality - NYC 2007-2021",
    x = "Year", y = "Sex Ratio (Male Rate / Female Rate)",
    color = "Cause Type"
  ) +
  theme_minimal(base_size = 13)

9.3 Boxplot: Rate Distribution by Sex and Cause Type

df %>%
  ggplot(aes(x = sex, y = rate_adj, fill = sex)) +
  geom_boxplot(alpha = 0.7, outlier.shape = 21) +
  facet_wrap(~ cause_type, scales = "free_y") +
  scale_fill_manual(values = c("Female" = "#E07B79", "Male" = "#4A90D9")) +
  labs(
    title    = "Distribution of Age-Adjusted Death Rates",
    subtitle = "By Sex and Cause Type - NYC 2007-2021",
    x = "Sex", y = "Age-Adjusted Death Rate (per 100,000)",
    fill = "Sex"
  ) +
  theme_minimal(base_size = 13)


10 Time Series Forecast

ext_male_ts <- sex_ratio %>%
  filter(cause_type == "External") %>%
  arrange(year) %>%
  pull(ratio_M_F) %>%
  ts(start = 2007, frequency = 1)

fc <- forecast(auto.arima(ext_male_ts), h = 3)
print(fc)
##      Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## 2013          2.718 2.498494 2.937506 2.382295 3.053705
## 2014          2.718 2.498494 2.937506 2.382295 3.053705
## 2015          2.718 2.498494 2.937506 2.382295 3.053705
autoplot(fc) +
  labs(
    title    = "Forecast: Male/Female External Cause Death Ratio",
    subtitle = "ARIMA model - 3-year forecast (2022-2024)",
    x = "Year", y = "Sex Ratio"
  ) +
  theme_minimal(base_size = 13)


11 Summary of Results

results_summary <- tibble(
  Test = c(
    "t-test: External Causes",
    "t-test: Natural Causes",
    "One-way ANOVA (4 groups)",
    "ANOVA: Race/Ethnicity (External)",
    "Linear Regression: External ratio ~ year",
    "Linear Regression: Natural ratio ~ year"
  ),
  `Key Statistic` = c(
    sprintf("t = %.3f", t_external$statistic),
    sprintf("t = %.3f", t_natural$statistic),
    sprintf("F = %.3f", summary(anova_model)[[1]]$`F value`[1]),
    sprintf("F = %.3f", summary(anova_race)[[1]]$`F value`[1]),
    sprintf("slope = %.4f", coef(lm_ext)[2]),
    sprintf("slope = %.4f", coef(lm_nat)[2])
  ),
  `p-value` = c(
    round(t_external$p.value, 4),
    round(t_natural$p.value,  4),
    round(summary(anova_model)[[1]]$`Pr(>F)`[1], 4),
    round(summary(anova_race)[[1]]$`Pr(>F)`[1],  4),
    round(coef(summary(lm_ext))[2, 4], 4),
    round(coef(summary(lm_nat))[2, 4], 4)
  ),
  `Significant?` = c(
    ifelse(t_external$p.value < 0.05, "Yes", "No"),
    ifelse(t_natural$p.value  < 0.05, "Yes", "No"),
    ifelse(summary(anova_model)[[1]]$`Pr(>F)`[1] < 0.05, "Yes", "No"),
    ifelse(summary(anova_race)[[1]]$`Pr(>F)`[1]  < 0.05, "Yes", "No"),
    ifelse(coef(summary(lm_ext))[2, 4] < 0.05, "Yes", "No"),
    ifelse(coef(summary(lm_nat))[2, 4] < 0.05, "Yes", "No")
  )
)

results_summary %>%
  kable(caption = "Summary of All Statistical Tests") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE)
Summary of All Statistical Tests
Test Key Statistic p-value Significant?
t-test: External Causes t = -7.872 0.0000 Yes
t-test: Natural Causes t = -4.538 0.0000 Yes
One-way ANOVA (4 groups) F = 22.204 0.0000 Yes
ANOVA: Race/Ethnicity (External) F = 7.012 0.0001 Yes
Linear Regression: External ratio ~ year slope = -0.0251 0.5005 No
Linear Regression: Natural ratio ~ year slope = 0.0286 0.1801 No

12 References

Arias, E., Tejada-Vera, B., Kochanek, K. D., & Ahmad, F. B. (2022). Provisional life expectancy estimates for 2021. Vital Statistics Rapid Release, 23. National Center for Health Statistics.

Courtenay, W. H. (2000). Constructions of masculinity and their influence on men’s well-being. Social Science & Medicine, 50(10).

Galdas, P. M., Cheater, F., & Marshall, P. (2005). Men and health help-seeking behaviour. Journal of Advanced Nursing, 49(6).

New York City Department of Health and Mental Hygiene. (2023). Overdose data: 2022 report. NYC DOHMH Bureau of Vital Statistics.

Pinkhasov, R. M., et al. (2010). Are men shortchanged on health? International Journal of Clinical Practice, 64(4).

Wingard, D. L. (1984). The sex differential in morbidity, mortality, and lifestyle. Annual Review of Public Health, 5.


Report generated with R Markdown | STAT 3675: Statistical Computing with R