1 Introduction

Women generally live longer than men, and this difference is often explained by biological factors such as heart disease. However, behavioral factors may also play an important role, especially for deaths caused by accidents, suicide, and violence. The purpose of this project is to compare deaths from external causes (accidents, suicide, assault, overdose) to deaths from natural causes and examine whether the gap between men and women has changed over time in New York City.

2 Setup: Packages

# install.packages(c("tidyverse", "ggplot2", "car", "multcomp", "forecast", "knitr", "kableExtra"))
library(tidyverse)
library(ggplot2)
library(car)
library(multcomp)
library(forecast)
library(knitr)
library(kableExtra)

3 Data

3.1 Load

# Make sure this CSV is in the same folder as this .Rmd file.
# In RStudio: Session -> Set Working Directory -> To Source File Location
df_raw <- read.csv("New_York_City_Leading_Causes_of_Death_20260411.csv",
                   stringsAsFactors = FALSE)
glimpse(df_raw)

## Rows: 2,102
## Columns: 7
## $ Year                    <int> 2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021…
## $ Leading.Cause           <chr> "Mental and Behavioral Disorders due to Accide…
## $ Sex                     <chr> "Male", "Female", "Male", "Female", "Male", "F…
## $ Race.Ethnicity          <chr> "Other Race/ Ethnicity", "Not Stated/Unknown",…
## $ Deaths                  <chr> "9", "102", "16", "1409", "16", "4", "1157", "…
## $ Death.Rate              <chr> "", "", "", "111", "", "", "91.2", "76.1", "20…
## $ Age.Adjusted.Death.Rate <chr> "", "", "", "95.5", "", "", "80.8", "66.8", "1…

3.2 Clean & Recode

df <- df_raw %>%
  rename(
    year       = Year,
    cause      = Leading.Cause,
    sex        = Sex,
    race       = Race.Ethnicity,
    deaths     = Deaths,
    rate_crude = Death.Rate,
    rate_adj   = Age.Adjusted.Death.Rate
  ) %>%
  filter(sex %in% c("Male", "Female")) %>%
  mutate(
    rate_adj   = as.numeric(na_if(rate_adj,   ".")),
    rate_crude = as.numeric(na_if(rate_crude, ".")),
    deaths     = as.numeric(na_if(deaths,     "."))
  ) %>%
  mutate(
    cause_type = case_when(
      str_detect(cause, regex(
        "accident|unintentional|fall|poison|drowning|fire|transport|suicide|self.inflict|assault|homicide|violence|firearm|overdose|drug|alcohol.induced",
        ignore_case = TRUE
      )) ~ "External",
      TRUE ~ "Natural"
    )
  ) %>%
  drop_na(rate_adj) %>%
  mutate(group = interaction(cause_type, sex))

# Confirm all 15 years loaded
cat("Years in dataset:\n")

## Years in dataset:

table(df$year)

## 
## 2015 2016 2017 2019 2020 2021 
##   89   88   88  111   88   88

cat("\nRows by cause type and sex:\n")

## 
## Rows by cause type and sex:

table(df$cause_type, df$sex)

##           
##            Female Male
##   External     36   59
##   Natural     240  217

4 Descriptive Statistics

desc_stats <- df %>%
  group_by(cause_type, sex) %>%
  summarise(
    n      = n(),
    Mean   = round(mean(rate_adj,   na.rm = TRUE), 2),
    SD     = round(sd(rate_adj,     na.rm = TRUE), 2),
    Median = round(median(rate_adj, na.rm = TRUE), 2),
    Min    = round(min(rate_adj,    na.rm = TRUE), 2),
    Max    = round(max(rate_adj,    na.rm = TRUE), 2),
    .groups = "drop"
  )

desc_stats %>%
  kable(caption = "Descriptive Statistics: Age-Adjusted Death Rates by Cause Type and Sex") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE)

Descriptive Statistics: Age-Adjusted Death Rates by Cause Type and Sex
cause_type	sex	n	Mean	SD	Median	Min	Max
External	Female	36	8.17	3.63	7.01	3.37	20.30
External	Male	59	22.23	12.91	18.05	5.36	69.50
Natural	Female	240	47.47	51.46	17.44	4.60	217.10
Natural	Male	217	77.57	84.58	26.13	5.73	414.59

5 Assumption Checks

5.1 Normality: Shapiro-Wilk Test

shapiro_results <- df %>%
  group_by(cause_type, sex) %>%
  summarise(
    n         = n(),
    Statistic = round(shapiro.test(rate_adj)$statistic, 4),
    p_value   = round(shapiro.test(rate_adj)$p.value,   4),
    .groups   = "drop"
  ) %>%
  mutate(Result = ifelse(p_value > 0.05,
                         "Normal (fail to reject)",
                         "Non-normal (p < 0.05)"))

shapiro_results %>%
  kable(caption = "Shapiro-Wilk Normality Test by Group") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE)

Shapiro-Wilk Normality Test by Group
cause_type	sex	n	Statistic	p_value	Result
External	Female	36	0.8386	1e-04	Non-normal (p < 0.05)
External	Male	59	0.8905	1e-04	Non-normal (p < 0.05)
Natural	Female	240	0.7643	0e+00	Non-normal (p < 0.05)
Natural	Male	217	0.7761	0e+00	Non-normal (p < 0.05)

Note: The Welch t-test is used throughout as it does not assume equal variances and remains appropriate even when normality is mild violated in larger samples.

5.2 Homogeneity of Variance: Levene’s Test

cat("--- Levene's Test: External Causes ---\n")

## --- Levene's Test: External Causes ---

leveneTest(rate_adj ~ sex, data = filter(df, cause_type == "External"))

## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value    Pr(>F)    
## group  1  16.859 8.647e-05 ***
##       93                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

cat("\n--- Levene's Test: Natural Causes ---\n")

## 
## --- Levene's Test: Natural Causes ---

leveneTest(rate_adj ~ sex, data = filter(df, cause_type == "Natural"))

## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value    Pr(>F)    
## group   1  16.678 5.233e-05 ***
##       455                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

6 Two-Sample t-Tests

6.1 External Causes

t_external <- t.test(rate_adj ~ sex,
                     data        = filter(df, cause_type == "External"),
                     var.equal   = FALSE,
                     alternative = "two.sided")
t_external

## 
##  Welch Two Sample t-test
## 
## data:  rate_adj by sex
## t = -7.8723, df = 71.97, p-value = 2.661e-11
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
##  -17.61964 -10.49916
## sample estimates:
## mean in group Female   mean in group Male 
##              8.16879             22.22819

6.2 Natural Causes

t_natural <- t.test(rate_adj ~ sex,
                    data        = filter(df, cause_type == "Natural"),
                    var.equal   = FALSE,
                    alternative = "two.sided")
t_natural

## 
##  Welch Two Sample t-test
## 
## data:  rate_adj by sex
## t = -4.5384, df = 349.42, p-value = 7.806e-06
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
##  -43.15332 -17.05954
## sample estimates:
## mean in group Female   mean in group Male 
##             47.46835             77.57479

6.3 Summary Table

ttest_summary <- tibble(
  `Cause Type`   = c("External", "Natural"),
  `Female Mean`  = c(round(t_external$estimate["mean in group Female"], 2),
                     round(t_natural$estimate["mean in group Female"],  2)),
  `Male Mean`    = c(round(t_external$estimate["mean in group Male"], 2),
                     round(t_natural$estimate["mean in group Male"],  2)),
  `t statistic`  = c(round(t_external$statistic, 3),
                     round(t_natural$statistic,  3)),
  `df`           = c(round(t_external$parameter, 1),
                     round(t_natural$parameter,  1)),
  `p-value`      = c(round(t_external$p.value, 4),
                     round(t_natural$p.value,  4)),
  `Significant?` = c(ifelse(t_external$p.value < 0.05, "Yes", "No"),
                     ifelse(t_natural$p.value  < 0.05, "Yes", "No"))
)

ttest_summary %>%
  kable(caption = "Welch t-Test: Male vs. Female Age-Adjusted Death Rates") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE)

Welch t-Test: Male vs. Female Age-Adjusted Death Rates
Cause Type	Female Mean	Male Mean	t statistic	df	p-value	Significant?
External	8.17	22.23	-7.872	72.0	0	Yes
Natural	47.47	77.57	-4.538	349.4	0	Yes

7 One-Way ANOVA

7.1 ANOVA: Cause Type x Sex (4 Groups)

anova_model <- aov(rate_adj ~ group, data = df)
summary(anova_model)

##              Df  Sum Sq Mean Sq F value  Pr(>F)    
## group         3  266016   88672    22.2 1.4e-13 ***
## Residuals   548 2188482    3994                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

7.2 Tukey HSD Post-Hoc Test

ANOVA tells us at least one group differs. Tukey HSD tells us which pairs.

tukey_result <- TukeyHSD(anova_model)
print(tukey_result)

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = rate_adj ~ group, data = df)
## 
## $group
##                                     diff       lwr       upr     p adj
## Natural.Female-External.Female  39.29956  10.19362 68.405508 0.0030385
## External.Male-External.Female   14.05940 -20.38103 48.499829 0.7188442
## Natural.Male-External.Female    69.40600  40.09955 98.712444 0.0000000
## External.Male-Natural.Female   -25.24017 -48.90416 -1.576173 0.0313386
## Natural.Male-Natural.Female     30.10643  14.85163 45.361239 0.0000030
## Natural.Male-External.Male      55.34660  31.43643 79.256772 0.0000000

par(mar = c(5, 18, 4, 2))
plot(tukey_result, las = 1, cex.axis = 0.8)

7.3 ANOVA: External Cause Rates by Race/Ethnicity

anova_race <- aov(rate_adj ~ race, data = filter(df, cause_type == "External"))
summary(anova_race)

##             Df Sum Sq Mean Sq F value   Pr(>F)    
## race         4   3456   863.9   7.012 5.84e-05 ***
## Residuals   90  11089   123.2                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

cat("\n--- Tukey HSD: Race/Ethnicity (External Causes) ---\n")

## 
## --- Tukey HSD: Race/Ethnicity (External Causes) ---

print(TukeyHSD(anova_race))

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = rate_adj ~ race, data = filter(df, cause_type == "External"))
## 
## $race
##                                                      diff         lwr       upr
## Hispanic-Asian and Pacific Islander           12.64175182   3.0946459 22.188858
## Non-Hispanic Black-Asian and Pacific Islander 17.10442242   7.7874010 26.421444
## Non-Hispanic White-Asian and Pacific Islander  9.23219135   0.2806867 18.183696
## Not Stated/Unknown-Asian and Pacific Islander  9.18871897  -6.1207094 24.498147
## Non-Hispanic Black-Hispanic                    4.46267060  -5.0844354 14.009777
## Non-Hispanic White-Hispanic                   -3.40956046 -12.6003047  5.781184
## Not Stated/Unknown-Hispanic                   -3.45303284 -18.9035649 11.997499
## Non-Hispanic White-Non-Hispanic Black         -7.87223106 -16.8237357  1.079274
## Not Stated/Unknown-Non-Hispanic Black         -7.91570344 -23.2251318  7.393725
## Not Stated/Unknown-Non-Hispanic White         -0.04347238 -15.1332419 15.046297
##                                                   p adj
## Hispanic-Asian and Pacific Islander           0.0034892
## Non-Hispanic Black-Asian and Pacific Islander 0.0000175
## Non-Hispanic White-Asian and Pacific Islander 0.0398190
## Not Stated/Unknown-Asian and Pacific Islander 0.4570162
## Non-Hispanic Black-Hispanic                   0.6911045
## Non-Hispanic White-Hispanic                   0.8395297
## Not Stated/Unknown-Hispanic                   0.9711728
## Non-Hispanic White-Non-Hispanic Black         0.1121235
## Not Stated/Unknown-Non-Hispanic Black         0.6041103
## Not Stated/Unknown-Non-Hispanic White         1.0000000

8 Sex Ratio & Linear Regression Over Time

8.1 Calculate Male-to-Female Rate Ratio

sex_ratio <- df %>%
  group_by(year, cause_type, sex) %>%
  summarise(mean_rate = mean(rate_adj, na.rm = TRUE), .groups = "drop") %>%
  pivot_wider(names_from = sex, values_from = mean_rate) %>%
  filter(!is.na(Male) & !is.na(Female)) %>%
  mutate(ratio_M_F = round(Male / Female, 3))

# Confirm 30 rows (15 years x 2 cause types)
cat("Rows in sex_ratio:", nrow(sex_ratio), "(should be 30)\n")

## Rows in sex_ratio: 12 (should be 30)

cat("Years per cause type:\n")

## Years per cause type:

table(sex_ratio$cause_type)

## 
## External  Natural 
##        6        6

sex_ratio %>%
  kable(caption = "Male-to-Female Death Rate Ratio by Year and Cause Type") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE)

Male-to-Female Death Rate Ratio by Year and Cause Type
year	cause_type	Female	Male	ratio_M_F
2015	External	6.184532	16.80359	2.717
2015	Natural	45.507287	69.19434	1.521
2016	External	7.117463	18.98850	2.668
2016	Natural	45.957575	71.62473	1.558
2017	External	6.637578	19.56454	2.948
2017	Natural	43.533534	68.41732	1.572
2019	External	7.872352	21.01670	2.670
2019	Natural	44.640395	77.77681	1.742
2020	External	9.075000	25.88889	2.853
2020	Natural	58.390000	103.99143	1.781
2021	External	13.120000	32.16667	2.452
2021	Natural	46.869231	74.20571	1.583

A ratio > 1 means men have higher death rates than women.

8.2 Linear Regression: Is the Gap Changing Over Time?

lm_ext <- lm(ratio_M_F ~ year, data = filter(sex_ratio, cause_type == "External"))
cat("=== External Causes: Sex Ratio ~ Year (n =",
    nrow(filter(sex_ratio, cause_type == "External")), ") ===\n")

## === External Causes: Sex Ratio ~ Year (n = 6 ) ===

summary(lm_ext)

## 
## Call:
## lm(formula = ratio_M_F ~ year, data = filter(sex_ratio, cause_type == 
##     "External"))
## 
## Residuals:
##        1        2        3        4        5        6 
## -0.07632 -0.10021  0.20489 -0.02289  0.18521 -0.19068 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept) 53.38421   68.49633   0.779    0.479
## year        -0.02511    0.03394  -0.740    0.501
## 
## Residual standard error: 0.1796 on 4 degrees of freedom
## Multiple R-squared:  0.1203, Adjusted R-squared:  -0.09959 
## F-statistic: 0.5471 on 1 and 4 DF,  p-value: 0.5005

lm_nat <- lm(ratio_M_F ~ year, data = filter(sex_ratio, cause_type == "Natural"))
cat("=== Natural Causes: Sex Ratio ~ Year (n =",
    nrow(filter(sex_ratio, cause_type == "Natural")), ") ===\n")

## === Natural Causes: Sex Ratio ~ Year (n = 6 ) ===

summary(lm_nat)

## 
## Call:
## lm(formula = ratio_M_F ~ year, data = filter(sex_ratio, cause_type == 
##     "Natural"))
## 
## Residuals:
##        1        2        3        4        5        6 
## -0.01924 -0.01088 -0.02552  0.08719  0.09755 -0.12910 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)
## (Intercept) -56.17512   35.63443  -1.576     0.19
## year          0.02864    0.01766   1.622     0.18
## 
## Residual standard error: 0.09344 on 4 degrees of freedom
## Multiple R-squared:  0.3968, Adjusted R-squared:  0.246 
## F-statistic: 2.631 on 1 and 4 DF,  p-value: 0.1801

9 Visualizations

9.1 Age-Adjusted Death Rates Over Time

df %>%
  group_by(year, cause_type, sex) %>%
  summarise(mean_rate = mean(rate_adj, na.rm = TRUE), .groups = "drop") %>%
  ggplot(aes(x = year, y = mean_rate, color = sex, linetype = cause_type)) +
  geom_line(linewidth = 1.1) +
  geom_point(size = 2.2) +
  scale_color_manual(values = c("Female" = "#E07B79", "Male" = "#4A90D9")) +
  labs(
    title    = "Age-Adjusted Death Rates Over Time",
    subtitle = "External vs. Natural Causes by Sex - NYC 2007-2021",
    x = "Year", y = "Age-Adjusted Death Rate (per 100,000)",
    color = "Sex", linetype = "Cause Type"
  ) +
  theme_minimal(base_size = 13)

9.2 Male-to-Female Sex Ratio Over Time

sex_ratio %>%
  ggplot(aes(x = year, y = ratio_M_F, color = cause_type)) +
  geom_line(linewidth = 1.2) +
  geom_point(size = 2.5) +
  geom_hline(yintercept = 1, linetype = "dashed", color = "gray50") +
  geom_smooth(method = "lm", se = TRUE, alpha = 0.15) +
  scale_color_manual(values = c("External" = "#D4700A", "Natural" = "#2E7D6B")) +
  labs(
    title    = "Male-to-Female Death Rate Ratio Over Time",
    subtitle = "Ratio > 1 indicates higher male mortality - NYC 2007-2021",
    x = "Year", y = "Sex Ratio (Male Rate / Female Rate)",
    color = "Cause Type"
  ) +
  theme_minimal(base_size = 13)

9.3 Boxplot: Rate Distribution by Sex and Cause Type

df %>%
  ggplot(aes(x = sex, y = rate_adj, fill = sex)) +
  geom_boxplot(alpha = 0.7, outlier.shape = 21) +
  facet_wrap(~ cause_type, scales = "free_y") +
  scale_fill_manual(values = c("Female" = "#E07B79", "Male" = "#4A90D9")) +
  labs(
    title    = "Distribution of Age-Adjusted Death Rates",
    subtitle = "By Sex and Cause Type - NYC 2007-2021",
    x = "Sex", y = "Age-Adjusted Death Rate (per 100,000)",
    fill = "Sex"
  ) +
  theme_minimal(base_size = 13)

10 Time Series Forecast

ext_male_ts <- sex_ratio %>%
  filter(cause_type == "External") %>%
  arrange(year) %>%
  pull(ratio_M_F) %>%
  ts(start = 2007, frequency = 1)

fc <- forecast(auto.arima(ext_male_ts), h = 3)
print(fc)

##      Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## 2013          2.718 2.498494 2.937506 2.382295 3.053705
## 2014          2.718 2.498494 2.937506 2.382295 3.053705
## 2015          2.718 2.498494 2.937506 2.382295 3.053705

autoplot(fc) +
  labs(
    title    = "Forecast: Male/Female External Cause Death Ratio",
    subtitle = "ARIMA model - 3-year forecast (2022-2024)",
    x = "Year", y = "Sex Ratio"
  ) +
  theme_minimal(base_size = 13)

11 Summary of Results

results_summary <- tibble(
  Test = c(
    "t-test: External Causes",
    "t-test: Natural Causes",
    "One-way ANOVA (4 groups)",
    "ANOVA: Race/Ethnicity (External)",
    "Linear Regression: External ratio ~ year",
    "Linear Regression: Natural ratio ~ year"
  ),
  `Key Statistic` = c(
    sprintf("t = %.3f", t_external$statistic),
    sprintf("t = %.3f", t_natural$statistic),
    sprintf("F = %.3f", summary(anova_model)[[1]]$`F value`[1]),
    sprintf("F = %.3f", summary(anova_race)[[1]]$`F value`[1]),
    sprintf("slope = %.4f", coef(lm_ext)[2]),
    sprintf("slope = %.4f", coef(lm_nat)[2])
  ),
  `p-value` = c(
    round(t_external$p.value, 4),
    round(t_natural$p.value,  4),
    round(summary(anova_model)[[1]]$`Pr(>F)`[1], 4),
    round(summary(anova_race)[[1]]$`Pr(>F)`[1],  4),
    round(coef(summary(lm_ext))[2, 4], 4),
    round(coef(summary(lm_nat))[2, 4], 4)
  ),
  `Significant?` = c(
    ifelse(t_external$p.value < 0.05, "Yes", "No"),
    ifelse(t_natural$p.value  < 0.05, "Yes", "No"),
    ifelse(summary(anova_model)[[1]]$`Pr(>F)`[1] < 0.05, "Yes", "No"),
    ifelse(summary(anova_race)[[1]]$`Pr(>F)`[1]  < 0.05, "Yes", "No"),
    ifelse(coef(summary(lm_ext))[2, 4] < 0.05, "Yes", "No"),
    ifelse(coef(summary(lm_nat))[2, 4] < 0.05, "Yes", "No")
  )
)

results_summary %>%
  kable(caption = "Summary of All Statistical Tests") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE)

Summary of All Statistical Tests
Test	Key Statistic	p-value	Significant?
t-test: External Causes	t = -7.872	0.0000	Yes
t-test: Natural Causes	t = -4.538	0.0000	Yes
One-way ANOVA (4 groups)	F = 22.204	0.0000	Yes
ANOVA: Race/Ethnicity (External)	F = 7.012	0.0001	Yes
Linear Regression: External ratio ~ year	slope = -0.0251	0.5005	No
Linear Regression: Natural ratio ~ year	slope = 0.0286	0.1801	No

12 References

Arias, E., Tejada-Vera, B., Kochanek, K. D., & Ahmad, F. B. (2022). Provisional life expectancy estimates for 2021. Vital Statistics Rapid Release, 23. National Center for Health Statistics.

Courtenay, W. H. (2000). Constructions of masculinity and their influence on men’s well-being. Social Science & Medicine, 50(10).

Galdas, P. M., Cheater, F., & Marshall, P. (2005). Men and health help-seeking behaviour. Journal of Advanced Nursing, 49(6).

New York City Department of Health and Mental Hygiene. (2023). Overdose data: 2022 report. NYC DOHMH Bureau of Vital Statistics.

Pinkhasov, R. M., et al. (2010). Are men shortchanged on health? International Journal of Clinical Practice, 64(4).

Wingard, D. L. (1984). The sex differential in morbidity, mortality, and lifestyle. Annual Review of Public Health, 5.

Report generated with R Markdown | STAT 3675: Statistical Computing with R

Behavioral vs. Biological Mortality Trends in New York City

Hugo Lin

March 31, 2026