Women generally live longer than men, and this difference is often explained by biological factors such as heart disease. However, behavioral factors may also play an important role, especially for deaths caused by accidents, suicide, and violence. The purpose of this project is to compare deaths from external causes (accidents, suicide, assault, overdose) to deaths from natural causes and examine whether the gap between men and women has changed over time in New York City.
# install.packages(c("tidyverse", "ggplot2", "car", "multcomp", "forecast", "knitr", "kableExtra"))
library(tidyverse)
library(ggplot2)
library(car) # Levene's test
library(multcomp) # Tukey HSD
library(forecast) # ARIMA / time series
library(knitr)
library(kableExtra) # nice tables in the report# Dataset: NYC Leading Causes of Death (2007-2021)
# Make sure this CSV is in the same folder as this .Rmd file.
# In RStudio: Session -> Set Working Directory -> To Source File Location
df_raw <- read.csv("New_York_City_Leading_Causes_of_Death_20260411.csv",
stringsAsFactors = FALSE)
glimpse(df_raw)## Rows: 2,102
## Columns: 7
## $ Year <int> 2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021…
## $ Leading.Cause <chr> "Mental and Behavioral Disorders due to Accide…
## $ Sex <chr> "Male", "Female", "Male", "Female", "Male", "F…
## $ Race.Ethnicity <chr> "Other Race/ Ethnicity", "Not Stated/Unknown",…
## $ Deaths <chr> "9", "102", "16", "1409", "16", "4", "1157", "…
## $ Death.Rate <chr> "", "", "", "111", "", "", "91.2", "76.1", "20…
## $ Age.Adjusted.Death.Rate <chr> "", "", "", "95.5", "", "", "80.8", "66.8", "1…
df <- df_raw %>%
rename(
year = Year,
cause = Leading.Cause,
sex = Sex,
race = Race.Ethnicity,
deaths = Deaths,
rate_crude = Death.Rate,
rate_adj = Age.Adjusted.Death.Rate
) %>%
filter(sex %in% c("Male", "Female")) %>%
mutate(
rate_adj = as.numeric(na_if(rate_adj, ".")),
rate_crude = as.numeric(na_if(rate_crude, ".")),
deaths = as.numeric(na_if(deaths, "."))
) %>%
mutate(
cause_type = case_when(
str_detect(cause, regex(
"accident|unintentional|fall|poison|drowning|fire|transport|suicide|self.inflict|assault|homicide|violence|firearm|overdose|drug|alcohol.induced",
ignore_case = TRUE
)) ~ "External",
TRUE ~ "Natural"
)
) %>%
drop_na(rate_adj) %>%
mutate(group = interaction(cause_type, sex))
table(df$cause_type, df$sex)##
## Female Male
## External 36 59
## Natural 240 217
desc_stats <- df %>%
group_by(cause_type, sex) %>%
summarise(
n = n(),
Mean = round(mean(rate_adj, na.rm = TRUE), 2),
SD = round(sd(rate_adj, na.rm = TRUE), 2),
Median = round(median(rate_adj, na.rm = TRUE), 2),
Min = round(min(rate_adj, na.rm = TRUE), 2),
Max = round(max(rate_adj, na.rm = TRUE), 2),
.groups = "drop"
)
desc_stats %>%
kable(caption = "Descriptive Statistics: Age-Adjusted Death Rates by Cause Type and Sex") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE)| cause_type | sex | n | Mean | SD | Median | Min | Max |
|---|---|---|---|---|---|---|---|
| External | Female | 36 | 8.17 | 3.63 | 7.01 | 3.37 | 20.30 |
| External | Male | 59 | 22.23 | 12.91 | 18.05 | 5.36 | 69.50 |
| Natural | Female | 240 | 47.47 | 51.46 | 17.44 | 4.60 | 217.10 |
| Natural | Male | 217 | 77.57 | 84.58 | 26.13 | 5.73 | 414.59 |
shapiro_results <- df %>%
group_by(cause_type, sex) %>%
summarise(
n = n(),
Statistic = round(shapiro.test(rate_adj)$statistic, 4),
p_value = round(shapiro.test(rate_adj)$p.value, 4),
.groups = "drop"
) %>%
mutate(Result = ifelse(p_value > 0.05,
"Normal (fail to reject)",
"Non-normal (p < 0.05)"))
shapiro_results %>%
kable(caption = "Shapiro-Wilk Normality Test by Group") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE)| cause_type | sex | n | Statistic | p_value | Result |
|---|---|---|---|---|---|
| External | Female | 36 | 0.8386 | 1e-04 | Non-normal (p < 0.05) |
| External | Male | 59 | 0.8905 | 1e-04 | Non-normal (p < 0.05) |
| Natural | Female | 240 | 0.7643 | 0e+00 | Non-normal (p < 0.05) |
| Natural | Male | 217 | 0.7761 | 0e+00 | Non-normal (p < 0.05) |
## --- Levene's Test: External Causes ---
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 1 16.859 8.647e-05 ***
## 93
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## --- Levene's Test: Natural Causes ---
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 1 16.678 5.233e-05 ***
## 455
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
t_external <- t.test(rate_adj ~ sex,
data = filter(df, cause_type == "External"),
var.equal = FALSE,
alternative = "two.sided")
t_external##
## Welch Two Sample t-test
##
## data: rate_adj by sex
## t = -7.8723, df = 71.97, p-value = 2.661e-11
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
## -17.61964 -10.49916
## sample estimates:
## mean in group Female mean in group Male
## 8.16879 22.22819
t_natural <- t.test(rate_adj ~ sex,
data = filter(df, cause_type == "Natural"),
var.equal = FALSE,
alternative = "two.sided")
t_natural##
## Welch Two Sample t-test
##
## data: rate_adj by sex
## t = -4.5384, df = 349.42, p-value = 7.806e-06
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
## -43.15332 -17.05954
## sample estimates:
## mean in group Female mean in group Male
## 47.46835 77.57479
ttest_summary <- tibble(
`Cause Type` = c("External", "Natural"),
`Female Mean` = c(round(t_external$estimate["mean in group Female"], 2),
round(t_natural$estimate["mean in group Female"], 2)),
`Male Mean` = c(round(t_external$estimate["mean in group Male"], 2),
round(t_natural$estimate["mean in group Male"], 2)),
`t statistic` = c(round(t_external$statistic, 3),
round(t_natural$statistic, 3)),
`df` = c(round(t_external$parameter, 1),
round(t_natural$parameter, 1)),
`p-value` = c(round(t_external$p.value, 4),
round(t_natural$p.value, 4)),
`Significant?` = c(ifelse(t_external$p.value < 0.05, "Yes", "No"),
ifelse(t_natural$p.value < 0.05, "Yes", "No"))
)
ttest_summary %>%
kable(caption = "Welch t-Test: Male vs. Female Age-Adjusted Death Rates") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE)| Cause Type | Female Mean | Male Mean | t statistic | df | p-value | Significant? |
|---|---|---|---|---|---|---|
| External | 8.17 | 22.23 | -7.872 | 72.0 | 0 | Yes |
| Natural | 47.47 | 77.57 | -4.538 | 349.4 | 0 | Yes |
## Df Sum Sq Mean Sq F value Pr(>F)
## group 3 266016 88672 22.2 1.4e-13 ***
## Residuals 548 2188482 3994
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = rate_adj ~ group, data = df)
##
## $group
## diff lwr upr p adj
## Natural.Female-External.Female 39.29956 10.19362 68.405508 0.0030385
## External.Male-External.Female 14.05940 -20.38103 48.499829 0.7188442
## Natural.Male-External.Female 69.40600 40.09955 98.712444 0.0000000
## External.Male-Natural.Female -25.24017 -48.90416 -1.576173 0.0313386
## Natural.Male-Natural.Female 30.10643 14.85163 45.361239 0.0000030
## Natural.Male-External.Male 55.34660 31.43643 79.256772 0.0000000
## Df Sum Sq Mean Sq F value Pr(>F)
## race 4 3456 863.9 7.012 5.84e-05 ***
## Residuals 90 11089 123.2
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## --- Tukey HSD: Race/Ethnicity (External Causes) ---
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = rate_adj ~ race, data = filter(df, cause_type == "External"))
##
## $race
## diff lwr upr
## Hispanic-Asian and Pacific Islander 12.64175182 3.0946459 22.188858
## Non-Hispanic Black-Asian and Pacific Islander 17.10442242 7.7874010 26.421444
## Non-Hispanic White-Asian and Pacific Islander 9.23219135 0.2806867 18.183696
## Not Stated/Unknown-Asian and Pacific Islander 9.18871897 -6.1207094 24.498147
## Non-Hispanic Black-Hispanic 4.46267060 -5.0844354 14.009777
## Non-Hispanic White-Hispanic -3.40956046 -12.6003047 5.781184
## Not Stated/Unknown-Hispanic -3.45303284 -18.9035649 11.997499
## Non-Hispanic White-Non-Hispanic Black -7.87223106 -16.8237357 1.079274
## Not Stated/Unknown-Non-Hispanic Black -7.91570344 -23.2251318 7.393725
## Not Stated/Unknown-Non-Hispanic White -0.04347238 -15.1332419 15.046297
## p adj
## Hispanic-Asian and Pacific Islander 0.0034892
## Non-Hispanic Black-Asian and Pacific Islander 0.0000175
## Non-Hispanic White-Asian and Pacific Islander 0.0398190
## Not Stated/Unknown-Asian and Pacific Islander 0.4570162
## Non-Hispanic Black-Hispanic 0.6911045
## Non-Hispanic White-Hispanic 0.8395297
## Not Stated/Unknown-Hispanic 0.9711728
## Non-Hispanic White-Non-Hispanic Black 0.1121235
## Not Stated/Unknown-Non-Hispanic Black 0.6041103
## Not Stated/Unknown-Non-Hispanic White 1.0000000
sex_ratio <- df %>%
group_by(year, cause_type, sex) %>%
summarise(mean_rate = mean(rate_adj, na.rm = TRUE), .groups = "drop") %>%
pivot_wider(names_from = sex, values_from = mean_rate) %>%
mutate(ratio_M_F = round(Male / Female, 3))
sex_ratio %>%
kable(caption = "Male-to-Female Death Rate Ratio by Year and Cause Type") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE)| year | cause_type | Female | Male | ratio_M_F |
|---|---|---|---|---|
| 2015 | External | 6.184532 | 16.80359 | 2.717 |
| 2015 | Natural | 45.507287 | 69.19434 | 1.521 |
| 2016 | External | 7.117463 | 18.98850 | 2.668 |
| 2016 | Natural | 45.957575 | 71.62473 | 1.558 |
| 2017 | External | 6.637578 | 19.56454 | 2.948 |
| 2017 | Natural | 43.533534 | 68.41732 | 1.572 |
| 2019 | External | 7.872352 | 21.01670 | 2.670 |
| 2019 | Natural | 44.640395 | 77.77681 | 1.742 |
| 2020 | External | 9.075000 | 25.88889 | 2.853 |
| 2020 | Natural | 58.390000 | 103.99143 | 1.781 |
| 2021 | External | 13.120000 | 32.16667 | 2.452 |
| 2021 | Natural | 46.869231 | 74.20571 | 1.583 |
##
## Call:
## lm(formula = ratio_M_F ~ year, data = filter(sex_ratio, cause_type ==
## "External"))
##
## Residuals:
## 1 2 3 4 5 6
## -0.07632 -0.10021 0.20489 -0.02289 0.18521 -0.19068
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 53.38421 68.49633 0.779 0.479
## year -0.02511 0.03394 -0.740 0.501
##
## Residual standard error: 0.1796 on 4 degrees of freedom
## Multiple R-squared: 0.1203, Adjusted R-squared: -0.09959
## F-statistic: 0.5471 on 1 and 4 DF, p-value: 0.5005
##
## Call:
## lm(formula = ratio_M_F ~ year, data = filter(sex_ratio, cause_type ==
## "Natural"))
##
## Residuals:
## 1 2 3 4 5 6
## -0.01924 -0.01088 -0.02552 0.08719 0.09755 -0.12910
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -56.17512 35.63443 -1.576 0.19
## year 0.02864 0.01766 1.622 0.18
##
## Residual standard error: 0.09344 on 4 degrees of freedom
## Multiple R-squared: 0.3968, Adjusted R-squared: 0.246
## F-statistic: 2.631 on 1 and 4 DF, p-value: 0.1801
df %>%
group_by(year, cause_type, sex) %>%
summarise(mean_rate = mean(rate_adj, na.rm = TRUE), .groups = "drop") %>%
ggplot(aes(x = year, y = mean_rate, color = sex, linetype = cause_type)) +
geom_line(linewidth = 1.1) +
geom_point(size = 2.2) +
scale_color_manual(values = c("Female" = "#E07B79", "Male" = "#4A90D9")) +
labs(
title = "Age-Adjusted Death Rates Over Time",
subtitle = "External vs. Natural Causes by Sex - NYC 2007-2021",
x = "Year", y = "Age-Adjusted Death Rate (per 100,000)",
color = "Sex", linetype = "Cause Type"
) +
theme_minimal(base_size = 13)sex_ratio %>%
ggplot(aes(x = year, y = ratio_M_F, color = cause_type)) +
geom_line(linewidth = 1.2) +
geom_point(size = 2.5) +
geom_hline(yintercept = 1, linetype = "dashed", color = "gray50") +
geom_smooth(method = "lm", se = TRUE, alpha = 0.15) +
scale_color_manual(values = c("External" = "#D4700A", "Natural" = "#2E7D6B")) +
labs(
title = "Male-to-Female Death Rate Ratio Over Time",
subtitle = "Ratio > 1 indicates higher male mortality - NYC 2007-2021",
x = "Year", y = "Sex Ratio (Male Rate / Female Rate)",
color = "Cause Type"
) +
theme_minimal(base_size = 13)df %>%
ggplot(aes(x = sex, y = rate_adj, fill = sex)) +
geom_boxplot(alpha = 0.7, outlier.shape = 21) +
facet_wrap(~ cause_type, scales = "free_y") +
scale_fill_manual(values = c("Female" = "#E07B79", "Male" = "#4A90D9")) +
labs(
title = "Distribution of Age-Adjusted Death Rates",
subtitle = "By Sex and Cause Type - NYC 2007-2021",
x = "Sex", y = "Age-Adjusted Death Rate (per 100,000)",
fill = "Sex"
) +
theme_minimal(base_size = 13)ext_male_ts <- sex_ratio %>%
filter(cause_type == "External") %>%
arrange(year) %>%
pull(ratio_M_F) %>%
ts(start = 2007, frequency = 1)
fc <- forecast(auto.arima(ext_male_ts), h = 3)
print(fc)## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 2013 2.718 2.498494 2.937506 2.382295 3.053705
## 2014 2.718 2.498494 2.937506 2.382295 3.053705
## 2015 2.718 2.498494 2.937506 2.382295 3.053705
autoplot(fc) +
labs(
title = "Forecast: Male/Female External Cause Death Ratio",
subtitle = "ARIMA model - 3-year forecast (2022-2024)",
x = "Year", y = "Sex Ratio"
) +
theme_minimal(base_size = 13)results_summary <- tibble(
Test = c(
"t-test: External Causes",
"t-test: Natural Causes",
"One-way ANOVA (4 groups)",
"ANOVA: Race/Ethnicity (External)",
"Linear Regression: External ratio ~ year",
"Linear Regression: Natural ratio ~ year"
),
`Key Statistic` = c(
sprintf("t = %.3f", t_external$statistic),
sprintf("t = %.3f", t_natural$statistic),
sprintf("F = %.3f", summary(anova_model)[[1]]$`F value`[1]),
sprintf("F = %.3f", summary(anova_race)[[1]]$`F value`[1]),
sprintf("slope = %.4f", coef(lm_ext)[2]),
sprintf("slope = %.4f", coef(lm_nat)[2])
),
`p-value` = c(
round(t_external$p.value, 4),
round(t_natural$p.value, 4),
round(summary(anova_model)[[1]]$`Pr(>F)`[1], 4),
round(summary(anova_race)[[1]]$`Pr(>F)`[1], 4),
round(coef(summary(lm_ext))[2, 4], 4),
round(coef(summary(lm_nat))[2, 4], 4)
),
`Significant?` = c(
ifelse(t_external$p.value < 0.05, "Yes", "No"),
ifelse(t_natural$p.value < 0.05, "Yes", "No"),
ifelse(summary(anova_model)[[1]]$`Pr(>F)`[1] < 0.05, "Yes", "No"),
ifelse(summary(anova_race)[[1]]$`Pr(>F)`[1] < 0.05, "Yes", "No"),
ifelse(coef(summary(lm_ext))[2, 4] < 0.05, "Yes", "No"),
ifelse(coef(summary(lm_nat))[2, 4] < 0.05, "Yes", "No")
)
)
results_summary %>%
kable(caption = "Summary of All Statistical Tests") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE)| Test | Key Statistic | p-value | Significant? |
|---|---|---|---|
| t-test: External Causes | t = -7.872 | 0.0000 | Yes |
| t-test: Natural Causes | t = -4.538 | 0.0000 | Yes |
| One-way ANOVA (4 groups) | F = 22.204 | 0.0000 | Yes |
| ANOVA: Race/Ethnicity (External) | F = 7.012 | 0.0001 | Yes |
| Linear Regression: External ratio ~ year | slope = -0.0251 | 0.5005 | No |
| Linear Regression: Natural ratio ~ year | slope = 0.0286 | 0.1801 | No |
Rice, S. M., et al. (2026). Masculinities and suicide: A systematic review and meta-analysis. PMC - National Institutes of Health.
New York City Department of Health and Mental Hygiene. (2024). Suicide Deaths in New York City, 2013 to 2022. Epi Data Brief.
Wilson, K., & Hardy, I. C. W. (2025). Statistical analysis of sex ratios: An introduction. Cambridge University Press.
Report generated with R Markdown | STAT 3675: Statistical Computing with R