Women generally live longer than men, and this difference is often explained by biological factors such as heart disease. However, behavioral factors may also play an important role, especially for deaths caused by accidents, suicide, and violence. The purpose of this project is to compare deaths from external causes (accidents, suicide, assault, overdose) to deaths from natural causes and examine whether the gap between men and women has changed over time in New York City.
# install.packages(c("tidyverse", "ggplot2", "car", "multcomp", "forecast", "knitr", "kableExtra"))
library(tidyverse)
library(ggplot2)
library(car)
library(multcomp)
library(forecast)
library(knitr)
library(kableExtra)# Make sure this CSV is in the same folder as this .Rmd file.
# In RStudio: Session -> Set Working Directory -> To Source File Location
df_raw <- read.csv("New_York_City_Leading_Causes_of_Death_20260411.csv",
stringsAsFactors = FALSE)
glimpse(df_raw)## Rows: 2,102
## Columns: 7
## $ Year <int> 2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021…
## $ Leading.Cause <chr> "Mental and Behavioral Disorders due to Accide…
## $ Sex <chr> "Male", "Female", "Male", "Female", "Male", "F…
## $ Race.Ethnicity <chr> "Other Race/ Ethnicity", "Not Stated/Unknown",…
## $ Deaths <chr> "9", "102", "16", "1409", "16", "4", "1157", "…
## $ Death.Rate <chr> "", "", "", "111", "", "", "91.2", "76.1", "20…
## $ Age.Adjusted.Death.Rate <chr> "", "", "", "95.5", "", "", "80.8", "66.8", "1…
df <- df_raw %>%
rename(
year = Year,
cause = Leading.Cause,
sex = Sex,
race = Race.Ethnicity,
deaths = Deaths,
rate_crude = Death.Rate,
rate_adj = Age.Adjusted.Death.Rate
) %>%
filter(sex %in% c("Male", "Female")) %>%
mutate(
rate_adj = as.numeric(na_if(rate_adj, ".")),
rate_crude = as.numeric(na_if(rate_crude, ".")),
deaths = as.numeric(na_if(deaths, "."))
) %>%
mutate(
cause_type = case_when(
str_detect(cause, regex(
"accident|unintentional|fall|poison|drowning|fire|transport|suicide|self.inflict|assault|homicide|violence|firearm|overdose|drug|alcohol.induced",
ignore_case = TRUE
)) ~ "External",
TRUE ~ "Natural"
)
) %>%
drop_na(rate_adj) %>%
mutate(group = interaction(cause_type, sex))
# Confirm all 15 years loaded
cat("Years in dataset:\n")## Years in dataset:
##
## 2015 2016 2017 2019 2020 2021
## 89 88 88 111 88 88
##
## Rows by cause type and sex:
##
## Female Male
## External 36 59
## Natural 240 217
desc_stats <- df %>%
group_by(cause_type, sex) %>%
summarise(
n = n(),
Mean = round(mean(rate_adj, na.rm = TRUE), 2),
SD = round(sd(rate_adj, na.rm = TRUE), 2),
Median = round(median(rate_adj, na.rm = TRUE), 2),
Min = round(min(rate_adj, na.rm = TRUE), 2),
Max = round(max(rate_adj, na.rm = TRUE), 2),
.groups = "drop"
)
desc_stats %>%
kable(caption = "Descriptive Statistics: Age-Adjusted Death Rates by Cause Type and Sex") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE)| cause_type | sex | n | Mean | SD | Median | Min | Max |
|---|---|---|---|---|---|---|---|
| External | Female | 36 | 8.17 | 3.63 | 7.01 | 3.37 | 20.30 |
| External | Male | 59 | 22.23 | 12.91 | 18.05 | 5.36 | 69.50 |
| Natural | Female | 240 | 47.47 | 51.46 | 17.44 | 4.60 | 217.10 |
| Natural | Male | 217 | 77.57 | 84.58 | 26.13 | 5.73 | 414.59 |
shapiro_results <- df %>%
group_by(cause_type, sex) %>%
summarise(
n = n(),
Statistic = round(shapiro.test(rate_adj)$statistic, 4),
p_value = round(shapiro.test(rate_adj)$p.value, 4),
.groups = "drop"
) %>%
mutate(Result = ifelse(p_value > 0.05,
"Normal (fail to reject)",
"Non-normal (p < 0.05)"))
shapiro_results %>%
kable(caption = "Shapiro-Wilk Normality Test by Group") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE)| cause_type | sex | n | Statistic | p_value | Result |
|---|---|---|---|---|---|
| External | Female | 36 | 0.8386 | 1e-04 | Non-normal (p < 0.05) |
| External | Male | 59 | 0.8905 | 1e-04 | Non-normal (p < 0.05) |
| Natural | Female | 240 | 0.7643 | 0e+00 | Non-normal (p < 0.05) |
| Natural | Male | 217 | 0.7761 | 0e+00 | Non-normal (p < 0.05) |
Note: The Welch t-test is used throughout as it does not assume equal variances and remains appropriate even when normality is mild violated in larger samples.
## --- Levene's Test: External Causes ---
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 1 16.859 8.647e-05 ***
## 93
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## --- Levene's Test: Natural Causes ---
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 1 16.678 5.233e-05 ***
## 455
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
t_external <- t.test(rate_adj ~ sex,
data = filter(df, cause_type == "External"),
var.equal = FALSE,
alternative = "two.sided")
t_external##
## Welch Two Sample t-test
##
## data: rate_adj by sex
## t = -7.8723, df = 71.97, p-value = 2.661e-11
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
## -17.61964 -10.49916
## sample estimates:
## mean in group Female mean in group Male
## 8.16879 22.22819
t_natural <- t.test(rate_adj ~ sex,
data = filter(df, cause_type == "Natural"),
var.equal = FALSE,
alternative = "two.sided")
t_natural##
## Welch Two Sample t-test
##
## data: rate_adj by sex
## t = -4.5384, df = 349.42, p-value = 7.806e-06
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
## -43.15332 -17.05954
## sample estimates:
## mean in group Female mean in group Male
## 47.46835 77.57479
ttest_summary <- tibble(
`Cause Type` = c("External", "Natural"),
`Female Mean` = c(round(t_external$estimate["mean in group Female"], 2),
round(t_natural$estimate["mean in group Female"], 2)),
`Male Mean` = c(round(t_external$estimate["mean in group Male"], 2),
round(t_natural$estimate["mean in group Male"], 2)),
`t statistic` = c(round(t_external$statistic, 3),
round(t_natural$statistic, 3)),
`df` = c(round(t_external$parameter, 1),
round(t_natural$parameter, 1)),
`p-value` = c(round(t_external$p.value, 4),
round(t_natural$p.value, 4)),
`Significant?` = c(ifelse(t_external$p.value < 0.05, "Yes", "No"),
ifelse(t_natural$p.value < 0.05, "Yes", "No"))
)
ttest_summary %>%
kable(caption = "Welch t-Test: Male vs. Female Age-Adjusted Death Rates") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE)| Cause Type | Female Mean | Male Mean | t statistic | df | p-value | Significant? |
|---|---|---|---|---|---|---|
| External | 8.17 | 22.23 | -7.872 | 72.0 | 0 | Yes |
| Natural | 47.47 | 77.57 | -4.538 | 349.4 | 0 | Yes |
## Df Sum Sq Mean Sq F value Pr(>F)
## group 3 266016 88672 22.2 1.4e-13 ***
## Residuals 548 2188482 3994
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ANOVA tells us at least one group differs. Tukey HSD tells us which pairs.
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = rate_adj ~ group, data = df)
##
## $group
## diff lwr upr p adj
## Natural.Female-External.Female 39.29956 10.19362 68.405508 0.0030385
## External.Male-External.Female 14.05940 -20.38103 48.499829 0.7188442
## Natural.Male-External.Female 69.40600 40.09955 98.712444 0.0000000
## External.Male-Natural.Female -25.24017 -48.90416 -1.576173 0.0313386
## Natural.Male-Natural.Female 30.10643 14.85163 45.361239 0.0000030
## Natural.Male-External.Male 55.34660 31.43643 79.256772 0.0000000
## Df Sum Sq Mean Sq F value Pr(>F)
## race 4 3456 863.9 7.012 5.84e-05 ***
## Residuals 90 11089 123.2
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## --- Tukey HSD: Race/Ethnicity (External Causes) ---
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = rate_adj ~ race, data = filter(df, cause_type == "External"))
##
## $race
## diff lwr upr
## Hispanic-Asian and Pacific Islander 12.64175182 3.0946459 22.188858
## Non-Hispanic Black-Asian and Pacific Islander 17.10442242 7.7874010 26.421444
## Non-Hispanic White-Asian and Pacific Islander 9.23219135 0.2806867 18.183696
## Not Stated/Unknown-Asian and Pacific Islander 9.18871897 -6.1207094 24.498147
## Non-Hispanic Black-Hispanic 4.46267060 -5.0844354 14.009777
## Non-Hispanic White-Hispanic -3.40956046 -12.6003047 5.781184
## Not Stated/Unknown-Hispanic -3.45303284 -18.9035649 11.997499
## Non-Hispanic White-Non-Hispanic Black -7.87223106 -16.8237357 1.079274
## Not Stated/Unknown-Non-Hispanic Black -7.91570344 -23.2251318 7.393725
## Not Stated/Unknown-Non-Hispanic White -0.04347238 -15.1332419 15.046297
## p adj
## Hispanic-Asian and Pacific Islander 0.0034892
## Non-Hispanic Black-Asian and Pacific Islander 0.0000175
## Non-Hispanic White-Asian and Pacific Islander 0.0398190
## Not Stated/Unknown-Asian and Pacific Islander 0.4570162
## Non-Hispanic Black-Hispanic 0.6911045
## Non-Hispanic White-Hispanic 0.8395297
## Not Stated/Unknown-Hispanic 0.9711728
## Non-Hispanic White-Non-Hispanic Black 0.1121235
## Not Stated/Unknown-Non-Hispanic Black 0.6041103
## Not Stated/Unknown-Non-Hispanic White 1.0000000
sex_ratio <- df %>%
group_by(year, cause_type, sex) %>%
summarise(mean_rate = mean(rate_adj, na.rm = TRUE), .groups = "drop") %>%
pivot_wider(names_from = sex, values_from = mean_rate) %>%
filter(!is.na(Male) & !is.na(Female)) %>%
mutate(ratio_M_F = round(Male / Female, 3))
# Confirm 30 rows (15 years x 2 cause types)
cat("Rows in sex_ratio:", nrow(sex_ratio), "(should be 30)\n")## Rows in sex_ratio: 12 (should be 30)
## Years per cause type:
##
## External Natural
## 6 6
sex_ratio %>%
kable(caption = "Male-to-Female Death Rate Ratio by Year and Cause Type") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE)| year | cause_type | Female | Male | ratio_M_F |
|---|---|---|---|---|
| 2015 | External | 6.184532 | 16.80359 | 2.717 |
| 2015 | Natural | 45.507287 | 69.19434 | 1.521 |
| 2016 | External | 7.117463 | 18.98850 | 2.668 |
| 2016 | Natural | 45.957575 | 71.62473 | 1.558 |
| 2017 | External | 6.637578 | 19.56454 | 2.948 |
| 2017 | Natural | 43.533534 | 68.41732 | 1.572 |
| 2019 | External | 7.872352 | 21.01670 | 2.670 |
| 2019 | Natural | 44.640395 | 77.77681 | 1.742 |
| 2020 | External | 9.075000 | 25.88889 | 2.853 |
| 2020 | Natural | 58.390000 | 103.99143 | 1.781 |
| 2021 | External | 13.120000 | 32.16667 | 2.452 |
| 2021 | Natural | 46.869231 | 74.20571 | 1.583 |
A ratio > 1 means men have higher death rates than women.
lm_ext <- lm(ratio_M_F ~ year, data = filter(sex_ratio, cause_type == "External"))
cat("=== External Causes: Sex Ratio ~ Year (n =",
nrow(filter(sex_ratio, cause_type == "External")), ") ===\n")## === External Causes: Sex Ratio ~ Year (n = 6 ) ===
##
## Call:
## lm(formula = ratio_M_F ~ year, data = filter(sex_ratio, cause_type ==
## "External"))
##
## Residuals:
## 1 2 3 4 5 6
## -0.07632 -0.10021 0.20489 -0.02289 0.18521 -0.19068
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 53.38421 68.49633 0.779 0.479
## year -0.02511 0.03394 -0.740 0.501
##
## Residual standard error: 0.1796 on 4 degrees of freedom
## Multiple R-squared: 0.1203, Adjusted R-squared: -0.09959
## F-statistic: 0.5471 on 1 and 4 DF, p-value: 0.5005
lm_nat <- lm(ratio_M_F ~ year, data = filter(sex_ratio, cause_type == "Natural"))
cat("=== Natural Causes: Sex Ratio ~ Year (n =",
nrow(filter(sex_ratio, cause_type == "Natural")), ") ===\n")## === Natural Causes: Sex Ratio ~ Year (n = 6 ) ===
##
## Call:
## lm(formula = ratio_M_F ~ year, data = filter(sex_ratio, cause_type ==
## "Natural"))
##
## Residuals:
## 1 2 3 4 5 6
## -0.01924 -0.01088 -0.02552 0.08719 0.09755 -0.12910
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -56.17512 35.63443 -1.576 0.19
## year 0.02864 0.01766 1.622 0.18
##
## Residual standard error: 0.09344 on 4 degrees of freedom
## Multiple R-squared: 0.3968, Adjusted R-squared: 0.246
## F-statistic: 2.631 on 1 and 4 DF, p-value: 0.1801
df %>%
group_by(year, cause_type, sex) %>%
summarise(mean_rate = mean(rate_adj, na.rm = TRUE), .groups = "drop") %>%
ggplot(aes(x = year, y = mean_rate, color = sex, linetype = cause_type)) +
geom_line(linewidth = 1.1) +
geom_point(size = 2.2) +
scale_color_manual(values = c("Female" = "#E07B79", "Male" = "#4A90D9")) +
labs(
title = "Age-Adjusted Death Rates Over Time",
subtitle = "External vs. Natural Causes by Sex - NYC 2007-2021",
x = "Year", y = "Age-Adjusted Death Rate (per 100,000)",
color = "Sex", linetype = "Cause Type"
) +
theme_minimal(base_size = 13)sex_ratio %>%
ggplot(aes(x = year, y = ratio_M_F, color = cause_type)) +
geom_line(linewidth = 1.2) +
geom_point(size = 2.5) +
geom_hline(yintercept = 1, linetype = "dashed", color = "gray50") +
geom_smooth(method = "lm", se = TRUE, alpha = 0.15) +
scale_color_manual(values = c("External" = "#D4700A", "Natural" = "#2E7D6B")) +
labs(
title = "Male-to-Female Death Rate Ratio Over Time",
subtitle = "Ratio > 1 indicates higher male mortality - NYC 2007-2021",
x = "Year", y = "Sex Ratio (Male Rate / Female Rate)",
color = "Cause Type"
) +
theme_minimal(base_size = 13)df %>%
ggplot(aes(x = sex, y = rate_adj, fill = sex)) +
geom_boxplot(alpha = 0.7, outlier.shape = 21) +
facet_wrap(~ cause_type, scales = "free_y") +
scale_fill_manual(values = c("Female" = "#E07B79", "Male" = "#4A90D9")) +
labs(
title = "Distribution of Age-Adjusted Death Rates",
subtitle = "By Sex and Cause Type - NYC 2007-2021",
x = "Sex", y = "Age-Adjusted Death Rate (per 100,000)",
fill = "Sex"
) +
theme_minimal(base_size = 13)ext_male_ts <- sex_ratio %>%
filter(cause_type == "External") %>%
arrange(year) %>%
pull(ratio_M_F) %>%
ts(start = 2007, frequency = 1)
fc <- forecast(auto.arima(ext_male_ts), h = 3)
print(fc)## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 2013 2.718 2.498494 2.937506 2.382295 3.053705
## 2014 2.718 2.498494 2.937506 2.382295 3.053705
## 2015 2.718 2.498494 2.937506 2.382295 3.053705
autoplot(fc) +
labs(
title = "Forecast: Male/Female External Cause Death Ratio",
subtitle = "ARIMA model - 3-year forecast (2022-2024)",
x = "Year", y = "Sex Ratio"
) +
theme_minimal(base_size = 13)results_summary <- tibble(
Test = c(
"t-test: External Causes",
"t-test: Natural Causes",
"One-way ANOVA (4 groups)",
"ANOVA: Race/Ethnicity (External)",
"Linear Regression: External ratio ~ year",
"Linear Regression: Natural ratio ~ year"
),
`Key Statistic` = c(
sprintf("t = %.3f", t_external$statistic),
sprintf("t = %.3f", t_natural$statistic),
sprintf("F = %.3f", summary(anova_model)[[1]]$`F value`[1]),
sprintf("F = %.3f", summary(anova_race)[[1]]$`F value`[1]),
sprintf("slope = %.4f", coef(lm_ext)[2]),
sprintf("slope = %.4f", coef(lm_nat)[2])
),
`p-value` = c(
round(t_external$p.value, 4),
round(t_natural$p.value, 4),
round(summary(anova_model)[[1]]$`Pr(>F)`[1], 4),
round(summary(anova_race)[[1]]$`Pr(>F)`[1], 4),
round(coef(summary(lm_ext))[2, 4], 4),
round(coef(summary(lm_nat))[2, 4], 4)
),
`Significant?` = c(
ifelse(t_external$p.value < 0.05, "Yes", "No"),
ifelse(t_natural$p.value < 0.05, "Yes", "No"),
ifelse(summary(anova_model)[[1]]$`Pr(>F)`[1] < 0.05, "Yes", "No"),
ifelse(summary(anova_race)[[1]]$`Pr(>F)`[1] < 0.05, "Yes", "No"),
ifelse(coef(summary(lm_ext))[2, 4] < 0.05, "Yes", "No"),
ifelse(coef(summary(lm_nat))[2, 4] < 0.05, "Yes", "No")
)
)
results_summary %>%
kable(caption = "Summary of All Statistical Tests") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE)| Test | Key Statistic | p-value | Significant? |
|---|---|---|---|
| t-test: External Causes | t = -7.872 | 0.0000 | Yes |
| t-test: Natural Causes | t = -4.538 | 0.0000 | Yes |
| One-way ANOVA (4 groups) | F = 22.204 | 0.0000 | Yes |
| ANOVA: Race/Ethnicity (External) | F = 7.012 | 0.0001 | Yes |
| Linear Regression: External ratio ~ year | slope = -0.0251 | 0.5005 | No |
| Linear Regression: Natural ratio ~ year | slope = 0.0286 | 0.1801 | No |
Arias, E., Tejada-Vera, B., Kochanek, K. D., & Ahmad, F. B. (2022). Provisional life expectancy estimates for 2021. Vital Statistics Rapid Release, 23. National Center for Health Statistics.
Courtenay, W. H. (2000). Constructions of masculinity and their influence on men’s well-being. Social Science & Medicine, 50(10).
Galdas, P. M., Cheater, F., & Marshall, P. (2005). Men and health help-seeking behaviour. Journal of Advanced Nursing, 49(6).
New York City Department of Health and Mental Hygiene. (2023). Overdose data: 2022 report. NYC DOHMH Bureau of Vital Statistics.
Pinkhasov, R. M., et al. (2010). Are men shortchanged on health? International Journal of Clinical Practice, 64(4).
Wingard, D. L. (1984). The sex differential in morbidity, mortality, and lifestyle. Annual Review of Public Health, 5.
Report generated with R Markdown | STAT 3675: Statistical Computing with R