# Load libraries
library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(broom)
library(car) # For Levene's Test
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
library(ggpubr) # For visual Q-Q and histograms
## Warning: package 'ggpubr' was built under R version 4.4.3
library(effsize) # For effect size (Cohen's d)
## Warning: package 'effsize' was built under R version 4.4.3
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(car)
data <- read.csv( "female_veteran_suicide (2).csv")
# Clean column names
colnames(data) <- c("Year", "Deaths", "Population", "VHA_Use")
# Clean and prepare the data
data <- data %>%
mutate(
Population = as.numeric(gsub(",", "", Population)), # Remove commas and convert to numeric
Deaths = as.numeric(Deaths),
Year = as.numeric(Year),
VHA_Use = factor(VHA_Use, levels = c("Other Veteran", "Recent Veteran VHA User")),
Suicide_Rate = (Deaths / Population) * 100000 # Suicide rate per 100,000
)
library(dplyr)
# Convert Population from character to numeric, calculate Suicide_Rate, and remove NAs
data <- data %>%
mutate(Population = as.numeric(gsub(",", "", Population))) %>% # Remove commas and convert to numeric
mutate(Suicide_Rate = (Deaths / Population) * 100000) %>%
drop_na(Deaths, Population, Suicide_Rate) # Drop any rows with NAs in key columns
#EDA # ======================
data %>%
group_by(VHA_Use) %>%
summarize(
min = min(Suicide_Rate, na.rm = TRUE),
q1 = quantile(Suicide_Rate, 0.25, na.rm = TRUE),
median = median(Suicide_Rate, na.rm = TRUE),
mean = mean(Suicide_Rate, na.rm = TRUE),
q3 = quantile(Suicide_Rate, 0.75, na.rm = TRUE),
max = max(Suicide_Rate, na.rm = TRUE),
sd = sd(Suicide_Rate, na.rm = TRUE),
n = n()
)
## # A tibble: 2 × 9
## VHA_Use min q1 median mean q3 max sd n
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 Other Veteran 6.92 9.51 12.3 12.0 13.6 16.4 2.73 22
## 2 Recent Veteran VHA User 9.77 13.7 15.3 15.4 17.0 19.9 2.39 22
# Line plot
ggplot(data, aes(x = Year, y = Suicide_Rate, color = VHA_Use)) +
geom_line(linewidth = 1.2) +
geom_point() +
theme_minimal() +
labs(title = "Female Suicide Rates by VHA Use Over Time",
y = "Suicide Rate per 100,000")
# Boxplot
ggplot(data, aes(x = VHA_Use, y = Suicide_Rate, fill = VHA_Use)) +
geom_boxplot() +
labs(title = "Distribution of Suicide Rates by VHA Use",
y = "Suicide Rate per 100,000") +
theme_minimal()
# Normality assumption: Shapiro-Wilk test
shapiro.test(data$Suicide_Rate[data$VHA_Use == "Recent Veteran VHA User"])
##
## Shapiro-Wilk normality test
##
## data: data$Suicide_Rate[data$VHA_Use == "Recent Veteran VHA User"]
## W = 0.98375, p-value = 0.9641
shapiro.test(data$Suicide_Rate[data$VHA_Use == "Other Veteran"])
##
## Shapiro-Wilk normality test
##
## data: data$Suicide_Rate[data$VHA_Use == "Other Veteran"]
## W = 0.95189, p-value = 0.3442
# Visual check - Q-Q plots
ggqqplot(data, "Suicide_Rate", facet.by = "VHA_Use", title = "Q-Q Plots by VHA Use")
# Equality of variances: Levene's test
leveneTest(Suicide_Rate ~ VHA_Use, data = data)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 1 0.5032 0.482
## 42
# t-test (Assuming equal variance)
t_test_result <- t.test(Suicide_Rate ~ VHA_Use, data = data, var.equal = TRUE)
print(t_test_result)
##
## Two Sample t-test
##
## data: Suicide_Rate by VHA_Use
## t = -4.4759, df = 42, p-value = 5.722e-05
## alternative hypothesis: true difference in means between group Other Veteran and group Recent Veteran VHA User is not equal to 0
## 95 percent confidence interval:
## -5.015122 -1.898087
## sample estimates:
## mean in group Other Veteran mean in group Recent Veteran VHA User
## 11.95014 15.40675
# Effect size: Cohen's d
cohen.d(Suicide_Rate ~ VHA_Use, data = data)
##
## Cohen's d
##
## d estimate: -1.349524 (large)
## 95 percent confidence interval:
## lower upper
## -2.0237099 -0.6753371
# Fit the model
lm_model <- lm(Suicide_Rate ~ Year + VHA_Use, data = data)
summary(lm_model)
##
## Call:
## lm(formula = Suicide_Rate ~ Year + VHA_Use, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.0224 -1.0779 0.1208 1.2975 3.0933
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -577.48039 82.94827 -6.962 1.86e-08 ***
## Year 0.29303 0.04124 7.106 1.17e-08 ***
## VHA_UseRecent Veteran VHA User 3.45660 0.52323 6.606 5.95e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.735 on 41 degrees of freedom
## Multiple R-squared: 0.6966, Adjusted R-squared: 0.6818
## F-statistic: 47.07 on 2 and 41 DF, p-value: 2.405e-11
# Tidy output
tidy(lm_model)
## # A tibble: 3 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -577. 82.9 -6.96 0.0000000186
## 2 Year 0.293 0.0412 7.11 0.0000000117
## 3 VHA_UseRecent Veteran VHA User 3.46 0.523 6.61 0.0000000595
# Diagnostic plots
par(mfrow = c(2, 2))
plot(lm_model)
par(mfrow = c(1, 1))
par(mfrow = c(1, 1))
Test for Heteroskedasticity
bptest(lm_model)
##
## studentized Breusch-Pagan test
##
## data: lm_model
## BP = 2.3938, df = 2, p-value = 0.3021
vif(lm_model)
## Year VHA_Use
## 1 1