# Load libraries
library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(broom)
library(car)       # For Levene's Test
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
library(ggpubr)    # For visual Q-Q and histograms
## Warning: package 'ggpubr' was built under R version 4.4.3
library(effsize)   # For effect size (Cohen's d)
## Warning: package 'effsize' was built under R version 4.4.3
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(car)

Import CSV

data <- read.csv( "female_veteran_suicide (2).csv")
# Clean column names
colnames(data) <- c("Year", "Deaths", "Population", "VHA_Use")
# Clean and prepare the data
data <- data %>%
  mutate(
    Population = as.numeric(gsub(",", "", Population)),  # Remove commas and convert to numeric
    Deaths = as.numeric(Deaths),
    Year = as.numeric(Year),
    VHA_Use = factor(VHA_Use, levels = c("Other Veteran", "Recent Veteran VHA User")),
    Suicide_Rate = (Deaths / Population) * 100000  # Suicide rate per 100,000
  )
library(dplyr)

# Convert Population from character to numeric, calculate Suicide_Rate, and remove NAs
data <- data %>%
  mutate(Population = as.numeric(gsub(",", "", Population))) %>%  # Remove commas and convert to numeric
  mutate(Suicide_Rate = (Deaths / Population) * 100000) %>%
  drop_na(Deaths, Population, Suicide_Rate)  # Drop any rows with NAs in key columns

======================

#EDA # ======================

data %>%
  group_by(VHA_Use) %>%
  summarize(
    min = min(Suicide_Rate, na.rm = TRUE),
    q1 = quantile(Suicide_Rate, 0.25, na.rm = TRUE),
    median = median(Suicide_Rate, na.rm = TRUE),
    mean = mean(Suicide_Rate, na.rm = TRUE),
    q3 = quantile(Suicide_Rate, 0.75, na.rm = TRUE),
    max = max(Suicide_Rate, na.rm = TRUE),
    sd = sd(Suicide_Rate, na.rm = TRUE),
    n = n()
  )
## # A tibble: 2 × 9
##   VHA_Use                   min    q1 median  mean    q3   max    sd     n
##   <fct>                   <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 Other Veteran            6.92  9.51   12.3  12.0  13.6  16.4  2.73    22
## 2 Recent Veteran VHA User  9.77 13.7    15.3  15.4  17.0  19.9  2.39    22
# Line plot
ggplot(data, aes(x = Year, y = Suicide_Rate, color = VHA_Use)) +
  geom_line(linewidth = 1.2) +
  geom_point() +
  theme_minimal() +
  labs(title = "Female Suicide Rates by VHA Use Over Time",
       y = "Suicide Rate per 100,000")

# Boxplot
ggplot(data, aes(x = VHA_Use, y = Suicide_Rate, fill = VHA_Use)) +
  geom_boxplot() +
  labs(title = "Distribution of Suicide Rates by VHA Use",
       y = "Suicide Rate per 100,000") +
  theme_minimal()

======================

T-TEST: Assumptions

======================

# Normality assumption: Shapiro-Wilk test
shapiro.test(data$Suicide_Rate[data$VHA_Use == "Recent Veteran VHA User"])
## 
##  Shapiro-Wilk normality test
## 
## data:  data$Suicide_Rate[data$VHA_Use == "Recent Veteran VHA User"]
## W = 0.98375, p-value = 0.9641
shapiro.test(data$Suicide_Rate[data$VHA_Use == "Other Veteran"])
## 
##  Shapiro-Wilk normality test
## 
## data:  data$Suicide_Rate[data$VHA_Use == "Other Veteran"]
## W = 0.95189, p-value = 0.3442
# Visual check - Q-Q plots
ggqqplot(data, "Suicide_Rate", facet.by = "VHA_Use", title = "Q-Q Plots by VHA Use")

# Equality of variances: Levene's test
leveneTest(Suicide_Rate ~ VHA_Use, data = data)
## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value Pr(>F)
## group  1  0.5032  0.482
##       42

======================

Independent Samples T-Test

======================

# t-test (Assuming equal variance)
t_test_result <- t.test(Suicide_Rate ~ VHA_Use, data = data, var.equal = TRUE)
print(t_test_result)
## 
##  Two Sample t-test
## 
## data:  Suicide_Rate by VHA_Use
## t = -4.4759, df = 42, p-value = 5.722e-05
## alternative hypothesis: true difference in means between group Other Veteran and group Recent Veteran VHA User is not equal to 0
## 95 percent confidence interval:
##  -5.015122 -1.898087
## sample estimates:
##           mean in group Other Veteran mean in group Recent Veteran VHA User 
##                              11.95014                              15.40675
# Effect size: Cohen's d
cohen.d(Suicide_Rate ~ VHA_Use, data = data)
## 
## Cohen's d
## 
## d estimate: -1.349524 (large)
## 95 percent confidence interval:
##      lower      upper 
## -2.0237099 -0.6753371

======================

Linear Regression

======================

# Fit the model
lm_model <- lm(Suicide_Rate ~ Year + VHA_Use, data = data)
summary(lm_model)
## 
## Call:
## lm(formula = Suicide_Rate ~ Year + VHA_Use, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.0224 -1.0779  0.1208  1.2975  3.0933 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    -577.48039   82.94827  -6.962 1.86e-08 ***
## Year                              0.29303    0.04124   7.106 1.17e-08 ***
## VHA_UseRecent Veteran VHA User    3.45660    0.52323   6.606 5.95e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.735 on 41 degrees of freedom
## Multiple R-squared:  0.6966, Adjusted R-squared:  0.6818 
## F-statistic: 47.07 on 2 and 41 DF,  p-value: 2.405e-11
# Tidy output
tidy(lm_model)
## # A tibble: 3 × 5
##   term                           estimate std.error statistic      p.value
##   <chr>                             <dbl>     <dbl>     <dbl>        <dbl>
## 1 (Intercept)                    -577.      82.9        -6.96 0.0000000186
## 2 Year                              0.293    0.0412      7.11 0.0000000117
## 3 VHA_UseRecent Veteran VHA User    3.46     0.523       6.61 0.0000000595
# Diagnostic plots
par(mfrow = c(2, 2))
plot(lm_model)

par(mfrow = c(1, 1))
par(mfrow = c(1, 1))

Test for Heteroskedasticity

bptest(lm_model)
## 
##  studentized Breusch-Pagan test
## 
## data:  lm_model
## BP = 2.3938, df = 2, p-value = 0.3021
vif(lm_model)
##    Year VHA_Use 
##       1       1