SECTION 1: LOAD LIBRARIES

library(tidyverse)   # Data wrangling and ggplot2 visualization
library(readxl)      # Read Excel files
library(psych)       # Descriptive statistics
library(knitr)       # Table formatting
library(broom)       # Tidy model outputs
library(gtsummary)   # Summary statistics tables
library(corrplot)    # Correlation matrix visualization
library(kableExtra)  # Enhanced table styling

SECTION 2: LOAD AND PREPARE DATA

# Load data from Excel
countyjail_data <- read_excel(
  "~/Desktop/capstone bexar county jail/excel work sheets and r code/TXJail_DeathsCounties2015_2025 Capstone analysis.xlsx",
  sheet = "All_Data")

# Create Bexar binary variable and set county factor levels
countyjail_data <- countyjail_data %>%
  mutate(
    # Binary indicator: 1 = Bexar County, 0 = all other counties
    Bexar = ifelse(agency_county == "BEXAR", 1, 0),
    
    # Set Bexar as reference group for all county comparisons
    agency_county = factor(agency_county,
                           levels = c("BEXAR", "DALLAS", "HARRIS", "TRAVIS"))
  )

# Confirm data loaded correctly
nrow(countyjail_data)  # Should return 390

## [1] 390

names(countyjail_data) # View all column names

##  [1] "record_id"                                
##  [2] "agency_county"                            
##  [3] "death_year"                               
##  [4] "death_date"                               
##  [5] "preventable_death"                        
##  [6] "age_at_time_of_death"                     
##  [7] "age_group"                                
##  [8] "sex"                                      
##  [9] "race"                                     
## [10] "sex_male"                                 
## [11] "race_white"                               
## [12] "race_black"                               
## [13] "race_hispanic"                            
## [14] "age_26_35"                                
## [15] "age_36_45"                                
## [16] "age_46_55"                                
## [17] "manner_of_death"                          
## [18] "specific_type_of_custody_facility"        
## [19] "housing_type"                             
## [20] "days_from_custody_to_death"               
## [21] "timing_category"                          
## [22] "exhibit_any_mental_health_problems"       
## [23] "make_suicidal_statements"                 
## [24] "housing_single_cell"                      
## [25] "housing_multiple"                         
## [26] "mh_unknown"                               
## [27] "mh_yes"                                   
## [28] "suicidal_yes"                             
## [29] "county_dallas"                            
## [30] "county_harris"                            
## [31] "county_travis"                            
## [32] "death_from_pre_existing_medical_condition"
## [33] "type_of_offense"                          
## [34] "offense_1"                                
## [35] "were_the_charges"                         
## [36] "Bexar"

SECTION 3: CREATE SUMMARY DATASETS

# Four county preventable death summary
four_county_summary <- countyjail_data %>%
  filter(agency_county %in% c("BEXAR", "DALLAS", "HARRIS", "TRAVIS")) %>%
  group_by(agency_county) %>%
  summarise(
    total_deaths       = n(),
    preventable_deaths = sum(preventable_death == 1),
    preventable_rate   = mean(preventable_death) * 100,
    .groups = 'drop'
  )

print(four_county_summary)

## # A tibble: 4 × 4
##   agency_county total_deaths preventable_deaths preventable_rate
##   <fct>                <int>              <int>            <dbl>
## 1 BEXAR                  119                 33            27.7 
## 2 DALLAS                  77                  6             7.79
## 3 HARRIS                 151                 13             8.61
## 4 TRAVIS                  43                 12            27.9

# Characteristics by county
county_characteristics <- countyjail_data %>%
  filter(agency_county %in% c("BEXAR", "DALLAS", "HARRIS", "TRAVIS")) %>%
  group_by(agency_county) %>%
  summarise(
    n                  = n(),
    preventable_rate   = mean(preventable_death) * 100,
    avg_age            = mean(age_at_time_of_death, na.rm = TRUE),
    male_pct           = mean(sex_male) * 100,
    black_pct          = mean(race_black) * 100,
    hispanic_pct       = mean(race_hispanic) * 100,
    mh_yes_pct         = mean(mh_yes) * 100,
    suicidal_pct       = mean(suicidal_yes) * 100,
    single_cell_pct    = mean(housing_single_cell) * 100,
    avg_days_custody   = mean(days_from_custody_to_death, na.rm = TRUE),
    .groups = 'drop'
  )

print(county_characteristics)

## # A tibble: 4 × 11
##   agency_county     n preventable_rate avg_age male_pct black_pct hispanic_pct
##   <fct>         <int>            <dbl>   <dbl>    <dbl>     <dbl>        <dbl>
## 1 BEXAR           119            27.7     45.7     85.7      22.7        38.7 
## 2 DALLAS           77             7.79    49.5     90.9      46.8        20.8 
## 3 HARRIS          151             8.61    48.2     91.4      54.3         5.96
## 4 TRAVIS           43            27.9     41.5     83.7      23.3        32.6 
## # ℹ 4 more variables: mh_yes_pct <dbl>, suicidal_pct <dbl>,
## #   single_cell_pct <dbl>, avg_days_custody <dbl>

# Preventable vs non-preventable comparison
preventable_comparison <- countyjail_data %>%
  group_by(preventable_death) %>%
  summarise(
    n                = n(),
    avg_age          = mean(age_at_time_of_death, na.rm = TRUE),
    male_pct         = mean(sex_male) * 100,
    black_pct        = mean(race_black) * 100,
    hispanic_pct     = mean(race_hispanic) * 100,
    mh_yes_pct       = mean(mh_yes) * 100,
    suicidal_pct     = mean(suicidal_yes) * 100,
    avg_days_custody = mean(days_from_custody_to_death, na.rm = TRUE),
    .groups = 'drop'
  )

print(preventable_comparison)

## # A tibble: 2 × 9
##   preventable_death     n avg_age male_pct black_pct hispanic_pct mh_yes_pct
##               <dbl> <int>   <dbl>    <dbl>     <dbl>        <dbl>      <dbl>
## 1                 0   326    48.9     89.3      43.9         20.2       17.5
## 2                 1    64    36.6     85.9      18.8         29.7       20.3
## # ℹ 2 more variables: suicidal_pct <dbl>, avg_days_custody <dbl>

SECTION 4: DESCRIPTIVE STATISTICS

# Overall sample descriptive statistics table
countyjail_data %>%
  select(preventable_death, housing_single_cell, mh_yes, suicidal_yes,
         sex_male, race_black, race_hispanic, age_at_time_of_death, Bexar) %>%
  tbl_summary(
    statistic = list(
      all_continuous()  ~ "{mean} ({sd})",
      all_categorical() ~ "{n} ({p}%)"
    ),
    digits = all_continuous() ~ 2,
    label = list(
      preventable_death    ~ "Preventable Death",
      housing_single_cell  ~ "Single Cell Housing",
      mh_yes               ~ "Mental Health Flag",
      suicidal_yes         ~ "Suicidal Ideation",
      sex_male             ~ "Male",
      race_black           ~ "Black",
      race_hispanic        ~ "Hispanic",
      age_at_time_of_death ~ "Age",
      Bexar                ~ "Bexar County"
    )
  ) %>%
  bold_labels()

Characteristic	N = 390¹
Preventable Death	64 (16%)
Single Cell Housing	99 (25%)
Mental Health Flag	70 (18%)
Suicidal Ideation	28 (7.2%)
Male	346 (89%)
Black	155 (40%)
Hispanic	85 (22%)
Age	46.92 (14.89)
Bexar County	119 (31%)
¹ n (%); Mean (SD)

SECTION 5: BIVARIATE TESTS

# T-test: Age by preventable death status
t_test_age <- t.test(age_at_time_of_death ~ preventable_death,
                     data = countyjail_data)
print(t_test_age)

## 
##  Welch Two Sample t-test
## 
## data:  age_at_time_of_death by preventable_death
## t = 8.0366, df = 120.46, p-value = 7.136e-13
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##   9.266044 15.323872
## sample estimates:
## mean in group 0 mean in group 1 
##        48.93558        36.64062

# Age summary by group
age_summary <- countyjail_data %>%
  group_by(preventable_death) %>%
  summarise(
    n        = n(),
    mean_age = mean(age_at_time_of_death, na.rm = TRUE),
    sd_age   = sd(age_at_time_of_death, na.rm = TRUE)
  )

knitr::kable(age_summary, digits = 2,
             caption = "Age by Preventable Death Status")

Age by Preventable Death Status
preventable_death	n	mean_age	sd_age
0	326	48.94	14.82
1	64	36.64	10.33

# T-test: Days in custody by preventable death status
t_test_days <- t.test(days_from_custody_to_death ~ preventable_death,
                      data = countyjail_data)
print(t_test_days)

## 
##  Welch Two Sample t-test
## 
## data:  days_from_custody_to_death by preventable_death
## t = 0.68259, df = 80.006, p-value = 0.4968
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -40.96204  83.73179
## sample estimates:
## mean in group 0 mean in group 1 
##        123.8067        102.4219

# Chi-square: Mental health by preventable death
mh_table   <- table(countyjail_data$exhibit_any_mental_health_problems,
                    countyjail_data$preventable_death)
chisq_mh   <- chisq.test(mh_table)
print(chisq_mh)

## 
##  Pearson's Chi-squared test
## 
## data:  mh_table
## X-squared = 5.631, df = 2, p-value = 0.05987

prop.table(mh_table, 1) * 100

##          
##                  0        1
##   NO      77.90698 22.09302
##   UNKNOWN 88.75740 11.24260
##   YES     81.42857 18.57143

# Chi-square: County by preventable death
county_table <- table(countyjail_data$agency_county,
                      countyjail_data$preventable_death)
chisq_county <- chisq.test(county_table)
print(chisq_county)

## 
##  Pearson's Chi-squared test
## 
## data:  county_table
## X-squared = 26.13, df = 3, p-value = 8.96e-06

# Chi-square: Race by preventable death
race_table <- table(countyjail_data$race,
                    countyjail_data$preventable_death)
chisq_race <- chisq.test(race_table)
print(chisq_race)

## 
##  Pearson's Chi-squared test
## 
## data:  race_table
## X-squared = 18.128, df = 3, p-value = 0.0004139

# ANOVA: Age across counties
anova_age <- aov(age_at_time_of_death ~ agency_county,
                 data = countyjail_data %>%
                   filter(agency_county %in% c("BEXAR", "DALLAS", "HARRIS", "TRAVIS")))
summary(anova_age)

##                Df Sum Sq Mean Sq F value Pr(>F)  
## agency_county   3   2193   731.0   3.358 0.0189 *
## Residuals     386  84038   217.7                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Post-hoc Tukey test
TukeyHSD(anova_age)

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = age_at_time_of_death ~ agency_county, data = countyjail_data %>% filter(agency_county %in% c("BEXAR", "DALLAS", "HARRIS", "TRAVIS")))
## 
## $agency_county
##                    diff        lwr         upr     p adj
## DALLAS-BEXAR   3.825057  -1.743174  9.39328810 0.2882031
## HARRIS-BEXAR   2.496856  -2.170039  7.16374999 0.5123687
## TRAVIS-BEXAR  -4.167090 -10.941287  2.60710711 0.3871057
## HARRIS-DALLAS -1.328202  -6.659606  4.00320277 0.9180113
## TRAVIS-DALLAS -7.992147 -15.240162 -0.74413263 0.0240764
## TRAVIS-HARRIS -6.663946 -13.244860 -0.08303202 0.0458887

SECTION 6: CORRELATION MATRIX

# Build correlation matrix for all model variables
cor_vars <- countyjail_data %>%
  select(preventable_death, housing_single_cell, mh_yes, suicidal_yes,
         sex_male, race_black, race_hispanic, age_at_time_of_death, Bexar) %>%
  na.omit()

cor_matrix <- cor(cor_vars, method = "pearson")

# Print rounded correlation values
print(round(cor_matrix, 2))

##                      preventable_death housing_single_cell mh_yes suicidal_yes
## preventable_death                 1.00                0.19   0.03         0.12
## housing_single_cell               0.19                1.00   0.14         0.02
## mh_yes                            0.03                0.14   1.00         0.47
## suicidal_yes                      0.12                0.02   0.47         1.00
## sex_male                         -0.04               -0.02  -0.02         0.04
## race_black                       -0.19               -0.06   0.10         0.04
## race_hispanic                     0.08                0.01  -0.09        -0.03
## age_at_time_of_death             -0.31               -0.17  -0.01         0.02
## Bexar                             0.20               -0.05  -0.21        -0.10
##                      sex_male race_black race_hispanic age_at_time_of_death
## preventable_death       -0.04      -0.19          0.08                -0.31
## housing_single_cell     -0.02      -0.06          0.01                -0.17
## mh_yes                  -0.02       0.10         -0.09                -0.01
## suicidal_yes             0.04       0.04         -0.03                 0.02
## sex_male                 1.00       0.02         -0.01                 0.06
## race_black               0.02       1.00         -0.43                -0.03
## race_hispanic           -0.01      -0.43          1.00                -0.07
## age_at_time_of_death     0.06      -0.03         -0.07                 1.00
## Bexar                   -0.06      -0.23          0.27                -0.06
##                      Bexar
## preventable_death     0.20
## housing_single_cell  -0.05
## mh_yes               -0.21
## suicidal_yes         -0.10
## sex_male             -0.06
## race_black           -0.23
## race_hispanic         0.27
## age_at_time_of_death -0.06
## Bexar                 1.00

# Visual correlation plot
corrplot(cor_matrix,
         method        = "color",
         type          = "upper",
         tl.cex        = 0.8,
         addCoef.col   = "purple",
         number.cex    = 0.7,
         title         = "Correlation Matrix - Custodial Death Variables",
         mar           = c(0, 0, 1, 0))

SECTION 7: LOGISTIC REGRESSION MODELS

Model 1 — Demographics Only

model1 <- glm(preventable_death ~ age_at_time_of_death + sex_male +
                race_black + race_hispanic,
              data   = countyjail_data,
              family = binomial)

summary(model1)

## 
## Call:
## glm(formula = preventable_death ~ age_at_time_of_death + sex_male + 
##     race_black + race_hispanic, family = binomial, data = countyjail_data)
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           2.21355    0.67731   3.268  0.00108 ** 
## age_at_time_of_death -0.07489    0.01256  -5.961 2.51e-09 ***
## sex_male             -0.15009    0.44079  -0.341  0.73347    
## race_black           -1.59295    0.39056  -4.079 4.53e-05 ***
## race_hispanic        -0.22262    0.35661  -0.624  0.53245    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 348.20  on 389  degrees of freedom
## Residual deviance: 286.98  on 385  degrees of freedom
## AIC: 296.98
## 
## Number of Fisher Scoring iterations: 5

# Odds ratios
tidy(model1, exponentiate = TRUE, conf.int = TRUE) %>%
  select(term, estimate, conf.low, conf.high, p.value) %>%
  rename(Variable       = term,
         `Odds Ratio`   = estimate,
         `95% CI Lower` = conf.low,
         `95% CI Upper` = conf.high,
         `P-Value`      = p.value) %>%
  kable(digits = 3,
        caption = "Model 1: Odds Ratios — Demographics Only") %>%
  kable_styling()

Model 1: Odds Ratios — Demographics Only
Variable	Odds Ratio	95% CI Lower	95% CI Upper	P-Value
(Intercept)	9.148	2.456	35.526	0.001
age_at_time_of_death	0.928	0.904	0.950	0.000
sex_male	0.861	0.374	2.140	0.733
race_black	0.203	0.091	0.425	0.000
race_hispanic	0.800	0.392	1.595	0.532

Model 2 — Demographics + Mental Health + Custody Time

model2 <- glm(preventable_death ~ age_at_time_of_death + sex_male +
                race_black + race_hispanic + mh_yes + suicidal_yes +
                days_from_custody_to_death,
              data   = countyjail_data,
              family = binomial)

summary(model2)

## 
## Call:
## glm(formula = preventable_death ~ age_at_time_of_death + sex_male + 
##     race_black + race_hispanic + mh_yes + suicidal_yes + days_from_custody_to_death, 
##     family = binomial, data = countyjail_data)
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 2.335e+00  6.960e-01   3.355 0.000793 ***
## age_at_time_of_death       -7.801e-02  1.288e-02  -6.058 1.38e-09 ***
## sex_male                   -2.530e-01  4.485e-01  -0.564 0.572647    
## race_black                 -1.661e+00  4.011e-01  -4.142 3.44e-05 ***
## race_hispanic              -2.050e-01  3.627e-01  -0.565 0.571903    
## mh_yes                     -2.854e-01  4.698e-01  -0.607 0.543523    
## suicidal_yes                1.648e+00  6.054e-01   2.722 0.006493 ** 
## days_from_custody_to_death  7.565e-05  8.169e-04   0.093 0.926215    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 348.20  on 389  degrees of freedom
## Residual deviance: 278.95  on 382  degrees of freedom
## AIC: 294.95
## 
## Number of Fisher Scoring iterations: 5

# Odds ratios
tidy(model2, exponentiate = TRUE, conf.int = TRUE) %>%
  select(term, estimate, conf.low, conf.high, p.value) %>%
  rename(Variable       = term,
         `Odds Ratio`   = estimate,
         `95% CI Lower` = conf.low,
         `95% CI Upper` = conf.high,
         `P-Value`      = p.value) %>%
  kable(digits = 3,
        caption = "Model 2: Odds Ratios — Demographics + Mental Health") %>%
  kable_styling()

Model 2: Odds Ratios — Demographics + Mental Health
Variable	Odds Ratio	95% CI Lower	95% CI Upper	P-Value
(Intercept)	10.330	2.683	41.805	0.001
age_at_time_of_death	0.925	0.901	0.948	0.000
sex_male	0.776	0.332	1.954	0.573
race_black	0.190	0.083	0.405	0.000
race_hispanic	0.815	0.394	1.644	0.572
mh_yes	0.752	0.282	1.811	0.544
suicidal_yes	5.195	1.592	17.464	0.006
days_from_custody_to_death	1.000	0.998	1.002	0.926

Model 3 — Primary Model (Professor’s Specification): Bexar Binary

# This is the main model as specified by the capstone advisor
model <- glm(preventable_death ~ housing_single_cell + mh_yes + suicidal_yes +
               sex_male + race_black + race_hispanic + age_at_time_of_death + Bexar,
             data   = countyjail_data,
             family = "binomial")

summary(model)

## 
## Call:
## glm(formula = preventable_death ~ housing_single_cell + mh_yes + 
##     suicidal_yes + sex_male + race_black + race_hispanic + age_at_time_of_death + 
##     Bexar, family = "binomial", data = countyjail_data)
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           1.23862    0.76230   1.625 0.104198    
## housing_single_cell   0.94777    0.33268   2.849 0.004388 ** 
## mh_yes               -0.04381    0.48404  -0.091 0.927887    
## suicidal_yes          1.70008    0.62316   2.728 0.006369 ** 
## sex_male             -0.09354    0.47275  -0.198 0.843157    
## race_black           -1.46962    0.40916  -3.592 0.000328 ***
## race_hispanic        -0.40401    0.38346  -1.054 0.292063    
## age_at_time_of_death -0.07522    0.01332  -5.645 1.65e-08 ***
## Bexar                 1.19484    0.35211   3.393 0.000690 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 348.20  on 389  degrees of freedom
## Residual deviance: 261.14  on 381  degrees of freedom
## AIC: 279.14
## 
## Number of Fisher Scoring iterations: 6

# Odds ratios table
tidy(model, exponentiate = TRUE, conf.int = TRUE) %>%
  select(term, estimate, conf.low, conf.high, p.value) %>%
  rename(Variable       = term,
         `Odds Ratio`   = estimate,
         `95% CI Lower` = conf.low,
         `95% CI Upper` = conf.high,
         `P-Value`      = p.value) %>%
  kable(digits = 3,
        caption = "Table 1: Logit Model Odds Ratios for Preventable Death (Bexar Binary)") %>%
  kable_styling()

Table 1: Logit Model Odds Ratios for Preventable Death (Bexar Binary)
Variable	Odds Ratio	95% CI Lower	95% CI Upper	P-Value
(Intercept)	3.451	0.775	15.652	0.104
housing_single_cell	2.580	1.344	4.980	0.004
mh_yes	0.957	0.352	2.389	0.928
suicidal_yes	5.474	1.637	19.245	0.006
sex_male	0.911	0.372	2.407	0.843
race_black	0.230	0.099	0.499	0.000
race_hispanic	0.668	0.309	1.400	0.292
age_at_time_of_death	0.928	0.902	0.951	0.000
Bexar	3.303	1.669	6.676	0.001

Model 4 — All County Comparison (Reference = Bexar)

# Supplemental model comparing all counties to Bexar as reference group
model_allcounties <- glm(preventable_death ~ housing_single_cell + mh_yes +
                           suicidal_yes + sex_male + race_black + race_hispanic +
                           age_at_time_of_death + agency_county,
                         data   = countyjail_data,
                         family = "binomial")

summary(model_allcounties)

## 
## Call:
## glm(formula = preventable_death ~ housing_single_cell + mh_yes + 
##     suicidal_yes + sex_male + race_black + race_hispanic + age_at_time_of_death + 
##     agency_county, family = "binomial", data = countyjail_data)
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           2.44867    0.75414   3.247 0.001166 ** 
## housing_single_cell   0.84278    0.34729   2.427 0.015236 *  
## mh_yes               -0.09167    0.50668  -0.181 0.856432    
## suicidal_yes          1.73780    0.63252   2.747 0.006006 ** 
## sex_male             -0.11607    0.47236  -0.246 0.805900    
## race_black           -1.43054    0.41272  -3.466 0.000528 ***
## race_hispanic        -0.40438    0.39111  -1.034 0.301175    
## age_at_time_of_death -0.07453    0.01344  -5.546 2.92e-08 ***
## agency_countyDALLAS  -1.56923    0.55455  -2.830 0.004659 ** 
## agency_countyHARRIS  -1.19106    0.41760  -2.852 0.004343 ** 
## agency_countyTRAVIS  -0.80992    0.51963  -1.559 0.119077    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 348.20  on 389  degrees of freedom
## Residual deviance: 259.63  on 379  degrees of freedom
## AIC: 281.63
## 
## Number of Fisher Scoring iterations: 6

# Odds ratios table
tidy(model_allcounties, exponentiate = TRUE, conf.int = TRUE) %>%
  select(term, estimate, conf.low, conf.high, p.value) %>%
  rename(Variable       = term,
         `Odds Ratio`   = estimate,
         `95% CI Lower` = conf.low,
         `95% CI Upper` = conf.high,
         `P-Value`      = p.value) %>%
  knitr::kable(digits = 3,
               caption = "Table 2: Logit Model — All County Comparisons (Reference = Bexar)") %>%
  kable_styling()

Table 2: Logit Model — All County Comparisons (Reference = Bexar)
Variable	Odds Ratio	95% CI Lower	95% CI Upper	P-Value
(Intercept)	11.573	2.696	52.853	0.001
housing_single_cell	2.323	1.173	4.604	0.015
mh_yes	0.912	0.321	2.383	0.856
suicidal_yes	5.685	1.673	20.372	0.006
sex_male	0.890	0.364	2.350	0.806
race_black	0.239	0.103	0.524	0.001
race_hispanic	0.667	0.305	1.424	0.301
age_at_time_of_death	0.928	0.903	0.952	0.000
agency_countyDALLAS	0.208	0.064	0.579	0.005
agency_countyHARRIS	0.304	0.131	0.678	0.004
agency_countyTRAVIS	0.445	0.155	1.200	0.119

Model Fit Comparison

# AIC comparison — lower AIC = better fit
cat("Model 1 AIC (Demographics Only):", AIC(model1), "\n")

## Model 1 AIC (Demographics Only): 296.981

cat("Model 2 AIC (+ Mental Health):", AIC(model2), "\n")

## Model 2 AIC (+ Mental Health): 294.9452

cat("Model 3 AIC (Primary - Bexar Binary):", AIC(model), "\n")

## Model 3 AIC (Primary - Bexar Binary): 279.144

cat("Model 4 AIC (All Counties):", AIC(model_allcounties), "\n")

## Model 4 AIC (All Counties): 281.6312

# McFadden's Pseudo R-squared
cat("\nMcFadden's Pseudo R-squared:\n")

## 
## McFadden's Pseudo R-squared:

cat("Model 1:", round(1 - (model1$deviance / model1$null.deviance), 4), "\n")

## Model 1: 0.1758

cat("Model 2:", round(1 - (model2$deviance / model2$null.deviance), 4), "\n")

## Model 2: 0.1989

cat("Model 3:", round(1 - (model$deviance / model$null.deviance), 4), "\n")

## Model 3: 0.25

cat("Model 4:", round(1 - (model_allcounties$deviance / model_allcounties$null.deviance), 4), "\n")

## Model 4: 0.2544

SECTION 8: VISUALIZATIONS

Figure 1 — Preventable Death Rates by County

ggplot(four_county_summary,
       aes(x = reorder(agency_county, -preventable_rate),
           y = preventable_rate,
           fill = agency_county)) +
  geom_bar(stat = "identity", width = 0.6) +
  geom_text(aes(label = paste0(round(preventable_rate, 1), "%")),
            vjust = -0.5, size = 5, fontface = "bold") +
  scale_fill_manual(values = c("BEXAR"  = "#d73027",
                                "TRAVIS" = "#f46d43",
                                "HARRIS" = "#4575b4",
                                "DALLAS" = "#74add1")) +
  labs(title    = "Figure 1: Preventable Death Rates by County",
       subtitle = "Texas County Jails, 2015–2025",
       x        = "County",
       y        = "Preventable Death Rate (%)") +
  theme_minimal(base_size = 13) +
  theme(legend.position = "none")

Figure 2 — Odds Ratio Plot: Primary Model (Bexar Binary)

or_plot_data <- tidy(model, exponentiate = TRUE, conf.int = TRUE) %>%
  filter(term != "(Intercept)") %>%
  mutate(
    term = recode(term,
      "housing_single_cell"  = "Single Cell Housing",
      "mh_yes"               = "Mental Health Flag",
      "suicidal_yes"         = "Suicidal Ideation",
      "sex_male"             = "Male",
      "race_black"           = "Black",
      "race_hispanic"        = "Hispanic",
      "age_at_time_of_death" = "Age",
      "Bexar"                = "Bexar County"
    ),
    significant = ifelse(p.value < 0.05, "Significant", "Not Significant")
  )

ggplot(or_plot_data,
       aes(x = estimate, y = reorder(term, estimate), color = significant)) +
  geom_point(size = 4) +
  geom_errorbar(aes(xmin = conf.low, xmax = conf.high),
                height = 0.2, orientation = "y") +
  geom_vline(xintercept = 1, linetype = "dashed", color = "gray40") +
  scale_x_log10() +
  scale_color_manual(values = c("Significant"     = "#d73027",
                                 "Not Significant" = "gray60")) +
  labs(title    = "Figure 2: Odds Ratios — Predictors of Preventable Death",
       subtitle = "Reference = Non-Bexar Counties | Log Scale",
       x        = "Odds Ratio (95% CI)",
       y        = NULL,
       color    = NULL) +
  theme_minimal(base_size = 13) +
  theme(legend.position = "bottom")

Figure 3 — Odds Ratio Plot: All County Comparisons

or_county_data <- tidy(model_allcounties, exponentiate = TRUE, conf.int = TRUE) %>%
  filter(term != "(Intercept)") %>%
  mutate(
    term = recode(term,
      "housing_single_cell"  = "Single Cell Housing",
      "mh_yes"               = "Mental Health Flag",
      "suicidal_yes"         = "Suicidal Ideation",
      "sex_male"             = "Male",
      "race_black"           = "Black",
      "race_hispanic"        = "Hispanic",
      "age_at_time_of_death" = "Age",
      "agency_countyDALLAS"  = "Dallas vs Bexar",
      "agency_countyHARRIS"  = "Harris vs Bexar",
      "agency_countyTRAVIS"  = "Travis vs Bexar"
    ),
    significant = ifelse(p.value < 0.05, "Significant", "Not Significant")
  )

ggplot(or_county_data,
       aes(x = estimate, y = reorder(term, estimate), color = significant)) +
  geom_point(size = 4) +
  geom_errorbar(aes(xmin = conf.low, xmax = conf.high),
                height = 0.2, orientation = "y") +
  geom_vline(xintercept = 1, linetype = "dashed", color = "gray40") +
  scale_x_log10() +
  scale_color_manual(values = c("Significant"     = "#d73027",
                                 "Not Significant" = "gray60")) +
  labs(title    = "Figure 3: Odds Ratios — All County Comparisons",
       subtitle = "Reference Group = Bexar County | Log Scale",
       x        = "Odds Ratio (95% CI)",
       y        = NULL,
       color    = NULL) +
  theme_minimal(base_size = 13) +
  theme(legend.position = "bottom")

Figure 4 — Preventable Death Rate by Suicidal Ideation

suicidal_summary <- countyjail_data %>%
  group_by(suicidal_yes) %>%
  summarise(
    preventable_rate = mean(preventable_death) * 100,
    n = n(),
    .groups = "drop"
  ) %>%
  mutate(label = ifelse(suicidal_yes == 1,
                        "Suicidal Ideation\nDocumented",
                        "No Suicidal\nIdeation"))

ggplot(suicidal_summary,
       aes(x = label, y = preventable_rate, fill = label)) +
  geom_bar(stat = "identity", width = 0.5) +
  geom_text(aes(label = paste0(round(preventable_rate, 1), "%")),
            vjust = -0.5, size = 5, fontface = "bold") +
  scale_fill_manual(values = c("No Suicidal\nIdeation"       = "#4575b4",
                                "Suicidal Ideation\nDocumented" = "#d73027")) +
  labs(title = "Figure 4: Preventable Death Rate by Suicidal Ideation",
       x     = NULL,
       y     = "Preventable Death Rate (%)") +
  theme_minimal(base_size = 13) +
  theme(legend.position = "none")

Figure 5 — Age Distribution by Preventable Death Status

countyjail_data %>%
  mutate(Death_Type = ifelse(preventable_death == 1,
                             "Preventable", "Non-Preventable")) %>%
  ggplot(aes(x = Death_Type, y = age_at_time_of_death, fill = Death_Type)) +
  geom_boxplot(alpha = 0.8, width = 0.5) +
  geom_jitter(width = 0.1, alpha = 0.2, size = 1.5) +
  scale_fill_manual(values = c("Non-Preventable" = "#4575b4",
                                "Preventable"     = "#d73027")) +
  labs(title    = "Figure 5: Age Distribution by Preventable Death Status",
       subtitle = "Mean age: Non-Preventable = 48.9 | Preventable = 36.6",
       x        = NULL,
       y        = "Age at Time of Death") +
  theme_minimal(base_size = 13) +
  theme(legend.position = "none")

Figure 6 — Preventable Death Rate by Housing Type

housing_summary <- countyjail_data %>%
  group_by(housing_single_cell) %>%
  summarise(
    preventable_rate = mean(preventable_death) * 100,
    n = n(),
    .groups = "drop"
  ) %>%
  mutate(label = ifelse(housing_single_cell == 1,
                        "Single Cell\nHousing",
                        "Other\nHousing"))

ggplot(housing_summary,
       aes(x = label, y = preventable_rate, fill = label)) +
  geom_bar(stat = "identity", width = 0.5) +
  geom_text(aes(label = paste0(round(preventable_rate, 1), "%")),
            vjust = -0.5, size = 5, fontface = "bold") +
  scale_fill_manual(values = c("Other\nHousing"      = "#4575b4",
                                "Single Cell\nHousing" = "#d73027")) +
  labs(title = "Figure 6: Preventable Death Rate by Housing Type",
       x     = NULL,
       y     = "Preventable Death Rate (%)") +
  theme_minimal(base_size = 13) +
  theme(legend.position = "none")

SECTION 9: KEY RESULTS REFERENCE

cat("=== KEY FINDINGS REFERENCE ===\n\n")

## === KEY FINDINGS REFERENCE ===

cat("SAMPLE:\n")

## SAMPLE:

cat("Total deaths:", nrow(countyjail_data), "\n")

## Total deaths: 390

cat("Preventable deaths: 64 (16.4%)\n\n")

## Preventable deaths: 64 (16.4%)

cat("COUNTY PREVENTABLE DEATH RATES:\n")

## COUNTY PREVENTABLE DEATH RATES:

print(four_county_summary)

## # A tibble: 4 × 4
##   agency_county total_deaths preventable_deaths preventable_rate
##   <fct>                <int>              <int>            <dbl>
## 1 BEXAR                  119                 33            27.7 
## 2 DALLAS                  77                  6             7.79
## 3 HARRIS                 151                 13             8.61
## 4 TRAVIS                  43                 12            27.9

cat("\nPRIMARY MODEL — SIGNIFICANT PREDICTORS:\n")

## 
## PRIMARY MODEL — SIGNIFICANT PREDICTORS:

cat("Bexar County:       OR = 3.303, 95% CI [1.669, 6.676], p = .001\n")

## Bexar County:       OR = 3.303, 95% CI [1.669, 6.676], p = .001

cat("Suicidal Ideation:  OR = 5.474, 95% CI [1.637, 19.245], p = .006\n")

## Suicidal Ideation:  OR = 5.474, 95% CI [1.637, 19.245], p = .006

cat("Single Cell Housing: OR = 2.580, 95% CI [1.344, 4.980], p = .004\n")

## Single Cell Housing: OR = 2.580, 95% CI [1.344, 4.980], p = .004

cat("Age (per year):     OR = 0.928, 95% CI [0.902, 0.951], p < .001\n")

## Age (per year):     OR = 0.928, 95% CI [0.902, 0.951], p < .001

cat("Race (Black):       OR = 0.230, 95% CI [0.099, 0.499], p < .001\n\n")

## Race (Black):       OR = 0.230, 95% CI [0.099, 0.499], p < .001

cat("ALL COUNTY MODEL — COUNTY COMPARISONS VS BEXAR:\n")

## ALL COUNTY MODEL — COUNTY COMPARISONS VS BEXAR:

cat("Dallas vs Bexar: OR = 0.208, 95% CI [0.064, 0.579], p = .005\n")

## Dallas vs Bexar: OR = 0.208, 95% CI [0.064, 0.579], p = .005

cat("Harris vs Bexar: OR = 0.304, 95% CI [0.131, 0.678], p = .004\n")

## Harris vs Bexar: OR = 0.304, 95% CI [0.131, 0.678], p = .004

cat("Travis vs Bexar: OR = 0.445, 95% CI [0.155, 1.200], p = .119 (ns)\n\n")

## Travis vs Bexar: OR = 0.445, 95% CI [0.155, 1.200], p = .119 (ns)

cat("MODEL FIT:\n")

## MODEL FIT:

cat("Primary Model AIC:", round(AIC(model), 2), "\n")

## Primary Model AIC: 279.14

cat("All County Model AIC:", round(AIC(model_allcounties), 2), "\n")

## All County Model AIC: 281.63

cat("Primary Model McFadden R2:",
    round(1 - (model$deviance / model$null.deviance), 4), "\n")

## Primary Model McFadden R2: 0.25

Predictors of Preventable Death in Texas County Jails: Complete R Codebook

Janice Palma

2026

SECTION 1: LOAD LIBRARIES

SECTION 2: LOAD AND PREPARE DATA

SECTION 3: CREATE SUMMARY DATASETS

SECTION 4: DESCRIPTIVE STATISTICS

SECTION 5: BIVARIATE TESTS

SECTION 6: CORRELATION MATRIX

SECTION 7: LOGISTIC REGRESSION MODELS

Model 1 — Demographics Only

Model 2 — Demographics + Mental Health + Custody Time

Model 3 — Primary Model (Professor’s Specification): Bexar Binary

Model 4 — All County Comparison (Reference = Bexar)

Model Fit Comparison

SECTION 8: VISUALIZATIONS

Figure 1 — Preventable Death Rates by County

Figure 2 — Odds Ratio Plot: Primary Model (Bexar Binary)

Figure 3 — Odds Ratio Plot: All County Comparisons

Figure 4 — Preventable Death Rate by Suicidal Ideation

Figure 5 — Age Distribution by Preventable Death Status

Figure 6 — Preventable Death Rate by Housing Type

SECTION 9: KEY RESULTS REFERENCE