Made with R Studio version 2024.12.1

Load the data set from 2023 National Survey on Drug Use and Health (NSDUH) into R Studio.

Data Table (Quantitative):

library(readxl)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(pastecs)

## 
## Attaching package: 'pastecs'
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## The following object is masked from 'package:tidyr':
## 
##     extract

library(lmtest)

## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

library(MASS)

## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select

library(stargazer)

## 
## Please cite as: 
## 
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer

library(janitor)

## Warning: package 'janitor' was built under R version 4.4.3

## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

library(broom)

## Warning: package 'broom' was built under R version 4.4.3

load('NSDUH_2023.Rdata')

This dataset is too large. Let’s create smaller dataset.

Select just the 11 variables: SUTNEEDPY, SUTRTPY, CAIDCHIP, IRINSUR4, COUTYP4, IRSEX, AGE3, NEWRACE2, EDUHIGHCAT, INCOME, IRWRKSTAT18

select <- dplyr::select

Treatment<-select(puf2023_102124, SUTNEEDPY, SUTRTPY, CAIDCHIP, IRINSUR4, COUTYP4, IRSEX, AGE3, NEWRACE2, EDUHIGHCAT, INCOME, IRWRKSTAT18)

Respondents who are aged 12-17 (AGE3=1, 2, 3) as well as only respondents who do not need SUD treatment (SUTNEEDPY = 0) will be removed from the data set, as well as responses to Medicaid coverage (CAIDCHIP= 85, 94, 97, 98)

TreatmentClean <- Treatment %>% 
  filter(AGE3 >=4 & SUTNEEDPY >=1 & CAIDCHIP <=3)

TreatmentClean <- TreatmentClean %>%
  mutate(across(everything(), as.factor))

summary(TreatmentClean)

##  SUTNEEDPY SUTRTPY  CAIDCHIP IRINSUR4 COUTYP4  IRSEX         AGE3      NEWRACE2
##  1:10156   0:7991   1:3189   1:8913   1:4537   1:5097   9      :2765   1:5942  
##            1:2165   2:6967   2:1243   2:3955   2:5059   5      :1600   2:1227  
##                                       3:1664            8      :1341   3: 243  
##                                                         4      :1123   4:  54  
##                                                         7      :1060   5: 255  
##                                                         6      :1022   6: 564  
##                                                         (Other):1245   7:1871  
##  EDUHIGHCAT INCOME   IRWRKSTAT18
##  1:1265     1:2172   1:5163     
##  2:2984     2:2910   2:1621     
##  3:3152     3:1497   3: 874     
##  4:2755     4:3577   4:2498     
##                                 
##                                 
##

str(TreatmentClean)

## tibble [10,156 × 11] (S3: tbl_df/tbl/data.frame)
##  $ SUTNEEDPY  : Factor w/ 1 level "1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ SUTRTPY    : Factor w/ 2 levels "0","1": 1 2 2 1 2 1 1 1 2 1 ...
##  $ CAIDCHIP   : Factor w/ 2 levels "1","2": 2 1 2 1 2 2 2 1 2 1 ...
##  $ IRINSUR4   : Factor w/ 2 levels "1","2": 2 1 2 1 1 1 2 1 1 1 ...
##  $ COUTYP4    : Factor w/ 3 levels "1","2","3": 3 2 1 3 1 2 2 3 2 1 ...
##  $ IRSEX      : Factor w/ 2 levels "1","2": 1 2 1 1 1 1 1 1 1 1 ...
##  $ AGE3       : Factor w/ 8 levels "4","5","6","7",..: 6 6 5 7 7 4 6 5 1 2 ...
##  $ NEWRACE2   : Factor w/ 7 levels "1","2","3","4",..: 1 4 1 3 1 7 1 1 1 3 ...
##  $ EDUHIGHCAT : Factor w/ 4 levels "1","2","3","4": 2 2 4 2 4 4 3 1 2 3 ...
##  $ INCOME     : Factor w/ 4 levels "1","2","3","4": 4 1 2 2 4 4 2 2 4 4 ...
##  $ IRWRKSTAT18: Factor w/ 4 levels "1","2","3","4": 1 4 1 4 1 1 3 1 1 3 ...

There are no N/As to remove from this data set. All 11 variables have values for all the remaining observations. Let’s recode them to give them meaningful labels.

TreatmentLabeled <- TreatmentClean %>%
  mutate(
    ReceivedTreatment = factor(SUTRTPY, levels = c(0, 1), labels = c("No", "Yes")),
    MedicaidCoverage = factor(CAIDCHIP, 
                      levels = c(1, 2),
                      labels = c("Medicaid", "No Medicaid")),
     Insurance = factor(IRINSUR4,
                      levels = c(1, 2),
                      labels = c("Insurance", "No Insurance")),
    Geography = factor(COUTYP4, 
                      levels = c(1, 2, 3),
                      labels = c("Large Metro", "Small Metro", "Nonmetro")),
    Sex = factor(IRSEX,
                levels = c(1, 2),
                labels = c("Male", "Female")),
    Age = factor(AGE3, 
                 levels = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11), 
                 labels = c("12-13 years", "14-15 years", "16-17 years", "18-20 years", "21-23 years", "24-25 years", "26-29 years", "30-34 years", "35-49 years", "50-64 years", "65 or older")),
    Race = factor(NEWRACE2, 
                  levels = c(1, 2, 3, 4, 5, 6, 7), 
                  labels = c("White", "Black", "Native American", "Native Hawaiian/Pacific Islander", "Asian", "More than one", "Hispanic")),
    EducationLevel = factor(EDUHIGHCAT,
                levels = c(1, 2, 3, 4, 5),
                labels = c("less than H.S", "H.S Grad", "Some College", "College Grad", "12-17 years old")),
    Income = factor(INCOME,
                levels = c(1, 2, 3, 4),
                labels = c("<$20,000", "$20,000-$49,000", "$50,000-$74,000", "$75,000 or more")),
    Employment = factor(IRWRKSTAT18,
                levels = c(1, 2, 3, 4),
                labels = c("Full-time", "Part-time", "Unemployed", "Other")),
   
  ) %>%
  
  select(-SUTNEEDPY,-SUTRTPY,-CAIDCHIP,-IRINSUR4,-COUTYP4, -IRSEX, -AGE3, -NEWRACE2, -EDUHIGHCAT, -INCOME, -IRWRKSTAT18)

Let’s explore this data now…

table(TreatmentLabeled$ReceivedTreatment, TreatmentLabeled$Insurance, TreatmentLabeled$MedicaidCoverage)

## , ,  = Medicaid
## 
##      
##       Insurance No Insurance
##   No       2155            0
##   Yes      1034            0
## 
## , ,  = No Medicaid
## 
##      
##       Insurance No Insurance
##   No       4806         1030
##   Yes       918          213

tabyl(TreatmentLabeled$ReceivedTreatment)

##  TreatmentLabeled$ReceivedTreatment    n   percent
##                                  No 7991 0.7868255
##                                 Yes 2165 0.2131745

tabyl(TreatmentLabeled$MedicaidCoverage)

##  TreatmentLabeled$MedicaidCoverage    n   percent
##                           Medicaid 3189 0.3140016
##                        No Medicaid 6967 0.6859984

tabyl(TreatmentLabeled$Insurance)

##  TreatmentLabeled$Insurance    n   percent
##                   Insurance 8913 0.8776093
##                No Insurance 1243 0.1223907

tabyl(TreatmentLabeled$Geography)

##  TreatmentLabeled$Geography    n  percent
##                 Large Metro 4537 0.446731
##                 Small Metro 3955 0.389425
##                    Nonmetro 1664 0.163844

tabyl(TreatmentLabeled$Sex)

##  TreatmentLabeled$Sex    n   percent
##                  Male 5097 0.5018708
##                Female 5059 0.4981292

tabyl(TreatmentLabeled$Age)

##  TreatmentLabeled$Age    n    percent
##           12-13 years    0 0.00000000
##           14-15 years    0 0.00000000
##           16-17 years    0 0.00000000
##           18-20 years 1123 0.11057503
##           21-23 years 1600 0.15754234
##           24-25 years 1022 0.10063017
##           26-29 years 1060 0.10437180
##           30-34 years 1341 0.13204017
##           35-49 years 2765 0.27225286
##           50-64 years  791 0.07788499
##           65 or older  454 0.04470264

tabyl(TreatmentLabeled$Race)

##             TreatmentLabeled$Race    n     percent
##                             White 5942 0.585072863
##                             Black 1227 0.120815282
##                   Native American  243 0.023926743
##  Native Hawaiian/Pacific Islander   54 0.005317054
##                             Asian  255 0.025108310
##                     More than one  564 0.055533675
##                          Hispanic 1871 0.184226073

tabyl(TreatmentLabeled$EducationLevel)

##  TreatmentLabeled$EducationLevel    n   percent
##                    less than H.S 1265 0.1245569
##                         H.S Grad 2984 0.2938165
##                     Some College 3152 0.3103584
##                     College Grad 2755 0.2712682
##                  12-17 years old    0 0.0000000

tabyl(TreatmentLabeled$Income)

##  TreatmentLabeled$Income    n   percent
##                 <$20,000 2172 0.2138637
##          $20,000-$49,000 2910 0.2865301
##          $50,000-$74,000 1497 0.1474006
##          $75,000 or more 3577 0.3522056

tabyl(TreatmentLabeled$Employment)

##  TreatmentLabeled$Employment    n   percent
##                    Full-time 5163 0.5083694
##                    Part-time 1621 0.1596101
##                   Unemployed  874 0.0860575
##                        Other 2498 0.2459630

freq_tables <- table(TreatmentLabeled)

variable_order <- c("ReceivedTreatment", "MedicaidCoverage", "Insurance", "Geography", "Sex", "Age", "Race", "EducationLevel", "Income", "Employment")

pivot_detailed <- TreatmentLabeled %>%
  pivot_longer(everything(), names_to = "Variable", values_to = "Value") %>%
  group_by(Variable, Value) %>%
  summarise(Count = n(), .groups = "drop") %>%
  group_by(Variable) %>%
  mutate(
    Percentage = round(Count/sum(Count)*100, 1),
    Total = sum(Count)
  ) %>%
  ungroup() %>%  
  mutate(Variable = factor(Variable, levels = variable_order)) %>%  # Set custom order
  arrange(Variable, Value)


# View
print(pivot_detailed, n = 200)

## # A tibble: 38 × 5
##    Variable          Value                            Count Percentage Total
##    <fct>             <fct>                            <int>      <dbl> <int>
##  1 ReceivedTreatment No                                7991       78.7 10156
##  2 ReceivedTreatment Yes                               2165       21.3 10156
##  3 MedicaidCoverage  Medicaid                          3189       31.4 10156
##  4 MedicaidCoverage  No Medicaid                       6967       68.6 10156
##  5 Insurance         Insurance                         8913       87.8 10156
##  6 Insurance         No Insurance                      1243       12.2 10156
##  7 Geography         Large Metro                       4537       44.7 10156
##  8 Geography         Small Metro                       3955       38.9 10156
##  9 Geography         Nonmetro                          1664       16.4 10156
## 10 Sex               Male                              5097       50.2 10156
## 11 Sex               Female                            5059       49.8 10156
## 12 Age               18-20 years                       1123       11.1 10156
## 13 Age               21-23 years                       1600       15.8 10156
## 14 Age               24-25 years                       1022       10.1 10156
## 15 Age               26-29 years                       1060       10.4 10156
## 16 Age               30-34 years                       1341       13.2 10156
## 17 Age               35-49 years                       2765       27.2 10156
## 18 Age               50-64 years                        791        7.8 10156
## 19 Age               65 or older                        454        4.5 10156
## 20 Race              White                             5942       58.5 10156
## 21 Race              Black                             1227       12.1 10156
## 22 Race              Native American                    243        2.4 10156
## 23 Race              Native Hawaiian/Pacific Islander    54        0.5 10156
## 24 Race              Asian                              255        2.5 10156
## 25 Race              More than one                      564        5.6 10156
## 26 Race              Hispanic                          1871       18.4 10156
## 27 EducationLevel    less than H.S                     1265       12.5 10156
## 28 EducationLevel    H.S Grad                          2984       29.4 10156
## 29 EducationLevel    Some College                      3152       31   10156
## 30 EducationLevel    College Grad                      2755       27.1 10156
## 31 Income            <$20,000                          2172       21.4 10156
## 32 Income            $20,000-$49,000                   2910       28.7 10156
## 33 Income            $50,000-$74,000                   1497       14.7 10156
## 34 Income            $75,000 or more                   3577       35.2 10156
## 35 Employment        Full-time                         5163       50.8 10156
## 36 Employment        Part-time                         1621       16   10156
## 37 Employment        Unemployed                         874        8.6 10156
## 38 Employment        Other                             2498       24.6 10156

class(pivot_detailed$Value)

## [1] "factor"

pivot_detailed %>% filter(Variable == "Income") %>% select(Value)

## # A tibble: 4 × 1
##   Value          
##   <fct>          
## 1 <$20,000       
## 2 $20,000-$49,000
## 3 $50,000-$74,000
## 4 $75,000 or more

pivot_detailed %>% 
  filter(Variable == "Income") %>% 
  print(n = Inf)

## # A tibble: 4 × 5
##   Variable Value           Count Percentage Total
##   <fct>    <fct>           <int>      <dbl> <int>
## 1 Income   <$20,000         2172       21.4 10156
## 2 Income   $20,000-$49,000  2910       28.7 10156
## 3 Income   $50,000-$74,000  1497       14.7 10156
## 4 Income   $75,000 or more  3577       35.2 10156

library(kableExtra)

## Warning: package 'kableExtra' was built under R version 4.4.3

## 
## Attaching package: 'kableExtra'

## The following object is masked from 'package:dplyr':
## 
##     group_rows

apa_frequency_table <- pivot_detailed %>%
  mutate(
    Variable_Value = paste(as.character(Variable), as.character(Value), sep = ": "),
    Percent = paste0(Percentage, "%")
  ) %>%
  select(Variable_Value, Count, Percent) %>%
  as.data.frame()

colnames(apa_frequency_table) <- c("Variable", "n", "%")

apa_frequency_table %>%
  kbl(format = "html", caption = "<b>Table 1</b><br><i>Frequency Distribution of Sample Characteristics</i>",
      align = c("l", "r", "r")) %>%  
  kable_classic(full_width = FALSE, html_font = "Arial") %>%
  column_spec(1, width = "17em") %>%
  column_spec(2, width = "5em") %>%
  column_spec(3, width = "5em") %>%
  row_spec(0, bold = TRUE) %>%  
  save_kable("Frequency_APA2.html")

Let’s plot these variables on bar chart

ggplot(TreatmentLabeled, aes(x =factor(MedicaidCoverage), fill = (ReceivedTreatment))) + 
  geom_bar() +
  labs(title = "Medicaid Status", x="Medicaid Status", y="Respondents", fill = "Received Treatment") +
  scale_fill_manual(values = c("No" = "coral", "Yes" = "steelblue"),
                    labels = c("No" = "No Treatment", "Yes" = "Received Treatment"))

ggplot(TreatmentLabeled, aes(x =factor(Insurance), fill = (ReceivedTreatment))) + 
  geom_bar() +
  labs(title = "Insurance Status", x="Insurance Status", y="Respondents", fill = "Received Treatment") +
  scale_fill_manual(values = c("No" = "coral", "Yes" = "steelblue"),
                    labels = c("No" = "No Treatment", "Yes" = "Received Treatment"))

ggplot(TreatmentLabeled, aes(x =factor(Geography), fill = (ReceivedTreatment))) + 
  geom_bar() +
  labs(title = "Geography", x="Metro Status", y="Respondents", fill = "Received Treatment") +
  scale_fill_manual(values = c("No" = "coral", "Yes" = "steelblue"),
                    labels = c("No" = "No Treatment", "Yes" = "Received Treatment"))

ggplot(TreatmentLabeled, aes(x =factor(Sex), fill = (ReceivedTreatment))) + 
  geom_bar() +
  labs(title = "Sex", x="Sex assigned at birth", y="Respondents", fill = "Received Treatment") +
  scale_fill_manual(values = c("No" = "coral", "Yes" = "steelblue"),
                    labels = c("No" = "No Treatment", "Yes" = "Received Treatment"))

ggplot(TreatmentLabeled, aes(x =factor(Age), fill = (ReceivedTreatment))) + 
  geom_bar() +
  labs(title = "Age", x="Age Range", y="Respondents", fill = "Received Treatment") +
  scale_fill_manual(values = c("No" = "coral", "Yes" = "steelblue"),
                    labels = c("No" = "No Treatment", "Yes" = "Received Treatment")) +
    theme(axis.text.x = element_text(size = 6))

ggplot(TreatmentLabeled, aes(x =factor(Race), fill = (ReceivedTreatment))) + 
  geom_bar() +
  labs(title = "Race", x="Race", y="Respondents", fill = "Received Treatment") +
  scale_fill_manual(values = c("No" = "coral", "Yes" = "steelblue"),
                    labels = c("No" = "No Treatment", "Yes" = "Received Treatment")) +
        scale_x_discrete(labels = function(x) str_wrap(x, width = 10)) +
  theme(axis.text.x = element_text(size = 7))

ggplot(TreatmentLabeled, aes(x =factor(EducationLevel), fill = (ReceivedTreatment))) + 
  geom_bar() +
  labs(title = "Highest Level of Education", x="Education", y="Respondents", fill = "Received Treatment") +
  scale_fill_manual(values = c("No" = "coral", "Yes" = "steelblue"),
                    labels = c("No" = "No Treatment", "Yes" = "Received Treatment"))

ggplot(TreatmentLabeled, aes(x =factor(Income), fill = (ReceivedTreatment))) + 
  geom_bar() +
  labs(title = "Income", x="Income", y="Respondents", fill = "Received Treatment") +
  scale_fill_manual(values = c("No" = "coral", "Yes" = "steelblue"),
                    labels = c("No" = "No Treatment", "Yes" = "Received Treatment"))

ggplot(TreatmentLabeled, aes(x =factor(Employment), fill = (ReceivedTreatment))) + 
  geom_bar() +
  labs(title = "Employement Status", x="Employment Status", y="Respondents", fill = "Received Treatment") +
  scale_fill_manual(values = c("No" = "coral", "Yes" = "steelblue"),
                    labels = c("No" = "No Treatment", "Yes" = "Received Treatment"))

RELATIONSHIPS BETWEEN VARIABLES

CAN’T RUN BECAUSE ALL VARIABLES ARE FACTORS: >hist(TreatmentClean$INCOME) >cor(TreatmentClean$SUTRTPY, TreatmentClean$CAIDCHIP) >Treatmentmodel1 <- lm(formula = SUTRTPY~CAIDCHIP+IRINSUR4,data=TreatmentClean) >TreatmentModel_2<-lm(SUTRTPY~CAIDCHIP+COUTYP4+AGE3,data=TreatmentClean) >library(lsr) for cramersV(table(TreatmentClean$SUTRTPY, TreatmentClean$CAIDCHIP))

CAN RUN THIS

result <- chisq.test(table(TreatmentLabeled$ReceivedTreatment, TreatmentLabeled$Insurance))
print(result)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(TreatmentLabeled$ReceivedTreatment, TreatmentLabeled$Insurance)
## X-squared = 14.482, df = 1, p-value = 0.0001415

p-value is less than 0.05, demonstrating that receiving treatment is significantly associated with Insurance

result <- chisq.test(table(TreatmentLabeled$ReceivedTreatment, TreatmentLabeled$MedicaidCoverage))
print(result)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(TreatmentLabeled$ReceivedTreatment, TreatmentLabeled$MedicaidCoverage)
## X-squared = 340.91, df = 1, p-value < 2.2e-16

p-value is MUCH MUCH MUCH less than 0.05, demonstrating that receiving treatment is significantly associated with Medicaid Coverage

    results_tidy <- TreatmentLabeled %>%
      pivot_longer(cols = -ReceivedTreatment, names_to = "variable", values_to = "value") %>%
      group_by(variable) %>%
      nest() %>%
      mutate(stats = map(data, ~broom::tidy(chisq.test(.$ReceivedTreatment, .$value)))) %>%
      select(-data) %>%
      unnest(cols = stats)

print(results_tidy)

## # A tibble: 9 × 5
## # Groups:   variable [9]
##   variable         statistic  p.value parameter method                          
##   <chr>                <dbl>    <dbl>     <int> <chr>                           
## 1 MedicaidCoverage     341.  4.03e-76         1 Pearson's Chi-squared test with…
## 2 Insurance             14.5 1.42e- 4         1 Pearson's Chi-squared test with…
## 3 Geography             30.7 2.18e- 7         2 Pearson's Chi-squared test      
## 4 Sex                   24.0 9.75e- 7         1 Pearson's Chi-squared test with…
## 5 Age                  133.  1.71e-25         7 Pearson's Chi-squared test      
## 6 Race                  20.5 2.23e- 3         6 Pearson's Chi-squared test      
## 7 EducationLevel       161.  1.22e-34         3 Pearson's Chi-squared test      
## 8 Income               180.  8.71e-39         3 Pearson's Chi-squared test      
## 9 Employment           205.  3.23e-44         3 Pearson's Chi-squared test

results_df <- as.data.frame(results_tidy)

stargazer(results_df,
          type = "html",
          out = "ChiSquare.htm",
          summary = FALSE,
          title = "Chi-Square Test Results by Variable",
          digits = 2,
          rownames = FALSE)

## 
## <table style="text-align:center"><caption><strong>Chi-Square Test Results by Variable</strong></caption>
## <tr><td colspan="5" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left">variable</td><td>statistic</td><td>p.value</td><td>parameter</td><td>method</td></tr>
## <tr><td colspan="5" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left">MedicaidCoverage</td><td>340.91</td><td>0</td><td>1</td><td>Pearson's Chi-squared test with Yates' continuity correction</td></tr>
## <tr><td style="text-align:left">Insurance</td><td>14.48</td><td>0.0001</td><td>1</td><td>Pearson's Chi-squared test with Yates' continuity correction</td></tr>
## <tr><td style="text-align:left">Geography</td><td>30.67</td><td>0.0000</td><td>2</td><td>Pearson's Chi-squared test</td></tr>
## <tr><td style="text-align:left">Sex</td><td>23.98</td><td>0.0000</td><td>1</td><td>Pearson's Chi-squared test with Yates' continuity correction</td></tr>
## <tr><td style="text-align:left">Age</td><td>132.71</td><td>0</td><td>7</td><td>Pearson's Chi-squared test</td></tr>
## <tr><td style="text-align:left">Race</td><td>20.53</td><td>0.002</td><td>6</td><td>Pearson's Chi-squared test</td></tr>
## <tr><td style="text-align:left">EducationLevel</td><td>160.82</td><td>0</td><td>3</td><td>Pearson's Chi-squared test</td></tr>
## <tr><td style="text-align:left">Income</td><td>180.03</td><td>0</td><td>3</td><td>Pearson's Chi-squared test</td></tr>
## <tr><td style="text-align:left">Employment</td><td>205.17</td><td>0</td><td>3</td><td>Pearson's Chi-squared test</td></tr>
## <tr><td colspan="5" style="border-bottom: 1px solid black"></td></tr></table>

Let’s try another one. This one is for Cramer’s V Cramer’s V – an effect size (ES) metric that quantifies the overall associations within a contingency table’s rows and columns.

cramer_v <- function(x, y) {
  cm <- table(x, y)
  n <- sum(cm)
  r <- nrow(cm)
  k <- ncol(cm)
  
  chi2 <- chisq.test(cm)$statistic
  chi2corr <- max(0, chi2 - (k - 1) * (r - 1) / (n - 1))
  
  kcorr <- k - (k - 1)^2 / (n - 1)
  rcorr <- r - (r - 1)^2 / (n - 1)
  
  return(sqrt(chi2corr / n) / min(kcorr - 1, rcorr - 1))
}

a1 <- cramer_v(TreatmentLabeled$ReceivedTreatment,TreatmentLabeled$ReceivedTreatment)
a2 <- cramer_v(TreatmentLabeled$ReceivedTreatment,TreatmentLabeled$MedicaidCoverage)
a3 <- cramer_v(TreatmentLabeled$ReceivedTreatment,TreatmentLabeled$Insurance)
a4 <- cramer_v(TreatmentLabeled$ReceivedTreatment,TreatmentLabeled$Geography)
a5 <- cramer_v(TreatmentLabeled$ReceivedTreatment,TreatmentLabeled$Sex)
a6 <- cramer_v(TreatmentLabeled$ReceivedTreatment,TreatmentLabeled$Age)

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect

a7 <- cramer_v(TreatmentLabeled$ReceivedTreatment,TreatmentLabeled$Race)
a8 <- cramer_v(TreatmentLabeled$ReceivedTreatment,TreatmentLabeled$EducationLevel)

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect

a9 <- cramer_v(TreatmentLabeled$ReceivedTreatment,TreatmentLabeled$Income)
a10 <- cramer_v(TreatmentLabeled$ReceivedTreatment,TreatmentLabeled$Employment)

a11 <- cramer_v(TreatmentLabeled$MedicaidCoverage,TreatmentLabeled$ReceivedTreatment)
a12 <- cramer_v(TreatmentLabeled$MedicaidCoverage,TreatmentLabeled$MedicaidCoverage)
a13 <- cramer_v(TreatmentLabeled$MedicaidCoverage,TreatmentLabeled$Insurance)
a14 <- cramer_v(TreatmentLabeled$MedicaidCoverage,TreatmentLabeled$Geography)
a15 <- cramer_v(TreatmentLabeled$MedicaidCoverage,TreatmentLabeled$Sex)
a16 <- cramer_v(TreatmentLabeled$MedicaidCoverage,TreatmentLabeled$Age)

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect

a17 <- cramer_v(TreatmentLabeled$MedicaidCoverage,TreatmentLabeled$Race)
a18 <- cramer_v(TreatmentLabeled$MedicaidCoverage,TreatmentLabeled$EducationLevel)

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect

a19 <- cramer_v(TreatmentLabeled$MedicaidCoverage,TreatmentLabeled$Income)
a20 <- cramer_v(TreatmentLabeled$MedicaidCoverage,TreatmentLabeled$Employment)

a21 <- cramer_v(TreatmentLabeled$Insurance,TreatmentLabeled$ReceivedTreatment)
a22 <- cramer_v(TreatmentLabeled$Insurance,TreatmentLabeled$MedicaidCoverage)
a23 <- cramer_v(TreatmentLabeled$Insurance,TreatmentLabeled$Insurance)
a24 <- cramer_v(TreatmentLabeled$Insurance,TreatmentLabeled$Geography)
a25 <- cramer_v(TreatmentLabeled$Insurance,TreatmentLabeled$Sex)
a26 <- cramer_v(TreatmentLabeled$Insurance,TreatmentLabeled$Age)

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect

a27 <- cramer_v(TreatmentLabeled$Insurance,TreatmentLabeled$Race)
a28 <- cramer_v(TreatmentLabeled$Insurance,TreatmentLabeled$EducationLevel)

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect

a29 <- cramer_v(TreatmentLabeled$Insurance,TreatmentLabeled$Income)
a30 <- cramer_v(TreatmentLabeled$Insurance,TreatmentLabeled$Employment)

a31 <- cramer_v(TreatmentLabeled$Geography,TreatmentLabeled$ReceivedTreatment)
a32 <- cramer_v(TreatmentLabeled$Geography,TreatmentLabeled$MedicaidCoverage)
a33 <- cramer_v(TreatmentLabeled$Geography,TreatmentLabeled$Insurance)
a34 <- cramer_v(TreatmentLabeled$Geography,TreatmentLabeled$Geography)
a35 <- cramer_v(TreatmentLabeled$Geography,TreatmentLabeled$Sex)
a36 <- cramer_v(TreatmentLabeled$Geography,TreatmentLabeled$Age)

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect

a37 <- cramer_v(TreatmentLabeled$Geography,TreatmentLabeled$Race)
a38 <- cramer_v(TreatmentLabeled$Geography,TreatmentLabeled$EducationLevel)

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect

a39 <- cramer_v(TreatmentLabeled$Geography,TreatmentLabeled$Income)
a40 <- cramer_v(TreatmentLabeled$Geography,TreatmentLabeled$Employment)

a41 <- cramer_v(TreatmentLabeled$Sex,TreatmentLabeled$ReceivedTreatment)
a42 <- cramer_v(TreatmentLabeled$Sex,TreatmentLabeled$MedicaidCoverage)
a43 <- cramer_v(TreatmentLabeled$Sex,TreatmentLabeled$Insurance)
a44 <- cramer_v(TreatmentLabeled$Sex,TreatmentLabeled$Geography)
a45 <- cramer_v(TreatmentLabeled$Sex,TreatmentLabeled$Sex)
a46 <- cramer_v(TreatmentLabeled$Sex,TreatmentLabeled$Age)

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect

a47 <- cramer_v(TreatmentLabeled$Sex,TreatmentLabeled$Race)
a48 <- cramer_v(TreatmentLabeled$Sex,TreatmentLabeled$EducationLevel)

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect

a49 <- cramer_v(TreatmentLabeled$Sex,TreatmentLabeled$Income)
a50 <- cramer_v(TreatmentLabeled$Sex,TreatmentLabeled$Employment)

a51 <- cramer_v(TreatmentLabeled$Age,TreatmentLabeled$ReceivedTreatment)

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect

a52 <- cramer_v(TreatmentLabeled$Age,TreatmentLabeled$MedicaidCoverage)

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect

a53 <- cramer_v(TreatmentLabeled$Age,TreatmentLabeled$Insurance)

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect

a54 <- cramer_v(TreatmentLabeled$Age,TreatmentLabeled$Geography)

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect

a55 <- cramer_v(TreatmentLabeled$Age,TreatmentLabeled$Sex)

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect

a56 <- cramer_v(TreatmentLabeled$Age,TreatmentLabeled$Age)

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect

a57 <- cramer_v(TreatmentLabeled$Age,TreatmentLabeled$Race)

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect

a58 <- cramer_v(TreatmentLabeled$Age,TreatmentLabeled$EducationLevel)

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect

a59 <- cramer_v(TreatmentLabeled$Age,TreatmentLabeled$Income)

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect

a60 <- cramer_v(TreatmentLabeled$Age,TreatmentLabeled$Employment)

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect

a61 <- cramer_v(TreatmentLabeled$Race,TreatmentLabeled$ReceivedTreatment)
a62 <- cramer_v(TreatmentLabeled$Race,TreatmentLabeled$MedicaidCoverage)
a63 <- cramer_v(TreatmentLabeled$Race,TreatmentLabeled$Insurance)
a64 <- cramer_v(TreatmentLabeled$Race,TreatmentLabeled$Geography)
a65 <- cramer_v(TreatmentLabeled$Race,TreatmentLabeled$Sex)
a66 <- cramer_v(TreatmentLabeled$Race,TreatmentLabeled$Age)

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect

a67 <- cramer_v(TreatmentLabeled$Race,TreatmentLabeled$Race)

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect

a68 <- cramer_v(TreatmentLabeled$Race,TreatmentLabeled$EducationLevel)

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect

a69 <- cramer_v(TreatmentLabeled$Race,TreatmentLabeled$Income)
a70 <- cramer_v(TreatmentLabeled$Race,TreatmentLabeled$Employment)

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect

a71 <- cramer_v(TreatmentLabeled$EducationLevel,TreatmentLabeled$ReceivedTreatment)

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect

a72 <- cramer_v(TreatmentLabeled$EducationLevel,TreatmentLabeled$MedicaidCoverage)

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect

a73 <- cramer_v(TreatmentLabeled$EducationLevel,TreatmentLabeled$Insurance)

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect

a74 <- cramer_v(TreatmentLabeled$EducationLevel,TreatmentLabeled$Geography)

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect

a75 <- cramer_v(TreatmentLabeled$EducationLevel,TreatmentLabeled$Sex)

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect

a76 <- cramer_v(TreatmentLabeled$EducationLevel,TreatmentLabeled$Age)

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect

a77 <- cramer_v(TreatmentLabeled$EducationLevel,TreatmentLabeled$Race)

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect

a78 <- cramer_v(TreatmentLabeled$EducationLevel,TreatmentLabeled$EducationLevel)

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect

a79 <- cramer_v(TreatmentLabeled$EducationLevel,TreatmentLabeled$Income)

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect

a80 <- cramer_v(TreatmentLabeled$EducationLevel,TreatmentLabeled$Employment)

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect

a81 <- cramer_v(TreatmentLabeled$Income,TreatmentLabeled$ReceivedTreatment)
a82 <- cramer_v(TreatmentLabeled$Income,TreatmentLabeled$MedicaidCoverage)
a83 <- cramer_v(TreatmentLabeled$Income,TreatmentLabeled$Insurance)
a84 <- cramer_v(TreatmentLabeled$Income,TreatmentLabeled$Geography)
a85 <- cramer_v(TreatmentLabeled$Income,TreatmentLabeled$Sex)
a86 <- cramer_v(TreatmentLabeled$Income,TreatmentLabeled$Age)

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect

a87 <- cramer_v(TreatmentLabeled$Income,TreatmentLabeled$Race)
a88 <- cramer_v(TreatmentLabeled$Income,TreatmentLabeled$EducationLevel)

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect

a89 <- cramer_v(TreatmentLabeled$Income,TreatmentLabeled$Income)
a90 <- cramer_v(TreatmentLabeled$Income,TreatmentLabeled$Employment)

a91 <- cramer_v(TreatmentLabeled$Employment,TreatmentLabeled$ReceivedTreatment)
a92 <- cramer_v(TreatmentLabeled$Employment,TreatmentLabeled$MedicaidCoverage)
a93 <- cramer_v(TreatmentLabeled$Employment,TreatmentLabeled$Insurance)
a94 <- cramer_v(TreatmentLabeled$Employment,TreatmentLabeled$Geography)
a95 <- cramer_v(TreatmentLabeled$Employment,TreatmentLabeled$Sex)
a96 <- cramer_v(TreatmentLabeled$Employment,TreatmentLabeled$Age)

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect

a97 <- cramer_v(TreatmentLabeled$Employment,TreatmentLabeled$Race)

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect

a98 <- cramer_v(TreatmentLabeled$Employment,TreatmentLabeled$EducationLevel)

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect

a99 <- cramer_v(TreatmentLabeled$Employment,TreatmentLabeled$Income)
a100 <- cramer_v(TreatmentLabeled$Employment,TreatmentLabeled$Employment)

d <- data.frame(
  ReceivedTreatment=c(a1,a2,a3,a4,a5,a6,a7,a8,a9,a10),
  Medicaid=c(a11,a12,a13,a14,a15,a16,a17,a18,a19,a20),
  Insurance=c(a21,a22,a23,a24,a25,a26,a27,a28,a29,a30),
  Geography=c(a31,a32,a33,a34,a35,a36,a37,a38,a39,a40),
  Sex=c(a41,a42,a43,a44,a45,a46,a47,a48,a49,a50),
  Age=c(a51,a52,a53,a54,a55,a56,a57,a58,a59,a60),
  Race=c(a61,a62,a63,a64,a65,a66,a67,a68,a69,a70),
  Education=c(a71,a72,a73,a74,a75,a76,a77,a78,a79,a80),
  Income=c(a81,a82,a83,a84,a85,a86,a87,a88,a89,a90),
  Employment=c(a91,a92,a93,a94,a95,a96,a97,a98,a99,a100))

print(d)

##    ReceivedTreatment   Medicaid   Insurance   Geography        Sex Age
## 1         0.99980493 0.18323326 0.037765105 0.054963045 0.04859405 NaN
## 2         0.18323326 0.99986990 0.252356206 0.073318435 0.12092569 NaN
## 3         0.03776511 0.25235621 0.999640083 0.009212179 0.08093075 NaN
## 4         0.05496304 0.07331843 0.009212179 0.707246065 0.03383537 NaN
## 5         0.04859405 0.12092569 0.080930754 0.033835372 0.99990153 NaN
## 6                NaN        NaN         NaN         NaN        NaN NaN
## 7         0.04495983 0.20405379 0.128678007 0.112291302 0.03837589 NaN
## 8                NaN        NaN         NaN         NaN        NaN NaN
## 9         0.13315200 0.39782468 0.147929159 0.075893564 0.10895653 NaN
## 10        0.14214606 0.29537716 0.078979593 0.032510736 0.12217652 NaN
##          Race Education     Income Employment
## 1  0.04495983       NaN 0.13315200 0.14214606
## 2  0.20405379       NaN 0.39782468 0.29537716
## 3  0.12867801       NaN 0.14792916 0.07897959
## 4  0.11229130       NaN 0.07589356 0.03251074
## 5  0.03837589       NaN 0.10895653 0.12217652
## 6         NaN       NaN        NaN        NaN
## 7  0.40848963       NaN 0.07014788 0.04736897
## 8         NaN       NaN        NaN        NaN
## 9  0.07014788       NaN 0.57752087 0.11464757
## 10 0.04736897       NaN 0.11464757 0.57752087

rownames(d) <- colnames(d)
variables <- colnames(d)

cor_TreatmentReceived <- data.frame()

for (i in 1:length(variables)) {
  for (j in 1:length(variables)) {
    row <- data.frame(
      Variable1 = variables[i],
      Variable2 = variables[j],
      Cramers_V = round(d[i, j],2)
    )
    cor_TreatmentReceived <- rbind(cor_TreatmentReceived, row)
  }
}

cor_TreatmentReceived <- cor_TreatmentReceived[cor_TreatmentReceived$Variable1 != cor_TreatmentReceived$Variable2, ]
rownames(cor_TreatmentReceived) <- NULL
colnames(cor_TreatmentReceived) <- c("Variable 1", "Variable 2", "Cramér's V")


cor_TreatmentReceived %>%
  kbl(format = "html", caption = "<b>Table 2</b><br><i>Cramér's V Correlations Between Variables</i>",
      align = c("l", "l", "r")) %>%  
  kable_classic(full_width = FALSE, html_font = "Arial") %>%
  column_spec(1, width = "15em") %>%
  column_spec(2, width = "15em") %>%
  column_spec(3, width = "8em") %>%
  row_spec(0, bold = TRUE) %>%  
  save_kable("Correlation_APA.html")

cramers_matrix <- d
cramers_matrix[upper.tri(cramers_matrix)] <- NA  #
diag(cramers_matrix) <- NA  

cramers_matrix <- round(cramers_matrix, 2)

var_names <- rownames(cramers_matrix)
cramers_matrix <- as.data.frame(cramers_matrix)
cramers_matrix <- cbind(Variable = var_names, cramers_matrix)

cramers_matrix %>%
  kbl(format = "html", 
      caption = "<b>Table 2</b><br><i>Cramér's V Associations Between Categorical Variables</i>",
      align = c("l", rep("r", ncol(cramers_matrix)-1))) %>%
  kable_classic(full_width = FALSE, html_font = "Arial") %>%
  row_spec(0, bold = TRUE) %>%
  footnote(general = "Note. Values represent Cramér's V coefficients. Interpretation: < .20 = weak association, .20-.40 = moderate association, > .40 = strong association.",
           general_title = "",
           footnote_as_chunk = TRUE) %>%
  save_kable("Cramers_V_APA.html")

library(ggplot2)


ggplot(data = cor_TreatmentReceived, aes(x = `Variable 1`, y = `Variable 2`, fill = `Cramér's V`)) +
  geom_tile() +
  scale_fill_gradient(low = "white", high = "red") + 
  labs(title = "Cramer's V Heatmap",
       x = "Variable 1",
       y = "Variable 2",
       fill = "Cramer's V") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

library(vcd)

## Warning: package 'vcd' was built under R version 4.4.3

## Loading required package: grid

tbl <- table(TreatmentLabeled$ReceivedTreatment, TreatmentLabeled$MedicaidCoverage)
chisq.test(tbl)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  tbl
## X-squared = 340.91, df = 1, p-value < 2.2e-16

assocstats(tbl)

##                     X^2 df P(> X^2)
## Likelihood Ratio 325.94  1        0
## Pearson          341.88  1        0
## 
## Phi-Coefficient   : 0.183 
## Contingency Coeff.: 0.18 
## Cramer's V        : 0.183

tbl2 <- table(TreatmentLabeled$ReceivedTreatment, TreatmentLabeled$Geography)
chisq.test(tbl2)

## 
##  Pearson's Chi-squared test
## 
## data:  tbl2
## X-squared = 30.675, df = 2, p-value = 2.183e-07

assocstats(tbl2)

##                     X^2 df   P(> X^2)
## Likelihood Ratio 30.325  2 2.5999e-07
## Pearson          30.675  2 2.1830e-07
## 
## Phi-Coefficient   : NA 
## Contingency Coeff.: 0.055 
## Cramer's V        : 0.055

tbl3 <- table(TreatmentLabeled$ReceivedTreatment, TreatmentLabeled$Insurance)
chisq.test(tbl3)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  tbl3
## X-squared = 14.482, df = 1, p-value = 0.0001415

assocstats(tbl3)

##                     X^2 df   P(> X^2)
## Likelihood Ratio 15.446  1 8.4905e-05
## Pearson          14.764  1 1.2181e-04
## 
## Phi-Coefficient   : 0.038 
## Contingency Coeff.: 0.038 
## Cramer's V        : 0.038

tbl4 <- table(TreatmentLabeled$ReceivedTreatment, TreatmentLabeled$Income)
chisq.test(tbl4)

## 
##  Pearson's Chi-squared test
## 
## data:  tbl4
## X-squared = 180.03, df = 3, p-value < 2.2e-16

assocstats(tbl4)

##                     X^2 df P(> X^2)
## Likelihood Ratio 179.76  3        0
## Pearson          180.03  3        0
## 
## Phi-Coefficient   : NA 
## Contingency Coeff.: 0.132 
## Cramer's V        : 0.133

tbl5 <- table(TreatmentLabeled$ReceivedTreatment, TreatmentLabeled$Employment)
chisq.test(tbl5)

## 
##  Pearson's Chi-squared test
## 
## data:  tbl5
## X-squared = 205.17, df = 3, p-value < 2.2e-16

assocstats(tbl5)

##                     X^2 df P(> X^2)
## Likelihood Ratio 198.71  3        0
## Pearson          205.17  3        0
## 
## Phi-Coefficient   : NA 
## Contingency Coeff.: 0.141 
## Cramer's V        : 0.142

tbl6 <- table(TreatmentLabeled$ReceivedTreatment, TreatmentLabeled$Race)
chisq.test(tbl6)

## 
##  Pearson's Chi-squared test
## 
## data:  tbl6
## X-squared = 20.526, df = 6, p-value = 0.002231

assocstats(tbl6)

##                     X^2 df  P(> X^2)
## Likelihood Ratio 19.330  6 0.0036413
## Pearson          20.526  6 0.0022314
## 
## Phi-Coefficient   : NA 
## Contingency Coeff.: 0.045 
## Cramer's V        : 0.045

tbl7 <- table(TreatmentLabeled$ReceivedTreatment, TreatmentLabeled$Age)
chisq.test(tbl7)

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  tbl7
## X-squared = NaN, df = 10, p-value = NA

assocstats(tbl7)

##                     X^2 df P(> X^2)
## Likelihood Ratio 133.96 10        0
## Pearson             NaN 10      NaN
## 
## Phi-Coefficient   : NA 
## Contingency Coeff.: NaN 
## Cramer's V        : NaN

tbl8 <- table(TreatmentLabeled$ReceivedTreatment, TreatmentLabeled$Sex)
chisq.test(tbl8)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  tbl8
## X-squared = 23.978, df = 1, p-value = 9.746e-07

assocstats(tbl8)

##                     X^2 df   P(> X^2)
## Likelihood Ratio 24.240  1 8.5053e-07
## Pearson          24.215  1 8.6139e-07
## 
## Phi-Coefficient   : 0.049 
## Contingency Coeff.: 0.049 
## Cramer's V        : 0.049

REGRESSION TIME

model <- glm(ReceivedTreatment ~ MedicaidCoverage + Insurance + Geography + Sex + Age + Race + EducationLevel + Income + Employment, data = TreatmentLabeled, family = binomial)

summary(model)

## 
## Call:
## glm(formula = ReceivedTreatment ~ MedicaidCoverage + Insurance + 
##     Geography + Sex + Age + Race + EducationLevel + Income + 
##     Employment, family = binomial, data = TreatmentLabeled)
## 
## Coefficients:
##                                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                          -1.07337    0.13025  -8.241  < 2e-16 ***
## MedicaidCoverageNo Medicaid          -0.56621    0.06507  -8.701  < 2e-16 ***
## InsuranceNo Insurance                -0.14410    0.09137  -1.577 0.114768    
## GeographySmall Metro                  0.06521    0.05722   1.140 0.254406    
## GeographyNonmetro                     0.08389    0.07365   1.139 0.254691    
## SexFemale                             0.13857    0.05198   2.666 0.007679 ** 
## Age21-23 years                       -0.07353    0.10718  -0.686 0.492689    
## Age24-25 years                        0.15230    0.11911   1.279 0.201035    
## Age26-29 years                        0.30924    0.11578   2.671 0.007566 ** 
## Age30-34 years                        0.59049    0.10609   5.566 2.61e-08 ***
## Age35-49 years                        0.84378    0.09392   8.984  < 2e-16 ***
## Age50-64 years                        0.55826    0.11907   4.689 2.75e-06 ***
## Age65 or older                        0.41634    0.14690   2.834 0.004595 ** 
## RaceBlack                            -0.42412    0.08491  -4.995 5.88e-07 ***
## RaceNative American                   0.04368    0.14975   0.292 0.770544    
## RaceNative Hawaiian/Pacific Islander -0.09098    0.33499  -0.272 0.785932    
## RaceAsian                             0.12870    0.16666   0.772 0.439948    
## RaceMore than one                    -0.25566    0.11669  -2.191 0.028457 *  
## RaceHispanic                         -0.17375    0.07145  -2.432 0.015020 *  
## EducationLevelH.S Grad               -0.17515    0.07880  -2.223 0.026242 *  
## EducationLevelSome College           -0.28560    0.08182  -3.490 0.000482 ***
## EducationLevelCollege Grad           -0.71499    0.09751  -7.333 2.25e-13 ***
## Income$20,000-$49,000                -0.13209    0.06920  -1.909 0.056290 .  
## Income$50,000-$74,000                -0.13328    0.08744  -1.524 0.127430    
## Income$75,000 or more                -0.36181    0.08186  -4.420 9.87e-06 ***
## EmploymentPart-time                   0.19331    0.07669   2.521 0.011710 *  
## EmploymentUnemployed                  0.23320    0.09424   2.475 0.013337 *  
## EmploymentOther                       0.43932    0.06705   6.552 5.67e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 10524.3  on 10155  degrees of freedom
## Residual deviance:  9819.2  on 10128  degrees of freedom
## AIC: 9875.2
## 
## Number of Fisher Scoring iterations: 4

print(model)

## 
## Call:  glm(formula = ReceivedTreatment ~ MedicaidCoverage + Insurance + 
##     Geography + Sex + Age + Race + EducationLevel + Income + 
##     Employment, family = binomial, data = TreatmentLabeled)
## 
## Coefficients:
##                          (Intercept)           MedicaidCoverageNo Medicaid  
##                             -1.07337                              -0.56621  
##                InsuranceNo Insurance                  GeographySmall Metro  
##                             -0.14410                               0.06521  
##                    GeographyNonmetro                             SexFemale  
##                              0.08389                               0.13857  
##                       Age21-23 years                        Age24-25 years  
##                             -0.07353                               0.15230  
##                       Age26-29 years                        Age30-34 years  
##                              0.30924                               0.59049  
##                       Age35-49 years                        Age50-64 years  
##                              0.84378                               0.55826  
##                       Age65 or older                             RaceBlack  
##                              0.41634                              -0.42412  
##                  RaceNative American  RaceNative Hawaiian/Pacific Islander  
##                              0.04368                              -0.09098  
##                            RaceAsian                     RaceMore than one  
##                              0.12870                              -0.25566  
##                         RaceHispanic                EducationLevelH.S Grad  
##                             -0.17375                              -0.17515  
##           EducationLevelSome College            EducationLevelCollege Grad  
##                             -0.28560                              -0.71499  
##                Income$20,000-$49,000                 Income$50,000-$74,000  
##                             -0.13209                              -0.13328  
##                Income$75,000 or more                   EmploymentPart-time  
##                             -0.36181                               0.19331  
##                 EmploymentUnemployed                       EmploymentOther  
##                              0.23320                               0.43932  
## 
## Degrees of Freedom: 10155 Total (i.e. Null);  10128 Residual
## Null Deviance:       10520 
## Residual Deviance: 9819  AIC: 9875

library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

## The following object is masked from 'package:purrr':
## 
##     some

library(lmtest)     
library(pROC)

## Warning: package 'pROC' was built under R version 4.4.3

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

library(caret)

## Warning: package 'caret' was built under R version 4.4.3

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

library(ggplot2)

#ASSUMPTIONS IN GLM

vif_values <- vif(model)
print("Variance Inflation Factors (VIF):")

## [1] "Variance Inflation Factors (VIF):"

print(vif_values)

##                      GVIF Df GVIF^(1/(2*Df))
## MedicaidCoverage 1.626504  1        1.275345
## Insurance        1.272239  1        1.127936
## Geography        1.111872  2        1.026866
## Sex              1.063332  1        1.031180
## Age              1.336171  7        1.020916
## Race             1.252100  6        1.018912
## EducationLevel   1.473183  3        1.066701
## Income           1.552718  3        1.076090
## Employment       1.415917  3        1.059676

Deviance: The summary output provides the Null Deviance (model with only an intercept) and Residual Deviance (full model). A smaller residual deviance relative to the null deviance suggests a better fit.

null_deviance <- model$null.deviance
residual_deviance <- model$deviance
df_null <- model$df.null
df_residual <- model$df.residual

cat("\n--- Deviance Analysis ---\n")

## 
## --- Deviance Analysis ---

cat("Null Deviance:", null_deviance, "on", df_null, "degrees of freedom\n")

## Null Deviance: 10524.3 on 10155 degrees of freedom

cat("Residual Deviance:", residual_deviance, "on", df_residual, "degrees of freedom\n")

## Residual Deviance: 9819.162 on 10128 degrees of freedom

cat("Deviance Reduction:", null_deviance - residual_deviance, "\n")

## Deviance Reduction: 705.1426

chi_sq <- null_deviance - residual_deviance
df_diff <- df_null - df_residual
p_value <- pchisq(chi_sq, df_diff, lower.tail = FALSE)
cat("Chi-square:", chi_sq, "df:", df_diff, "p-value:", p_value, "\n")

## Chi-square: 705.1426 df: 27 p-value: 3.186422e-131

3. GOODNESS OF FIT - Hosmer-Lemeshow Test

Rule: p-value > 0.05 indicates good fit

library(ResourceSelection)

## Warning: package 'ResourceSelection' was built under R version 4.4.3

## ResourceSelection 0.3-6   2023-06-27

hoslem_test <- hoslem.test(model$y, fitted(model), g = 10)
print("\n--- Hosmer-Lemeshow Goodness of Fit Test ---")

## [1] "\n--- Hosmer-Lemeshow Goodness of Fit Test ---"

print(hoslem_test)

## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  model$y, fitted(model)
## X-squared = 6.2384, df = 8, p-value = 0.6205

Residual Analysis: Plot standardized residuals versus fitted values to check for patterns, which can indicate issues with the assumed mean-variance relationship or link function.

TreatmentLabeled$fitted_probs <- fitted(model)
TreatmentLabeled$deviance_resid <- residuals(model, type = "deviance")
TreatmentLabeled$pearson_resid <- residuals(model, type = "pearson")
TreatmentLabeled$std_resid <- rstandard(model)


ggplot(TreatmentLabeled, aes(x = fitted_probs, y = deviance_resid)) +
  geom_point(alpha = 0.3) +
  geom_hline(yintercept = 0, color = "red", linetype = "dashed") +
  geom_smooth(se = FALSE) +
  labs(title = "Deviance Residuals vs Fitted Probabilities",
       x = "Fitted Probabilities", y = "Deviance Residuals") +
  theme_minimal()

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

ggplot(TreatmentLabeled, aes(x = seq_along(std_resid), y = std_resid)) +
  geom_point(alpha = 0.3) +
  geom_hline(yintercept = c(-3, 0, 3), color = c("red", "blue", "red"), 
             linetype = c("dashed", "solid", "dashed")) +
  labs(title = "Standardized Residuals",
       x = "Observation Index", y = "Standardized Residuals") +
  theme_minimal()

influential <- which(abs(TreatmentLabeled$std_resid) > 3)
cat("\n--- Influential Observations (|std residual| > 3) ---\n")

## 
## --- Influential Observations (|std residual| > 3) ---

cat("Number of influential observations:", length(influential), "\n")

## Number of influential observations: 0

cat("Percentage:", round(length(influential)/nrow(TreatmentLabeled)*100, 2), "%\n")

## Percentage: 0 %

cooks_d <- cooks.distance(model)
plot(cooks_d, type = "h", main = "Cook's Distance", ylab = "Cook's Distance")
abline(h = 4/nrow(TreatmentLabeled), col = "red", lty = 2)

Cross-validation: Use a hold-out or validation sample to compare the model’s predictions with actual results, assessing its generalization performance. # Tests model stability (10-fold CV)

set.seed(123)  

train_control <- trainControl(method = "cv", 
                              number = 10,
                              savePredictions = TRUE,
                              classProbs = TRUE,
                              summaryFunction = twoClassSummary)

TreatmentLabeled$ReceivedTreatment_factor <- factor(
  TreatmentLabeled$ReceivedTreatment,
  levels = c("No", "Yes")
)

cv_model <- train(ReceivedTreatment_factor ~ MedicaidCoverage + Insurance + 
                  Geography + Sex + Age + Race + EducationLevel + 
                  Income + Employment,
                  data = TreatmentLabeled,
                  method = "glm",
                  family = "binomial",
                  trControl = train_control,
                  metric = "ROC")

print("\n--- 10-Fold Cross-Validation Results ---")

## [1] "\n--- 10-Fold Cross-Validation Results ---"

print(cv_model)

## Generalized Linear Model 
## 
## 10156 samples
##     9 predictor
##     2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 9139, 9141, 9140, 9141, 9141, 9141, ... 
## Resampling results:
## 
##   ROC        Sens       Spec      
##   0.6682197  0.9876109  0.05635134

print(cv_model$results)

##   parameter       ROC      Sens       Spec      ROCSD      SensSD     SpecSD
## 1      none 0.6682197 0.9876109 0.05635134 0.01766354 0.003658918 0.02089676

stargazer(model, 
          type = "html", 
          out = "model.htm", 
          title = "Logistic Regression Results",
          single.row = TRUE,
          column.sep.width = "20pt")

## 
## <table style="text-align:center"><caption><strong>Logistic Regression Results</strong></caption>
## <tr><td colspan="2" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left"></td><td><em>Dependent variable:</em></td></tr>
## <tr><td></td><td colspan="1" style="border-bottom: 1px solid black"></td></tr>
## <tr><td style="text-align:left"></td><td>ReceivedTreatment</td></tr>
## <tr><td colspan="2" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left">MedicaidCoverageNo Medicaid</td><td>-0.566<sup>***</sup> (0.065)</td></tr>
## <tr><td style="text-align:left">InsuranceNo Insurance</td><td>-0.144 (0.091)</td></tr>
## <tr><td style="text-align:left">GeographySmall Metro</td><td>0.065 (0.057)</td></tr>
## <tr><td style="text-align:left">GeographyNonmetro</td><td>0.084 (0.074)</td></tr>
## <tr><td style="text-align:left">SexFemale</td><td>0.139<sup>***</sup> (0.052)</td></tr>
## <tr><td style="text-align:left">Age21-23 years</td><td>-0.074 (0.107)</td></tr>
## <tr><td style="text-align:left">Age24-25 years</td><td>0.152 (0.119)</td></tr>
## <tr><td style="text-align:left">Age26-29 years</td><td>0.309<sup>***</sup> (0.116)</td></tr>
## <tr><td style="text-align:left">Age30-34 years</td><td>0.590<sup>***</sup> (0.106)</td></tr>
## <tr><td style="text-align:left">Age35-49 years</td><td>0.844<sup>***</sup> (0.094)</td></tr>
## <tr><td style="text-align:left">Age50-64 years</td><td>0.558<sup>***</sup> (0.119)</td></tr>
## <tr><td style="text-align:left">Age65 or older</td><td>0.416<sup>***</sup> (0.147)</td></tr>
## <tr><td style="text-align:left">RaceBlack</td><td>-0.424<sup>***</sup> (0.085)</td></tr>
## <tr><td style="text-align:left">RaceNative American</td><td>0.044 (0.150)</td></tr>
## <tr><td style="text-align:left">RaceNative Hawaiian/Pacific Islander</td><td>-0.091 (0.335)</td></tr>
## <tr><td style="text-align:left">RaceAsian</td><td>0.129 (0.167)</td></tr>
## <tr><td style="text-align:left">RaceMore than one</td><td>-0.256<sup>**</sup> (0.117)</td></tr>
## <tr><td style="text-align:left">RaceHispanic</td><td>-0.174<sup>**</sup> (0.071)</td></tr>
## <tr><td style="text-align:left">EducationLevelH.S Grad</td><td>-0.175<sup>**</sup> (0.079)</td></tr>
## <tr><td style="text-align:left">EducationLevelSome College</td><td>-0.286<sup>***</sup> (0.082)</td></tr>
## <tr><td style="text-align:left">EducationLevelCollege Grad</td><td>-0.715<sup>***</sup> (0.098)</td></tr>
## <tr><td style="text-align:left">49,000</td><td>-0.132<sup>*</sup> (0.069)</td></tr>
## <tr><td style="text-align:left">74,000</td><td>-0.133 (0.087)</td></tr>
## <tr><td style="text-align:left">75,000 or more</td><td>-0.362<sup>***</sup> (0.082)</td></tr>
## <tr><td style="text-align:left">EmploymentPart-time</td><td>0.193<sup>**</sup> (0.077)</td></tr>
## <tr><td style="text-align:left">EmploymentUnemployed</td><td>0.233<sup>**</sup> (0.094)</td></tr>
## <tr><td style="text-align:left">EmploymentOther</td><td>0.439<sup>***</sup> (0.067)</td></tr>
## <tr><td style="text-align:left">Constant</td><td>-1.073<sup>***</sup> (0.130)</td></tr>
## <tr><td colspan="2" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left">Observations</td><td>10,156</td></tr>
## <tr><td style="text-align:left">Log Likelihood</td><td>-4,909.581</td></tr>
## <tr><td style="text-align:left">Akaike Inf. Crit.</td><td>9,875.162</td></tr>
## <tr><td colspan="2" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left"><em>Note:</em></td><td style="text-align:right"><sup>*</sup>p<0.1; <sup>**</sup>p<0.05; <sup>***</sup>p<0.01</td></tr>
## </table>

exp(coef(model))

##                          (Intercept)          MedicaidCoverageNo Medicaid 
##                            0.3418562                            0.5676755 
##                InsuranceNo Insurance                 GeographySmall Metro 
##                            0.8658024                            1.0673850 
##                    GeographyNonmetro                            SexFemale 
##                            1.0875090                            1.1486342 
##                       Age21-23 years                       Age24-25 years 
##                            0.9291095                            1.1645067 
##                       Age26-29 years                       Age30-34 years 
##                            1.3623894                            1.8048679 
##                       Age35-49 years                       Age50-64 years 
##                            2.3251492                            1.7476276 
##                       Age65 or older                            RaceBlack 
##                            1.5164014                            0.6543478 
##                  RaceNative American RaceNative Hawaiian/Pacific Islander 
##                            1.0446446                            0.9130337 
##                            RaceAsian                    RaceMore than one 
##                            1.1373545                            0.7744062 
##                         RaceHispanic               EducationLevelH.S Grad 
##                            0.8405034                            0.8393319 
##           EducationLevelSome College           EducationLevelCollege Grad 
##                            0.7515650                            0.4891987 
##                Income$20,000-$49,000                Income$50,000-$74,000 
##                            0.8762633                            0.8752204 
##                Income$75,000 or more                  EmploymentPart-time 
##                            0.6964167                            1.2132609 
##                 EmploymentUnemployed                      EmploymentOther 
##                            1.2626395                            1.5516527

stargazer(model, 
          align = TRUE, 
          apply.coef=exp, 
          type = "html", 
          out = "model2.htm", 
          title = "Odds Ratio",
          single.row = TRUE,
          column.sep.width = "20pt"
          )

## 
## <table style="text-align:center"><caption><strong>Odds Ratio</strong></caption>
## <tr><td colspan="2" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left"></td><td><em>Dependent variable:</em></td></tr>
## <tr><td></td><td colspan="1" style="border-bottom: 1px solid black"></td></tr>
## <tr><td style="text-align:left"></td><td>ReceivedTreatment</td></tr>
## <tr><td colspan="2" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left">MedicaidCoverageNo Medicaid</td><td>0.568<sup>***</sup> (0.065)</td></tr>
## <tr><td style="text-align:left">InsuranceNo Insurance</td><td>0.866<sup>***</sup> (0.091)</td></tr>
## <tr><td style="text-align:left">GeographySmall Metro</td><td>1.067<sup>***</sup> (0.057)</td></tr>
## <tr><td style="text-align:left">GeographyNonmetro</td><td>1.088<sup>***</sup> (0.074)</td></tr>
## <tr><td style="text-align:left">SexFemale</td><td>1.149<sup>***</sup> (0.052)</td></tr>
## <tr><td style="text-align:left">Age21-23 years</td><td>0.929<sup>***</sup> (0.107)</td></tr>
## <tr><td style="text-align:left">Age24-25 years</td><td>1.165<sup>***</sup> (0.119)</td></tr>
## <tr><td style="text-align:left">Age26-29 years</td><td>1.362<sup>***</sup> (0.116)</td></tr>
## <tr><td style="text-align:left">Age30-34 years</td><td>1.805<sup>***</sup> (0.106)</td></tr>
## <tr><td style="text-align:left">Age35-49 years</td><td>2.325<sup>***</sup> (0.094)</td></tr>
## <tr><td style="text-align:left">Age50-64 years</td><td>1.748<sup>***</sup> (0.119)</td></tr>
## <tr><td style="text-align:left">Age65 or older</td><td>1.516<sup>***</sup> (0.147)</td></tr>
## <tr><td style="text-align:left">RaceBlack</td><td>0.654<sup>***</sup> (0.085)</td></tr>
## <tr><td style="text-align:left">RaceNative American</td><td>1.045<sup>***</sup> (0.150)</td></tr>
## <tr><td style="text-align:left">RaceNative Hawaiian/Pacific Islander</td><td>0.913<sup>***</sup> (0.335)</td></tr>
## <tr><td style="text-align:left">RaceAsian</td><td>1.137<sup>***</sup> (0.167)</td></tr>
## <tr><td style="text-align:left">RaceMore than one</td><td>0.774<sup>***</sup> (0.117)</td></tr>
## <tr><td style="text-align:left">RaceHispanic</td><td>0.841<sup>***</sup> (0.071)</td></tr>
## <tr><td style="text-align:left">EducationLevelH.S Grad</td><td>0.839<sup>***</sup> (0.079)</td></tr>
## <tr><td style="text-align:left">EducationLevelSome College</td><td>0.752<sup>***</sup> (0.082)</td></tr>
## <tr><td style="text-align:left">EducationLevelCollege Grad</td><td>0.489<sup>***</sup> (0.098)</td></tr>
## <tr><td style="text-align:left">49,000</td><td>0.876<sup>***</sup> (0.069)</td></tr>
## <tr><td style="text-align:left">74,000</td><td>0.875<sup>***</sup> (0.087)</td></tr>
## <tr><td style="text-align:left">75,000 or more</td><td>0.696<sup>***</sup> (0.082)</td></tr>
## <tr><td style="text-align:left">EmploymentPart-time</td><td>1.213<sup>***</sup> (0.077)</td></tr>
## <tr><td style="text-align:left">EmploymentUnemployed</td><td>1.263<sup>***</sup> (0.094)</td></tr>
## <tr><td style="text-align:left">EmploymentOther</td><td>1.552<sup>***</sup> (0.067)</td></tr>
## <tr><td style="text-align:left">Constant</td><td>0.342<sup>***</sup> (0.130)</td></tr>
## <tr><td colspan="2" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left">Observations</td><td>10,156</td></tr>
## <tr><td style="text-align:left">Log Likelihood</td><td>-4,909.581</td></tr>
## <tr><td style="text-align:left">Akaike Inf. Crit.</td><td>9,875.162</td></tr>
## <tr><td colspan="2" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left"><em>Note:</em></td><td style="text-align:right"><sup>*</sup>p<0.1; <sup>**</sup>p<0.05; <sup>***</sup>p<0.01</td></tr>
## </table>

All variables are statistically significant. p value is P<0.01 #No Medicaid coverage decreases the odds of treatment by 43%. #Small metro (7% increase) and nonmetro (9% increase) have slight increase of odds to receive treatment. #Being female increases the odds of treatment by almost 15%. #Being Aged 35-49 increases the odds of treatment by 132%. Largest increase in Odds Ratio. #Being Black decreases the odds of treatment by 34%. Races black, hispanic, more than one are all less likley to receive treatment than white individuals. This is a concerning disparity. #Employment status other than Full time (baseline) have higher odds of receiving treatment. #Both income and education have decreased odds of receiving treatment the higher up the range you go. Perhaps this is due to less need in higher income or higher achieved education, or a lack of motivation to seek out treatment due to social factors.

exp(confint.default(model))

##                                          2.5 %    97.5 %
## (Intercept)                          0.2648340 0.4412790
## MedicaidCoverageNo Medicaid          0.4996991 0.6448990
## InsuranceNo Insurance                0.7238456 1.0355988
## GeographySmall Metro                 0.9541515 1.1940563
## GeographyNonmetro                    0.9413296 1.2563886
## SexFemale                            1.0373750 1.2718261
## Age21-23 years                       0.7530716 1.1462981
## Age24-25 years                       0.9220485 1.4707207
## Age26-29 years                       1.0857922 1.7094477
## Age30-34 years                       1.4660249 2.2220277
## Age35-49 years                       1.9342125 2.7951008
## Age50-64 years                       1.3838833 2.2069795
## Age65 or older                       1.1370298 2.0223510
## RaceBlack                            0.5540311 0.7728284
## RaceNative American                  0.7789342 1.4009944
## RaceNative Hawaiian/Pacific Islander 0.4735243 1.7604810
## RaceAsian                            0.8204238 1.5767158
## RaceMore than one                    0.6160871 0.9734094
## RaceHispanic                         0.7306721 0.9668441
## EducationLevelH.S Grad               0.7192112 0.9795148
## EducationLevelSome College           0.6402048 0.8822957
## EducationLevelCollege Grad           0.4040992 0.5922192
## Income$20,000-$49,000                0.7651228 1.0035480
## Income$50,000-$74,000                0.7373812 1.0388260
## Income$75,000 or more                0.5931874 0.8176105
## EmploymentPart-time                  1.0439450 1.4100377
## EmploymentUnemployed                 1.0496971 1.5187795
## EmploymentOther                      1.3605715 1.7695698

or_table <- data.frame(
  Variable = names(coef(model))[-1],  
  Odds_Ratio = exp(coef(model))[-1],
  CI_Lower = exp(confint.default(model))[-1, 1],
  CI_Upper = exp(confint.default(model))[-1, 2],
  P_Value = summary(model)$coefficients[-1, 4]
)

or_table$OR_95CI <- sprintf("%.2f (%.2f-%.2f)", 
                             or_table$Odds_Ratio, 
                             or_table$CI_Lower, 
                             or_table$CI_Upper)

or_table$P_Value_Formatted <- ifelse(or_table$P_Value < 0.001, 
                                      "<0.001", 
                                      sprintf("%.3f", or_table$P_Value))

final_table <- or_table[, c("Variable", "OR_95CI", "P_Value_Formatted")]
colnames(final_table) <- c("Variable", "Odds Ratio (95% CI)", "P-value")


final_table$Variable <- gsub("MedicaidCoverageNo Medicaid", "No Medicaid (vs. Medicaid)", final_table$Variable)
final_table$Variable <- gsub("InsuranceNo Insurance", "No Insurance (vs. Insurance)", final_table$Variable)
final_table$Variable <- gsub("GeographySmall Metro", "Small Metro (vs. Large Metro)", final_table$Variable)
final_table$Variable <- gsub("GeographyNonmetro", "Nonmetro (vs. Large Metro)", final_table$Variable)
final_table$Variable <- gsub("SexFemale", "Female (vs. Male)", final_table$Variable)
final_table$Variable <- gsub("Race", "Race: ", final_table$Variable)
final_table$Variable <- gsub("EducationLevel", "Education: ", final_table$Variable)
final_table$Variable <- gsub("Employment", "Employment: ", final_table$Variable)
final_table$Variable <- gsub("Income\\$", "Income: $", final_table$Variable)
final_table$Variable <- gsub("Age", "Age: ", final_table$Variable)

rownames(final_table) <- NULL


final_table %>%
  kbl(format = "html", 
      caption = "<b><i>Adjusted Odds Ratios for Receipt of SUD Treatment with 95% Confidence Intervals</i>",
      align = c("l", "r", "r")) %>%
  kable_classic(full_width = FALSE, html_font = "Arial") %>%
  column_spec(1, width = "20em") %>%
  column_spec(2, width = "12em") %>%
  column_spec(3, width = "8em") %>%
  row_spec(0, bold = TRUE) %>%
  footnote(general = "Note: Reference categories are shown in parentheses. All models control for all variables simultaneously.",
           general_title = "",
           footnote_as_chunk = TRUE) %>%
  save_kable("OddsRatios_CI.html")

print(final_table)

##                                  Variable Odds Ratio (95% CI) P-value
## 1              No Medicaid (vs. Medicaid)    0.57 (0.50-0.64)  <0.001
## 2            No Insurance (vs. Insurance)    0.87 (0.72-1.04)   0.115
## 3           Small Metro (vs. Large Metro)    1.07 (0.95-1.19)   0.254
## 4              Nonmetro (vs. Large Metro)    1.09 (0.94-1.26)   0.255
## 5                       Female (vs. Male)    1.15 (1.04-1.27)   0.008
## 6                        Age: 21-23 years    0.93 (0.75-1.15)   0.493
## 7                        Age: 24-25 years    1.16 (0.92-1.47)   0.201
## 8                        Age: 26-29 years    1.36 (1.09-1.71)   0.008
## 9                        Age: 30-34 years    1.80 (1.47-2.22)  <0.001
## 10                       Age: 35-49 years    2.33 (1.93-2.80)  <0.001
## 11                       Age: 50-64 years    1.75 (1.38-2.21)  <0.001
## 12                       Age: 65 or older    1.52 (1.14-2.02)   0.005
## 13                            Race: Black    0.65 (0.55-0.77)  <0.001
## 14                  Race: Native American    1.04 (0.78-1.40)   0.771
## 15 Race: Native Hawaiian/Pacific Islander    0.91 (0.47-1.76)   0.786
## 16                            Race: Asian    1.14 (0.82-1.58)   0.440
## 17                    Race: More than one    0.77 (0.62-0.97)   0.028
## 18                         Race: Hispanic    0.84 (0.73-0.97)   0.015
## 19                    Education: H.S Grad    0.84 (0.72-0.98)   0.026
## 20                Education: Some College    0.75 (0.64-0.88)  <0.001
## 21                Education: College Grad    0.49 (0.40-0.59)  <0.001
## 22                Income: $20,000-$49,000    0.88 (0.77-1.00)   0.056
## 23                Income: $50,000-$74,000    0.88 (0.74-1.04)   0.127
## 24                Income: $75,000 or more    0.70 (0.59-0.82)  <0.001
## 25                  Employment: Part-time    1.21 (1.04-1.41)   0.012
## 26                 Employment: Unemployed    1.26 (1.05-1.52)   0.013
## 27                      Employment: Other    1.55 (1.36-1.77)  <0.001

DRAFT Capstone

Megan Janzen

Updated 2025-11-23

3. GOODNESS OF FIT - Hosmer-Lemeshow Test

Rule: p-value > 0.05 indicates good fit