DEM 7223 - Event History Analysis - Comparing Survival Times Between Groups

Author

Samson A. Olowolaju, MPH

Published

September 15, 2022

Load the Pharmacy Dataset

Code
# Import the data for active pharmacies in Texas 
TxRX0122 <- import("TxRX0122.xlsx") %>% 
     filter(LIC_STATUS %in% c("Active","Probation")) %>% 
  mutate(LIC_STATUS =ifelse(LIC_STATUS=="Probation","Active",LIC_STATUS)) %>% 
  select(LIC_NBR,Latitude,Longitude,fullAdd,PHARMACY_NAME,CLASS,
         LIC_STATUS,LIC_EXPR_DATE,LIC_ORIG_DATE) %>% 
  clean_names() %>% 
  rename(status=lic_status,
         full_address=full_add) %>% 
  mutate(status_date="2021-12-31", 
         status_date=as.Date(status_date,"%Y-%m-%d"),  
         lic_orig_date=as.Date(lic_orig_date,"%Y-%m-%d"), 
         lic_expr_date=as.Date( lic_expr_date,"%Y-%m-%d")) 
  
# Import the data for closed pharmacies in Texas
 TxRX1622 <- import("ClosedRX.csv") %>% 
      select(status,status_date,lic_nbr,pharmacy_name,lic_orig_date,
             lic_expr_date,class,full_address,latitude,longitude) %>% 
      mutate(status_date=as.Date(status_date,"%m/%d/%y"),  
         lic_orig_date=as.Date(lic_orig_date,"%m/%d/%y"), 
         lic_expr_date=as.Date( lic_expr_date,"%m/%d/%y")) 

 # Merge concatenate the two dataset     
Txrxdata <- rbind(TxRX0122,TxRX1622)
head(as.data.frame(Txrxdata))

Make the Pharmacy Dataset Into a Spatial Point Geometery

Code
Txrxdata<- st_as_sf(Txrxdata, 
                        coords = c("longitude", "latitude"),
                        crs = 4326) %>% 
st_transform(crs= 3081) 
head(as.data.frame(Txrxdata))

Get the 2019 Demographic dataset from ACS Profile Tables (Percent Estimates)

Code
TxtracShp<-get_acs(geography = "tract",
                state="TX",
                year = 2019,
                variables=c("DP05_0001P","DP05_0002P","DP05_0003P","DP05_0004P","DP05_0005P","DP05_0006P",
                            "DP05_0007P","DP05_0008P","DP05_0009P","DP05_0010P","DP05_0011P","DP05_0012P",
                            "DP05_0013P","DP05_0014P","DP05_0015P","DP05_0016P","DP05_0017P","DP02_0067P",
                            "DP02_0068P","DP03_0099P","DP05_0024P","DP03_0009P","DP02_0072P" ),
                geometry = T,
                output = "wide",
               progress_bar=F ) %>%  
  # Rename the variable names for ease of use
  rename(totpop=DP05_0001PE,pctmalepop=DP05_0002PE,pctfemalepop=DP05_0003PE,pctpopu5=DP05_0004PE,pctpop5to9=DP05_0005PE,pctpop10to14=DP05_0006PE,
                            pctpop15to19=DP05_0007PE,pctpop20to24=DP05_0008PE,pctpop25to34=DP05_0009PE,pctpop35to44=DP05_0010PE,pctpop45to54=DP05_0011PE,pctpop55to59=DP05_0012PE,
                            pctpop60to64=DP05_0013PE,pctpop65over=DP05_0024PE,pcthighsch=DP02_0067PE,pctbscdegree=DP02_0068PE, noinsurancecov=DP03_0099PE, unemployment_rate=DP03_0009PE,pct_disability=DP02_0072PE) %>%
 #select only variables needed for the analysis 
   select(GEOID,NAME, totpop,pctmalepop,pctfemalepop,pctpop5to9,pctpop10to14,pctpop15to19,pctpop20to24,pctpop25to34,pctpop35to44,pctpop45to54,pctpop55to59,pctpop60to64,pctpop65over,pcthighsch,pctbscdegree,unemployment_rate,pct_disability)

#load the RUCA dataset to determine Rurality
Ruca <- import("Ruca_tract.csv") %>% 
  clean_names() %>% 
  filter(select_state=="TX", !primary_ruca==99) %>% 
  mutate(GEOID=as.character(geoid), rurality=ifelse(primary_ruca%in%c(1:3),"Metro","Non-Metro")) 



# Merge the demographic data to the RUCA dataset
TxtracShp1 <- left_join(TxtracShp,Ruca, by="GEOID") 
head(as.data.frame(TxtracShp1))

Get the 2019 Remaining Demographic dataset from ACS Tables (Percent Estimates Generated by Self)

Code
TxtracShp2 <- get_acs(geography = 'tract', 
                      variables = c(perCapInc = "B19301_001",totPop19 = "B03003_001",poverty19="B17001_002",                                          insurance19="B27001_001", Nocar="B25044_001", whitepop="B03002_003",
                                     blackpop="B03002_004", Hispop="B03003_003",medianinc="B06011_001",
                                     mNoins6= "B27001_005", mNoins618= "B27001_008",mNoins1925="B27001_011",                                            mNoins2634="B27001_014", mNoins3544="B27001_017",mNoins4554="B27001_020",
                                     mNoins5564="B27001_023",mNoins6574= "B27001_026",mNoins75="B27001_029",
                                     fNoins6= "B27001_033",fNoins618= "B27001_036",fNoins1925= "B27001_039",
                                     fNoins2634= "B27001_042",fNoins3544= "B27001_045",fNoins4554="B27001_048" ,
                                     fNoins5564= "B27001_051",fNoins6574= "B27001_054",fNoins75 = "B27001_057",
                                medicaid_mu19="C27007_004",medicaid_m1964="C27007_007",medicaid_m65a="C27007_010",
                              medicaid_fu19= "C27007_014", medicaid_f1964= "C27007_017",medicaid_f65a= "C27007_020"
                             ), 
                       state = "TX" , year = 2019, geometry = FALSE) %>%
                 dplyr::select(GEOID, NAME, variable, estimate) %>% 
                  spread(variable, estimate) %>% 
  replace(is.na(.),0) %>% 
  mutate( funinsured=rowSums(.[4:12]),muninsured= rowSums(.[22:30]) , uninsuredpop=muninsured+funinsured,medicaid= rowSums(.[16:21]),
          pctuninsured=round((uninsuredpop/totPop19)*100,1) , pctwhitepop= round((whitepop/totPop19)*100,1),pctblackpop=round((blackpop/totPop19)*100,1), pcthisppop=round((Hispop/totPop19)*100,1), pctnovehicle=round((Nocar/totPop19)*100,1), Pctmedicaidins=round((medicaid/totPop19)*100,1)) %>% 
                  dplyr::select(GEOID, totPop19,whitepop, blackpop,Hispop, Nocar, uninsuredpop,medianinc,perCapInc,pctuninsured,pctwhitepop,pctblackpop,pcthisppop,pctnovehicle,Pctmedicaidins)

# Merge the above merged file (TxtracShp1) to the other demographic data (TxtracShp2) that was created.
TxtracShp_2019 <- merge(TxtracShp1,TxtracShp2, by="GEOID") %>%
  select(-geoid) %>% 
  st_transform(crs= 3081) %>% 
  na.omit()
head(as.data.frame(TxtracShp_2019))

Get the Geoid for Phamarcy Locations within the Tracts

Code
tracts.texas.pharmacy <-Txrxdata%>% 
        st_join(TxtracShp_2019) %>% 
  mutate(age_years=round(as.numeric(status_date - lic_orig_date)/365,1), age_months=round(age_years*12,0), status_year= lubridate::year(status_date)) %>% 
  filter(!is.na(lic_orig_date)) %>% 
  st_drop_geometry()

head(tracts.texas.pharmacy)

Age at Pharmacy Closure in Texas

This analysis uses pharmacies’ age-at-closure as the outcome variable. From the data manipulation above, I calculated age at pharmacy closure as the difference between the Pharmacy date of closure (Status_date) and their first date of operation(lic_orig_date). However, in the case of active pharmacies, I calculated their age using the censoring date of pharmacy closure follow-up. In this case, the latest follow-up date was December 31, 2021

Code
tracts.texas.pharmacy <- tracts.texas.pharmacy %>%
  mutate(death_age = age_months,
         d.event = ifelse(status == "Closed", 1, 0))
head(tracts.texas.pharmacy)

Estimating and Plotting the Survival Function - Age at Pharmacy Closure in Texas

Code
# Estimating Age at pharmacy closure survival Function
age_fit <- survfit(Surv(death_age, d.event) ~ 1, 
                   data = tracts.texas.pharmacy)

#summary(age_fit)

# Plotting the survival function Graph
age_fit %>%
  ggsurvfit() +
  xlim(0, 30) +
  ylim(.95, 1)+
  add_confidence_interval(type = "ribbon") +
  add_quantile() 

Time to Pharmacy Closure - Kaplan-Meier Survival Analysis

Here, instead of just examining the age at pharmacy closure, I estimated the survival risk of Pharmacy closure within 12 months of operation (Time to closure). If a pharmacy closed down within 12 months of operation, they are coded as experiencing the event; otherwise they are censored.

Code
tracts.texas.pharmacy <- tracts.texas.pharmacy %>%
  mutate(death_time = age_months,
         d3.event = ifelse(status == "Closed" & death_time <= 12, 1, 0))
head(tracts.texas.pharmacy)

Estimating and Plotting the Survival Function - Time to Pharmacy Closure in Texas

Code
# Estimating Age at pharmacy closure survival Function
time_fit <- survfit(Surv(death_time, d.event) ~ 1, 
                   data = tracts.texas.pharmacy)
 
#summary(time_fit)
# Plotting the survival function Graph
time_fit %>%
  ggsurvfit() +
  xlim(0, 30) +
  ylim(.95, 1)+
  add_confidence_interval(type = "ribbon") +
  add_quantile() 

Comparing Two Groups

Next, I estimate survival function for two groups(Pharmacies in Metro and Non-Metro tracts)and then test for differences in Pharmacy survival by residential characteristics of the Pharmacy location (Metro or Non-Metro). Consequently, I hypothesize that there is a significant difference in the survival chances of Pharmacies located in Metro tracts compared to those in Non-metro tracts in Texas.

Code
# Estimating the survival function for the groups
group_fit <- survfit2(Surv(death_time, d.event) ~ rurality, 
                   data = tracts.texas.pharmacy)

#summary(group_fit)
# Plotting the Survival functions for the two groups
group_fit%>%
  ggsurvfit() +
   add_confidence_interval() +
   add_risktable() +
  labs(title = "Pharmacy Closure Survival Plot by Rurality Status",
       caption = "Source: Texas Pharmacy Board \n Calculations by author", x= "Follow-up Time, Months",
      y = "Survival Probability")+  theme_minimal() +
  theme(legend.position = "bottom",axis.line = element_line(linetype = "solid"),
    axis.ticks = element_line(linetype = "blank"),
    panel.grid.major = element_line(colour = "gray80"),
    panel.grid.minor = element_line(colour = "gray94",
        linetype = "blank"), axis.title = element_text(family = "serif",
        face = "bold"), axis.text = element_text(face = "bold"),
    plot.title = element_text(family = "serif",
        size = 14, face = "bold") , legend.text = element_text(face = "bold",
        family = "serif"), legend.title = element_text(face = "bold",
        family = "serif"), panel.background = element_rect(fill = "gray97"),
    legend.key = element_rect(fill = NA,
        linetype = "solid"), legend.background = element_rect(linetype = "solid")) +
  guides(color = guide_legend(ncol = 1)) +
    coord_cartesian(xlim = c(0, 30)) +
  scale_y_continuous(
    limits = c(.95, 1),
    labels = scales::percent, 
    expand = c(0.01, 0))+
  scale_color_manual(values = c('#54738E', '#82AC7C')) +
  scale_fill_manual(values = c('#54738E', '#82AC7C')) 

Code
# Testing the hypothesis to see if there is a significant difference in the Pharmacy survival chances between the two groups 

survdiff(Surv(death_time, d.event) ~ rurality, 
                   data = tracts.texas.pharmacy)
Call:
survdiff(formula = Surv(death_time, d.event) ~ rurality, data = tracts.texas.pharmacy)

n=9017, 8 observations deleted due to missingness.

                      N Observed Expected (O-E)^2/E (O-E)^2/V
rurality=Metro     7943     1274     1179      7.72      55.6
rurality=Non-Metro 1074       96      191     47.54      55.6

 Chisq= 55.6  on 1 degrees of freedom, p= 9e-14 

Given the high chi-square value of 54 and a very small p-value of 9e-14, I fail to reject the null hypothesis. There is no significant difference in the survival chances between Pharmacies located in Metro tracts and Non-Metro tracts of Texas within the first 12 months of opening the pharmacy stores.