DEM 7223 - Event History Analysis - Comparing Survival Times Between Groups
Load the Pharmacy Dataset
Code
# Import the data for active pharmacies in Texas
TxRX0122 <- import("TxRX0122.xlsx") %>%
filter(LIC_STATUS %in% c("Active","Probation")) %>%
mutate(LIC_STATUS =ifelse(LIC_STATUS=="Probation","Active",LIC_STATUS)) %>%
select(LIC_NBR,Latitude,Longitude,fullAdd,PHARMACY_NAME,CLASS,
LIC_STATUS,LIC_EXPR_DATE,LIC_ORIG_DATE) %>%
clean_names() %>%
rename(status=lic_status,
full_address=full_add) %>%
mutate(status_date="2021-12-31",
status_date=as.Date(status_date,"%Y-%m-%d"),
lic_orig_date=as.Date(lic_orig_date,"%Y-%m-%d"),
lic_expr_date=as.Date( lic_expr_date,"%Y-%m-%d"))
# Import the data for closed pharmacies in Texas
TxRX1622 <- import("ClosedRX.csv") %>%
select(status,status_date,lic_nbr,pharmacy_name,lic_orig_date,
lic_expr_date,class,full_address,latitude,longitude) %>%
mutate(status_date=as.Date(status_date,"%m/%d/%y"),
lic_orig_date=as.Date(lic_orig_date,"%m/%d/%y"),
lic_expr_date=as.Date( lic_expr_date,"%m/%d/%y"))
# Merge concatenate the two dataset
Txrxdata <- rbind(TxRX0122,TxRX1622)
head(as.data.frame(Txrxdata))
Make the Pharmacy Dataset Into a Spatial Point Geometery
Code
Txrxdata<- st_as_sf(Txrxdata,
coords = c("longitude", "latitude"),
crs = 4326) %>%
st_transform(crs= 3081)
head(as.data.frame(Txrxdata))
Get the 2019 Demographic dataset from ACS Profile Tables (Percent Estimates)
Code
TxtracShp<-get_acs(geography = "tract",
state="TX",
year = 2019,
variables=c("DP05_0001P","DP05_0002P","DP05_0003P","DP05_0004P","DP05_0005P","DP05_0006P",
"DP05_0007P","DP05_0008P","DP05_0009P","DP05_0010P","DP05_0011P","DP05_0012P",
"DP05_0013P","DP05_0014P","DP05_0015P","DP05_0016P","DP05_0017P","DP02_0067P",
"DP02_0068P","DP03_0099P","DP05_0024P","DP03_0009P","DP02_0072P" ),
geometry = T,
output = "wide",
progress_bar=F ) %>%
# Rename the variable names for ease of use
rename(totpop=DP05_0001PE,pctmalepop=DP05_0002PE,pctfemalepop=DP05_0003PE,pctpopu5=DP05_0004PE,pctpop5to9=DP05_0005PE,pctpop10to14=DP05_0006PE,
pctpop15to19=DP05_0007PE,pctpop20to24=DP05_0008PE,pctpop25to34=DP05_0009PE,pctpop35to44=DP05_0010PE,pctpop45to54=DP05_0011PE,pctpop55to59=DP05_0012PE,
pctpop60to64=DP05_0013PE,pctpop65over=DP05_0024PE,pcthighsch=DP02_0067PE,pctbscdegree=DP02_0068PE, noinsurancecov=DP03_0099PE, unemployment_rate=DP03_0009PE,pct_disability=DP02_0072PE) %>%
#select only variables needed for the analysis
select(GEOID,NAME, totpop,pctmalepop,pctfemalepop,pctpop5to9,pctpop10to14,pctpop15to19,pctpop20to24,pctpop25to34,pctpop35to44,pctpop45to54,pctpop55to59,pctpop60to64,pctpop65over,pcthighsch,pctbscdegree,unemployment_rate,pct_disability)
#load the RUCA dataset to determine Rurality
Ruca <- import("Ruca_tract.csv") %>%
clean_names() %>%
filter(select_state=="TX", !primary_ruca==99) %>%
mutate(GEOID=as.character(geoid), rurality=ifelse(primary_ruca%in%c(1:3),"Metro","Non-Metro"))
# Merge the demographic data to the RUCA dataset
TxtracShp1 <- left_join(TxtracShp,Ruca, by="GEOID")
head(as.data.frame(TxtracShp1))
Get the 2019 Remaining Demographic dataset from ACS Tables (Percent Estimates Generated by Self)
Code
TxtracShp2 <- get_acs(geography = 'tract',
variables = c(perCapInc = "B19301_001",totPop19 = "B03003_001",poverty19="B17001_002", insurance19="B27001_001", Nocar="B25044_001", whitepop="B03002_003",
blackpop="B03002_004", Hispop="B03003_003",medianinc="B06011_001",
mNoins6= "B27001_005", mNoins618= "B27001_008",mNoins1925="B27001_011", mNoins2634="B27001_014", mNoins3544="B27001_017",mNoins4554="B27001_020",
mNoins5564="B27001_023",mNoins6574= "B27001_026",mNoins75="B27001_029",
fNoins6= "B27001_033",fNoins618= "B27001_036",fNoins1925= "B27001_039",
fNoins2634= "B27001_042",fNoins3544= "B27001_045",fNoins4554="B27001_048" ,
fNoins5564= "B27001_051",fNoins6574= "B27001_054",fNoins75 = "B27001_057",
medicaid_mu19="C27007_004",medicaid_m1964="C27007_007",medicaid_m65a="C27007_010",
medicaid_fu19= "C27007_014", medicaid_f1964= "C27007_017",medicaid_f65a= "C27007_020"
),
state = "TX" , year = 2019, geometry = FALSE) %>%
dplyr::select(GEOID, NAME, variable, estimate) %>%
spread(variable, estimate) %>%
replace(is.na(.),0) %>%
mutate( funinsured=rowSums(.[4:12]),muninsured= rowSums(.[22:30]) , uninsuredpop=muninsured+funinsured,medicaid= rowSums(.[16:21]),
pctuninsured=round((uninsuredpop/totPop19)*100,1) , pctwhitepop= round((whitepop/totPop19)*100,1),pctblackpop=round((blackpop/totPop19)*100,1), pcthisppop=round((Hispop/totPop19)*100,1), pctnovehicle=round((Nocar/totPop19)*100,1), Pctmedicaidins=round((medicaid/totPop19)*100,1)) %>%
dplyr::select(GEOID, totPop19,whitepop, blackpop,Hispop, Nocar, uninsuredpop,medianinc,perCapInc,pctuninsured,pctwhitepop,pctblackpop,pcthisppop,pctnovehicle,Pctmedicaidins)
# Merge the above merged file (TxtracShp1) to the other demographic data (TxtracShp2) that was created.
TxtracShp_2019 <- merge(TxtracShp1,TxtracShp2, by="GEOID") %>%
select(-geoid) %>%
st_transform(crs= 3081) %>%
na.omit()
head(as.data.frame(TxtracShp_2019))
Get the Geoid for Phamarcy Locations within the Tracts
Code
Age at Pharmacy Closure in Texas
This analysis uses pharmacies’ age-at-closure as the outcome variable. From the data manipulation above, I calculated age at pharmacy closure as the difference between the Pharmacy date of closure (Status_date) and their first date of operation(lic_orig_date). However, in the case of active pharmacies, I calculated their age using the censoring date of pharmacy closure follow-up. In this case, the latest follow-up date was December 31, 2021
Estimating and Plotting the Survival Function - Age at Pharmacy Closure in Texas
Code
# Estimating Age at pharmacy closure survival Function
age_fit <- survfit(Surv(death_age, d.event) ~ 1,
data = tracts.texas.pharmacy)
#summary(age_fit)
# Plotting the survival function Graph
age_fit %>%
ggsurvfit() +
xlim(0, 30) +
ylim(.95, 1)+
add_confidence_interval(type = "ribbon") +
add_quantile()
Time to Pharmacy Closure - Kaplan-Meier Survival Analysis
Here, instead of just examining the age at pharmacy closure, I estimated the survival risk of Pharmacy closure within 12 months of operation (Time to closure). If a pharmacy closed down within 12 months of operation, they are coded as experiencing the event; otherwise they are censored.
Estimating and Plotting the Survival Function - Time to Pharmacy Closure in Texas
Code
# Estimating Age at pharmacy closure survival Function
time_fit <- survfit(Surv(death_time, d.event) ~ 1,
data = tracts.texas.pharmacy)
#summary(time_fit)
# Plotting the survival function Graph
time_fit %>%
ggsurvfit() +
xlim(0, 30) +
ylim(.95, 1)+
add_confidence_interval(type = "ribbon") +
add_quantile()
Comparing Two Groups
Next, I estimate survival function for two groups(Pharmacies in Metro and Non-Metro tracts)and then test for differences in Pharmacy survival by residential characteristics of the Pharmacy location (Metro or Non-Metro). Consequently, I hypothesize that there is a significant difference in the survival chances of Pharmacies located in Metro tracts compared to those in Non-metro tracts in Texas.
Code
# Estimating the survival function for the groups
group_fit <- survfit2(Surv(death_time, d.event) ~ rurality,
data = tracts.texas.pharmacy)
#summary(group_fit)
# Plotting the Survival functions for the two groups
group_fit%>%
ggsurvfit() +
add_confidence_interval() +
add_risktable() +
labs(title = "Pharmacy Closure Survival Plot by Rurality Status",
caption = "Source: Texas Pharmacy Board \n Calculations by author", x= "Follow-up Time, Months",
y = "Survival Probability")+ theme_minimal() +
theme(legend.position = "bottom",axis.line = element_line(linetype = "solid"),
axis.ticks = element_line(linetype = "blank"),
panel.grid.major = element_line(colour = "gray80"),
panel.grid.minor = element_line(colour = "gray94",
linetype = "blank"), axis.title = element_text(family = "serif",
face = "bold"), axis.text = element_text(face = "bold"),
plot.title = element_text(family = "serif",
size = 14, face = "bold") , legend.text = element_text(face = "bold",
family = "serif"), legend.title = element_text(face = "bold",
family = "serif"), panel.background = element_rect(fill = "gray97"),
legend.key = element_rect(fill = NA,
linetype = "solid"), legend.background = element_rect(linetype = "solid")) +
guides(color = guide_legend(ncol = 1)) +
coord_cartesian(xlim = c(0, 30)) +
scale_y_continuous(
limits = c(.95, 1),
labels = scales::percent,
expand = c(0.01, 0))+
scale_color_manual(values = c('#54738E', '#82AC7C')) +
scale_fill_manual(values = c('#54738E', '#82AC7C'))
Code
# Testing the hypothesis to see if there is a significant difference in the Pharmacy survival chances between the two groups
survdiff(Surv(death_time, d.event) ~ rurality,
data = tracts.texas.pharmacy)
Call:
survdiff(formula = Surv(death_time, d.event) ~ rurality, data = tracts.texas.pharmacy)
n=9017, 8 observations deleted due to missingness.
N Observed Expected (O-E)^2/E (O-E)^2/V
rurality=Metro 7943 1274 1179 7.72 55.6
rurality=Non-Metro 1074 96 191 47.54 55.6
Chisq= 55.6 on 1 degrees of freedom, p= 9e-14
Given the high chi-square value of 54 and a very small p-value of 9e-14, I fail to reject the null hypothesis. There is no significant difference in the survival chances between Pharmacies located in Metro tracts and Non-Metro tracts of Texas within the first 12 months of opening the pharmacy stores.