DEM 7223 Homework 1 Introduction to Event History Analysis and Functions of Survival Time Add Health Suicide Ideation Ricardo Lowe

library(haven)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#ADD HEALTH 1995 (WI), 96 (WII), 01 (WIII) and 08 (WIV)
add95 <- read_sav("~/Desktop/ICPSR_21600/DS0001/21600-0001-Data.sav")
add96 <- read_sav("~/Desktop/ICPSR_21600-2/DS0005/21600-0005-Data.sav")
add01 <- read_sav("~/Desktop/ICPSR_21600 - 3/DS0008/21600-0008-Data.sav")
add08 <- read_sav("~/Desktop/ICPSR_21600-040/DS0022/21600-0022-Data.sav")
#Join 96 and 95 since the former does not have Race variable 

add96_2 <- left_join(add95, add96, by = c('AID'))
#Year in 95 is only two-digit, so making a 4 digit variable
add95$DIG <- ifelse(add95$AID>0,19," ")
add95$AGE_ <- as.character(add95$H1GI1Y)
add95$AGE1 <- as.numeric(paste(add95$DIG,add95$AGE_,sep = ""))
## Warning: NAs introduced by coercion
#TEST
a9N5 <- add95 %>% select(AID,DIG,AGE1)
#Rename Variables in Datasets to prep for Stack
add95 <- rename(add95, YEAR = IYEAR, BIRTHYR = AGE1, RACE = H1GI9, SEX = BIO_SEX, SUI = H1SU1, SUA = H1SU2)
add96 <- rename(add96_2, YEAR = IYEAR2, BIRTHYR = H2GI1Y, RACE = H1GI9, SEX = BIO_SEX2, SUI = H2SU1, SUA = H2SU2)
add01 <- rename(add01, YEAR = IYEAR3, BIRTHYR = H3OD1Y,  RACE = H3IR4, SEX = BIO_SEX3, SUI = H3TO130, SUA = H3TO131)
add08 <- rename(add08, YEAR = IYEAR4, BIRTHYR = H4OD1Y, RACE = H4IR4, SEX = BIO_SEX4, SUI = H4SE1, SUA = H4SE2)
add95$YEAR <- ifelse(add95$YEAR == 95,1995,"")
add96$YEAR <- ifelse(add96$YEAR == 96,1996,"")
#-----------------id---year---race-sex-ideation-------attempt
a95 <- add95 %>% select(AID,YEAR,BIRTHYR,RACE,SEX,SUA,SUI)
a96 <- add96 %>% select(AID,YEAR,BIRTHYR,RACE,SEX,SUA,SUI)
a01 <- add01 %>% select(AID,YEAR,BIRTHYR,RACE,SEX,SUA,SUI)
a08 <- add08 %>% select(AID,YEAR,BIRTHYR,RACE,SEX,SUA,SUI)
sui <- rbind(a95,a01,a08)

X <- sui %>% 
  arrange(desc(AID))

X$AGE <- as.numeric(X$YEAR) - as.numeric(X$BIRTHYR)

X$SUA <- NULL

EV <- X[, c(1, 2, 3, 7, 4,5,6)]



EV$PERIOD <- 2008 - as.numeric(EV$YEAR)

EV
## # A tibble: 16,500 x 8
##    AID     YEAR  BIRTHYR   AGE              RACE       SEX            SUI PERIOD
##    <chr>   <chr>   <dbl> <dbl>         <dbl+lbl> <dbl+lbl>      <dbl+lbl>  <dbl>
##  1 997199… 1995     1981    14 1 [(1) White]     1 [(1) M… 0 [(0) No (sk…     13
##  2 997199… 2008     1981    27 1 [(1) White]     1 [(1) M… 0 [(0) No (sk…      0
##  3 997199… 1995     1980    15 1 [(1) White]     2 [(2) F… 0 [(0) No (sk…     13
##  4 997199… 2001     1980    21 1 [(1) White]     2 [(2) F… 1 [(1) Yes]         7
##  5 997199… 2008     1980    28 1 [(1) White]     2 [(2) F… 0 [(0) No (sk…      0
##  6 997199… 1995     1981    14 1 [(1) White]     1 [(1) M… 1 [(1) Yes]        13
##  7 997199… 2008     1981    27 1 [(1) White]     1 [(1) M… 0 [(0) No (sk…      0
##  8 997199… 1995     1981    14 2 [(2) Black/Afr… 1 [(1) M… 0 [(0) No (sk…     13
##  9 997199… 2001     1981    20 2 [(2) Black/Afr… 1 [(1) M… 0 [(0) No (sk…      7
## 10 997199… 2008     1981    27 2 [(2) Black/Afr… 1 [(1) M… 0 [(0) No (sk…      0
## # … with 16,490 more rows
EV2 <- EV %>%
  arrange(desc(AID))

EV2
## # A tibble: 16,500 x 8
##    AID     YEAR  BIRTHYR   AGE              RACE       SEX            SUI PERIOD
##    <chr>   <chr>   <dbl> <dbl>         <dbl+lbl> <dbl+lbl>      <dbl+lbl>  <dbl>
##  1 997199… 1995     1981    14 1 [(1) White]     1 [(1) M… 0 [(0) No (sk…     13
##  2 997199… 2008     1981    27 1 [(1) White]     1 [(1) M… 0 [(0) No (sk…      0
##  3 997199… 1995     1980    15 1 [(1) White]     2 [(2) F… 0 [(0) No (sk…     13
##  4 997199… 2001     1980    21 1 [(1) White]     2 [(2) F… 1 [(1) Yes]         7
##  5 997199… 2008     1980    28 1 [(1) White]     2 [(2) F… 0 [(0) No (sk…      0
##  6 997199… 1995     1981    14 1 [(1) White]     1 [(1) M… 1 [(1) Yes]        13
##  7 997199… 2008     1981    27 1 [(1) White]     1 [(1) M… 0 [(0) No (sk…      0
##  8 997199… 1995     1981    14 2 [(2) Black/Afr… 1 [(1) M… 0 [(0) No (sk…     13
##  9 997199… 2001     1981    20 2 [(2) Black/Afr… 1 [(1) M… 0 [(0) No (sk…      7
## 10 997199… 2008     1981    27 2 [(2) Black/Afr… 1 [(1) M… 0 [(0) No (sk…      0
## # … with 16,490 more rows

In the Add Health, suicide ideation is rcorded as yes or no (I excluded other options like DK or Refusal). I created a variable that provides the age of the respondent at time of ideation.

If the respondent did not contemplate suicide by age 33, then the variable is SUI == 0 and the age at ideation is censored.

If the age at death is censored, then the age at ideation (censored age at ideation) is the year of the interview - year of birth.

If the respondent contemplated suicide by age of 33, then the age at ideation in years is the variable sui.age2. Here I code this:

EV2$sui.age2<-ifelse(EV2$SUI==1,EV2$AGE,"")
EV2$sui.age<-ifelse(EV2$SUI==0,
                         EV2$AGE,
                         EV2$sui.age2)

EV2$sui.age2<-as.numeric(EV2$sui.age2,na.rm=TRUE)
EV2$sui.age<-as.numeric(EV2$sui.age,na.rm=TRUE)



#censoring indicator for death by age 1, in months (12 months)
EV2$d.event<-ifelse(is.na(EV2$sui.age2)==T|EV2$sui.age2>33,0,1)
EV2$d.eventfac<-factor(EV2$d.event); levels(EV2$d.eventfac)<-c("No Suicide Ideation", "Suicide Ideation")
table(EV2$d.eventfac)
## 
## No Suicide Ideation    Suicide Ideation 
##               15056                1444

We see 1444 suicide ideators among the 15056 respondents across all 4 waves.

summary(EV2$AGE)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   12.00   17.00   21.00   21.89   27.00   34.00       4
library(survival)
head(EV2[,c("sui.age","d.event")], n=20)
## # A tibble: 20 x 2
##    sui.age d.event
##      <dbl>   <dbl>
##  1      14       0
##  2      27       0
##  3      15       0
##  4      21       1
##  5      28       0
##  6      14       1
##  7      27       0
##  8      14       0
##  9      20       0
## 10      27       0
## 11      13       0
## 12      19       0
## 13      26       0
## 14      14       0
## 15      20       0
## 16      27       0
## 17      14       0
## 18      20       0
## 19      27       0
## 20      15       0
head(Surv(EV2$sui.age, EV2$d.event), n=20)
##  [1] 14+ 27+ 15+ 21  28+ 14  27+ 14+ 20+ 27+ 13+ 19+ 26+ 14+ 20+ 27+ 14+ 20+ 27+
## [20] 15+
mort<-survfit(Surv(sui.age, d.event)~1, data=EV2,conf.type="none")
plot(mort, ylim=c(.7,1), xlim=c(14,32), main="Survival Function for Suicide Ideation Among Respondents")

1000*(1-summary(mort)$surv[16])
## [1] 123.6626
summary(mort)
## Call: survfit(formula = Surv(sui.age, d.event) ~ 1, data = EV2, conf.type = "none")
## 
## 241 observations deleted due to missingness 
##  time n.risk n.event survival  std.err
##    13  16251      49    0.997 0.000430
##    14  15677      95    0.991 0.000751
##    15  14783     142    0.981 0.001089
##    16  13712     150    0.971 0.001386
##    17  12565     185    0.956 0.001718
##    18  11402     144    0.944 0.001969
##    19  10259      85    0.936 0.002128
##    20   9513      62    0.930 0.002251
##    21   8840      51    0.925 0.002360
##    22   8046      67    0.917 0.002521
##    23   7232      34    0.913 0.002615
##    24   6401      30    0.909 0.002717
##    25   5636      22    0.905 0.002810
##    26   5211      40    0.898 0.002995
##    27   4633      47    0.889 0.003247
##    28   3895      56    0.876 0.003622
##    29   3007      56    0.860 0.004159
##    30   2112      62    0.835 0.005126
##    31   1227      48    0.802 0.006753
##    32    356      16    0.766 0.010917
##    33     53       3    0.723 0.026407

This Kaplan-Meier estimate of the survival function shows/ the number of respondents at risk of ideation and the number actually contemplating suivide. We see the highest number of suicide ideations occurred between 14 and 18 years of age.

EV2 <- EV2 %>% filter(!is.na(sui.age))
library(muhaz)
hazard<-kphaz.fit(time=EV2$sui.age, status=EV2$d.event, method = "product-limit")
kphaz.plot(hazard, main="Hazard function plot")

data.frame(hazard)
##    time         haz          var
## 1  13.5 0.006250236 4.113146e-07
## 2  14.5 0.009981525 7.019656e-07
## 3  15.5 0.011469560 8.775582e-07
## 4  16.5 0.015430919 1.288091e-06
## 5  17.5 0.013346521 1.238277e-06
## 6  18.5 0.008581702 8.667703e-07
## 7  19.5 0.006750514 7.352982e-07
## 8  20.5 0.006078859 7.251578e-07
## 9  21.5 0.008724834 1.137240e-06
## 10 22.5 0.005046990 7.504069e-07
## 11 23.5 0.004968199 8.238959e-07
## 12 24.5 0.004074120 7.547632e-07
## 13 25.5 0.008163162 1.667444e-06
## 14 26.5 0.011098821 2.627641e-06
## 15 27.5 0.016484103 4.879146e-06
## 16 28.5 0.022318668 8.990932e-06
## 17 29.5 0.039678076 2.601705e-05
## 18 30.5 0.073649752 1.297388e-04
## 19 31.5 0.117409476 1.180331e-03
## 20 32.5 0.062944351 1.325387e-03

The hazard is highest among respondents aged between 15 and 18. There is also a large jump in risk between ages 31 and 32.

Now we have our S(t) and h(t) functions. We can derive the other functions of survival time from these but integrating (summing) and differentiating these functions.

#cumulative hazard
plot(cumsum(hazard$haz)~hazard$time, 
     main = "Cumulative Hazard function",
     ylab="H(t)",xlab="Time in Years", 
     type="l",xlim=c(14,31), lwd=2,col=3)

#Survival function, I just store this in an object so I can use it
surv<-mort

#here is a cheap version of the pdf
ft<- -diff(mort$surv)
plot(ft, xlim=c(13.0,33.0), 
     type="s",
     ylab="f(t)",xlab="Time in Years",
     main="Probability Density Function")

#here is the cumulative distribution function
Ft<-cumsum(ft)
plot(Ft, xlim=c(13.0,33.0), type="s", ylab="F(t)",xlab="Time in Years", main="Cumulative Distribution Function")