DEM 7223 Homework 1 Introduction to Event History Analysis and Functions of Survival Time Add Health Suicide Ideation Ricardo Lowe
library(haven)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#ADD HEALTH 1995 (WI), 96 (WII), 01 (WIII) and 08 (WIV)
add95 <- read_sav("~/Desktop/ICPSR_21600/DS0001/21600-0001-Data.sav")
add96 <- read_sav("~/Desktop/ICPSR_21600-2/DS0005/21600-0005-Data.sav")
add01 <- read_sav("~/Desktop/ICPSR_21600 - 3/DS0008/21600-0008-Data.sav")
add08 <- read_sav("~/Desktop/ICPSR_21600-040/DS0022/21600-0022-Data.sav")
#Join 96 and 95 since the former does not have Race variable
add96_2 <- left_join(add95, add96, by = c('AID'))
#Year in 95 is only two-digit, so making a 4 digit variable
add95$DIG <- ifelse(add95$AID>0,19," ")
add95$AGE_ <- as.character(add95$H1GI1Y)
add95$AGE1 <- as.numeric(paste(add95$DIG,add95$AGE_,sep = ""))
## Warning: NAs introduced by coercion
#TEST
a9N5 <- add95 %>% select(AID,DIG,AGE1)
#Rename Variables in Datasets to prep for Stack
add95 <- rename(add95, YEAR = IYEAR, BIRTHYR = AGE1, RACE = H1GI9, SEX = BIO_SEX, SUI = H1SU1, SUA = H1SU2)
add96 <- rename(add96_2, YEAR = IYEAR2, BIRTHYR = H2GI1Y, RACE = H1GI9, SEX = BIO_SEX2, SUI = H2SU1, SUA = H2SU2)
add01 <- rename(add01, YEAR = IYEAR3, BIRTHYR = H3OD1Y, RACE = H3IR4, SEX = BIO_SEX3, SUI = H3TO130, SUA = H3TO131)
add08 <- rename(add08, YEAR = IYEAR4, BIRTHYR = H4OD1Y, RACE = H4IR4, SEX = BIO_SEX4, SUI = H4SE1, SUA = H4SE2)
add95$YEAR <- ifelse(add95$YEAR == 95,1995,"")
add96$YEAR <- ifelse(add96$YEAR == 96,1996,"")
#-----------------id---year---race-sex-ideation-------attempt
a95 <- add95 %>% select(AID,YEAR,BIRTHYR,RACE,SEX,SUA,SUI)
a96 <- add96 %>% select(AID,YEAR,BIRTHYR,RACE,SEX,SUA,SUI)
a01 <- add01 %>% select(AID,YEAR,BIRTHYR,RACE,SEX,SUA,SUI)
a08 <- add08 %>% select(AID,YEAR,BIRTHYR,RACE,SEX,SUA,SUI)
sui <- rbind(a95,a01,a08)
X <- sui %>%
arrange(desc(AID))
X$AGE <- as.numeric(X$YEAR) - as.numeric(X$BIRTHYR)
X$SUA <- NULL
EV <- X[, c(1, 2, 3, 7, 4,5,6)]
EV$PERIOD <- 2008 - as.numeric(EV$YEAR)
EV
## # A tibble: 16,500 x 8
## AID YEAR BIRTHYR AGE RACE SEX SUI PERIOD
## <chr> <chr> <dbl> <dbl> <dbl+lbl> <dbl+lbl> <dbl+lbl> <dbl>
## 1 997199… 1995 1981 14 1 [(1) White] 1 [(1) M… 0 [(0) No (sk… 13
## 2 997199… 2008 1981 27 1 [(1) White] 1 [(1) M… 0 [(0) No (sk… 0
## 3 997199… 1995 1980 15 1 [(1) White] 2 [(2) F… 0 [(0) No (sk… 13
## 4 997199… 2001 1980 21 1 [(1) White] 2 [(2) F… 1 [(1) Yes] 7
## 5 997199… 2008 1980 28 1 [(1) White] 2 [(2) F… 0 [(0) No (sk… 0
## 6 997199… 1995 1981 14 1 [(1) White] 1 [(1) M… 1 [(1) Yes] 13
## 7 997199… 2008 1981 27 1 [(1) White] 1 [(1) M… 0 [(0) No (sk… 0
## 8 997199… 1995 1981 14 2 [(2) Black/Afr… 1 [(1) M… 0 [(0) No (sk… 13
## 9 997199… 2001 1981 20 2 [(2) Black/Afr… 1 [(1) M… 0 [(0) No (sk… 7
## 10 997199… 2008 1981 27 2 [(2) Black/Afr… 1 [(1) M… 0 [(0) No (sk… 0
## # … with 16,490 more rows
EV2 <- EV %>%
arrange(desc(AID))
EV2
## # A tibble: 16,500 x 8
## AID YEAR BIRTHYR AGE RACE SEX SUI PERIOD
## <chr> <chr> <dbl> <dbl> <dbl+lbl> <dbl+lbl> <dbl+lbl> <dbl>
## 1 997199… 1995 1981 14 1 [(1) White] 1 [(1) M… 0 [(0) No (sk… 13
## 2 997199… 2008 1981 27 1 [(1) White] 1 [(1) M… 0 [(0) No (sk… 0
## 3 997199… 1995 1980 15 1 [(1) White] 2 [(2) F… 0 [(0) No (sk… 13
## 4 997199… 2001 1980 21 1 [(1) White] 2 [(2) F… 1 [(1) Yes] 7
## 5 997199… 2008 1980 28 1 [(1) White] 2 [(2) F… 0 [(0) No (sk… 0
## 6 997199… 1995 1981 14 1 [(1) White] 1 [(1) M… 1 [(1) Yes] 13
## 7 997199… 2008 1981 27 1 [(1) White] 1 [(1) M… 0 [(0) No (sk… 0
## 8 997199… 1995 1981 14 2 [(2) Black/Afr… 1 [(1) M… 0 [(0) No (sk… 13
## 9 997199… 2001 1981 20 2 [(2) Black/Afr… 1 [(1) M… 0 [(0) No (sk… 7
## 10 997199… 2008 1981 27 2 [(2) Black/Afr… 1 [(1) M… 0 [(0) No (sk… 0
## # … with 16,490 more rows
In the Add Health, suicide ideation is rcorded as yes or no (I excluded other options like DK or Refusal). I created a variable that provides the age of the respondent at time of ideation.
If the respondent did not contemplate suicide by age 33, then the variable is SUI == 0 and the age at ideation is censored.
If the age at death is censored, then the age at ideation (censored age at ideation) is the year of the interview - year of birth.
If the respondent contemplated suicide by age of 33, then the age at ideation in years is the variable sui.age2. Here I code this:
EV2$sui.age2<-ifelse(EV2$SUI==1,EV2$AGE,"")
EV2$sui.age<-ifelse(EV2$SUI==0,
EV2$AGE,
EV2$sui.age2)
EV2$sui.age2<-as.numeric(EV2$sui.age2,na.rm=TRUE)
EV2$sui.age<-as.numeric(EV2$sui.age,na.rm=TRUE)
#censoring indicator for death by age 1, in months (12 months)
EV2$d.event<-ifelse(is.na(EV2$sui.age2)==T|EV2$sui.age2>33,0,1)
EV2$d.eventfac<-factor(EV2$d.event); levels(EV2$d.eventfac)<-c("No Suicide Ideation", "Suicide Ideation")
table(EV2$d.eventfac)
##
## No Suicide Ideation Suicide Ideation
## 15056 1444
We see 1444 suicide ideators among the 15056 respondents across all 4 waves.
summary(EV2$AGE)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 12.00 17.00 21.00 21.89 27.00 34.00 4
library(survival)
head(EV2[,c("sui.age","d.event")], n=20)
## # A tibble: 20 x 2
## sui.age d.event
## <dbl> <dbl>
## 1 14 0
## 2 27 0
## 3 15 0
## 4 21 1
## 5 28 0
## 6 14 1
## 7 27 0
## 8 14 0
## 9 20 0
## 10 27 0
## 11 13 0
## 12 19 0
## 13 26 0
## 14 14 0
## 15 20 0
## 16 27 0
## 17 14 0
## 18 20 0
## 19 27 0
## 20 15 0
head(Surv(EV2$sui.age, EV2$d.event), n=20)
## [1] 14+ 27+ 15+ 21 28+ 14 27+ 14+ 20+ 27+ 13+ 19+ 26+ 14+ 20+ 27+ 14+ 20+ 27+
## [20] 15+
mort<-survfit(Surv(sui.age, d.event)~1, data=EV2,conf.type="none")
plot(mort, ylim=c(.7,1), xlim=c(14,32), main="Survival Function for Suicide Ideation Among Respondents")
1000*(1-summary(mort)$surv[16])
## [1] 123.6626
summary(mort)
## Call: survfit(formula = Surv(sui.age, d.event) ~ 1, data = EV2, conf.type = "none")
##
## 241 observations deleted due to missingness
## time n.risk n.event survival std.err
## 13 16251 49 0.997 0.000430
## 14 15677 95 0.991 0.000751
## 15 14783 142 0.981 0.001089
## 16 13712 150 0.971 0.001386
## 17 12565 185 0.956 0.001718
## 18 11402 144 0.944 0.001969
## 19 10259 85 0.936 0.002128
## 20 9513 62 0.930 0.002251
## 21 8840 51 0.925 0.002360
## 22 8046 67 0.917 0.002521
## 23 7232 34 0.913 0.002615
## 24 6401 30 0.909 0.002717
## 25 5636 22 0.905 0.002810
## 26 5211 40 0.898 0.002995
## 27 4633 47 0.889 0.003247
## 28 3895 56 0.876 0.003622
## 29 3007 56 0.860 0.004159
## 30 2112 62 0.835 0.005126
## 31 1227 48 0.802 0.006753
## 32 356 16 0.766 0.010917
## 33 53 3 0.723 0.026407
This Kaplan-Meier estimate of the survival function shows/ the number of respondents at risk of ideation and the number actually contemplating suivide. We see the highest number of suicide ideations occurred between 14 and 18 years of age.
EV2 <- EV2 %>% filter(!is.na(sui.age))
library(muhaz)
hazard<-kphaz.fit(time=EV2$sui.age, status=EV2$d.event, method = "product-limit")
kphaz.plot(hazard, main="Hazard function plot")
data.frame(hazard)
## time haz var
## 1 13.5 0.006250236 4.113146e-07
## 2 14.5 0.009981525 7.019656e-07
## 3 15.5 0.011469560 8.775582e-07
## 4 16.5 0.015430919 1.288091e-06
## 5 17.5 0.013346521 1.238277e-06
## 6 18.5 0.008581702 8.667703e-07
## 7 19.5 0.006750514 7.352982e-07
## 8 20.5 0.006078859 7.251578e-07
## 9 21.5 0.008724834 1.137240e-06
## 10 22.5 0.005046990 7.504069e-07
## 11 23.5 0.004968199 8.238959e-07
## 12 24.5 0.004074120 7.547632e-07
## 13 25.5 0.008163162 1.667444e-06
## 14 26.5 0.011098821 2.627641e-06
## 15 27.5 0.016484103 4.879146e-06
## 16 28.5 0.022318668 8.990932e-06
## 17 29.5 0.039678076 2.601705e-05
## 18 30.5 0.073649752 1.297388e-04
## 19 31.5 0.117409476 1.180331e-03
## 20 32.5 0.062944351 1.325387e-03
The hazard is highest among respondents aged between 15 and 18. There is also a large jump in risk between ages 31 and 32.
Now we have our S(t) and h(t) functions. We can derive the other functions of survival time from these but integrating (summing) and differentiating these functions.
#cumulative hazard
plot(cumsum(hazard$haz)~hazard$time,
main = "Cumulative Hazard function",
ylab="H(t)",xlab="Time in Years",
type="l",xlim=c(14,31), lwd=2,col=3)
#Survival function, I just store this in an object so I can use it
surv<-mort
#here is a cheap version of the pdf
ft<- -diff(mort$surv)
plot(ft, xlim=c(13.0,33.0),
type="s",
ylab="f(t)",xlab="Time in Years",
main="Probability Density Function")
#here is the cumulative distribution function
Ft<-cumsum(ft)
plot(Ft, xlim=c(13.0,33.0), type="s", ylab="F(t)",xlab="Time in Years", main="Cumulative Distribution Function")