library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.4 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.0      ✔ stringr 1.4.1 
## ✔ readr   2.1.2      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(ipumsr)
library(haven)
library(survival)
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
library(survminer)
## Loading required package: ggpubr
## 
## Attaching package: 'survminer'
## 
## The following object is masked from 'package:survival':
## 
##     myeloma
library(ggplot2)
library(ggpubr)
library(muhaz)
library(carData)
library(dplyr)

** Read IPUMS NHIS data ** # I select data from 1990 to 2017

#setwd("C:\\Users\\shahi\\OneDrive\\Desktop\\Fall'22\\Dem 7223 (Event history)\\Assignments\\Assignment 2") 
nhis <- read_ipums_ddi("nhis_00006.xml") 
nhis_dat <- read_ipums_micro(nhis)
## Use of data from IPUMS NHIS is subject to conditions including that users
## should cite the data appropriately. Use command `ipums_conditions()` for more
## details.
nhis_dat <- haven::zap_labels(nhis_dat)

** Recode variable **

nhis_dat$MARST<- as.numeric(nhis_dat$MARST)

nhis_dat$MARST <- car:: Recode(nhis_dat$MARST, recodes= "11:40= 'married/ever married';  50= 'unmarried'; else= NA", as.factor= T)
nhis_dat <- nhis_dat %>%
  filter(MORTELIG == 1)

nhis_dat <- nhis_dat %>%
  filter(SEX == 2)



nhis_dat <- nhis_dat%>%
filter(complete.cases(.))

#Event - All Cause Mortality for Single Woman. #Time Variable - The current year of mortality follow up is 2017, that is the censoring year for mortality #Censoring Indicator - If the respondent died within 5 years of the survey, they are coded as experiencing the event, they are otherwise censored.

nhis_dat <- nhis_dat %>%
  mutate(death_age = ifelse( MORTSTAT ==1, 
                             MORTDODY - (YEAR - AGE), 
                             2017 - (YEAR - AGE)), 
         d.event = ifelse(MORTSTAT == 1, 1, 0))

library(survival)

age_fit <- survfit(Surv(death_age, d.event) ~ 1, 
                   data = nhis_dat)

library(ggsurvfit)

age_fit %>%
  ggsurvfit() +
  add_confidence_interval(type = "ribbon") +
  add_quantile() 

** Estimating survival time functions **

head(nhis_dat[,c("death_age","d.event")], n=20)
## # A tibble: 20 × 2
##    death_age d.event
##        <dbl>   <dbl>
##  1        53       0
##  2        72       0
##  3        54       0
##  4        56       0
##  5        64       0
##  6        41       0
##  7        75       1
##  8        36       1
##  9        73       0
## 10        43       0
## 11        47       0
## 12        58       0
## 13        46       0
## 14        50       0
## 15        45       0
## 16        38       0
## 17        65       1
## 18        46       0
## 19        47       0
## 20        98       0
head(Surv(nhis_dat$death_age, nhis_dat$d.event), n=20)
##  [1] 53+ 72+ 54+ 56+ 64+ 41+ 75  36  73+ 43+ 47+ 58+ 46+ 50+ 45+ 38+ 65  46+ 47+
## [20] 98+

In the first 20 cases from the data, we can see all the deaths are censored except for 3. censored cases are shown with a ‘+’ sign.

** 5a. Kaplan-Meier survival analysis of the outcome **

nhis_dat <- nhis_dat %>%
  mutate(death_time = ifelse( MORTSTAT ==1, 
                             MORTDODY - YEAR , 
                             2017 - YEAR ), 
         d5.event = ifelse(MORTSTAT == 1 & death_time <= 5, 1, 0))

library(survival)

time_fit <- survfit(Surv(death_time, d5.event) ~ 1, 
                  conf.type = "log",
                   data = nhis_dat)

library(ggsurvfit)

time_fit %>%
  ggsurvfit()+
  xlim(-1, 6) +
  add_censor_mark() +
  add_confidence_interval()+
  add_quantile(y_value = .975) 
## Warning: Removed 16 row(s) containing missing values (geom_path).

summary(age_fit)
## Call: survfit(formula = Surv(death_age, d.event) ~ 1, data = nhis_dat)
## 
##  time n.risk n.event survival  std.err lower 95% CI upper 95% CI
##    18 653351       4  0.99999 3.06e-06      0.99999       1.0000
##    19 653196       8  0.99998 5.30e-06      0.99997       1.0000
##    20 652805      10  0.99997 7.18e-06      0.99995       1.0000
##    21 652225      18  0.99994 9.69e-06      0.99992       1.0000
##    22 650998      23  0.99990 1.22e-05      0.99988       0.9999
##    23 649215      29  0.99986 1.47e-05      0.99983       0.9999
##    24 646693      29  0.99981 1.69e-05      0.99978       0.9998
##    25 643534      41  0.99975 1.96e-05      0.99971       0.9998
##    26 639494      42  0.99968 2.21e-05      0.99964       0.9997
##    27 634852      45  0.99961 2.45e-05      0.99957       0.9997
##    28 629673      62  0.99952 2.75e-05      0.99946       0.9996
##    29 624078      71  0.99940 3.06e-05      0.99934       0.9995
##    30 617907     100  0.99924 3.46e-05      0.99917       0.9993
##    31 611053      99  0.99908 3.82e-05      0.99900       0.9992
##    32 603573      80  0.99895 4.10e-05      0.99887       0.9990
##    33 595624     125  0.99874 4.51e-05      0.99865       0.9988
##    34 587100     118  0.99854 4.87e-05      0.99844       0.9986
##    35 577952     127  0.99832 5.25e-05      0.99821       0.9984
##    36 568168     164  0.99803 5.71e-05      0.99792       0.9981
##    37 557573     181  0.99770 6.19e-05      0.99758       0.9978
##    38 546431     178  0.99738 6.65e-05      0.99725       0.9975
##    39 535084     212  0.99698 7.18e-05      0.99684       0.9971
##    40 523798     217  0.99657 7.71e-05      0.99642       0.9967
##    41 512431     199  0.99618 8.18e-05      0.99602       0.9963
##    42 501178     240  0.99571 8.73e-05      0.99554       0.9959
##    43 489782     266  0.99517 9.34e-05      0.99498       0.9953
##    44 478463     309  0.99452 1.00e-04      0.99433       0.9947
##    45 466871     326  0.99383 1.07e-04      0.99362       0.9940
##    46 454913     388  0.99298 1.15e-04      0.99275       0.9932
##    47 442406     360  0.99217 1.23e-04      0.99193       0.9924
##    48 429824     429  0.99118 1.32e-04      0.99092       0.9914
##    49 417727     485  0.99003 1.42e-04      0.98975       0.9903
##    50 405742     477  0.98887 1.51e-04      0.98857       0.9892
##    51 393562     565  0.98745 1.62e-04      0.98713       0.9878
##    52 381104     532  0.98607 1.73e-04      0.98573       0.9864
##    53 368311     637  0.98436 1.85e-04      0.98400       0.9847
##    54 355087     684  0.98247 1.99e-04      0.98208       0.9829
##    55 341602     705  0.98044 2.12e-04      0.98002       0.9809
##    56 328309     788  0.97809 2.28e-04      0.97764       0.9785
##    57 314784     807  0.97558 2.44e-04      0.97510       0.9761
##    58 301512     817  0.97294 2.60e-04      0.97243       0.9734
##    59 288446     900  0.96990 2.78e-04      0.96936       0.9704
##    60 275289     882  0.96679 2.96e-04      0.96621       0.9674
##    61 261919     966  0.96323 3.17e-04      0.96261       0.9638
##    62 249150    1022  0.95928 3.39e-04      0.95861       0.9599
##    63 236758    1078  0.95491 3.62e-04      0.95420       0.9556
##    64 224734    1110  0.95019 3.87e-04      0.94943       0.9510
##    65 212845    1089  0.94533 4.12e-04      0.94452       0.9461
##    66 201392    1217  0.93962 4.41e-04      0.93875       0.9405
##    67 190240    1252  0.93343 4.72e-04      0.93251       0.9344
##    68 179272    1370  0.92630 5.06e-04      0.92531       0.9273
##    69 168549    1303  0.91914 5.39e-04      0.91808       0.9202
##    70 158077    1418  0.91089 5.77e-04      0.90976       0.9120
##    71 147484    1491  0.90169 6.19e-04      0.90047       0.9029
##    72 138719    1589  0.89136 6.64e-04      0.89006       0.8927
##    73 130348    1636  0.88017 7.11e-04      0.87878       0.8816
##    74 122135    1680  0.86806 7.60e-04      0.86658       0.8696
##    75 113914    1781  0.85449 8.13e-04      0.85290       0.8561
##    76 106296    1855  0.83958 8.70e-04      0.83788       0.8413
##    77  99085    1922  0.82329 9.29e-04      0.82148       0.8251
##    78  92176    2016  0.80529 9.91e-04      0.80335       0.8072
##    79  85578    2067  0.78584 1.06e-03      0.78377       0.7879
##    80  79131    2197  0.76402 1.12e-03      0.76182       0.7662
##    81  72914    2351  0.73938 1.20e-03      0.73704       0.7417
##    82  66909    2474  0.71204 1.27e-03      0.70955       0.7145
##    83  60943    2507  0.68275 1.35e-03      0.68012       0.6854
##    84  55271    2634  0.65022 1.43e-03      0.64743       0.6530
##    85  49822    3252  0.60778 1.51e-03      0.60481       0.6108
##    86  43426    3891  0.55332 1.61e-03      0.55017       0.5565
##    87  36808    3855  0.49537 1.69e-03      0.49206       0.4987
##    88  30482    3486  0.43872 1.75e-03      0.43530       0.4422
##    89  24700    3222  0.38149 1.79e-03      0.37800       0.3850
##    90  19569    2843  0.32606 1.81e-03      0.32255       0.3296
##    91  15249    2426  0.27419 1.80e-03      0.27069       0.2777
##    92  11680    2028  0.22658 1.77e-03      0.22314       0.2301
##    93   8778    1643  0.18417 1.72e-03      0.18083       0.1876
##    94   6380    1234  0.14855 1.66e-03      0.14533       0.1518
##    95   4591     938  0.11820 1.59e-03      0.11513       0.1214
##    96   3216     629  0.09508 1.52e-03      0.09214       0.0981
##    97   2264     502  0.07400 1.45e-03      0.07122       0.0769
##    98   1534     342  0.05750 1.37e-03      0.05487       0.0603
##    99   1006     189  0.04670 1.32e-03      0.04418       0.0494
##   100    661     119  0.03829 1.29e-03      0.03585       0.0409
##   101    437      67  0.03242 1.27e-03      0.03002       0.0350
##   102    293      37  0.02833 1.28e-03      0.02593       0.0309
##   103    183      21  0.02508 1.31e-03      0.02263       0.0278
##   104    101       5  0.02383 1.36e-03      0.02131       0.0267
##   105     45       1  0.02330 1.43e-03      0.02066       0.0263
##   106      4       3  0.00583 5.06e-03      0.00106       0.0319
##   107      1       1  0.00000      NaN           NA           NA

This is the Kaplan-Meier estimate of the survival function. We can see the number of deaths increased after age of 33 and continued to increase till 89.

** Define a grouping variable ** # Marital Status is my grouping variable. My research hypothesis is that married women has higher mortality comapred to single women.

** Comparison of Kaplan-Meier survival across grouping variables in data.**

survdiff(Surv(death_time, d.event) ~ MARST,
         data = nhis_dat)
## Call:
## survdiff(formula = Surv(death_time, d.event) ~ MARST, data = nhis_dat)
## 
##                                 N Observed Expected (O-E)^2/E (O-E)^2/V
## MARST=married/ever married 508999    76984    65474      2023      9547
## MARST=unmarried            144352     6554    18064      7334      9547
## 
##  Chisq= 9547  on 1 degrees of freedom, p= <2e-16

The result shows that there is a significant difference between single and married women in terms of mortality. Married women has higher mortality.

** Plot the survival function **

group_fit <- survfit2(Surv(death_time, d.event) ~ MARST,
                   data = nhis_dat)

group_fit%>%
  ggsurvfit() +
   add_confidence_interval() +
   add_risktable() +
  labs(title = "Female mortality Survival Plot by Marital Status",
       caption = "Source: NHIS 1990-2017 \n Calculations by Mahmuda Sultana", x= "time",
      y = "Survival Probability")+  theme_minimal() +
  theme(legend.position = "bottom",axis.line = element_line(linetype = "solid"),
    axis.ticks = element_line(linetype = "blank"),
    panel.grid.major = element_line(colour = "gray80"),
    panel.grid.minor = element_line(colour = "gray94",
        linetype = "blank"), axis.title = element_text(family = "serif",
        face = "bold"), axis.text = element_text(face = "bold"),
    plot.title = element_text(family = "serif",
        size = 14, face = "bold") , legend.text = element_text(face = "bold",
        family = "serif"), legend.title = element_text(face = "bold",
        family = "serif"), panel.background = element_rect(fill = "gray97"),
    legend.key = element_rect(fill = NA,
        linetype = "solid"), legend.background = element_rect(linetype = "solid")) +
  guides(color = guide_legend(ncol = 1)) +
    coord_cartesian(xlim = c(0, 1)) +
  scale_y_continuous(
    limits = c(.99, 1),
    labels = scales::percent, 
    expand = c(0.01, 0))+
  scale_color_manual(values = c('#54738E', '#82AC7C')) +
  scale_fill_manual(values = c('#54738E', '#82AC7C')) 
## Warning: Removed 42 row(s) containing missing values (geom_path).