library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.4 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.0      ✔ stringr 1.4.1 
## ✔ readr   2.1.2      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(ipumsr)
library(haven)
library(survival)
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
library(survminer)
## Loading required package: ggpubr
## 
## Attaching package: 'survminer'
## 
## The following object is masked from 'package:survival':
## 
##     myeloma
library(ggplot2)
library(ggpubr)
library(muhaz)
library(carData)
library(dplyr)
library(eha)

** Read IPUMS NHIS data ** # I select data from 1990 to 2017

nhis <- read_ipums_ddi("nhis_00008.xml")
nhis_dat <- read_ipums_micro(nhis)
## Use of data from IPUMS NHIS is subject to conditions including that users
## should cite the data appropriately. Use command `ipums_conditions()` for more
## details.
nhis_dat <- haven::zap_labels(nhis_dat)

** Recode variable **

nhis_dat$MARST<- as.numeric(nhis_dat$MARST)

nhis_dat$MARST <- car:: Recode(nhis_dat$MARST, recodes= "11:40= 'married/ever married';  50= 'unmarried'; else= NA", as.factor= T)
nhis_dat$EDUC <- car:: Recode(nhis_dat$EDUC, recodes= "100:116= 'LHS';  201= 'High School'; 400='Bachelor'; 500:503='Graduate'; else= NA", as.factor= T)
nhis_dat <- nhis_dat %>%
  filter(MORTELIG == 1)

nhis_dat <- nhis_dat %>%
  filter(SEX == 2)

nhis_dat <- nhis_dat%>%
filter(complete.cases(.))

1) Carry out the following analysis:

#Define your outcome as in HW 1.Also consider what covariates are hypothesized to affect the outcome variable:

The outcome is adult female mortality based on Education Attainment and marital status. Marital Status and Education Attaintment are my grouping variable. The variables are categorical.

#The covariates hypothesis are Marital Status and Educational Attaintment.

#My research hypothesis is that married women has higher mortality comapred to single women. The higher the education attaintment, the lower the mortality rate.

 nhis_dat<-nhis_dat %>%
    mutate(death_age = ifelse( MORTSTAT ==1, 
                             MORTDODY - YEAR , 
                             2017 - YEAR ), 
         d.event = ifelse(MORTSTAT == 1 & death_age <= 1, 1, 0))

library(survival)

time_fit <- survfit(Surv(death_age, d.event) ~ 1, 
                  conf.type = "log",
                   data = nhis_dat)

library(ggsurvfit)

time_fit %>%
  ggsurvfit()+
  xlim(-1, 6) +
  add_censor_mark() +
  add_confidence_interval(type = "ribbon")+
  add_quantile()
## Warning: Removed 16 row(s) containing missing values (geom_path).

#Fit the parametric model of your choosing to the data.

a.Did you choose an AFT or PH model and why?

#Proportional Hazards Model (PH) because I want to see if the effect of the covariates increase or decrease the hazard by a proportionate amount of time.

#b.Justify what parametric distribution you choose

#Weibull because it is a flexible distribution. #c.Carry out model fit diagnostics for the model:

day<- 1/365
fit.wei<-phreg(Surv(death_age+day, d.event)~MARST+EDUC,
                   data=nhis_dat,

dist="weibull")

summary(fit.wei)
## Covariate             Mean       Coef     Rel.Risk   S.E.    LR p
## MARST                                                       0.0000 
## married/ever mar      0.798     0         1 (reference)
##        unmarried      0.202    -1.176     0.309     0.051
## EDUC                                                        0.0000 
##         Bachelor      0.223     0         1 (reference)
##         Graduate      0.104    -0.148     0.862     0.076
##      High School      0.396     0.840     2.317     0.047
##              LHS      0.277     1.286     3.618     0.046
## 
## Events                    5582 
## Total time at risk        4296898 
## Max. log. likelihood      -32347 
## LR test statistic         2015.52 
## Degrees of freedom        4 
## Overall p-value           0

d. Include all main effects in the model:

plot(fit.wei)

** Test for an interaction between at least two of the predictors **

survdiff(Surv(death_age, d.event)~MARST+EDUC, data=nhis_dat)
## Call:
## survdiff(formula = Surv(death_age, d.event) ~ MARST + EDUC, data = nhis_dat)
## 
##                                                   N Observed Expected (O-E)^2/E
## MARST=married/ever married, EDUC=Bachelor     79073      514     1009     242.6
## MARST=married/ever married, EDUC=Graduate     41160      224      523     171.0
## MARST=married/ever married, EDUC=High School 133856     2122     1718      95.0
## MARST=married/ever married, EDUC=LHS          93468     2309     1201    1022.6
## MARST=unmarried, EDUC=Bachelor                21490       56      273     172.6
## MARST=unmarried, EDUC=Graduate                 8461       22      107      67.4
## MARST=unmarried, EDUC=High School             33399      158      429     171.0
## MARST=unmarried, EDUC=LHS                     25048      177      323      65.8
##                                              (O-E)^2/V
## MARST=married/ever married, EDUC=Bachelor        298.4
## MARST=married/ever married, EDUC=Graduate        190.1
## MARST=married/ever married, EDUC=High School     138.3
## MARST=married/ever married, EDUC=LHS            1313.0
## MARST=unmarried, EDUC=Bachelor                   182.9
## MARST=unmarried, EDUC=Graduate                    69.2
## MARST=unmarried, EDUC=High School                186.7
## MARST=unmarried, EDUC=LHS                         70.4
## 
##  Chisq= 2024  on 7 degrees of freedom, p= <2e-16

** Interpretation ** # The p-value is less than .05, indicating there is significant relationship between Marital Status, Education Attaintment, and Female adult mortality. So, we can say that there is significant interactions between them.

** Tabular and Graphical Output **

fit.gom<-phreg(Surv(death_age, d.event)~MARST + EDUC,
                   data=nhis_dat,
             
dist="gompertz")
summary(fit.gom)
## Covariate             Mean       Coef     Rel.Risk   S.E.    LR p
## MARST                                                       0.0000 
## married/ever mar      0.798     0         1 (reference)
##        unmarried      0.202    -1.177     0.308     0.051
## EDUC                                                        0.0000 
##         Bachelor      0.223     0         1 (reference)
##         Graduate      0.104    -0.114     0.892     0.076
##      High School      0.396     0.798     2.221     0.047
##              LHS      0.277     1.260     3.526     0.046
## 
## Events                    5582 
## Total time at risk        4295704 
## Max. log. likelihood      -41716 
## LR test statistic         1926.21 
## Degrees of freedom        4 
## Overall p-value           0
plot(fit.gom)