Censored Data

Kaplan-Meier

library(survival)
library(survminer)

## Loading required package: ggpubr

## Loading required package: magrittr

## 
## Attaching package: 'magrittr'

## The following object is masked from 'package:purrr':
## 
##     set_names

## The following object is masked from 'package:tidyr':
## 
##     extract

library(dplyr)
data(ovarian)

##futime is when the participant exits
##death being 1 indicates that the exit was though death, 0 if lost to dropout
##trt = treatment
glimpse(myeloid)

## Observations: 646
## Variables: 7
## $ id     <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, ...
## $ trt    <chr> "B", "A", "A", "B", "B", "B", "A", "A", "B", "A", "A", ...
## $ futime <int> 235, 286, 1983, 2137, 326, 2041, 63, 446, 1695, 1669, 6...
## $ death  <int> 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1...
## $ txtime <int> NA, 200, NA, 245, 112, 102, NA, 205, NA, 106, NA, 187, ...
## $ crtime <int> 44, NA, 38, 25, 56, NA, NA, 34, 28, NA, NA, NA, NA, 36,...
## $ rltime <int> 113, NA, NA, NA, 200, NA, NA, 382, NA, NA, NA, NA, NA, ...

surv_object <- Surv(time = myeloid$futime, event = myeloid$death)
surv_object

##   [1]  235   286  1983+ 2137+  326  2041+   63   446  1695+ 1669+   66+
##  [12] 1364+   17+  209   261  2380+  431   372   838  2086+ 1480+  188 
##  [23]  963   318   373  1657+  213  1421+  337   565   286  1823+   17+
##  [34]  201   321+  529   266  1305  1460+ 1832  1859+ 1864+ 1880+  368 
##  [45] 1391+  762   332   376+ 1740+ 2241+  582   284   823  2186+ 2198+
##  [56] 1737+ 2315+ 2006+  185  2367+ 1893+ 1734+ 2042+ 2162+ 1176+ 2148+
##  [67] 2247+ 1991+  180  1956+ 1976+   13  1913+  110   116   245  2194+
##  [78]  531  1782+  548    46  2052+  411  1034   293  2250+  235   267 
##  [89] 2136+ 1465+  439   295   286  1251+ 1360+ 1783+ 1866+  892  1525+
## [100]  461   322  1323+  651   241+ 2300+ 2192+  191    18+ 1621    50 
## [111]   15+   27   574   917   609  1774+  206   297    77    52  1429+
## [122]  278  1393+  232   106   397   140  2038+ 1911+   13  1990+  424 
## [133] 2063+ 1721+  955+ 1731+ 1499+  281   310  1999+  408  1361+  337 
## [144]   15+ 1530+  234  1681+  518   800  1806+ 1192+ 2304+ 1687+ 1785+
## [155] 1912+ 2335+   31+  485   432+ 1409  1464+   18   462   408  1423+
## [166] 1431+  248  1459+ 2148+ 2188+ 1725+  218  1999+ 1254  1420+ 1521+
## [177] 1921+ 1648+ 1555+  614    50    86   588+  874    33   156   563 
## [188]  231  1020  1390+  572  1877+ 2122+  322   511   288   368   480 
## [199] 2224+ 1708+  561  1578+  834  1583+    7+  513  1426+ 2040+ 1796+
## [210] 1627+  360  1280+ 2371+ 1479+  260  2219+   20   269   381   163 
## [221] 1287+ 1092+  154   771  1972+ 1658+ 2283  1827+    4+ 2210+ 1379 
## [232]  516  1007  1757+ 2134+   99  1369+  121  2361+  423   322  1423+
## [243]  369   603    25  1809+  131   602   692  1444+  601+  305   518 
## [254] 2118+  972   524  1881+   74  2345+  403    14+ 2053+ 2254+   76 
## [265]   22+ 2154+  779   167    16+  191   707   103   283   293  1442+
## [276] 1350+ 1710+   19+ 1384+  263  1947+  230   248   216  1717+ 1718+
## [287]  707   303    89  1545+ 1283   388  2023+ 1990+  140   759  2145+
## [298]  168+ 1843+ 2188+ 1960+  874    34   438   396   867  1885+  148 
## [309] 1522+ 2233+ 1795+   13  1467+ 1682+ 2063+ 1302+   56  1666+ 1708+
## [320] 1384+  644   239  2095+ 2065+  588   432   308  1779+  197  2183+
## [331] 1368+  193  2239+ 1479+ 1414+  599  1834+ 1290+ 1968+ 2123+  385 
## [342]  698+ 2255+ 1319+ 1465+ 1552  1883+  409   144   157  1370+  193+
## [353]  128   671   670  1960+ 1320+  679  1815+ 2149+  748  1901+ 2229+
## [364]  964  1379+  146   179     6+ 1755+ 2043+  258   460   575   364 
## [375] 2344+ 1953+  386  1164   374  1683+ 1704+   86+  262   187    65 
## [386]  355    36+ 1727+ 1855+  215   319  1861+   21+  497  1385+    9 
## [397]  418  2028+  476  1744+ 1952+  344   464  1742+  355  2222+  226 
## [408]  356  1405+  210   907  2276+  104   565   727   385    20   333 
## [419]  578  1988+ 1349+ 2007+  255   522   539    23  1714+ 2056+  341 
## [430]  258  2237+ 1102  1729+ 2318+    9+  411    59+  805  2058+  697 
## [441] 2048+ 2034+  696   411  2083+  388   342  1544+   44   326  2200+
## [452]   28  1360+  658   621   307   248  1652+ 1623+  540   166   246 
## [463]  458   329  2017+ 1129  1720+ 1703+  167  1584+    9+ 1021   465 
## [474] 1308+ 1681+ 1846+ 1241  1602+ 1813+ 1742+ 1459+   95  1559+   24 
## [485]  465  1333+  419  1924+   85  1537   246  1774+ 1999+ 2233+ 1367+
## [496]  267  1085  2052+  742   516+  499  1907+  400  1792+ 1698+  774 
## [507]  833  2253+ 1383+ 1039+  148   447   476   736  1485+  567+   27 
## [518] 1745+  313  1739+ 2043+ 1760+   45   145  1989+  242  1099+  506 
## [529] 2044+ 1497+  156   456  1695+ 2299+ 1281+ 1990+   24   495   743 
## [540] 2419+ 1764+  170  1331+  112   295   359    32    27  1374+  120+
## [551]  365   144   168  1901+  275   826  1704+  432+ 1780+   49+ 1313+
## [562]   24+ 2126+  609   203  1245   436+ 1825+ 1915  2150+ 1710+ 2125+
## [573] 1594+ 1414+ 1455+ 1799+ 1728+ 1688+ 1113+  185    10+  811  1480+
## [584]  515   122  2179+  420   248    24   906+  486   273  1220    40 
## [595]  443   383   829   160+  275   400   768  1347+ 1374+  375  1456+
## [606]   27    93+ 1943+ 2041+  517   252   293  1417+  752  1322+ 2070+
## [617]   16  2191+  334  1370  2369+ 1401+  664   259  2308+ 2335+  120 
## [628] 2055+ 1730+ 1373+  563  1480+  884  1386+ 1845+ 1313+  410   697 
## [639]  253    21    22+  237  2394+   17+  181    25+

fit1 <- survfit(surv_object ~ trt, data = myeloid, type="kaplan-meier")
ggsurvplot(fit1, data = myeloid, pval = TRUE)

fit1 <- survfit(surv_object ~ trt, data = myeloid, type="fleming-harrington")
ggsurvplot(fit1, data = myeloid, pval = TRUE)

#install.packages("survival")
library(survival)
library(survminer)
data("myeloid") 
data("leukemia")

Log-Rank Test

#load the data from the survival package
data("myeloid") 
data("leukemia")

#1. Construct our two survival objects
surv_object <- Surv(time = myeloid$futime, event = myeloid$death)
surv_object2 <- Surv(time = leukemia$time, event = leukemia$status)

# As before, Fit survival curves for each data set
fit1<-survfit(surv_object ~ trt, data = myeloid, type="kaplan-meier") #uses the Kaplan-Meier Estimator
fit2<-survfit(surv_object2 ~ x, data = leukemia, type="kaplan-meier")

#3. Plot Them, Graphical Analysis
ggsurvplot(fit1, data = myeloid)

ggsurvplot(fit2, data=leukemia)

#Log-Rank Test for Significant Difference
test1<-survdiff(Surv(futime,death)~trt, data = myeloid, rho=0)
#observe p-value, its significant
test1

## Call:
## survdiff(formula = Surv(futime, death) ~ trt, data = myeloid, 
##     rho = 0)
## 
##         N Observed Expected (O-E)^2/E (O-E)^2/V
## trt=A 317      171      143      5.28      9.59
## trt=B 329      149      177      4.29      9.59
## 
##  Chisq= 9.6  on 1 degrees of freedom, p= 0.00196

test2<-survdiff(Surv(time,status)~x, data = leukemia, rho=0)
#observe p-value, its not significant
test2

## Call:
## survdiff(formula = Surv(time, status) ~ x, data = leukemia, rho = 0)
## 
##                  N Observed Expected (O-E)^2/E (O-E)^2/V
## x=Maintained    11        7    10.69      1.27       3.4
## x=Nonmaintained 12       11     7.31      1.86       3.4
## 
##  Chisq= 3.4  on 1 degrees of freedom, p= 0.0653

#Peto-Peto Modification of the Gehan-Wilcoxon Test, an easy alternate
test3<-survdiff(Surv(futime,death)~trt, data = myeloid, rho=1)
#observe p-value, its significant
test3

## Call:
## survdiff(formula = Surv(futime, death) ~ trt, data = myeloid, 
##     rho = 1)
## 
##         N Observed Expected (O-E)^2/E (O-E)^2/V
## trt=A 317      129      107      4.26      10.1
## trt=B 329      108      130      3.52      10.1
## 
##  Chisq= 10.1  on 1 degrees of freedom, p= 0.00149

test4<-survdiff(Surv(time,status)~x, data = leukemia, rho=1)
#observe p-value, its not significant
test4

## Call:
## survdiff(formula = Surv(time, status) ~ x, data = leukemia, rho = 1)
## 
##                  N Observed Expected (O-E)^2/E (O-E)^2/V
## x=Maintained    11     3.85     6.14     0.859      2.78
## x=Nonmaintained 12     7.18     4.88     1.081      2.78
## 
##  Chisq= 2.8  on 1 degrees of freedom, p= 0.0955

Nelson-Aalen Estimator and Cumulative Hazard

#First, create a survival curve using the Flemington-Harrington Method. 

surv_object3 <- Surv(time = myeloid$futime, event = myeloid$death)
fit1 <- survfit(surv_object3 ~ 1, data = myeloid, type="fleming-harrington")

#At each time t, we apply our theoretical transformation to get the hazard function
sum_fit1 <- summary(fit1)
hazard_fit <- -log(sum_fit1$surv)

#At each time, compute the Nelson-Aalen estimator
haz_frac <- sum_fit1$n.event/sum_fit1$n.risk
nelson_aaelon <- cumsum(haz_frac)

#Plot the hazard function versus the survival function
plot(sum_fit1$time, hazard_fit, lty=1, xlab="time", ylab = "", main="Comparing Cumulative Hazard to Survival Function", ylim=range(c(hazard_fit, sum_fit1$surv)), type="s")
points(sum_fit1$time, sum_fit1$surv, lty=2, type="s")

#Plot the hazard function versus the nelson-aalen estimator
plot(sum_fit1$time, hazard_fit, lty=1, ylab = "", xlab="time", main="Nelson Aaelon Estimator vs. -log(S(t))", ylim=range(c(hazard_fit, nelson_aaelon)), type="s")
points(sum_fit1$time, nelson_aaelon, lty=2, col="red", type="s")

Tobit: Quick Example with Generated Data

library(AER)

## Warning: package 'car' was built under R version 3.4.4

## Warning: package 'carData' was built under R version 3.4.4

## Warning: package 'lmtest' was built under R version 3.4.4

## Warning: package 'zoo' was built under R version 3.4.3

library(tidyverse)

In order to run tobit regressions, we will be using the package AER. Other R packages include commands for tobit, such as survival::survreg() and VGAM::vglm(), but generally have more complicated formats.

set.seed(1)
x=runif(100)
y=5*x+rnorm(100)
df <- matrix(c(y,rep(3,100),x), ncol=3, byrow=F) %>%
  as.data.frame() %>%
  rename(y=V1, c=V2, x=V3) %>%
  mutate(yc = pmin(y,c)) %>%
  dplyr::select(y,yc,x)

## Warning: package 'bindrcpp' was built under R version 3.4.4

dfCen <- df[,2:3]
dfUncen <- df[,c(1,3)]

First, we generate two datasets: one with the censored response variable yc, and one with the uncensored response variable y. Both have the same realtionship: y=5x+e, but for dfCen, any y>3 is set equal to 3.

m1 <- lm(y~x,data=dfUncen)
m1

## 
## Call:
## lm(formula = y ~ x, data = dfUncen)
## 
## Coefficients:
## (Intercept)            x  
##     -0.1793       5.3123

plot(dfUncen$x, dfUncen$y)
abline(m1,col="red")

cm1 <- lm(yc~x,data=dfCen)
cm1

## 
## Call:
## lm(formula = yc ~ x, data = dfCen)
## 
## Coefficients:
## (Intercept)            x  
##       0.316        3.384

plot(dfCen$x, dfCen$yc)
abline(cm1,col="blue")
abline(m1, col="red")

First, we run a SLR. When we use dfUncen, it captures the relationship well, but when we use dfCen, we see our estimated coefficient underestimates the relationship between our variables.

tm1 <- tobit(yc~x, left = -Inf, right = 3, data = dfCen)
tm1

## 
## Call:
## tobit(formula = yc ~ x, left = -Inf, right = 3, data = dfCen)
## 
## Coefficients:
## (Intercept)            x  
##     -0.2134       5.3869  
## 
## Scale: 0.9083

plot(dfCen$x, dfCen$yc)
abline(cm1,col="blue")
abline(m1, col="red")
abline(tm1, col="green")

This is resolved by tobit: now, we can use dfCen and still have essentially the same coefficient estimate as we did with our SLR on dfUncen.

Tobit for hours worked

library(carData)
data("Mroz")#NOTE: Really weird, but you get different Mroz if you enter carData::Mroz

head(Mroz)

## # A tibble: 6 x 18
##   work  hoursw child6 child618  agew educw hearnw wagew hoursh  ageh educh
##   <chr>  <int>  <int>    <int> <int> <int>  <dbl> <dbl>  <int> <int> <int>
## 1 no      1610      1        0    32    12   3.35  2.65   2708    34    12
## 2 no      1656      0        2    30    12   1.39  2.65   2310    30     9
## 3 no      1980      1        3    35    12   4.55  4.04   3072    40    12
## 4 no       456      0        3    34    12   1.10  3.25   1920    53    10
## 5 no      1568      1        2    31    14   4.59  3.60   2000    32    12
## 6 no      2032      0        0    54    12   4.74  4.70   1040    57    11
## # ... with 7 more variables: wageh <dbl>, income <int>, educwm <int>,
## #   educwf <int>, unemprate <dbl>, city <chr>, experience <int>

summary(Mroz$hoursw)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0     0.0   288.0   740.6  1516.0  4950.0

Here, the variables are defined as follows:

work: participation in 1975 (NOTE: yes and no seem to be swapped)

hoursw: wife’s hours of work in 1975

child6: number of children less than 6 years old in household

child618: number of children between ages 6 and 18 in household

agew: wife’s age

educw: wife’s educational attainment, in years

hearnw: wife’s average hourly earnings, in 1975 dollars

wagew: wife’s wage reported at the time of the 1976 interview (note= 1975 estimated wage)

hoursh: husband’s hours worked in 1975

ageh: husband’s age

educh: husband’s educational attainment, in years

wageh: husband’s wage, in 1975 dollars

income: family income, in 1975 dollars

educwm: wife’s mother’s educational attainment, in years

educwf: wife’s father’s educational attainment, in years

unemprate: unemployment rate in county of residence, in percentage points

city: lives in large city (SMSA) ?

experience: actual years of wife’s previous labor market experience

Our response variable of interest is hoursw.

tml <- tobit(hoursw ~ wagew + experience + I(experience^2) + agew + child6, left = 0, right = Inf, data=Mroz)
tobitMroz <- summary(tml)
tobitMroz

## 
## Call:
## tobit(formula = hoursw ~ wagew + experience + I(experience^2) + 
##     agew + child6, left = 0, right = Inf, data = Mroz)
## 
## Observations:
##          Total  Left-censored     Uncensored Right-censored 
##            753            325            428              0 
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      802.45311  273.13028   2.938   0.0033 ** 
## wagew            260.96336   17.13829  15.227  < 2e-16 ***
## experience        81.65108   14.95239   5.461 4.74e-08 ***
## I(experience^2)   -1.17380    0.46334  -2.533   0.0113 *  
## agew             -34.57801    6.02626  -5.738 9.59e-09 ***
## child6          -620.26541   96.29675  -6.441 1.19e-10 ***
## Log(scale)         6.85771    0.03641 188.323  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Scale: 951.2 
## 
## Gaussian distribution
## Number of Newton-Raphson Iterations: 5 
## Log-likelihood: -3715 on 7 Df
## Wald-statistic: 488.3 on 5 Df, p-value: < 2.22e-16

First, we run another regression with tobit(). However, all the coefficients we estimate explain the relationship between explanatory variables and the latent response variable. In order to calculate other expected values and marginal effects we will need to write our own code.

XB <- tml$linear.predictors 
sigma <- tml$scale
lambda <- dnorm(XB/sigma)/pnorm(XB/sigma)
#pnorm: prob distribution func (normal)
#dnorm: prob density func

eystar <- XB
head(eystar) # E[y*|x] matrix

## [1]  680.2941  835.5760  986.9099  922.5813  563.7787 1577.9829

ey0 <- (XB + sigma*lambda)
head(ey0) # E[y|y>0] matrix

## [1] 1065.516 1154.023 1247.439 1206.861 1003.890 1678.715

eyx <- pnorm(XB/sigma)*(XB + sigma*lambda)
head(eyx) # E[y|x] matrix

## [1]  812.7327  934.9348 1060.6504 1006.4726  726.1269 1597.1943

First, we create matrices for the three different types of fitted values shown on the slides.

Below are the marginal effects for wage. Since our marginal effects are dependent on our fitted values, we will look at the marginal effects when all of our explanatory variables are at the mean.

mWage <- mean(Mroz$wagew)
mExper <- mean(Mroz$experience)
mAgew <- mean(Mroz$agew)
mKids6 <- mean(Mroz$child6)

bInt <- coef(tobitMroz)[[1]]
bWage <- coef(tobitMroz)[[2]]
bExper <- coef(tobitMroz)[[3]]
bExper2 <- coef(tobitMroz)[[4]]
bAgew <- coef(tobitMroz)[[5]]
bKids6 <- coef(tobitMroz)[[6]]

mXB <- (bInt+bWage*mWage+bExper*mExper+bExper2*I(mExper^2)+ bAgew*mAgew+bKids6*mKids6)/(sigma)
mlambda <- dnorm(mXB)/pnorm(mXB) 

mfxEyx <- pnorm(mXB)*bWage 
mfxEyx

## [1] 173.2266

mfxEy0 <- bWage*(1-mlambda*(mXB + mlambda))
mfxEy0

## [1] 121.4885

mfxEystar <- bWage
mfxEystar

## [1] 260.9634

We can also plot all of our marginal effects in relation to our fitted values

XBs <- XB/sigma

mfxEyx_all <- pnorm(XBs)*bWage
plot(eyx, mfxEyx_all)

mfxEy0_all <- bWage*(1-lambda*((XB/sigma) + lambda))
plot(ey0, mfxEy0_all)

mfxEystar_all <- bWage*(XB/sigma)
plot(eystar, mfxEystar_all)