library(survival)
library(survminer)
## Loading required package: ggpubr
## Loading required package: magrittr
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
library(dplyr)
data(ovarian)
##futime is when the participant exits
##death being 1 indicates that the exit was though death, 0 if lost to dropout
##trt = treatment
glimpse(myeloid)
## Observations: 646
## Variables: 7
## $ id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, ...
## $ trt <chr> "B", "A", "A", "B", "B", "B", "A", "A", "B", "A", "A", ...
## $ futime <int> 235, 286, 1983, 2137, 326, 2041, 63, 446, 1695, 1669, 6...
## $ death <int> 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1...
## $ txtime <int> NA, 200, NA, 245, 112, 102, NA, 205, NA, 106, NA, 187, ...
## $ crtime <int> 44, NA, 38, 25, 56, NA, NA, 34, 28, NA, NA, NA, NA, 36,...
## $ rltime <int> 113, NA, NA, NA, 200, NA, NA, 382, NA, NA, NA, NA, NA, ...
surv_object <- Surv(time = myeloid$futime, event = myeloid$death)
surv_object
## [1] 235 286 1983+ 2137+ 326 2041+ 63 446 1695+ 1669+ 66+
## [12] 1364+ 17+ 209 261 2380+ 431 372 838 2086+ 1480+ 188
## [23] 963 318 373 1657+ 213 1421+ 337 565 286 1823+ 17+
## [34] 201 321+ 529 266 1305 1460+ 1832 1859+ 1864+ 1880+ 368
## [45] 1391+ 762 332 376+ 1740+ 2241+ 582 284 823 2186+ 2198+
## [56] 1737+ 2315+ 2006+ 185 2367+ 1893+ 1734+ 2042+ 2162+ 1176+ 2148+
## [67] 2247+ 1991+ 180 1956+ 1976+ 13 1913+ 110 116 245 2194+
## [78] 531 1782+ 548 46 2052+ 411 1034 293 2250+ 235 267
## [89] 2136+ 1465+ 439 295 286 1251+ 1360+ 1783+ 1866+ 892 1525+
## [100] 461 322 1323+ 651 241+ 2300+ 2192+ 191 18+ 1621 50
## [111] 15+ 27 574 917 609 1774+ 206 297 77 52 1429+
## [122] 278 1393+ 232 106 397 140 2038+ 1911+ 13 1990+ 424
## [133] 2063+ 1721+ 955+ 1731+ 1499+ 281 310 1999+ 408 1361+ 337
## [144] 15+ 1530+ 234 1681+ 518 800 1806+ 1192+ 2304+ 1687+ 1785+
## [155] 1912+ 2335+ 31+ 485 432+ 1409 1464+ 18 462 408 1423+
## [166] 1431+ 248 1459+ 2148+ 2188+ 1725+ 218 1999+ 1254 1420+ 1521+
## [177] 1921+ 1648+ 1555+ 614 50 86 588+ 874 33 156 563
## [188] 231 1020 1390+ 572 1877+ 2122+ 322 511 288 368 480
## [199] 2224+ 1708+ 561 1578+ 834 1583+ 7+ 513 1426+ 2040+ 1796+
## [210] 1627+ 360 1280+ 2371+ 1479+ 260 2219+ 20 269 381 163
## [221] 1287+ 1092+ 154 771 1972+ 1658+ 2283 1827+ 4+ 2210+ 1379
## [232] 516 1007 1757+ 2134+ 99 1369+ 121 2361+ 423 322 1423+
## [243] 369 603 25 1809+ 131 602 692 1444+ 601+ 305 518
## [254] 2118+ 972 524 1881+ 74 2345+ 403 14+ 2053+ 2254+ 76
## [265] 22+ 2154+ 779 167 16+ 191 707 103 283 293 1442+
## [276] 1350+ 1710+ 19+ 1384+ 263 1947+ 230 248 216 1717+ 1718+
## [287] 707 303 89 1545+ 1283 388 2023+ 1990+ 140 759 2145+
## [298] 168+ 1843+ 2188+ 1960+ 874 34 438 396 867 1885+ 148
## [309] 1522+ 2233+ 1795+ 13 1467+ 1682+ 2063+ 1302+ 56 1666+ 1708+
## [320] 1384+ 644 239 2095+ 2065+ 588 432 308 1779+ 197 2183+
## [331] 1368+ 193 2239+ 1479+ 1414+ 599 1834+ 1290+ 1968+ 2123+ 385
## [342] 698+ 2255+ 1319+ 1465+ 1552 1883+ 409 144 157 1370+ 193+
## [353] 128 671 670 1960+ 1320+ 679 1815+ 2149+ 748 1901+ 2229+
## [364] 964 1379+ 146 179 6+ 1755+ 2043+ 258 460 575 364
## [375] 2344+ 1953+ 386 1164 374 1683+ 1704+ 86+ 262 187 65
## [386] 355 36+ 1727+ 1855+ 215 319 1861+ 21+ 497 1385+ 9
## [397] 418 2028+ 476 1744+ 1952+ 344 464 1742+ 355 2222+ 226
## [408] 356 1405+ 210 907 2276+ 104 565 727 385 20 333
## [419] 578 1988+ 1349+ 2007+ 255 522 539 23 1714+ 2056+ 341
## [430] 258 2237+ 1102 1729+ 2318+ 9+ 411 59+ 805 2058+ 697
## [441] 2048+ 2034+ 696 411 2083+ 388 342 1544+ 44 326 2200+
## [452] 28 1360+ 658 621 307 248 1652+ 1623+ 540 166 246
## [463] 458 329 2017+ 1129 1720+ 1703+ 167 1584+ 9+ 1021 465
## [474] 1308+ 1681+ 1846+ 1241 1602+ 1813+ 1742+ 1459+ 95 1559+ 24
## [485] 465 1333+ 419 1924+ 85 1537 246 1774+ 1999+ 2233+ 1367+
## [496] 267 1085 2052+ 742 516+ 499 1907+ 400 1792+ 1698+ 774
## [507] 833 2253+ 1383+ 1039+ 148 447 476 736 1485+ 567+ 27
## [518] 1745+ 313 1739+ 2043+ 1760+ 45 145 1989+ 242 1099+ 506
## [529] 2044+ 1497+ 156 456 1695+ 2299+ 1281+ 1990+ 24 495 743
## [540] 2419+ 1764+ 170 1331+ 112 295 359 32 27 1374+ 120+
## [551] 365 144 168 1901+ 275 826 1704+ 432+ 1780+ 49+ 1313+
## [562] 24+ 2126+ 609 203 1245 436+ 1825+ 1915 2150+ 1710+ 2125+
## [573] 1594+ 1414+ 1455+ 1799+ 1728+ 1688+ 1113+ 185 10+ 811 1480+
## [584] 515 122 2179+ 420 248 24 906+ 486 273 1220 40
## [595] 443 383 829 160+ 275 400 768 1347+ 1374+ 375 1456+
## [606] 27 93+ 1943+ 2041+ 517 252 293 1417+ 752 1322+ 2070+
## [617] 16 2191+ 334 1370 2369+ 1401+ 664 259 2308+ 2335+ 120
## [628] 2055+ 1730+ 1373+ 563 1480+ 884 1386+ 1845+ 1313+ 410 697
## [639] 253 21 22+ 237 2394+ 17+ 181 25+
fit1 <- survfit(surv_object ~ trt, data = myeloid, type="kaplan-meier")
ggsurvplot(fit1, data = myeloid, pval = TRUE)
fit1 <- survfit(surv_object ~ trt, data = myeloid, type="fleming-harrington")
ggsurvplot(fit1, data = myeloid, pval = TRUE)
#install.packages("survival")
library(survival)
library(survminer)
data("myeloid")
data("leukemia")
#load the data from the survival package
data("myeloid")
data("leukemia")
#1. Construct our two survival objects
surv_object <- Surv(time = myeloid$futime, event = myeloid$death)
surv_object2 <- Surv(time = leukemia$time, event = leukemia$status)
# As before, Fit survival curves for each data set
fit1<-survfit(surv_object ~ trt, data = myeloid, type="kaplan-meier") #uses the Kaplan-Meier Estimator
fit2<-survfit(surv_object2 ~ x, data = leukemia, type="kaplan-meier")
#3. Plot Them, Graphical Analysis
ggsurvplot(fit1, data = myeloid)
ggsurvplot(fit2, data=leukemia)
#Log-Rank Test for Significant Difference
test1<-survdiff(Surv(futime,death)~trt, data = myeloid, rho=0)
#observe p-value, its significant
test1
## Call:
## survdiff(formula = Surv(futime, death) ~ trt, data = myeloid,
## rho = 0)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## trt=A 317 171 143 5.28 9.59
## trt=B 329 149 177 4.29 9.59
##
## Chisq= 9.6 on 1 degrees of freedom, p= 0.00196
test2<-survdiff(Surv(time,status)~x, data = leukemia, rho=0)
#observe p-value, its not significant
test2
## Call:
## survdiff(formula = Surv(time, status) ~ x, data = leukemia, rho = 0)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## x=Maintained 11 7 10.69 1.27 3.4
## x=Nonmaintained 12 11 7.31 1.86 3.4
##
## Chisq= 3.4 on 1 degrees of freedom, p= 0.0653
#Peto-Peto Modification of the Gehan-Wilcoxon Test, an easy alternate
test3<-survdiff(Surv(futime,death)~trt, data = myeloid, rho=1)
#observe p-value, its significant
test3
## Call:
## survdiff(formula = Surv(futime, death) ~ trt, data = myeloid,
## rho = 1)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## trt=A 317 129 107 4.26 10.1
## trt=B 329 108 130 3.52 10.1
##
## Chisq= 10.1 on 1 degrees of freedom, p= 0.00149
test4<-survdiff(Surv(time,status)~x, data = leukemia, rho=1)
#observe p-value, its not significant
test4
## Call:
## survdiff(formula = Surv(time, status) ~ x, data = leukemia, rho = 1)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## x=Maintained 11 3.85 6.14 0.859 2.78
## x=Nonmaintained 12 7.18 4.88 1.081 2.78
##
## Chisq= 2.8 on 1 degrees of freedom, p= 0.0955
#First, create a survival curve using the Flemington-Harrington Method.
surv_object3 <- Surv(time = myeloid$futime, event = myeloid$death)
fit1 <- survfit(surv_object3 ~ 1, data = myeloid, type="fleming-harrington")
#At each time t, we apply our theoretical transformation to get the hazard function
sum_fit1 <- summary(fit1)
hazard_fit <- -log(sum_fit1$surv)
#At each time, compute the Nelson-Aalen estimator
haz_frac <- sum_fit1$n.event/sum_fit1$n.risk
nelson_aaelon <- cumsum(haz_frac)
#Plot the hazard function versus the survival function
plot(sum_fit1$time, hazard_fit, lty=1, xlab="time", ylab = "", main="Comparing Cumulative Hazard to Survival Function", ylim=range(c(hazard_fit, sum_fit1$surv)), type="s")
points(sum_fit1$time, sum_fit1$surv, lty=2, type="s")
#Plot the hazard function versus the nelson-aalen estimator
plot(sum_fit1$time, hazard_fit, lty=1, ylab = "", xlab="time", main="Nelson Aaelon Estimator vs. -log(S(t))", ylim=range(c(hazard_fit, nelson_aaelon)), type="s")
points(sum_fit1$time, nelson_aaelon, lty=2, col="red", type="s")
library(AER)
## Warning: package 'car' was built under R version 3.4.4
## Warning: package 'carData' was built under R version 3.4.4
## Warning: package 'lmtest' was built under R version 3.4.4
## Warning: package 'zoo' was built under R version 3.4.3
library(tidyverse)
In order to run tobit regressions, we will be using the package AER. Other R packages include commands for tobit, such as survival::survreg() and VGAM::vglm(), but generally have more complicated formats.
set.seed(1)
x=runif(100)
y=5*x+rnorm(100)
df <- matrix(c(y,rep(3,100),x), ncol=3, byrow=F) %>%
as.data.frame() %>%
rename(y=V1, c=V2, x=V3) %>%
mutate(yc = pmin(y,c)) %>%
dplyr::select(y,yc,x)
## Warning: package 'bindrcpp' was built under R version 3.4.4
dfCen <- df[,2:3]
dfUncen <- df[,c(1,3)]
First, we generate two datasets: one with the censored response variable yc, and one with the uncensored response variable y. Both have the same realtionship: y=5x+e, but for dfCen, any y>3 is set equal to 3.
m1 <- lm(y~x,data=dfUncen)
m1
##
## Call:
## lm(formula = y ~ x, data = dfUncen)
##
## Coefficients:
## (Intercept) x
## -0.1793 5.3123
plot(dfUncen$x, dfUncen$y)
abline(m1,col="red")
cm1 <- lm(yc~x,data=dfCen)
cm1
##
## Call:
## lm(formula = yc ~ x, data = dfCen)
##
## Coefficients:
## (Intercept) x
## 0.316 3.384
plot(dfCen$x, dfCen$yc)
abline(cm1,col="blue")
abline(m1, col="red")
First, we run a SLR. When we use dfUncen, it captures the relationship well, but when we use dfCen, we see our estimated coefficient underestimates the relationship between our variables.
tm1 <- tobit(yc~x, left = -Inf, right = 3, data = dfCen)
tm1
##
## Call:
## tobit(formula = yc ~ x, left = -Inf, right = 3, data = dfCen)
##
## Coefficients:
## (Intercept) x
## -0.2134 5.3869
##
## Scale: 0.9083
plot(dfCen$x, dfCen$yc)
abline(cm1,col="blue")
abline(m1, col="red")
abline(tm1, col="green")
This is resolved by tobit: now, we can use dfCen and still have essentially the same coefficient estimate as we did with our SLR on dfUncen.
library(carData)
data("Mroz")#NOTE: Really weird, but you get different Mroz if you enter carData::Mroz
head(Mroz)
## # A tibble: 6 x 18
## work hoursw child6 child618 agew educw hearnw wagew hoursh ageh educh
## <chr> <int> <int> <int> <int> <int> <dbl> <dbl> <int> <int> <int>
## 1 no 1610 1 0 32 12 3.35 2.65 2708 34 12
## 2 no 1656 0 2 30 12 1.39 2.65 2310 30 9
## 3 no 1980 1 3 35 12 4.55 4.04 3072 40 12
## 4 no 456 0 3 34 12 1.10 3.25 1920 53 10
## 5 no 1568 1 2 31 14 4.59 3.60 2000 32 12
## 6 no 2032 0 0 54 12 4.74 4.70 1040 57 11
## # ... with 7 more variables: wageh <dbl>, income <int>, educwm <int>,
## # educwf <int>, unemprate <dbl>, city <chr>, experience <int>
summary(Mroz$hoursw)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 0.0 288.0 740.6 1516.0 4950.0
Here, the variables are defined as follows:
work: participation in 1975 (NOTE: yes and no seem to be swapped)
hoursw: wife’s hours of work in 1975
child6: number of children less than 6 years old in household
child618: number of children between ages 6 and 18 in household
agew: wife’s age
educw: wife’s educational attainment, in years
hearnw: wife’s average hourly earnings, in 1975 dollars
wagew: wife’s wage reported at the time of the 1976 interview (note= 1975 estimated wage)
hoursh: husband’s hours worked in 1975
ageh: husband’s age
educh: husband’s educational attainment, in years
wageh: husband’s wage, in 1975 dollars
income: family income, in 1975 dollars
educwm: wife’s mother’s educational attainment, in years
educwf: wife’s father’s educational attainment, in years
unemprate: unemployment rate in county of residence, in percentage points
city: lives in large city (SMSA) ?
experience: actual years of wife’s previous labor market experience
Our response variable of interest is hoursw.
tml <- tobit(hoursw ~ wagew + experience + I(experience^2) + agew + child6, left = 0, right = Inf, data=Mroz)
tobitMroz <- summary(tml)
tobitMroz
##
## Call:
## tobit(formula = hoursw ~ wagew + experience + I(experience^2) +
## agew + child6, left = 0, right = Inf, data = Mroz)
##
## Observations:
## Total Left-censored Uncensored Right-censored
## 753 325 428 0
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 802.45311 273.13028 2.938 0.0033 **
## wagew 260.96336 17.13829 15.227 < 2e-16 ***
## experience 81.65108 14.95239 5.461 4.74e-08 ***
## I(experience^2) -1.17380 0.46334 -2.533 0.0113 *
## agew -34.57801 6.02626 -5.738 9.59e-09 ***
## child6 -620.26541 96.29675 -6.441 1.19e-10 ***
## Log(scale) 6.85771 0.03641 188.323 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Scale: 951.2
##
## Gaussian distribution
## Number of Newton-Raphson Iterations: 5
## Log-likelihood: -3715 on 7 Df
## Wald-statistic: 488.3 on 5 Df, p-value: < 2.22e-16
First, we run another regression with tobit(). However, all the coefficients we estimate explain the relationship between explanatory variables and the latent response variable. In order to calculate other expected values and marginal effects we will need to write our own code.
XB <- tml$linear.predictors
sigma <- tml$scale
lambda <- dnorm(XB/sigma)/pnorm(XB/sigma)
#pnorm: prob distribution func (normal)
#dnorm: prob density func
eystar <- XB
head(eystar) # E[y*|x] matrix
## [1] 680.2941 835.5760 986.9099 922.5813 563.7787 1577.9829
ey0 <- (XB + sigma*lambda)
head(ey0) # E[y|y>0] matrix
## [1] 1065.516 1154.023 1247.439 1206.861 1003.890 1678.715
eyx <- pnorm(XB/sigma)*(XB + sigma*lambda)
head(eyx) # E[y|x] matrix
## [1] 812.7327 934.9348 1060.6504 1006.4726 726.1269 1597.1943
First, we create matrices for the three different types of fitted values shown on the slides.
Below are the marginal effects for wage. Since our marginal effects are dependent on our fitted values, we will look at the marginal effects when all of our explanatory variables are at the mean.
mWage <- mean(Mroz$wagew)
mExper <- mean(Mroz$experience)
mAgew <- mean(Mroz$agew)
mKids6 <- mean(Mroz$child6)
bInt <- coef(tobitMroz)[[1]]
bWage <- coef(tobitMroz)[[2]]
bExper <- coef(tobitMroz)[[3]]
bExper2 <- coef(tobitMroz)[[4]]
bAgew <- coef(tobitMroz)[[5]]
bKids6 <- coef(tobitMroz)[[6]]
mXB <- (bInt+bWage*mWage+bExper*mExper+bExper2*I(mExper^2)+ bAgew*mAgew+bKids6*mKids6)/(sigma)
mlambda <- dnorm(mXB)/pnorm(mXB)
mfxEyx <- pnorm(mXB)*bWage
mfxEyx
## [1] 173.2266
mfxEy0 <- bWage*(1-mlambda*(mXB + mlambda))
mfxEy0
## [1] 121.4885
mfxEystar <- bWage
mfxEystar
## [1] 260.9634
We can also plot all of our marginal effects in relation to our fitted values
XBs <- XB/sigma
mfxEyx_all <- pnorm(XBs)*bWage
plot(eyx, mfxEyx_all)
mfxEy0_all <- bWage*(1-lambda*((XB/sigma) + lambda))
plot(ey0, mfxEy0_all)
mfxEystar_all <- bWage*(XB/sigma)
plot(eystar, mfxEystar_all)