X Wang - Assignment 9

PSCI 7108: Advanced Data Analysis III

Conduct a survival analysis of government coalitions in parliamentary democracies during the period 1945–1987 (cabinet.dta data set). The dependent variable is durat and the censoring indicator is censor. Choose independent variables that make sense and, for the model formula, do the things listed below.

Independent variables:

1: Parametric Models

1. Estimate an exponential model and a Weibull model. Report the results in a \( \LaTeX \) table.

## Load Data and Libraries ##
library(Zelig)
## Loading required package: MASS
## Loading required package: boot
## ## 
## ##  Zelig (Version 3.5.4, built: 2010-01-20)
## ##  Please refer to http://gking.harvard.edu/zelig for full
## ##  documentation or help.zelig() for help with commands and
## ##  models supported by Zelig.
## ##
## 
## ##  Zelig project citations:
## ##    Kosuke Imai, Gary King, and Olivia Lau. (2009).
## ##    ``Zelig: Everyone's Statistical Software,''
## ##    http://gking.harvard.edu/zelig.
## ##  and
## ##    Kosuke Imai, Gary King, and Olivia Lau. (2008).
## ##    ``Toward A Common Framework for Statistical Analysis
## ##    and Development,'' Journal of Computational and
## ##    Graphical Statistics, Vol. 17, No. 4 (December)
## ##    pp. 892-913. 
## 
## ##  To cite individual Zelig models, please use the citation format printed with
## ##  each model run and in the documentation.
## ##
library(foreign)
cabinet <- read.dta("/Users/squishy/Dropbox/PSCI 7108/Assignment 9/cabinet.dta")
str(cabinet)
## 'data.frame':    314 obs. of  9 variables:
##  $ durat   : num  0.5 1 1 2 0.5 2 0.5 1 2 3 ...
##  $ invest  : int  1 1 0 0 1 1 1 0 0 0 ...
##  $ polar   : int  11 8 5 16 36 3 31 1 0 16 ...
##  $ fract   : int  656 702 749 763 718 599 788 689 513 709 ...
##  $ numst   : int  0 1 0 1 0 1 1 0 1 1 ...
##  $ format  : int  3 3 5 1 4 1 3 1 1 1 ...
##  $ postelec: int  1 1 1 1 1 1 0 0 0 1 ...
##  $ caretakr: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ censor  : int  1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, "datalabel")= chr ""
##  - attr(*, "time.stamp")= chr " 7 Nov 2012 14:19"
##  - attr(*, "formats")= chr  "%9.0g" "%8.0g" "%8.0g" "%8.0g" ...
##  - attr(*, "types")= int  254 251 251 252 251 251 251 251 251
##  - attr(*, "val.labels")= chr  "" "" "" "" ...
##  - attr(*, "var.labels")= chr  "Duration of the government" "Investiture dummy variable" "Polarization index" "Fractionalization" ...
##  - attr(*, "expansion.fields")=List of 10
##   ..$ : chr  "_dta" "st_t" "_t"
##   ..$ : chr  "_dta" "st_t0" "_t0"
##   ..$ : chr  "_dta" "st_d" "_d"
##   ..$ : chr  "_dta" "st_bs" "1"
##   ..$ : chr  "_dta" "st_s" "1"
##   ..$ : chr  "_dta" "st_o" "0"
##   ..$ : chr  "_dta" "st_bd" "censor"
##   ..$ : chr  "_dta" "st_bt" "durat"
##   ..$ : chr  "_dta" "st_ver" "2"
##   ..$ : chr  "_dta" "_dta" "st"
##  - attr(*, "version")= int 8

## Create Two Parametric Models ##

# Exponential model
m.exp <- zelig(Surv(durat, censor) ~ invest + polar + fract + numst + format + 
    caretakr, model = "exp", data = cabinet, cite = F)
## 
## Attaching package: 'survival'
## 
## The following object is masked from 'package:boot':
## 
##     aml

# Weibull model
m.wb <- zelig(Surv(durat, censor) ~ invest + polar + fract + numst + format + 
    caretakr, model = "weibull", data = cabinet, cite = F)

# Results summary
summary(m.exp)
## 
## Call:
## zelig(formula = Surv(durat, censor) ~ invest + polar + fract + 
##     numst + format + caretakr, model = "exp", data = cabinet, 
##     cite = F)
##                Value Std. Error      z        p
## (Intercept)  4.50878   0.606972  7.428 1.10e-13
## invest      -0.46244   0.137209 -3.370 7.51e-04
## polar       -0.02391   0.005861 -4.080 4.50e-05
## fract       -0.00159   0.000889 -1.783 7.45e-02
## numst        0.45673   0.128841  3.545 3.93e-04
## format      -0.04179   0.046018 -0.908 3.64e-01
## caretakr    -1.38342   0.261968 -5.281 1.29e-07
## 
## Scale fixed at 1 
## 
## Exponential distribution
## Loglik(model)= -1040   Loglik(intercept only)= -1101
##  Chisq= 121.1 on 6 degrees of freedom, p= 0 
## Number of Newton-Raphson Iterations: 4 
## n= 314
summary(m.wb)
## 
## Call:
## zelig(formula = Surv(durat, censor) ~ invest + polar + fract + 
##     numst + format + caretakr, model = "weibull", data = cabinet, 
##     cite = F)
##                Value Std. Error     z        p
## (Intercept)  4.43057   0.504292  8.79 1.55e-18
## invest      -0.42555   0.113178 -3.76 1.70e-04
## polar       -0.02261   0.004818 -4.69 2.68e-06
## fract       -0.00147   0.000738 -1.99 4.67e-02
## numst        0.42314   0.106945  3.96 7.60e-05
## format      -0.04090   0.038260 -1.07 2.85e-01
## caretakr    -1.38397   0.217524 -6.36 1.99e-10
## Log(scale)  -0.19218   0.049607 -3.87 1.07e-04
## 
## Scale= 0.825 
## 
## Weibull distribution
## Loglik(model)= -1033   Loglik(intercept only)= -1101
##  Chisq= 134.6 on 6 degrees of freedom, p= 0 
## Number of Newton-Raphson Iterations: 5 
## n= 314

See \( \LaTeX \) document for results table.

2. Which of the two parametric models seems to fit the data better and why?

## Compute Model BICs ##
n <- length(m.exp$linear.predictors)
bic.exp <- -2 * m.exp$loglik[2] + m.exp$df * log(n)
bic.wb <- -2 * m.wb$loglik[2] + m.wb$df * log(n)
bic.exp
## [1] 2121
bic.wb  # Lower BIC
## [1] 2113

The Weibull parametric model seems to fit the data better because its BIC is lower (by almost 7) than that of the Exponential model. This also makes sense in terms of the scale (0.825) reported by the output of the Weibull model. p = 1/sigma = 1/0.825 = 1.21, indicating the baseline hazard rate increases over time (so an exponential model wouldn't be appropriate).

3. Using the better-fitting of the two models, choose two “interesting” covariate profiles and compute one of the QI (% change in the hazard rate or change in the expected duration time). Report the point estimate and a 95% confidence interval. Remember that R reports results in the AFT paramaterization. Present results in the text, a \( \LaTeX \) table, or in a graph.

## Simulate Expected Duration Time E(T) for polar Profile ##
polar.seq <- seq(min(cabinet$polar), max(cabinet$polar), length = 50)
x.polar <- setx(m.wb, invest = 0, polar = polar.seq, fract = mean(cabinet$fract), 
    numst = 1, format = mean(cabinet$format), caretakr = 0)
sim.polar <- sim(m.wb, x = x.polar)

pe.polar <- apply(sim.polar$qi$ev, 2, mean)
lo.polar <- apply(sim.polar$qi$ev, 2, quantile, prob = 0.025)
hi.polar <- apply(sim.polar$qi$ev, 2, quantile, prob = 0.975)

# Make the plot
par(mar = c(4, 6, 0.1, 0.5))
plot(polar.seq, pe.polar, type = "n", xlab = "", ylab = "", ylim = c(0, 70), 
    axes = FALSE)
abline(v = seq(min(polar.seq), max(polar.seq), length = 10), col = "gray75", 
    lty = 3)
abline(h = seq(0, 70, by = 5), col = "gray75", lty = 3)
lines(polar.seq, pe.polar, lwd = 3, lty = 1)
lines(polar.seq, lo.polar, lwd = 2, lty = 2)
lines(polar.seq, hi.polar, lwd = 2, lty = 2)
title(ylab = expression("Expected Cabinet Duration"), line = 4, cex.lab = 1.5)
title(xlab = expression("Support for Extremist Parties"), line = 2.75, cex.lab = 1.5)
axis(1)
axis(2, at = seq(0, 70, by = 5), las = 2, cex.axis = 1.1)
box()
rug(jitter(cabinet$polar), ticksize = 0.015)
legend("topright", bty = "n", c(expression("Point Estimate"), expression("95% Conf. Interval")), 
    lty = c(1, 2), lwd = c(3, 2), cex = 1.25)

plot of chunk unnamed-chunk-3


## Simulate Expected Duration Time E(T) for fract Profile ##
fract.seq <- seq(min(cabinet$fract), max(cabinet$fract), length = 50)
x.fract <- setx(m.wb, invest = 0, polar = mean(cabinet$polar), fract = fract.seq, 
    numst = 1, format = mean(cabinet$format), caretakr = 0)
sim.fract <- sim(m.wb, x = x.fract)

pe.fract <- apply(sim.fract$qi$ev, 2, mean)
lo.fract <- apply(sim.fract$qi$ev, 2, quantile, prob = 0.025)
hi.fract <- apply(sim.fract$qi$ev, 2, quantile, prob = 0.975)

# Make the plot
par(mar = c(4, 6, 0.1, 0.5))
plot(fract.seq, pe.fract, type = "n", xlab = "", ylab = "", ylim = c(0, 70), 
    axes = FALSE)
abline(v = seq(min(fract.seq), max(fract.seq), length = 10), col = "gray75", 
    lty = 3)
abline(h = seq(0, 70, by = 5), col = "gray75", lty = 3)
lines(fract.seq, pe.fract, lwd = 3, lty = 1)
lines(fract.seq, lo.fract, lwd = 2, lty = 2)
lines(fract.seq, hi.fract, lwd = 2, lty = 2)
title(ylab = expression("Expected Cabinet Duration"), line = 4, cex.lab = 1.5)
title(xlab = expression("Fractionalization of Parties in Parliment"), line = 2.75, 
    cex.lab = 1.5)
axis(1)
axis(2, at = seq(0, 70, by = 5), las = 2, cex.axis = 1.1)
box()
rug(jitter(cabinet$fract), ticksize = 0.015)
legend("topright", bty = "n", c(expression("Point Estimate"), expression("95% Conf. Interval")), 
    lty = c(1, 2), lwd = c(3, 2), cex = 1.25)

plot of chunk unnamed-chunk-3

2: The Cox Model

1. Estimate your model with the Cox model using both PLM and IRR (choose the truncation level). Report the results in a \( \LaTeX \) table.

library(coxrobust)
## Create Two Cox Models ##

# Cox Model via Partial Likelihood Maximization (PLM) - standard estimator
m.cox <- zelig(Surv(durat, censor) ~ invest + polar + fract + numst + format + 
    caretakr, model = "coxph", data = cabinet, cite = F, method = "efron")  # Use Efron method if events occur simultaneously
## Warning: The robust option is depricated

# Cox Model via Iteratively Reweighted Robust (IRR) Estimator - ids and
# downweights outliers
m.irr <- coxr(Surv(durat, censor) ~ invest + polar + fract + numst + format + 
    caretakr, trunc = 0.95, data = cabinet)  # Output for partial likelihood estimator don't match that of m.cox

# Results summary
summary(m.cox)
## Call:
## zelig(formula = Surv(durat, censor) ~ invest + polar + fract + 
##     numst + format + caretakr, model = "coxph", data = cabinet, 
##     cite = F, method = "efron")
## 
##   n= 314, number of events= 271 
## 
##               coef exp(coef)  se(coef)     z Pr(>|z|)    
## invest    0.514460  1.672735  0.137697  3.74  0.00019 ***
## polar     0.027176  1.027549  0.005949  4.57  4.9e-06 ***
## fract     0.001678  1.001680  0.000905  1.85  0.06366 .  
## numst    -0.481750  0.617701  0.130932 -3.68  0.00023 ***
## format    0.049046  1.050268  0.046225  1.06  0.28868    
## caretakr  1.708637  5.521432  0.284391  6.01  1.9e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##          exp(coef) exp(-coef) lower .95 upper .95
## invest       1.673      0.598     1.277     2.191
## polar        1.028      0.973     1.016     1.040
## fract        1.002      0.998     1.000     1.003
## numst        0.618      1.619     0.478     0.798
## format       1.050      0.952     0.959     1.150
## caretakr     5.521      0.181     3.162     9.641
## 
## Concordance= 0.71  (se = 0.02 )
## Rsquare= 0.337   (max possible= 1 )
## Likelihood ratio test= 129  on 6 df,   p=0
## Wald test            = 149  on 6 df,   p=0
## Score (logrank) test = 186  on 6 df,   p=0
m.irr
## 
## Call:
## coxr(formula = Surv(durat, censor) ~ invest + polar + fract +     numst + format + caretakr, data = cabinet, trunc = 0.95)
## 
## Partial likelihood estimator
##              coef exp(coef) se(coef)        p
## invest    0.50493     1.657 0.137767 2.47e-04
## polar     0.02599     1.026 0.005945 1.24e-05
## fract     0.00174     1.002 0.000906 5.49e-02
## numst    -0.47122     0.624 0.130902 3.18e-04
## format    0.04590     1.047 0.046143 3.20e-01
## caretakr  1.54992     4.711 0.281995 3.88e-08
## 
## Wald test=137 on 6 df, p=0
## 
## Robust estimator
##             coef exp(coef) se(coef)        p
## invest    0.5103     1.666  0.16389 1.85e-03
## polar     0.0293     1.030  0.00644 5.31e-06
## fract     0.0018     1.002  0.00105 8.60e-02
## numst    -0.4661     0.627  0.15670 2.94e-03
## format    0.0618     1.064  0.04702 1.89e-01
## caretakr  1.5954     4.930  0.36026 9.49e-06
## 
## Extended Wald test=86.8 on 6 df, p=1.11e-16

See \( \LaTeX \) document for results table.

It's also worth noting that although the output of m.irr also reports estimations for the “partial likelihood estimator” (which I assume should theoretically be the same as the coefficients for m.cox that is calculated via PLM), they don't match those of m.cox.

2. Perform the CVMF test and interpret the results.

work.dir <- "/Users/squishy/Dropbox/PSCI 7108/"  # Define working directory
source(paste0(work.dir, "CVMF.r"))  # Yay! Revolutionary? Mayhaps
source("CVMF.r")
## Warning: cannot open file 'CVMF.r': No such file or directory
## Error: cannot open the connection
cvmf.test <- CVMF(Surv(durat, censor) ~ invest + polar + fract + numst + format + 
    caretakr, trunc = 0.95, data = cabinet)
## IRR supported with a two-sided p-value of 0.001
cvmf.test$best  # IRR likelihood count is greater than that of PLM
## number of successes 
##               "IRR"
cvmf.test$p  # The null hypothesis is that both fit equally well; given the p-value, we reject the null and accept that the IRR model provides a better fit
## [1] 0.001

3. Compute the %change in the hazard rate between the two covariate profiles used before (point estimate and 95% confidence interval). Do this with the PLM estimates and with the IRR estimates. Present the results in the text, a \( \LaTeX \) table, or in a graph. Are there large substantive differences between the two estimators?

library(arm)
## Loading required package: Matrix
## Loading required package: lattice
## 
## Attaching package: 'lattice'
## 
## The following object is masked from 'package:boot':
## 
##     melanoma
## 
## Loading required package: lme4
## 
## arm (Version 1.6-09, built: 2013-9-23)
## 
## Working directory is /Users/squishy/Dropbox/PSCI 7108/Assignment 9
## 
## 
## Attaching package: 'arm'
## 
## The following object is masked from 'package:Zelig':
## 
##     sim
## 
## The following object is masked from 'package:boot':
## 
##     logit
## Simulate the Percent Change for m.cox Model ##

# Simulate polar Profile
set.seed(39222)
m <- 1000  # Simulations
betas.mcox <- m.cox$coef  # Collecting betas
vcv.mcox <- vcov(m.cox)  # Collecting variance-covariance matrix 
simbetas.mcox <- mvrnorm(m, betas.mcox, vcv.mcox)  # Taking 1000 draws from multivariate normal, the mean is betas.mcox, vcv estimated from m.cox (simulating from sampling distribution of the model) 

# Check to see if the simulated coefficients look like the real results
round(m.cox$coef, digits = 2)
##   invest    polar    fract    numst   format caretakr 
##     0.51     0.03     0.00    -0.48     0.05     1.71
round(head(simbetas.mcox, 10), digits = 2)
##       invest polar fract numst format caretakr
##  [1,]   0.33  0.04     0 -0.45   0.08     1.73
##  [2,]   0.62  0.02     0 -0.31   0.08     1.73
##  [3,]   0.44  0.03     0 -0.39   0.09     2.05
##  [4,]   0.64  0.02     0 -0.76   0.07     2.05
##  [5,]   0.34  0.04     0 -0.47   0.05     1.57
##  [6,]   0.56  0.02     0 -0.53   0.02     1.52
##  [7,]   0.64  0.02     0 -0.34   0.15     1.29
##  [8,]   0.64  0.03     0 -0.65   0.07     2.05
##  [9,]   0.54  0.02     0 -0.44   0.13     1.73
## [10,]   0.28  0.03     0 -0.39   0.12     1.94

# Create hypothetical profile
x.mcoxpolar <- data.frame(invest = 0, polar = polar.seq, fract = mean(cabinet$fract), 
    numst = 1, format = mean(cabinet$format), caretakr = 0)

# Compute the percent change and confidence intervals
pc.mcoxpolar <- matrix(NA, nrow = m, ncol = length(polar.seq))  # Compute the percent change 1000 times @ each of 50 polar levels

for (i in 1:m) {
    pc.mcoxpolar[i, ] <- invlogit(as.matrix(x.mcoxpolar) %*% simbetas.mcox[i, 
        ])
}

pe.mcoxpolar <- apply(pc.mcoxpolar, 2, mean)
lo.mcoxpolar <- apply(pc.mcoxpolar, 2, quantile, prob = 0.025)
hi.mcoxpolar <- apply(pc.mcoxpolar, 2, quantile, prob = 0.975)

# Make the plot
par(mar = c(4, 6, 0.1, 0.5))
plot(polar.seq, pe.mcoxpolar, type = "n", xlab = "", ylab = "", ylim = c(0, 
    1), axes = FALSE)
abline(v = seq(min(polar.seq), max(polar.seq), length = 10), col = "gray75", 
    lty = 3)
abline(h = seq(0, 1, by = 0.2), col = "gray75", lty = 3)
lines(polar.seq, pe.mcoxpolar, lwd = 3, lty = 1)
lines(polar.seq, lo.mcoxpolar, lwd = 2, lty = 2)
lines(polar.seq, hi.mcoxpolar, lwd = 2, lty = 2)
title(ylab = expression("% Change in Hazard Rate"), line = 4, cex.lab = 1.5)
title(xlab = expression("Support for Extremist Parties"), line = 2.75, cex.lab = 1.5)
axis(1)
axis(2, at = seq(0, 1, by = 0.2), las = 2, cex.axis = 1.1)
box()
rug(jitter(cabinet$polar), ticksize = 0.015)
legend("topleft", bty = "n", c(expression("Point Estimate"), expression("95% Conf. Interval")), 
    lty = c(1, 2), lwd = c(3, 2), cex = 1.25)

plot of chunk unnamed-chunk-6


# Simulate fract Profile
x.mcoxfract <- data.frame(invest = 0, polar = mean(cabinet$polar), fract = fract.seq, 
    numst = 1, format = mean(cabinet$format), caretakr = 0)

# Compute the percent change and confidence intervals
pc.mcoxfract <- matrix(NA, nrow = m, ncol = length(fract.seq))  # Compute the percent change 1000 times @ each of 50 fract levels

for (i in 1:m) {
    pc.mcoxfract[i, ] <- invlogit(as.matrix(x.mcoxfract) %*% simbetas.mcox[i, 
        ])
}

pe.mcoxfract <- apply(pc.mcoxfract, 2, mean)
lo.mcoxfract <- apply(pc.mcoxfract, 2, quantile, prob = 0.025)
hi.mcoxfract <- apply(pc.mcoxfract, 2, quantile, prob = 0.975)

# Make the plot
par(mar = c(4, 6, 0.1, 0.5))
plot(fract.seq, pe.mcoxfract, type = "n", xlab = "", ylab = "", ylim = c(0, 
    1), axes = FALSE)
abline(v = seq(min(fract.seq), max(fract.seq), length = 10), col = "gray75", 
    lty = 3)
abline(h = seq(0, 1, by = 0.2), col = "gray75", lty = 3)
lines(fract.seq, pe.mcoxfract, lwd = 3, lty = 1)
lines(fract.seq, lo.mcoxfract, lwd = 2, lty = 2)
lines(fract.seq, hi.mcoxfract, lwd = 2, lty = 2)
title(ylab = expression("% Change in Hazard Rate"), line = 4, cex.lab = 1.5)
title(xlab = expression("Fractionalization of Parties in Parliment"), line = 2.75, 
    cex.lab = 1.5)
axis(1)
axis(2, at = seq(0, 1, by = 0.2), las = 2, cex.axis = 1.1)
box()
rug(jitter(cabinet$fract), ticksize = 0.015)
legend("topleft", bty = "n", c(expression("Point Estimate"), expression("95% Conf. Interval")), 
    lty = c(1, 2), lwd = c(3, 2), cex = 1.25)

plot of chunk unnamed-chunk-6