## for Titanicp dataset
library(vcdExtra)
## for natural splines
library(splines)
## for multiple imputation
library(Amelia)
## for analysis based on the imputed dataset
library(Zelig)
The Titanicp dataset is a collection of individual-level information on predictors and the outcome (survival). The age variable are missing in 263/1309.
## Load and show
data(Titanicp)
head(Titanicp)
## pclass survived sex age sibsp parch
## 1 1st survived female 29.0000 0 0
## 2 1st survived male 0.9167 1 2
## 3 1st died female 2.0000 1 2
## 4 1st died male 30.0000 1 2
## 5 1st died female 25.0000 1 2
## 6 1st survived male 48.0000 0 0
summary(Titanicp)
## pclass survived sex age sibsp parch
## 1st:323 died :809 female:466 Min. : 0.17 Min. :0.000 Min. :0.000
## 2nd:277 survived:500 male :843 1st Qu.:21.00 1st Qu.:0.000 1st Qu.:0.000
## 3rd:709 Median :28.00 Median :0.000 Median :0.000
## Mean :29.88 Mean :0.499 Mean :0.385
## 3rd Qu.:39.00 3rd Qu.:1.000 3rd Qu.:0.000
## Max. :80.00 Max. :8.000 Max. :9.000
## NA's :263
Passengers on the Titanic
Data on passengers on the RMS Titanic, excluding the Crew and some
individual identifier variables.
Format:
A data frame with 1309 observations on the following 6 variables.
‘pclass’ a factor with levels ‘1st’ ‘2nd’ ‘3rd’
‘survived’ a factor with levels ‘died’ ‘survived’
‘sex’ a factor with levels ‘female’ ‘male’
‘age’ passenger age in years (or fractions of a year, for
children), a numeric vector; AGE IS MISSING FOR 263 OF THE
PASSENGERS
‘sibsp’ number of siblings or spouses aboard, integer: ‘0:8’
‘parch’ number of parents or children aboard, integer: ‘0:6’
Missing age is associated with decreased survival (27.8%) compared to those with non-missing age (40.8%). In other words, among the dead, 23.5% of age are missing, whereas among the survived, 14.6% of age are missing. This indicates the missingness violates “missing completely at random (MCAR)”. To be able to proceed, here I assume the missingness is “missing at random (MAR)”, i.e., the probability of missingness is independent of the outcome conditional on other covariates.
## Crete a 2x2 table
tabAgeNaSurvived <- xtabs( ~ is.na(age) + survived, data = Titanicp)
## Proportion of survived
prop.table(tabAgeNaSurvived, margin = 1)
## survived
## is.na(age) died survived
## FALSE 0.5918 0.4082
## TRUE 0.7224 0.2776
## Proportion of Missing age
prop.table(tabAgeNaSurvived, margin = 2)
## survived
## is.na(age) died survived
## FALSE 0.7651 0.8540
## TRUE 0.2349 0.1460
##
modelCompleteCases <- glm(formula = survived ~ pclass + sex + ns(age, df = 5) + poly(sibsp, df = 2) + poly(parch, df = 2),
family = binomial(link = "logit"),
data = Titanicp)
summary(modelCompleteCases)
##
## Call:
## glm(formula = survived ~ pclass + sex + ns(age, df = 5) + poly(sibsp,
## df = 2) + poly(parch, df = 2), family = binomial(link = "logit"),
## data = Titanicp)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.062 -0.658 -0.452 0.643 2.553
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.525 0.573 7.89 3.0e-15 ***
## pclass2nd -1.305 0.233 -5.61 2.0e-08 ***
## pclass3rd -2.095 0.233 -8.98 < 2e-16 ***
## sexmale -2.568 0.177 -14.51 < 2e-16 ***
## ns(age, df = 5)1 -2.112 0.531 -3.98 7.0e-05 ***
## ns(age, df = 5)2 -2.084 0.608 -3.43 0.00060 ***
## ns(age, df = 5)3 -2.184 0.633 -3.45 0.00056 ***
## ns(age, df = 5)4 -5.266 1.232 -4.27 1.9e-05 ***
## ns(age, df = 5)5 -2.316 1.028 -2.25 0.02422 *
## poly(sibsp, df = 2)1 -32.983 8.405 -3.92 8.7e-05 ***
## poly(sibsp, df = 2)2 -19.279 8.231 -2.34 0.01917 *
## poly(parch, df = 2)1 -4.831 4.407 -1.10 0.27297
## poly(parch, df = 2)2 -15.905 6.699 -2.37 0.01758 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1414.62 on 1045 degrees of freedom
## Residual deviance: 944.01 on 1033 degrees of freedom
## (263 observations deleted due to missingness)
## AIC: 970
##
## Number of Fisher Scoring iterations: 5
All covariates need to be numbers. In the resulting object, the imputations element is the list of imputed datasets, and the m element is the number of imputations.
## Create imputable dataset
TitanicpOrdered <- within(Titanicp, {
pclass <- factor(pclass, levels = c("1st","2nd","3rd"), labels = c(1,2,3))
pclass <- as.numeric(as.character(pclass))
survived <- factor(survived, levels = c("died","survived"), labels = c(0,1))
survived <- as.numeric(as.character(survived))
sex <- factor(sex, levels = c("female","male"), labels = c(0,1))
sex <- as.numeric(as.character(sex))
})
## Impute with Amelia
titanicAmelia <- amelia(x = TitanicpOrdered, m = 5, p2s = 0)
summary(titanicAmelia)
##
## Amelia output with 5 imputed datasets.
## Return code: 1
## Message: Normal EM convergence.
##
## Chain Lengths:
## --------------
## Imputation 1: 3
## Imputation 2: 3
## Imputation 3: 3
## Imputation 4: 3
## Imputation 5: 3
##
## Rows after Listwise Deletion: 1046
## Rows after Imputation: 1309
## Patterns of missingness in the data: 2
##
## Fraction Missing for original variables:
## -----------------------------------------
##
## Fraction Missing
## pclass 0.0000
## survived 0.0000
## sex 0.0000
## age 0.2009
## sibsp 0.0000
## parch 0.0000
## Check structure
names(titanicAmelia)
## [1] "imputations" "m" "missMatrix" "overvalues" "theta" "mu" "covMatrices" "code"
## [9] "message" "iterHist" "arguments" "orig.vars"
## Fit model for each dataset
modelsMi <- sapply(titanicAmelia$imputations,
FUN = function(DF) {
## Fit model for each dataset
modelMi <- glm(formula = survived ~ factor(pclass) + sex + ns(age, df = 5) + poly(sibsp, df = 2) + poly(parch, df = 2),
family = binomial(link = "logit"),
data = DF)
## Return
modelMi
},
simplify = FALSE)
## Extract the coefficient table for each dataset
listMatrix <- sapply(modelsMi,
FUN = function(model) {
## Extract the coefficient table
coef(summary(model))
},
simplify = FALSE)
## The points estimates are the means.
matCoefs <- sapply(listMatrix,
FUN = function(MAT) {
## Extract the coeffient column
MAT[,"Estimate"]
},
simplify = TRUE)
matCoefs
## imp1 imp2 imp3 imp4 imp5
## (Intercept) 5.275 11.857 4.939 5.568 6.696
## factor(pclass)2 -1.197 -1.318 -1.302 -1.243 -1.261
## factor(pclass)3 -1.985 -2.117 -2.127 -2.079 -2.095
## sex -2.568 -2.589 -2.573 -2.554 -2.587
## ns(age, df = 5)1 -3.008 -9.358 -2.484 -3.424 -4.300
## ns(age, df = 5)2 -2.996 -9.451 -2.490 -3.264 -4.488
## ns(age, df = 5)3 -2.567 -6.695 -2.569 -3.084 -3.341
## ns(age, df = 5)4 -6.629 -19.446 -5.939 -6.388 -9.224
## ns(age, df = 5)5 -2.803 -5.611 -2.700 -3.595 -3.756
## poly(sibsp, df = 2)1 -32.397 -36.312 -32.582 -34.537 -33.511
## poly(sibsp, df = 2)2 -21.884 -23.296 -21.362 -21.901 -21.347
## poly(parch, df = 2)1 -6.980 -6.405 -6.715 -6.547 -6.509
## poly(parch, df = 2)2 -19.207 -17.652 -19.331 -18.687 -16.472
## Show the mean
pointEstimates <- rowMeans(matCoefs)
data.frame(pointEstimates = pointEstimates)
## pointEstimates
## (Intercept) 6.867
## factor(pclass)2 -1.264
## factor(pclass)3 -2.081
## sex -2.574
## ns(age, df = 5)1 -4.515
## ns(age, df = 5)2 -4.538
## ns(age, df = 5)3 -3.651
## ns(age, df = 5)4 -9.525
## ns(age, df = 5)5 -3.693
## poly(sibsp, df = 2)1 -33.868
## poly(sibsp, df = 2)2 -21.958
## poly(parch, df = 2)1 -6.631
## poly(parch, df = 2)2 -18.270
## Collect the standard errors of the estimates
matSes <- sapply(listMatrix,
FUN = function(MAT) {
## Extract the coeffient column
MAT[,"Std. Error"]
},
simplify = TRUE)
## Square to get the variances of the estimations
(matVariances <- matSes^2)
## imp1 imp2 imp3 imp4 imp5
## (Intercept) 0.93034 6.37710 0.85875 4.75552 1.81779
## factor(pclass)2 0.04663 0.04883 0.04880 0.04788 0.04804
## factor(pclass)3 0.04119 0.04370 0.04556 0.04373 0.04364
## sex 0.02493 0.02551 0.02520 0.02500 0.02539
## ns(age, df = 5)1 0.73858 5.85601 0.66018 4.22353 1.48749
## ns(age, df = 5)2 1.00620 6.69878 0.94126 5.00392 1.95732
## ns(age, df = 5)3 0.42605 1.88010 0.40053 1.33121 0.60105
## ns(age, df = 5)4 4.55081 28.22227 4.22520 21.55446 8.66944
## ns(age, df = 5)5 1.07297 1.95325 0.95174 1.72730 1.15366
## poly(sibsp, df = 2)1 67.75721 51.37976 69.21592 70.77645 68.19162
## poly(sibsp, df = 2)2 63.30704 44.08978 64.01720 64.85012 63.12719
## poly(parch, df = 2)1 21.36723 20.89047 22.13207 21.04953 20.53102
## poly(parch, df = 2)2 48.21047 47.03847 49.43907 47.25376 46.08032
## The within component of the variance is the mean of variances across imputed datasets
## W = sum(variances)/M
withinVariances <- rowMeans(matVariances)
## The between component of the variance is the variance of the coefficients.
distanceFromMean <- sweep(x = matCoefs, # individual coefficients
MARGIN = 1,
STATS = pointEstimates, # mean coefficients
FUN = "-" # subtraction
)
## B = (distance from the mean)^2 / (M-1)
betweenVariances <- rowSums(distanceFromMean^2) / (titanicAmelia$m - 1)
## Calculate the total variances: W + (1 + 1/M) * B
totalVariances <- withinVariances + (1 + 1/titanicAmelia$m) * betweenVariances
## Calculate the total standard errors.
totalStdErrors <- sqrt(totalVariances)
## The result
finalRes <- data.frame(pointEstimates = pointEstimates, StdErrors = totalStdErrors)
finalRes$ZValues <- with(finalRes, pointEstimates / StdErrors)
finalRes$pValues <- pnorm(q = abs(finalRes$ZValues), lower.tail = FALSE) * 2
finalRes
## pointEstimates StdErrors ZValues pValues
## (Intercept) 6.867 3.5789 1.919 5.501e-02
## factor(pclass)2 -1.264 0.2254 -5.607 2.059e-08
## factor(pclass)3 -2.081 0.2178 -9.553 1.266e-21
## sex -2.574 0.1595 -16.135 1.454e-58
## ns(age, df = 5)1 -4.515 3.4524 -1.308 1.910e-01
## ns(age, df = 5)2 -4.538 3.5808 -1.267 2.050e-01
## ns(age, df = 5)3 -3.651 2.1302 -1.714 8.652e-02
## ns(age, df = 5)4 -9.525 7.2335 -1.317 1.879e-01
## ns(age, df = 5)5 -3.693 1.7359 -2.128 3.338e-02
## poly(sibsp, df = 2)1 -33.868 8.2809 -4.090 4.316e-05
## poly(sibsp, df = 2)2 -21.958 7.7870 -2.820 4.805e-03
## poly(parch, df = 2)1 -6.631 4.6103 -1.438 1.503e-01
## poly(parch, df = 2)2 -18.270 7.0244 -2.601 9.298e-03
## Fit logistic regression model for each imputed dataset at once
modelMiZelig <- zelig(formula = survived ~ factor(pclass) + sex + ns(age, df = 5) + poly(sibsp, df = 2) + poly(parch, df = 2),
model = "logit",
data = titanicAmelia$imputations)
##
##
## How to cite this model in Zelig:
## Kosuke Imai, Gary King, and Olivia Lau. 2014.
## "logit: Logistic Regression for Dichotomous Dependent Variables"
## in Kosuke Imai, Gary King, and Olivia Lau, "Zelig: Everyone's Statistical Software,"
## http://gking.harvard.edu/zelig
##
## Shwo results (no printing method is implemented?)
summary(modelMiZelig)
##
## Model: logit
## Number of multiply imputed data sets: 5
##
## Combined results:
##
## Call:
## glm(formula = formula, weights = w, family = binomial(link = "logit"),
## model = F, data = data)
##
## Coefficients:
## Value Std. Error t-stat p-value
## (Intercept) 6.867 3.5789 1.919 9.805e-02
## factor(pclass)2 -1.264 0.2254 -5.607 2.501e-08
## factor(pclass)3 -2.081 0.2178 -9.553 3.178e-20
## sex -2.574 0.1595 -16.135 2.166e-58
## ns(age, df = 5)1 -4.515 3.4524 -1.308 2.351e-01
## ns(age, df = 5)2 -4.538 3.5808 -1.267 2.456e-01
## ns(age, df = 5)3 -3.651 2.1302 -1.714 1.348e-01
## ns(age, df = 5)4 -9.525 7.2335 -1.317 2.280e-01
## ns(age, df = 5)5 -3.693 1.7359 -2.128 5.236e-02
## poly(sibsp, df = 2)1 -33.868 8.2809 -4.090 4.492e-05
## poly(sibsp, df = 2)2 -21.958 7.7870 -2.820 4.809e-03
## poly(parch, df = 2)1 -6.631 4.6103 -1.438 1.503e-01
## poly(parch, df = 2)2 -18.270 7.0244 -2.601 9.340e-03
##
## For combined results from datasets i to j, use summary(x, subset = i:j).
## For separate results, use print(summary(x), subset = i:j).
## print method
print.MI <- function(object) {
## Work on one element at a time
sapply(object,
function(imp) {
print(imp$result)
cat("--------------------------------------------------------------------------------\n")
})
## No meaningful return value
return(invisible(NULL))
}
## Example
print(modelMiZelig)
##
## Call: glm(formula = formula, weights = w, family = binomial(link = "logit"),
## model = F, data = data)
##
## Coefficients:
## (Intercept) factor(pclass)2 factor(pclass)3 sex ns(age, df = 5)1
## 5.28 -1.20 -1.98 -2.57 -3.01
## ns(age, df = 5)2 ns(age, df = 5)3 ns(age, df = 5)4 ns(age, df = 5)5 poly(sibsp, df = 2)1
## -3.00 -2.57 -6.63 -2.80 -32.40
## poly(sibsp, df = 2)2 poly(parch, df = 2)1 poly(parch, df = 2)2
## -21.88 -6.98 -19.21
##
## Degrees of Freedom: 1308 Total (i.e. Null); 1296 Residual
## Null Deviance: 1740
## Residual Deviance: 1180 AIC: 1200
## --------------------------------------------------------------------------------
##
## Call: glm(formula = formula, weights = w, family = binomial(link = "logit"),
## model = F, data = data)
##
## Coefficients:
## (Intercept) factor(pclass)2 factor(pclass)3 sex ns(age, df = 5)1
## 11.86 -1.32 -2.12 -2.59 -9.36
## ns(age, df = 5)2 ns(age, df = 5)3 ns(age, df = 5)4 ns(age, df = 5)5 poly(sibsp, df = 2)1
## -9.45 -6.70 -19.45 -5.61 -36.31
## poly(sibsp, df = 2)2 poly(parch, df = 2)1 poly(parch, df = 2)2
## -23.30 -6.40 -17.65
##
## Degrees of Freedom: 1308 Total (i.e. Null); 1296 Residual
## Null Deviance: 1740
## Residual Deviance: 1160 AIC: 1190
## --------------------------------------------------------------------------------
##
## Call: glm(formula = formula, weights = w, family = binomial(link = "logit"),
## model = F, data = data)
##
## Coefficients:
## (Intercept) factor(pclass)2 factor(pclass)3 sex ns(age, df = 5)1
## 4.94 -1.30 -2.13 -2.57 -2.48
## ns(age, df = 5)2 ns(age, df = 5)3 ns(age, df = 5)4 ns(age, df = 5)5 poly(sibsp, df = 2)1
## -2.49 -2.57 -5.94 -2.70 -32.58
## poly(sibsp, df = 2)2 poly(parch, df = 2)1 poly(parch, df = 2)2
## -21.36 -6.72 -19.33
##
## Degrees of Freedom: 1308 Total (i.e. Null); 1296 Residual
## Null Deviance: 1740
## Residual Deviance: 1180 AIC: 1200
## --------------------------------------------------------------------------------
##
## Call: glm(formula = formula, weights = w, family = binomial(link = "logit"),
## model = F, data = data)
##
## Coefficients:
## (Intercept) factor(pclass)2 factor(pclass)3 sex ns(age, df = 5)1
## 5.57 -1.24 -2.08 -2.55 -3.42
## ns(age, df = 5)2 ns(age, df = 5)3 ns(age, df = 5)4 ns(age, df = 5)5 poly(sibsp, df = 2)1
## -3.26 -3.08 -6.39 -3.60 -34.54
## poly(sibsp, df = 2)2 poly(parch, df = 2)1 poly(parch, df = 2)2
## -21.90 -6.55 -18.69
##
## Degrees of Freedom: 1308 Total (i.e. Null); 1296 Residual
## Null Deviance: 1740
## Residual Deviance: 1180 AIC: 1200
## --------------------------------------------------------------------------------
##
## Call: glm(formula = formula, weights = w, family = binomial(link = "logit"),
## model = F, data = data)
##
## Coefficients:
## (Intercept) factor(pclass)2 factor(pclass)3 sex ns(age, df = 5)1
## 6.70 -1.26 -2.09 -2.59 -4.30
## ns(age, df = 5)2 ns(age, df = 5)3 ns(age, df = 5)4 ns(age, df = 5)5 poly(sibsp, df = 2)1
## -4.49 -3.34 -9.22 -3.76 -33.51
## poly(sibsp, df = 2)2 poly(parch, df = 2)1 poly(parch, df = 2)2
## -21.35 -6.51 -16.47
##
## Degrees of Freedom: 1308 Total (i.e. Null); 1296 Residual
## Null Deviance: 1740
## Residual Deviance: 1170 AIC: 1190
## --------------------------------------------------------------------------------
## coef method
coef.MI <- function(object, simplify = FALSE) {
## Work on one element at a time
sapply(object,
function(imp) {
## Extract coefficients from the model
coef(imp$result)
},
## Determine if the result should be simplified to a matrix
simplify = simplify)
}
## Examples
coef(modelMiZelig, simplify = TRUE)
## imp1 imp2 imp3 imp4 imp5
## (Intercept) 5.275 11.857 4.939 5.568 6.696
## factor(pclass)2 -1.197 -1.318 -1.302 -1.243 -1.261
## factor(pclass)3 -1.985 -2.117 -2.127 -2.079 -2.095
## sex -2.568 -2.589 -2.573 -2.554 -2.587
## ns(age, df = 5)1 -3.008 -9.358 -2.484 -3.424 -4.300
## ns(age, df = 5)2 -2.996 -9.451 -2.490 -3.264 -4.488
## ns(age, df = 5)3 -2.567 -6.695 -2.569 -3.084 -3.341
## ns(age, df = 5)4 -6.629 -19.446 -5.939 -6.388 -9.224
## ns(age, df = 5)5 -2.803 -5.611 -2.700 -3.595 -3.756
## poly(sibsp, df = 2)1 -32.397 -36.312 -32.582 -34.537 -33.511
## poly(sibsp, df = 2)2 -21.884 -23.296 -21.362 -21.901 -21.347
## poly(parch, df = 2)1 -6.980 -6.405 -6.715 -6.547 -6.509
## poly(parch, df = 2)2 -19.207 -17.652 -19.331 -18.687 -16.472
coef(modelMiZelig)
## $imp1
## (Intercept) factor(pclass)2 factor(pclass)3 sex ns(age, df = 5)1
## 5.275 -1.197 -1.985 -2.568 -3.008
## ns(age, df = 5)2 ns(age, df = 5)3 ns(age, df = 5)4 ns(age, df = 5)5 poly(sibsp, df = 2)1
## -2.996 -2.567 -6.629 -2.803 -32.397
## poly(sibsp, df = 2)2 poly(parch, df = 2)1 poly(parch, df = 2)2
## -21.884 -6.980 -19.207
##
## $imp2
## (Intercept) factor(pclass)2 factor(pclass)3 sex ns(age, df = 5)1
## 11.857 -1.318 -2.117 -2.589 -9.358
## ns(age, df = 5)2 ns(age, df = 5)3 ns(age, df = 5)4 ns(age, df = 5)5 poly(sibsp, df = 2)1
## -9.451 -6.695 -19.446 -5.611 -36.312
## poly(sibsp, df = 2)2 poly(parch, df = 2)1 poly(parch, df = 2)2
## -23.296 -6.405 -17.652
##
## $imp3
## (Intercept) factor(pclass)2 factor(pclass)3 sex ns(age, df = 5)1
## 4.939 -1.302 -2.127 -2.573 -2.484
## ns(age, df = 5)2 ns(age, df = 5)3 ns(age, df = 5)4 ns(age, df = 5)5 poly(sibsp, df = 2)1
## -2.490 -2.569 -5.939 -2.700 -32.582
## poly(sibsp, df = 2)2 poly(parch, df = 2)1 poly(parch, df = 2)2
## -21.362 -6.715 -19.331
##
## $imp4
## (Intercept) factor(pclass)2 factor(pclass)3 sex ns(age, df = 5)1
## 5.568 -1.243 -2.079 -2.554 -3.424
## ns(age, df = 5)2 ns(age, df = 5)3 ns(age, df = 5)4 ns(age, df = 5)5 poly(sibsp, df = 2)1
## -3.264 -3.084 -6.388 -3.595 -34.537
## poly(sibsp, df = 2)2 poly(parch, df = 2)1 poly(parch, df = 2)2
## -21.901 -6.547 -18.687
##
## $imp5
## (Intercept) factor(pclass)2 factor(pclass)3 sex ns(age, df = 5)1
## 6.696 -1.261 -2.095 -2.587 -4.300
## ns(age, df = 5)2 ns(age, df = 5)3 ns(age, df = 5)4 ns(age, df = 5)5 poly(sibsp, df = 2)1
## -4.488 -3.341 -9.224 -3.756 -33.511
## poly(sibsp, df = 2)2 poly(parch, df = 2)1 poly(parch, df = 2)2
## -21.347 -6.509 -16.472