## Settings for RMarkdown http://yihui.name/knitr/options#chunk_options
opts_chunk$set(comment = "", warning = FALSE, message = FALSE, tidy = FALSE,
echo = TRUE, fig.width = 7, fig.height = 7)
options(width = 116, scipen = 10)
setwd("~/Documents/HSPH/useR.at.HSPH/")
library(plyr)
Regression type | Model comparison measure (model vs model) | Model fit measure (model vs data) |
---|---|---|
Linear regression | Adjusted \( R^2 \), AIC, BIC; Partial F-test if nested | \( R^2 \), AIC, BIC, Model F-test, Residual analysis |
Logistic regression | Deviance, LR test for deviance, AIC, BIC, AUC, NRI | Deviance, AIC, BIC, Hosmer-Lemeshow test, (AUC) |
Poisson regression | Deviance, AIC | Deviance |
Cox regression | AIC | Residual analysis |
These measures are used to comapre two models to look for a better model.
library(gdata)
lbw <- read.xls("~/statistics/bio213/lbw.xls")
## Same data available online: http://www.umass.edu/statdata/statdata/data/index.html
## Cases 10 and 39 needs fix to make them identical to Dr. Orav's dataset
## lbw2 <- read.xls("http://www.umass.edu/statdata/statdata/data/lowbwt.xls")
## lbw2[c(10,39),"BWT"] <- c(2655,3035)
## Recoding
lbw <- within(lbw, {
## race relabeling
race.cat <- factor(race, levels = 1:3, labels = c("White","Black","Other"))
## ftv (frequency of visit) relabeling
ftv.cat <- cut(ftv, breaks = c(-Inf, 0, 2, Inf), labels = c("None","Normal","Many"))
ftv.cat <- relevel(ftv.cat, ref = "Normal")
## ptl
preterm <- factor(ptl >= 1, levels = c(F,T), labels = c("=0",">=1"))
})
\( R^2 \); not defined for the null model.
For residual analysis see http://rpubs.com/kaz_yos/residuals
## Null
linear.model.null <- lm(bwt ~ 1, data = lbw)
## lwt
linear.model.lwt <- lm(bwt ~ lwt, data = lbw)
## Full
linear.model.full <- lm(bwt ~ age + lwt + smoke + ht + ui + ftv.cat + race.cat + preterm, data = lbw)
## Create a list
linear.model.list <- list(null = linear.model.null, lwt = linear.model.lwt, full = linear.model.full)
## Model summaries
lapply(linear.model.list, summary)
$null
Call:
lm(formula = bwt ~ 1, data = lbw)
Residuals:
Min 1Q Median 3Q Max
-2235.8 -530.8 32.2 530.2 2045.2
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2945 53 55.5 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 729 on 188 degrees of freedom
$lwt
Call:
lm(formula = bwt ~ lwt, data = lbw)
Residuals:
Min 1Q Median 3Q Max
-2192 -504 -4 508 2076
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2367.77 228.41 10.37 <2e-16 ***
lwt 4.44 1.71 2.59 0.01 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 718 on 187 degrees of freedom
Multiple R-squared: 0.0348, Adjusted R-squared: 0.0296
F-statistic: 6.73 on 1 and 187 DF, p-value: 0.0102
$full
Call:
lm(formula = bwt ~ age + lwt + smoke + ht + ui + ftv.cat + race.cat +
preterm, data = lbw)
Residuals:
Min 1Q Median 3Q Max
-1896.7 -443.3 53.2 466.1 1654.4
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2947.32 320.48 9.20 < 2e-16 ***
age -2.91 9.67 -0.30 0.76354
lwt 4.22 1.72 2.46 0.01488 *
smoke -307.34 109.13 -2.82 0.00541 **
ht -568.63 200.88 -2.83 0.00518 **
ui -494.11 137.23 -3.60 0.00041 ***
ftv.catNone -56.50 105.36 -0.54 0.59245
ftv.catMany -185.86 203.19 -0.91 0.36158
race.catBlack -467.30 149.78 -3.12 0.00211 **
race.catOther -322.81 117.40 -2.75 0.00658 **
preterm>=1 -207.91 136.35 -1.52 0.12907
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 647 on 178 degrees of freedom
Multiple R-squared: 0.255, Adjusted R-squared: 0.213
F-statistic: 6.08 on 10 and 178 DF, p-value: 0.0000000609
## R-squared
sapply(linear.model.list,
function(x) {
summary(x)$r.squared
})
null lwt full
0.00000 0.03476 0.25471
## Null vs lwt model by partial F-test
anova(linear.model.null, linear.model.lwt)
Analysis of Variance Table
Model 1: bwt ~ 1
Model 2: bwt ~ lwt
Res.Df RSS Df Sum of Sq F Pr(>F)
1 188 99927264
2 187 96454213 1 3473052 6.73 0.01 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## lwt vs full model by partial F-test
anova(linear.model.lwt, linear.model.full)
Analysis of Variance Table
Model 1: bwt ~ lwt
Model 2: bwt ~ age + lwt + smoke + ht + ui + ftv.cat + race.cat + preterm
Res.Df RSS Df Sum of Sq F Pr(>F)
1 187 96454213
2 178 74475274 9 21978939 5.84 0.00000041 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Adjusted R-squared
sapply(linear.model.list,
function(x) {
summary(x)$adj.r.squared
})
null lwt full
0.00000 0.02959 0.21283
## AIC
sapply(linear.model.list, AIC)
null lwt full
3031 3026 2995
Deviance is used for both goodness of fit and model comparison.
## Null
logistic.model.null <- glm(low ~ 1, data = lbw, family = binomial)
## lwt
logistic.model.lwt <- glm(low ~ lwt, data = lbw, family = binomial)
## Full
logistic.model.full <- glm(low ~ age + lwt + smoke + ht + ui + ftv.cat + race.cat + preterm, data = lbw, family = binomial)
## Create a list
logistic.model.list <- list(null = logistic.model.null, lwt = logistic.model.lwt, full = logistic.model.full)
## Model summaries. If substantially larger than DF, oversispersion is present.
lapply(logistic.model.list, summary)
$null
Call:
glm(formula = low ~ 1, family = binomial, data = lbw)
Deviance Residuals:
Min 1Q Median 3Q Max
-0.865 -0.865 -0.865 1.526 1.526
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.790 0.157 -5.03 0.00000048 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 234.67 on 188 degrees of freedom
Residual deviance: 234.67 on 188 degrees of freedom
AIC: 236.7
Number of Fisher Scoring iterations: 4
$lwt
Call:
glm(formula = low ~ lwt, family = binomial, data = lbw)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.095 -0.902 -0.802 1.361 1.982
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 0.99831 0.78529 1.27 0.204
lwt -0.01406 0.00617 -2.28 0.023 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 234.67 on 188 degrees of freedom
Residual deviance: 228.69 on 187 degrees of freedom
AIC: 232.7
Number of Fisher Scoring iterations: 4
$full
Call:
glm(formula = low ~ age + lwt + smoke + ht + ui + ftv.cat + race.cat +
preterm, family = binomial, data = lbw)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.684 -0.810 -0.502 0.816 2.187
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 0.44611 1.26160 0.35 0.7236
age -0.03551 0.03842 -0.92 0.3553
lwt -0.01497 0.00697 -2.15 0.0317 *
smoke 0.76813 0.42380 1.81 0.0699 .
ht 1.80673 0.71053 2.54 0.0110 *
ui 0.72520 0.46427 1.56 0.1183
ftv.catNone 0.25824 0.39945 0.65 0.5180
ftv.catMany 0.80332 0.71606 1.12 0.2619
race.catBlack 1.19059 0.53674 2.22 0.0265 *
race.catOther 0.75304 0.45949 1.64 0.1012
preterm>=1 1.25279 0.46763 2.68 0.0074 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 234.67 on 188 degrees of freedom
Residual deviance: 195.50 on 178 degrees of freedom
AIC: 217.5
Number of Fisher Scoring iterations: 4
## Deviance.
sapply(logistic.model.list, deviance)
null lwt full
234.7 228.7 195.5
## Hosmer-Lemeshow goodness of fit test with ResourceSelection
library(ResourceSelection)
hoslem.test(x = lbw$low, y = fitted(logistic.model.full), g = 10)
Hosmer and Lemeshow goodness of fit (GOF) test
data: lbw$low, fitted(logistic.model.full)
X-squared = 3.376, df = 8, p-value = 0.9086
## Hosmer-Lemeshow goodness of fit test with MKmisc
library(MKmisc)
HLgof.test(fit = logistic.model.full$fitted.values, obs = lbw$low)$C
Hosmer-Lemeshow C statistic
data: logistic.model.full$fitted.values and lbw$low
X-squared = 3.376, df = 8, p-value = 0.9086
## Hosmer-Lemeshow goodness of fit test with PredictABEL. This one matches SAS results.
library(PredictABEL)
PBS Modelling 2.63.229 -- Copyright (C) 2005-2010 Fisheries and Oceans Canada
A complete user guide 'PBSmodelling-UG.pdf' is located at
/Library/Frameworks/R.framework/Versions/2.15/Resources/library/PBSmodelling/doc/PBSmodelling-UG.pdf
Packaged on 2011-09-09
Pacific Biological Station, Nanaimo
-------------------------------------------------------------
plotCalibration(data = lbw, cOutcome = 2, predRisk = fitted(logistic.model.full), groups = 10)
$Table_HLtest
total meanpred meanobs predicted observed
[0.0207,0.0841) 19 0.055 0.000 1.05 0
[0.0841,0.1228) 19 0.103 0.105 1.95 2
[0.1228,0.1747) 19 0.145 0.211 2.76 4
[0.1747,0.2219) 19 0.198 0.263 3.77 5
[0.2219,0.2608) 19 0.238 0.263 4.53 5
[0.2608,0.3147) 19 0.294 0.263 5.59 5
[0.3147,0.4055) 19 0.351 0.263 6.66 5
[0.4055,0.5056) 19 0.452 0.526 8.59 10
[0.5056,0.6360) 19 0.559 0.526 10.62 10
[0.6360,0.8867] 18 0.749 0.722 13.48 13
$Chi_square
[1] 3.634
$df
[1] 8
$p_value
[1] 0.8886
## Graphing and AUC with epicalc
library(epicalc)
lroc.lwt <- lroc(logistic.model.lwt, add = F, lwd = 2, line.col = "black")
lroc.full <- lroc(logistic.model.full, add = T, lwd = 2, line.col = "red")
legend("bottomright", lwd = 2, lty = 1, col = 1:2, c("lwt","full"))
## ROC comparison with pROC
library(pROC)
roc.test(lbw$low ~ fitted(logistic.model.lwt) + fitted(logistic.model.full))
DeLong's test for two correlated ROC curves
data: fitted(logistic.model.lwt) and fitted(logistic.model.full) by lbw$low (0, 1)
Z = -3.004, p-value = 0.002666
alternative hypothesis: true difference in AUC is not equal to 0
sample estimates:
AUC of roc1 AUC of roc2
0.6131 0.7558
library(PredictABEL)
## Low risk vs high risk (cutoff at 0.5)
reclassification(data = lbw, cOutcome = 2,
predrisk1 = fitted(logistic.model.lwt),
predrisk2 = fitted(logistic.model.full),
cutoff = c(0,0.5,1)
)
_________________________________________
Reclassification table
_________________________________________
Outcome: absent
Updated Model
Initial Model [0,0.5) [0.5,1] % reclassified
[0,0.5) 116 14 11
[0.5,1] 0 0 NaN
Outcome: present
Updated Model
Initial Model [0,0.5) [0.5,1] % reclassified
[0,0.5) 35 24 41
[0.5,1] 0 0 NaN
Combined Data
Updated Model
Initial Model [0,0.5) [0.5,1] % reclassified
[0,0.5) 151 38 20
[0.5,1] 0 0 NaN
_________________________________________
NRI(Categorical) [95% CI]: 0.2991 [ 0.1268 - 0.4713 ] ; p-value: 0.00067
NRI(Continuous) [95% CI]: 0.7497 [ 0.442 - 1.057 ] ; p-value: 0
IDI [95% CI]: 0.1668 [ 0.1056 - 0.2281 ] ; p-value: 0
## Low risk vs moderate risk vs high risk (cufoffs at 0.1, 0.3)
reclassification(data = lbw, cOutcome = 2,
predrisk1 = fitted(logistic.model.lwt),
predrisk2 = fitted(logistic.model.full),
cutoff = c(0, 0.1, 0.3, 1)
)
_________________________________________
Reclassification table
_________________________________________
Outcome: absent
Updated Model
Initial Model [0,0.1) [0.1,0.3) [0.3,1] % reclassified
[0,0.1) 0 3 1 100
[0.1,0.3) 18 20 9 57
[0.3,1] 7 40 32 59
Outcome: present
Updated Model
Initial Model [0,0.1) [0.1,0.3) [0.3,1] % reclassified
[0,0.1) 0 0 0 NaN
[0.1,0.3) 2 5 7 64
[0.3,1] 0 11 34 24
Combined Data
Updated Model
Initial Model [0,0.1) [0.1,0.3) [0.3,1] % reclassified
[0,0.1) 0 3 1 100
[0.1,0.3) 20 25 16 59
[0.3,1] 7 51 66 47
_________________________________________
NRI(Categorical) [95% CI]: 0.2983 [ 0.0988 - 0.4978 ] ; p-value: 0.00338
NRI(Continuous) [95% CI]: 0.7497 [ 0.442 - 1.057 ] ; p-value: 0
IDI [95% CI]: 0.1668 [ 0.1056 - 0.2281 ] ; p-value: 0
## AIC
sapply(logistic.model.list, AIC)
null lwt full
236.7 232.7 217.5