## Settings for RMarkdown http://yihui.name/knitr/options#chunk_options
opts_chunk$set(comment = "", warning = FALSE, message = FALSE, tidy = FALSE,
echo = TRUE, fig.width = 7, fig.height = 7)
options(width = 116, scipen = 10)
setwd("~/Documents/HSPH/useR.at.HSPH/")
library(plyr)
Performed to check for validity of assumptions and presence of outliers.
Assumptions in linear regression
library(gdata)
lbw <- read.xls("~/statistics/bio213/lbw.xls")
## Same data available online: http://www.umass.edu/statdata/statdata/data/index.html
## Cases 10 and 39 needs fix to make them identical to Dr. Orav's dataset
## lbw2 <- read.xls("http://www.umass.edu/statdata/statdata/data/lowbwt.xls")
## lbw2[c(10,39),"BWT"] <- c(2655,3035)
## Recoding
lbw <- within(lbw, {
## race relabeling
race.cat <- factor(race, levels = 1:3, labels = c("White","Black","Other"))
## ftv (frequency of visit) relabeling
ftv.cat <- cut(ftv, breaks = c(-Inf, 0, 2, Inf), labels = c("None","Normal","Many"))
ftv.cat <- relevel(ftv.cat, ref = "Normal")
## ptl
preterm <- factor(ptl >= 1, levels = c(F,T), labels = c("=0",">=1"))
})
## Full
linear.model.full <- lm(bwt ~ age + lwt + smoke + ht + ui + ftv.cat + race.cat + preterm, data = lbw)
## summary
summary(linear.model.full)
Call:
lm(formula = bwt ~ age + lwt + smoke + ht + ui + ftv.cat + race.cat +
preterm, data = lbw)
Residuals:
Min 1Q Median 3Q Max
-1896.7 -443.3 53.2 466.1 1654.4
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2947.32 320.48 9.20 < 2e-16 ***
age -2.91 9.67 -0.30 0.76354
lwt 4.22 1.72 2.46 0.01488 *
smoke -307.34 109.13 -2.82 0.00541 **
ht -568.63 200.88 -2.83 0.00518 **
ui -494.11 137.23 -3.60 0.00041 ***
ftv.catNone -56.50 105.36 -0.54 0.59245
ftv.catMany -185.86 203.19 -0.91 0.36158
race.catBlack -467.30 149.78 -3.12 0.00211 **
race.catOther -322.81 117.40 -2.75 0.00658 **
preterm>=1 -207.91 136.35 -1.52 0.12907
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 647 on 178 degrees of freedom
Multiple R-squared: 0.255, Adjusted R-squared: 0.213
F-statistic: 6.08 on 10 and 178 DF, p-value: 0.0000000609
Assumptions
Outliers
## Residual analysis for assumptions (independence, linearity, normality, and homoscedasticity)
layout(matrix(1:6, byrow = T, ncol = 3))
plot(linear.model.full, which = 1:6)
layout(matrix(1:2, byrow = T, ncol = 2))
## Residual ~ age
with(linear.model.full, plot(residuals ~ model$age))
## Residual ~ lwt
with(linear.model.full, plot(residuals ~ model$lwt))
library(gvlma)
gvlma(linear.model.full)
Call:
lm(formula = bwt ~ age + lwt + smoke + ht + ui + ftv.cat + race.cat +
preterm, data = lbw)
Coefficients:
(Intercept) age lwt smoke ht ui ftv.catNone
2947.32 -2.91 4.22 -307.34 -568.63 -494.11 -56.50
ftv.catMany race.catBlack race.catOther preterm>=1
-185.86 -467.30 -322.81 -207.91
ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
Level of Significance = 0.05
Call:
gvlma(x = linear.model.full)
Value p-value Decision
Global Stat 18.216 0.0011197 Assumptions NOT satisfied!
Skewness 1.316 0.2512972 Assumptions acceptable.
Kurtosis 0.376 0.5395040 Assumptions acceptable.
Link Function 0.397 0.5285123 Assumptions acceptable.
Heteroscedasticity 16.126 0.0000593 Assumptions NOT satisfied!
## Raw residual for first 10 observations
residuals(linear.model.full)[1:10]
1 2 3 4 5 6 7 8 9 10
-119.64 -446.06 -468.12 53.24 111.21 -408.47 -744.53 -372.95 -411.90 -329.92
## Studentized residuals for first 10 observations
rstudent(linear.model.full)[1:10]
1 2 3 4 5 6 7 8 9 10
-0.19427 -0.73267 -0.73613 0.08486 0.17660 -0.63745 -1.16752 -0.58717 -0.64673 -0.51655
## Cook's distance' for first 10 observations
cooks.distance(linear.model.full)[1:10]
1 2 3 4 5 6 7 8 9 10
0.00037673 0.00644733 0.00184287 0.00004556 0.00017324 0.00082857 0.00331085 0.00128855 0.00132888 0.00072915
## DFBETAS for first 10 observations
dfbetas(linear.model.full)[1:10,]
(Intercept) age lwt smoke ht ui ftv.catNone ftv.catMany race.catBlack
1 0.012848 0.006354 -0.023329 0.014455 0.0088370 -0.039604 -0.013401 -0.0016262 -0.027892
2 0.098203 -0.091959 -0.037075 0.014349 0.0268171 0.002751 -0.013141 -0.2042290 -0.006042
3 -0.078171 0.042525 0.038826 -0.069822 -0.0004036 0.025841 0.079083 0.0364423 0.021247
4 0.005285 -0.002685 -0.002617 0.007813 0.0016438 0.015726 -0.009694 -0.0037954 -0.002195
5 0.009294 -0.007792 -0.005278 0.009145 0.0010715 0.030996 0.007939 0.0008654 -0.007586
6 -0.002223 0.006908 -0.005444 0.020296 0.0170243 0.022795 -0.035699 -0.0003424 0.005648
7 -0.125442 0.053990 0.043225 0.083619 0.0028210 0.029943 0.075849 0.0330754 0.059831
8 -0.067486 0.055028 0.018272 0.002956 0.0026522 0.022458 0.070391 0.0263787 0.002117
9 0.005677 -0.045144 0.019769 -0.070154 0.0032850 0.014869 0.052950 0.0341122 0.005866
10 0.002076 -0.031980 0.027857 -0.034583 0.0064932 0.017459 -0.039006 0.0014676 0.011277
race.catOther preterm>=1
1 0.0057553 0.006126
2 -0.0805302 0.029022
3 0.0168223 0.033219
4 -0.0021617 -0.007232
5 -0.0120184 -0.012011
6 -0.0416261 0.010627
7 0.0971458 0.010737
8 -0.0502633 0.013962
9 -0.0008454 0.039082
10 0.0232173 0.023845
## DFFITS for first 10 observations
dffits(linear.model.full)[1:10]
1 2 3 4 5 6 7 8 9 10
-0.06420 -0.26596 -0.14219 0.02232 0.04353 -0.09531 -0.19103 -0.11884 -0.12071 -0.08937
layout(1)
library(car)
## Independence (Do not rely on tests): Durbin-Watson test to detect serially correlated residuals (mainly for time series).
durbinWatsonTest(linear.model.full)
lag Autocorrelation D-W Statistic p-value
1 0.6947 0.6102 0
Alternative hypothesis: rho != 0
## Normality: Normal Q-Q plot. See depature from the diagonal line.
## qqPlot(linear.model.full, id.method = "identify") # to use mouse-identification
qqPlot(linear.model.full)
## Linearity: Component plus residual plots. Compare green lines to the red predicted lines.
crPlots(linear.model.full)
## Linearity: Ceres plots
ceresPlots(linear.model.full)
Error: need finite 'ylim' values
## Homoscedasticity: Score Test for Non-Constant Error Variance. Heteroscedasticity if significant.
ncvTest(linear.model.full)
Non-constant Variance Score Test
Variance formula: ~ fitted.values
Chisquare = 0.004127 Df = 1 p = 0.9488
## Homoscedasticity: Spread-level plot. See if the red line is horizontal. If the suggested power trasformation is close to 0.5, use sqrt(outcome). If close to 0, use log(outcome).
spreadLevelPlot(linear.model.full)
Suggested power transformation: 0.7456
Unusual observations
## Outliers: Outlier test. Identify the largest absolute studentized residual, and test.
outlierTest(linear.model.full)
No Studentized residuals with Bonferonni p < 0.05
Largest |rstudent|:
rstudent unadjusted p-value Bonferonni p
132 -3.102 0.002241 0.4236
## High-leverage points: Leverage plot
leveragePlots(linear.model.full)
## Leverage values for first 10 observations
hatvalues(linear.model.full)[1:10]
1 2 3 4 5 6 7 8 9 10
0.09845 0.11643 0.03597 0.06472 0.05729 0.02187 0.02607 0.03935 0.03366 0.02907
## Influential observation: dfbetas Index Plots
dfbetaPlots(linear.model.full)
## Influential observations: Cook's distance' (not part of car package)
plot(linear.model.full, which = 4, cook.levels = cutoff)
## Influential observations: Added variable plot
avPlots(linear.model.full)
## Influential observations: Influence Plot.
## Size is proportional to Cook's distance. '
## If more on the right, high leverage. If higher than 2 or lower than -2, outlier.
influencePlot(linear.model.full, id.method = "identify")
## Another way of combining information
influenceIndexPlot(linear.model.full)
If greater than 4 (or 10?), the variable has a multicollinearity issue.
car::vif(linear.model.full)
GVIF Df GVIF^(1/(2*Df))
age 1.180 1 1.086
lwt 1.239 1 1.113
smoke 1.282 1 1.132
ht 1.084 1 1.041
ui 1.074 1 1.036
ftv.cat 1.171 2 1.040
race.cat 1.416 2 1.091
preterm 1.121 1 1.059
rms::vif(linear.model.full)
age lwt smoke ht ui ftv.catNone ftv.catMany race.catBlack
1.180 1.239 1.282 1.084 1.074 1.249 1.109 1.202
race.catOther preterm>=1
1.425 1.121