<< Data Analysis >> Note part 6

Created on 8 Aug 2013
Revised on Thu Aug 08 16:28:26 2013

1.ANOVA with multiple factors/variables

Key ideas
(1) Outcome is still quantitative
(2) You have multiple explanatory variables
(3) Goal is to identify contributions of different variables

require(RCurl)

## Loading required package: RCurl

## Loading required package: bitops

myCsv <- getURL("https://dl.dropboxusercontent.com/u/8272421/movies03RT.txt", 
    ssl.verifypeer = FALSE)
movies <- read.csv(textConnection(myCsv), sep = "\t", header = T, quote = "")

head(movies)

##                  X score rating            genre box.office running.time
## 1 2 Fast 2 Furious  48.9  PG-13 action/adventure     127.15          107
## 2    28 Days Later  78.2      R           horror      45.06          113
## 3      A Guy Thing  39.5  PG-13       rom comedy      15.54          101
## 4      A Man Apart  42.9      R action/adventure      26.25          110
## 5    A Mighty Wind  79.9  PG-13           comedy      17.78           91
## 6 Agent Cody Banks  57.9     PG action/adventure      47.81          102

1.1 Relating score to rating

Si = b0 + b1!(Rai =" PG ") + b2!(Rai =" PG − 13 ") + b3!(Rai =" R ") + ei

The notation !(Rai =" PG ") is a logical value that is one if the movie rating is "PG" and zero
otherwise.

Average values
b0    = average of the G movies
b0+b1 = average of the PG movies
b0+b2 = average of the PG-13 movies
b0+b3 = average of the R movies

ANOVA in R

aovObject <- aov(movies$score ~ movies$rating)
aovObject

## Call:
##    aov(formula = movies$score ~ movies$rating)
## 
## Terms:
##                 movies$rating Residuals
## Sum of Squares            570     28149
## Deg. of Freedom             3       136
## 
## Residual standard error: 14.39
## Estimated effects may be unbalanced

aovObject$coeff

##        (Intercept)    movies$ratingPG movies$ratingPG-13 
##              67.65             -12.59             -11.81 
##     movies$ratingR 
##             -12.02

1.2 Adding a second factor

Si = b0 + b1!(Rai =" PG ") + b2!(Rai =" PG − 13 ") + b3!(Rai =" R ") + γ1!(Gi =" action ") + γ2!(Gi =" animated ")+...+ ei

The notation !(Rai =" PG ") is a logical value that is one if the movie rating is "PG" and zero
otherwise.

aovObject2 <- aov(movies$score ~ movies$rating + movies$genre)
aovObject2

## Call:
##    aov(formula = movies$score ~ movies$rating + movies$genre)
## 
## Terms:
##                 movies$rating movies$genre Residuals
## Sum of Squares            570         3935     24214
## Deg. of Freedom             3           12       124
## 
## Residual standard error: 13.97
## Estimated effects may be unbalanced

summary(aovObject2)

##                Df Sum Sq Mean Sq F value Pr(>F)  
## movies$rating   3    570     190    0.97  0.408  
## movies$genre   12   3935     328    1.68  0.079 .
## Residuals     124  24214     195                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

1.3 Order matters

aovObject3 <- aov(movies$score ~ movies$genre + movies$rating)
summary(aovObject3)

##                Df Sum Sq Mean Sq F value Pr(>F)  
## movies$genre   12   4222     352    1.80  0.055 .
## movies$rating   3    284      95    0.48  0.694  
## Residuals     124  24214     195                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

1.4 Adding a quantitative variabl

aovObject4 <- aov(movies$score ~ movies$genre + movies$rating + movies$box.office)
summary(aovObject4)

##                    Df Sum Sq Mean Sq F value  Pr(>F)    
## movies$genre       12   4222     352    2.19   0.016 *  
## movies$rating       3    284      95    0.59   0.624    
## movies$box.office   1   4421    4421   27.47 6.7e-07 ***
## Residuals         123  19793     161                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

2.Binary outcomes

Key ideas
 ·Frequently we care about outcomes that have two values
  -Alive/dead
  -Win/loss
  -Success/Failure
  -etc
 ·Called binary outcomes or 0/1 outcomes
 ·Linear regression (like we've seen) may not be the best

require(RCurl)
myCsv <- getURL("https://dl.dropboxusercontent.com/u/8272421/ravensData.csv", 
    ssl.verifypeer = FALSE)
ravensData <- read.csv(textConnection(myCsv))

2.1 Linear regression

RWi = b0 + b1RSi + ei

RWi - 1 if a Ravens win, 0 if not
RSi - Number of points Ravens scored
b0 - probability of a Ravens win if they score 0 points
b1 - increase in probability of a Ravens win for each additional point
ei - variation due to everything we didn't measure

lmRavens <- lm(ravensData$ravenWinNum ~ ravensData$ravenScore)
summary(lmRavens)

## 
## Call:
## lm(formula = ravensData$ravenWinNum ~ ravensData$ravenScore)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -0.730 -0.508  0.182  0.322  0.572 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)  
## (Intercept)            0.28503    0.25664    1.11    0.281  
## ravensData$ravenScore  0.01590    0.00906    1.76    0.096 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.446 on 18 degrees of freedom
## Multiple R-squared:  0.146,  Adjusted R-squared:  0.0987 
## F-statistic: 3.08 on 1 and 18 DF,  p-value: 0.0963

plot(ravensData$ravenScore, lmRavens$fitted, pch = 19, col = "blue", ylab = "Prob Win", 
    xlab = "Raven Score")

plot of chunk unnamed-chunk-9

2.2 Ravens logistic regression

logRegRavens <- glm(ravensData$ravenWinNum ~ ravensData$ravenScore, family = "binomial")
summary(logRegRavens)

## 
## Call:
## glm(formula = ravensData$ravenWinNum ~ ravensData$ravenScore, 
##     family = "binomial")
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.758  -1.100   0.530   0.806   1.495  
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)
## (Intercept)            -1.6800     1.5541   -1.08     0.28
## ravensData$ravenScore   0.1066     0.0667    1.60     0.11
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 24.435  on 19  degrees of freedom
## Residual deviance: 20.895  on 18  degrees of freedom
## AIC: 24.89
## 
## Number of Fisher Scoring iterations: 5

plot(ravensData$ravenScore, logRegRavens$fitted, pch = 19, col = "blue", xlab = "Score", 
    ylab = "Prob Ravens Win")

plot of chunk unnamed-chunk-11

Odds ratios and confidence intervals

exp(logRegRavens$coeff)

##           (Intercept) ravensData$ravenScore 
##                0.1864                1.1125

exp(confint(logRegRavens))

## Waiting for profiling to be done...

##                          2.5 % 97.5 %
## (Intercept)           0.005675  3.106
## ravensData$ravenScore 0.996230  1.303

ANOVA for logistic regression

anova(logRegRavens, test = "Chisq")

## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: ravensData$ravenWinNum
## 
## Terms added sequentially (first to last)
## 
## 
##                       Df Deviance Resid. Df Resid. Dev Pr(>Chi)  
## NULL                                     19       24.4           
## ravensData$ravenScore  1     3.54        18       20.9     0.06 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

3.Count outcomes

Key ideas
 ·Many data take the form of counts
   -Calls to a call center
   -Number of flu cases in an area
   -Number of cars that cross a bridge
 ·Data may also be in the form of rates
   -Percent of children passing a test
   -Percent of hits to a website from a country
 ·Linear regression with transformation is an option

3.1 Poisson distribution

set.seed(3433)
oldpar = par(mfrow = c(1, 2))
poisData2 <- rpois(100, lambda = 100)
poisData1 <- rpois(100, lambda = 50)
hist(poisData1, col = "blue", xlim = c(0, 150))
hist(poisData2, col = "blue", xlim = c(0, 150))

plot of chunk unnamed-chunk-14

par(oldpar)

c(mean(poisData1), var(poisData1))

## [1] 49.85 49.38

c(mean(poisData2), var(poisData2))

## [1] 100.12  95.26

3.2 Example: Leek Group Website Traffic

require(RCurl)
myCsv <- getURL("https://dl.dropboxusercontent.com/u/8272421/gaData.csv", ssl.verifypeer = FALSE)
gaData <- read.csv(textConnection(myCsv))
gaData$julian <- julian(as.Date(gaData$date))
head(gaData)

##       date visits simplystats julian
## 1 2011/1/1      0           0  14975
## 2 2011/1/2      0           0  14976
## 3 2011/1/3      0           0  14977
## 4 2011/1/4      0           0  14978
## 5 2011/1/5      0           0  14979
## 6 2011/1/6      0           0  14980

plot(gaData$julian, gaData$visits, pch = 19, col = "darkgrey", xlab = "Julian", 
    ylab = "Visits")

plot of chunk unnamed-chunk-17

3.2.1 Linear regression

NHi = b0 + b1JDi + ei

NHi - number of hits to the website
JDi - day of the year (Julian day)
b0 - number of hits on Julian day 0 (1970-01-01)
b1 - increase in number of hits per unit day
ei - variation due to everything we didn't measure

plot(gaData$julian, gaData$visits, pch = 19, col = "darkgrey", xlab = "Julian", 
    ylab = "Visits")
lm1 <- lm(gaData$visits ~ gaData$julian)
abline(lm1, col = "red", lwd = 3)

plot of chunk unnamed-chunk-18

3.2.2 Linear vs. Poisson regression

linear:
NHi = b0 + b1JDi + ei
or
E[NHi|JDi, b0, b1 ] = b0 + b1JDi

Poisson:
log(E[NHi|JDi, b0, b1]) = b0 + b1JDi
or
E[NHi|JDi, b0, b1 ] = exp(b0 + b1JDi)

3.2.3 Poisson regression in R

plot(gaData$julian, gaData$visits, pch = 19, col = "darkgrey", xlab = "Julian", 
    ylab = "Visits")
glm1 <- glm(gaData$visits ~ gaData$julian, family = "poisson")
abline(lm1, col = "red", lwd = 3)
lines(gaData$julian, glm1$fitted, col = "blue", lwd = 3)

plot of chunk unnamed-chunk-19

Mean-variance relationship?

plot(glm1$fitted, glm1$residuals, pch = 19, col = "grey", ylab = "Residuals", 
    xlab = "Date")

plot of chunk unnamed-chunk-20

Model agnostic standard errors

library(sandwich)

## Warning: package 'sandwich' was built under R version 3.0.1

## Loading required package: zoo

## Warning: package 'zoo' was built under R version 3.0.1

## Attaching package: 'zoo'

## 下列对象被屏蔽了from 'package:base':
## 
## as.Date, as.Date.numeric

confint.agnostic <- function(object, parm, level = 0.95, ...) {
    cf <- coef(object)
    pnames <- names(cf)
    if (missing(parm)) 
        parm <- pnames else if (is.numeric(parm)) 
        parm <- pnames[parm]
    a <- (1 - level)/2
    a <- c(a, 1 - a)
    pct <- stats:::format.perc(a, 3)
    fac <- qnorm(a)
    ci <- array(NA, dim = c(length(parm), 2L), dimnames = list(parm, pct))

    ses <- sqrt(diag(sandwich::vcovHC(object)))[parm]
    ci[] <- cf[parm] + ses %o% fac
    ci
}

confint(glm1)

## Waiting for profiling to be done...

##                   2.5 %     97.5 %
## (Intercept)   -34.34658 -31.159716
## gaData$julian   0.00219   0.002396

confint.agnostic(glm1)

##                    2.5 %     97.5 %
## (Intercept)   -36.362675 -29.136997
## gaData$julian   0.002058   0.002528

3.2.4 Fitting rates in R

E[NHSSi|JDi, b0, b1 ]/NHi = exp(b0 + b1JDi)
log(E[NHSSi |JDi, b0, b1]) − log(NHi) = b0 + b1JDi
log(E[NHSSi |JDi, b0, b1]) = log(NHi) + b0 + b1JDi

glm2 <- glm(gaData$simplystats ~ julian(as.Date(gaData$date)), offset = log(visits + 
    1), family = "poisson", data = gaData)
plot(julian(as.Date(gaData$date)), glm2$fitted, col = "blue", pch = 19, xlab = "Date", 
    ylab = "Fitted Counts")
points(julian(as.Date(gaData$date)), glm1$fitted, col = "red", pch = 19)

plot of chunk unnamed-chunk-23

glm2 <- glm(gaData$simplystats ~ julian(as.Date(gaData$date)), offset = log(visits + 
    1), family = "poisson", data = gaData)
plot(julian(as.Date(gaData$date)), gaData$simplystats/(gaData$visits + 1), col = "grey", 
    xlab = "Date", ylab = "Fitted Rates", pch = 19)
lines(julian(as.Date(gaData$date)), glm2$fitted/(gaData$visits + 1), col = "blue", 
    lwd = 3)

plot of chunk unnamed-chunk-24

4.Model checking and model selection

Sometimes model checking/selection not allowed
Often it can lead to problems
  - Overfitting
  - Overtesting
  - Biased inference
But you don't want to miss something obvious

4.1 Linear regression - basic assumptions

-Variance is constant
-You are summarizing a linear trend
-You have all the right terms in the model
-There are no big outliers

4.1.1 Model checking - constant variance

set.seed(3433)
oldpar = par(mfrow = c(1, 2))
data <- rnorm(100, mean = seq(0, 3, length = 100), sd = seq(0.1, 3, length = 100))
lm1 <- lm(data ~ seq(0, 3, length = 100))
plot(seq(0, 3, length = 100), data, pch = 19, col = "grey")
abline(lm1, col = "red", lwd = 3)
plot(seq(0, 3, length = 100), lm1$residuals, pch = 19, col = "grey")
abline(c(0, 0), col = "red", lwd = 3)

plot of chunk unnamed-chunk-25

par(oldpar)

What to do?

- See if another variable explains the increased variance
- Use the vcovHC {sandwich} variance estimators (if n is big)

Using the sandwich estimate

set.seed(3433)
data <- rnorm(100, mean = seq(0, 3, length = 100), sd = seq(0.1, 3, length = 100))
lm1 <- lm(data ~ seq(0, 3, length = 100))
vcovHC(lm1)

##                         (Intercept) seq(0, 3, length = 100)
## (Intercept)                 0.05410                -0.04765
## seq(0, 3, length = 100)    -0.04765                 0.05369

summary(lm1)$cov.unscaled

##                         (Intercept) seq(0, 3, length = 100)
## (Intercept)                 0.03941                -0.01960
## seq(0, 3, length = 100)    -0.01960                 0.01307

4.1.2 Model checking - linear trend

set.seed(3433)
oldpar = par(mfrow = c(1, 2))
data <- rnorm(100, mean = seq(0, 3, length = 100)^3, sd = 2)
lm1 <- lm(data ~ seq(0, 3, length = 100))
plot(seq(0, 3, length = 100), data, pch = 19, col = "grey")
abline(lm1, col = "red", lwd = 3)
plot(seq(0, 3, length = 100), lm1$residuals, pch = 19, col = "grey")
abline(c(0, 0), col = "red", lwd = 3)

plot of chunk unnamed-chunk-27

par(oldpar)

What to do?

Use Poisson regression (if it looks exponential/multiplicative)
Use a data transformation (e.g. take the log)
Smooth the data/fit a nonlinear trend
Use linear regression anyway
  -Interpret as the linear trend between the variables
  -Use the vcovHC {sandwich} variance estimators (if n is big)

4.1.3 Model checking - missing covariate

set.seed(3433)
oldpar = par(mfrow = c(1, 3))
z <- rep(c(-0.5, 0.5), 50)
data <- rnorm(100, mean = (seq(0, 3, length = 100) + z), sd = seq(0.1, 3, length = 100))
lm1 <- lm(data ~ seq(0, 3, length = 100))
plot(seq(0, 3, length = 100), data, pch = 19, col = ((z > 0) + 3))
abline(lm1, col = "red", lwd = 3)
plot(seq(0, 3, length = 100), lm1$residuals, pch = 19, col = ((z > 0) + 3))
abline(c(0, 0), col = "red", lwd = 3)
boxplot(lm1$residuals ~ z, col = ((z > 0) + 3))

plot of chunk unnamed-chunk-28

par(oldpar)

What to do?

Use exploratory analysis to identify other variables to include
Use the vcovHC {sandwich} variance estimators (if n is big)
Report unexplained patterns in the data

4.1.4 Model checking - outliers

set.seed(343)
oldpar=par(mfrow=c(1,2))
betahat <- rep(NA,100)
x <- seq(0,3,length=100)
y <- rcauchy(100)
lm1 <- lm(y ~ x)
plot(x,y,pch=19,col="blue")
abline(lm1,col="red",lwd=3)
for(i in 1:length(data)){betahat[i] <- lm(y[-i] ~ x[-i])$coeff[2]}
plot(betahat - lm1$coeff[2],col="blue",pch=19)
abline(c(0,0),col="red",lwd=3)
par(oldpar)

What to do?

If outliers are experimental mistakes -remove and document them
If they are real - consider reporting how sensitive your estimate is to the outliers
Consider using a robust linear model fit like rlm {MASS}

4.1.5 Robust linear modeling

library(MASS)
set.seed(343)
x <- seq(0, 3, length = 100)
y <- rcauchy(100)
lm1 <- lm(y ~ x)
rlm1 <- rlm(y ~ x)
lm1$coeff

## (Intercept)           x 
##      0.3523     -0.4011

rlm1$coeff

## (Intercept)           x 
##    0.008527   -0.017892

oldpar = par(mfrow = c(1, 2))
plot(x, y, pch = 19, col = "grey")
lines(x, lm1$fitted, col = "blue", lwd = 3)
lines(x, rlm1$fitted, col = "green", lwd = 3)
plot(x, y, pch = 19, col = "grey", ylim = c(-5, 5), main = "Zoomed In")
lines(x, lm1$fitted, col = "blue", lwd = 3)
lines(x, rlm1$fitted, col = "green", lwd = 3)

plot of chunk unnamed-chunk-30

par(oldpar)

4.1.6 Model checking - default plots

set.seed(343)
oldpar = par(mfrow = c(1, 2))
x <- seq(0, 3, length = 100)
y <- rnorm(100)
lm1 <- lm(y ~ x)
plot(lm1)

plot of chunk unnamed-chunk-31

par(oldpar)

4..1.7 Model checking - deviance

Commonly reported for GLM's
Usually compares the model where every point gets its own parameter to the model you are using
On it's own it doesn't tell you what is wrong
In large samples the deviance may be big even for "conservative" models
You can not compare deviances for models with different sample sizes

R2 may be a bad summary

4.2 Model selection

·Many times you have multiple variables to evaluate
·Options for choosing variables
   -Domain-specific knowledge
   -Exploratory analysis
   -Statistical selection
·There are many statistical selection options
   -Step-wise
   -AIC
   -BIC
   -Modern approaches: Lasso, Ridge-Regression, etc.
·Statistical selection may bias your inference
   -If possible, do selection on a held out sample

Error measures
R2 alone isn't enough - more variables = bigger R2
Adjusted R2 is taking into account the number of estimated parameters
AIC also penalizes models with more parameters
BIC does the same, but with a bigger penalty

EXAMPLE

require(RCurl)
myCsv <- getURL("https://dl.dropboxusercontent.com/u/8272421/movies03RT.txt", 
    ssl.verifypeer = FALSE)
movies <- read.csv(textConnection(myCsv), sep = "\t", header = T, quote = "")
head(movies)

##                  X score rating            genre box.office running.time
## 1 2 Fast 2 Furious  48.9  PG-13 action/adventure     127.15          107
## 2    28 Days Later  78.2      R           horror      45.06          113
## 3      A Guy Thing  39.5  PG-13       rom comedy      15.54          101
## 4      A Man Apart  42.9      R action/adventure      26.25          110
## 5    A Mighty Wind  79.9  PG-13           comedy      17.78           91
## 6 Agent Cody Banks  57.9     PG action/adventure      47.81          102

4.2.1 Model selection - step

movies <- movies[, -1]
lm1 <- lm(score ~ ., data = movies)
aicFormula <- step(lm1)

## Start:  AIC=727.5
## score ~ rating + genre + box.office + running.time
## 
##                Df Sum of Sq   RSS AIC
## - genre        12      2575 22132 721
## - rating        3        40 19596 722
## - running.time  1       237 19793 727
## <none>                      19556 728
## - box.office    1      3007 22563 746
## 
## Step:  AIC=720.8
## score ~ rating + box.office + running.time
## 
##                Df Sum of Sq   RSS AIC
## - rating        3       491 22623 718
## <none>                      22132 721
## - running.time  1      1192 23324 726
## - box.office    1      2456 24588 734
## 
## Step:  AIC=717.9
## score ~ box.office + running.time
## 
##                Df Sum of Sq   RSS AIC
## <none>                      22623 718
## - running.time  1       935 23557 722
## - box.office    1      3337 25959 735

aicFormula

## 
## Call:
## lm(formula = score ~ box.office + running.time, data = movies)
## 
## Coefficients:
##  (Intercept)    box.office  running.time  
##      37.2364        0.0824        0.1275

4.2.2 Model selection - regsubsets

library(leaps)

## Warning: package 'leaps' was built under R version 3.0.1

regSub <- regsubsets(score ~ ., data = movies)
plot(regSub)

plot of chunk unnamed-chunk-35

4.2.3 Model selection - bic.glm

library(BMA)

## Warning: package 'BMA' was built under R version 3.0.1

## Loading required package: survival

## Loading required package: splines

bicglm1 <- bic.glm(score ~ ., data = movies, glm.family = "gaussian")
print(bicglm1)

## 
## Call:
## bic.glm.formula(f = score ~ ., data = movies, glm.family = "gaussian")
## 
## 
##  Posterior probabilities(%): 
##  <NA>  <NA>  <NA>  <NA> 
##   0.0 100.0 100.0  18.2 
## 
##  Coefficient posterior expected values: 
##           (Intercept)               ratingPG            ratingPG-13  
##                45.263                  0.000                  0.000  
##               ratingR  genreaction/adventure          genreanimated  
##                 0.000                 -0.120                  7.628  
##           genrecomedy       genredocumentary             genredrama  
##                 2.077                  8.642                 13.041  
##          genrefantasy            genrehorror           genremusical  
##                 1.504                 -3.458                -12.255  
##       genrerom comedy            genresci-fi          genresuspense  
##                 1.244                 -3.324                  3.815  
##          genrewestern             box.office           running.time  
##                17.563                  0.100                  0.016