Created on 8 Aug 2013
Revised on Thu Aug 08 16:28:26 2013
Key ideas
(1) Outcome is still quantitative
(2) You have multiple explanatory variables
(3) Goal is to identify contributions of different variables
require(RCurl)
## Loading required package: RCurl
## Loading required package: bitops
myCsv <- getURL("https://dl.dropboxusercontent.com/u/8272421/movies03RT.txt",
ssl.verifypeer = FALSE)
movies <- read.csv(textConnection(myCsv), sep = "\t", header = T, quote = "")
head(movies)
## X score rating genre box.office running.time
## 1 2 Fast 2 Furious 48.9 PG-13 action/adventure 127.15 107
## 2 28 Days Later 78.2 R horror 45.06 113
## 3 A Guy Thing 39.5 PG-13 rom comedy 15.54 101
## 4 A Man Apart 42.9 R action/adventure 26.25 110
## 5 A Mighty Wind 79.9 PG-13 comedy 17.78 91
## 6 Agent Cody Banks 57.9 PG action/adventure 47.81 102
1.1 Relating score to rating
Si = b0 + b1!(Rai =" PG ") + b2!(Rai =" PG − 13 ") + b3!(Rai =" R ") + ei
The notation !(Rai =" PG ") is a logical value that is one if the movie rating is "PG" and zero
otherwise.
Average values
b0 = average of the G movies
b0+b1 = average of the PG movies
b0+b2 = average of the PG-13 movies
b0+b3 = average of the R movies
ANOVA in R
aovObject <- aov(movies$score ~ movies$rating)
aovObject
## Call:
## aov(formula = movies$score ~ movies$rating)
##
## Terms:
## movies$rating Residuals
## Sum of Squares 570 28149
## Deg. of Freedom 3 136
##
## Residual standard error: 14.39
## Estimated effects may be unbalanced
aovObject$coeff
## (Intercept) movies$ratingPG movies$ratingPG-13
## 67.65 -12.59 -11.81
## movies$ratingR
## -12.02
1.2 Adding a second factor
Si = b0 + b1!(Rai =" PG ") + b2!(Rai =" PG − 13 ") + b3!(Rai =" R ") + γ1!(Gi =" action ") + γ2!(Gi =" animated ")+...+ ei
The notation !(Rai =" PG ") is a logical value that is one if the movie rating is "PG" and zero
otherwise.
aovObject2 <- aov(movies$score ~ movies$rating + movies$genre)
aovObject2
## Call:
## aov(formula = movies$score ~ movies$rating + movies$genre)
##
## Terms:
## movies$rating movies$genre Residuals
## Sum of Squares 570 3935 24214
## Deg. of Freedom 3 12 124
##
## Residual standard error: 13.97
## Estimated effects may be unbalanced
summary(aovObject2)
## Df Sum Sq Mean Sq F value Pr(>F)
## movies$rating 3 570 190 0.97 0.408
## movies$genre 12 3935 328 1.68 0.079 .
## Residuals 124 24214 195
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
1.3 Order matters
aovObject3 <- aov(movies$score ~ movies$genre + movies$rating)
summary(aovObject3)
## Df Sum Sq Mean Sq F value Pr(>F)
## movies$genre 12 4222 352 1.80 0.055 .
## movies$rating 3 284 95 0.48 0.694
## Residuals 124 24214 195
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
1.4 Adding a quantitative variabl
aovObject4 <- aov(movies$score ~ movies$genre + movies$rating + movies$box.office)
summary(aovObject4)
## Df Sum Sq Mean Sq F value Pr(>F)
## movies$genre 12 4222 352 2.19 0.016 *
## movies$rating 3 284 95 0.59 0.624
## movies$box.office 1 4421 4421 27.47 6.7e-07 ***
## Residuals 123 19793 161
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Key ideas
·Frequently we care about outcomes that have two values
-Alive/dead
-Win/loss
-Success/Failure
-etc
·Called binary outcomes or 0/1 outcomes
·Linear regression (like we've seen) may not be the best
require(RCurl)
myCsv <- getURL("https://dl.dropboxusercontent.com/u/8272421/ravensData.csv",
ssl.verifypeer = FALSE)
ravensData <- read.csv(textConnection(myCsv))
2.1 Linear regression
RWi = b0 + b1RSi + ei
RWi - 1 if a Ravens win, 0 if not
RSi - Number of points Ravens scored
b0 - probability of a Ravens win if they score 0 points
b1 - increase in probability of a Ravens win for each additional point
ei - variation due to everything we didn't measure
lmRavens <- lm(ravensData$ravenWinNum ~ ravensData$ravenScore)
summary(lmRavens)
##
## Call:
## lm(formula = ravensData$ravenWinNum ~ ravensData$ravenScore)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.730 -0.508 0.182 0.322 0.572
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.28503 0.25664 1.11 0.281
## ravensData$ravenScore 0.01590 0.00906 1.76 0.096 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.446 on 18 degrees of freedom
## Multiple R-squared: 0.146, Adjusted R-squared: 0.0987
## F-statistic: 3.08 on 1 and 18 DF, p-value: 0.0963
plot(ravensData$ravenScore, lmRavens$fitted, pch = 19, col = "blue", ylab = "Prob Win",
xlab = "Raven Score")
2.2 Ravens logistic regression
logRegRavens <- glm(ravensData$ravenWinNum ~ ravensData$ravenScore, family = "binomial")
summary(logRegRavens)
##
## Call:
## glm(formula = ravensData$ravenWinNum ~ ravensData$ravenScore,
## family = "binomial")
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.758 -1.100 0.530 0.806 1.495
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.6800 1.5541 -1.08 0.28
## ravensData$ravenScore 0.1066 0.0667 1.60 0.11
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 24.435 on 19 degrees of freedom
## Residual deviance: 20.895 on 18 degrees of freedom
## AIC: 24.89
##
## Number of Fisher Scoring iterations: 5
plot(ravensData$ravenScore, logRegRavens$fitted, pch = 19, col = "blue", xlab = "Score",
ylab = "Prob Ravens Win")
Odds ratios and confidence intervals
exp(logRegRavens$coeff)
## (Intercept) ravensData$ravenScore
## 0.1864 1.1125
exp(confint(logRegRavens))
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 0.005675 3.106
## ravensData$ravenScore 0.996230 1.303
ANOVA for logistic regression
anova(logRegRavens, test = "Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: ravensData$ravenWinNum
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 19 24.4
## ravensData$ravenScore 1 3.54 18 20.9 0.06 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Key ideas
·Many data take the form of counts
-Calls to a call center
-Number of flu cases in an area
-Number of cars that cross a bridge
·Data may also be in the form of rates
-Percent of children passing a test
-Percent of hits to a website from a country
·Linear regression with transformation is an option
3.1 Poisson distribution
set.seed(3433)
oldpar = par(mfrow = c(1, 2))
poisData2 <- rpois(100, lambda = 100)
poisData1 <- rpois(100, lambda = 50)
hist(poisData1, col = "blue", xlim = c(0, 150))
hist(poisData2, col = "blue", xlim = c(0, 150))
par(oldpar)
c(mean(poisData1), var(poisData1))
## [1] 49.85 49.38
c(mean(poisData2), var(poisData2))
## [1] 100.12 95.26
3.2 Example: Leek Group Website Traffic
require(RCurl)
myCsv <- getURL("https://dl.dropboxusercontent.com/u/8272421/gaData.csv", ssl.verifypeer = FALSE)
gaData <- read.csv(textConnection(myCsv))
gaData$julian <- julian(as.Date(gaData$date))
head(gaData)
## date visits simplystats julian
## 1 2011/1/1 0 0 14975
## 2 2011/1/2 0 0 14976
## 3 2011/1/3 0 0 14977
## 4 2011/1/4 0 0 14978
## 5 2011/1/5 0 0 14979
## 6 2011/1/6 0 0 14980
plot(gaData$julian, gaData$visits, pch = 19, col = "darkgrey", xlab = "Julian",
ylab = "Visits")
3.2.1 Linear regression
NHi = b0 + b1JDi + ei
NHi - number of hits to the website
JDi - day of the year (Julian day)
b0 - number of hits on Julian day 0 (1970-01-01)
b1 - increase in number of hits per unit day
ei - variation due to everything we didn't measure
plot(gaData$julian, gaData$visits, pch = 19, col = "darkgrey", xlab = "Julian",
ylab = "Visits")
lm1 <- lm(gaData$visits ~ gaData$julian)
abline(lm1, col = "red", lwd = 3)
3.2.2 Linear vs. Poisson regression
linear:
NHi = b0 + b1JDi + ei
or
E[NHi|JDi, b0, b1 ] = b0 + b1JDi
Poisson:
log(E[NHi|JDi, b0, b1]) = b0 + b1JDi
or
E[NHi|JDi, b0, b1 ] = exp(b0 + b1JDi)
3.2.3 Poisson regression in R
plot(gaData$julian, gaData$visits, pch = 19, col = "darkgrey", xlab = "Julian",
ylab = "Visits")
glm1 <- glm(gaData$visits ~ gaData$julian, family = "poisson")
abline(lm1, col = "red", lwd = 3)
lines(gaData$julian, glm1$fitted, col = "blue", lwd = 3)
Mean-variance relationship?
plot(glm1$fitted, glm1$residuals, pch = 19, col = "grey", ylab = "Residuals",
xlab = "Date")
Model agnostic standard errors
library(sandwich)
## Warning: package 'sandwich' was built under R version 3.0.1
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 3.0.1
## Attaching package: 'zoo'
## 下列对象被屏蔽了from 'package:base':
##
## as.Date, as.Date.numeric
confint.agnostic <- function(object, parm, level = 0.95, ...) {
cf <- coef(object)
pnames <- names(cf)
if (missing(parm))
parm <- pnames else if (is.numeric(parm))
parm <- pnames[parm]
a <- (1 - level)/2
a <- c(a, 1 - a)
pct <- stats:::format.perc(a, 3)
fac <- qnorm(a)
ci <- array(NA, dim = c(length(parm), 2L), dimnames = list(parm, pct))
ses <- sqrt(diag(sandwich::vcovHC(object)))[parm]
ci[] <- cf[parm] + ses %o% fac
ci
}
confint(glm1)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) -34.34658 -31.159716
## gaData$julian 0.00219 0.002396
confint.agnostic(glm1)
## 2.5 % 97.5 %
## (Intercept) -36.362675 -29.136997
## gaData$julian 0.002058 0.002528
3.2.4 Fitting rates in R
E[NHSSi|JDi, b0, b1 ]/NHi = exp(b0 + b1JDi)
log(E[NHSSi |JDi, b0, b1]) − log(NHi) = b0 + b1JDi
log(E[NHSSi |JDi, b0, b1]) = log(NHi) + b0 + b1JDi
glm2 <- glm(gaData$simplystats ~ julian(as.Date(gaData$date)), offset = log(visits +
1), family = "poisson", data = gaData)
plot(julian(as.Date(gaData$date)), glm2$fitted, col = "blue", pch = 19, xlab = "Date",
ylab = "Fitted Counts")
points(julian(as.Date(gaData$date)), glm1$fitted, col = "red", pch = 19)
glm2 <- glm(gaData$simplystats ~ julian(as.Date(gaData$date)), offset = log(visits +
1), family = "poisson", data = gaData)
plot(julian(as.Date(gaData$date)), gaData$simplystats/(gaData$visits + 1), col = "grey",
xlab = "Date", ylab = "Fitted Rates", pch = 19)
lines(julian(as.Date(gaData$date)), glm2$fitted/(gaData$visits + 1), col = "blue",
lwd = 3)
Sometimes model checking/selection not allowed
Often it can lead to problems
- Overfitting
- Overtesting
- Biased inference
But you don't want to miss something obvious
4.1 Linear regression - basic assumptions
-Variance is constant
-You are summarizing a linear trend
-You have all the right terms in the model
-There are no big outliers
4.1.1 Model checking - constant variance
set.seed(3433)
oldpar = par(mfrow = c(1, 2))
data <- rnorm(100, mean = seq(0, 3, length = 100), sd = seq(0.1, 3, length = 100))
lm1 <- lm(data ~ seq(0, 3, length = 100))
plot(seq(0, 3, length = 100), data, pch = 19, col = "grey")
abline(lm1, col = "red", lwd = 3)
plot(seq(0, 3, length = 100), lm1$residuals, pch = 19, col = "grey")
abline(c(0, 0), col = "red", lwd = 3)
par(oldpar)
What to do?
- See if another variable explains the increased variance
- Use the vcovHC {sandwich} variance estimators (if n is big)
Using the sandwich estimate
set.seed(3433)
data <- rnorm(100, mean = seq(0, 3, length = 100), sd = seq(0.1, 3, length = 100))
lm1 <- lm(data ~ seq(0, 3, length = 100))
vcovHC(lm1)
## (Intercept) seq(0, 3, length = 100)
## (Intercept) 0.05410 -0.04765
## seq(0, 3, length = 100) -0.04765 0.05369
summary(lm1)$cov.unscaled
## (Intercept) seq(0, 3, length = 100)
## (Intercept) 0.03941 -0.01960
## seq(0, 3, length = 100) -0.01960 0.01307
4.1.2 Model checking - linear trend
set.seed(3433)
oldpar = par(mfrow = c(1, 2))
data <- rnorm(100, mean = seq(0, 3, length = 100)^3, sd = 2)
lm1 <- lm(data ~ seq(0, 3, length = 100))
plot(seq(0, 3, length = 100), data, pch = 19, col = "grey")
abline(lm1, col = "red", lwd = 3)
plot(seq(0, 3, length = 100), lm1$residuals, pch = 19, col = "grey")
abline(c(0, 0), col = "red", lwd = 3)
par(oldpar)
What to do?
Use Poisson regression (if it looks exponential/multiplicative)
Use a data transformation (e.g. take the log)
Smooth the data/fit a nonlinear trend
Use linear regression anyway
-Interpret as the linear trend between the variables
-Use the vcovHC {sandwich} variance estimators (if n is big)
4.1.3 Model checking - missing covariate
set.seed(3433)
oldpar = par(mfrow = c(1, 3))
z <- rep(c(-0.5, 0.5), 50)
data <- rnorm(100, mean = (seq(0, 3, length = 100) + z), sd = seq(0.1, 3, length = 100))
lm1 <- lm(data ~ seq(0, 3, length = 100))
plot(seq(0, 3, length = 100), data, pch = 19, col = ((z > 0) + 3))
abline(lm1, col = "red", lwd = 3)
plot(seq(0, 3, length = 100), lm1$residuals, pch = 19, col = ((z > 0) + 3))
abline(c(0, 0), col = "red", lwd = 3)
boxplot(lm1$residuals ~ z, col = ((z > 0) + 3))
par(oldpar)
What to do?
Use exploratory analysis to identify other variables to include
Use the vcovHC {sandwich} variance estimators (if n is big)
Report unexplained patterns in the data
4.1.4 Model checking - outliers
set.seed(343)
oldpar=par(mfrow=c(1,2))
betahat <- rep(NA,100)
x <- seq(0,3,length=100)
y <- rcauchy(100)
lm1 <- lm(y ~ x)
plot(x,y,pch=19,col="blue")
abline(lm1,col="red",lwd=3)
for(i in 1:length(data)){betahat[i] <- lm(y[-i] ~ x[-i])$coeff[2]}
plot(betahat - lm1$coeff[2],col="blue",pch=19)
abline(c(0,0),col="red",lwd=3)
par(oldpar)
What to do?
If outliers are experimental mistakes -remove and document them
If they are real - consider reporting how sensitive your estimate is to the outliers
Consider using a robust linear model fit like rlm {MASS}
4.1.5 Robust linear modeling
library(MASS)
set.seed(343)
x <- seq(0, 3, length = 100)
y <- rcauchy(100)
lm1 <- lm(y ~ x)
rlm1 <- rlm(y ~ x)
lm1$coeff
## (Intercept) x
## 0.3523 -0.4011
rlm1$coeff
## (Intercept) x
## 0.008527 -0.017892
oldpar = par(mfrow = c(1, 2))
plot(x, y, pch = 19, col = "grey")
lines(x, lm1$fitted, col = "blue", lwd = 3)
lines(x, rlm1$fitted, col = "green", lwd = 3)
plot(x, y, pch = 19, col = "grey", ylim = c(-5, 5), main = "Zoomed In")
lines(x, lm1$fitted, col = "blue", lwd = 3)
lines(x, rlm1$fitted, col = "green", lwd = 3)
par(oldpar)
4.1.6 Model checking - default plots
set.seed(343)
oldpar = par(mfrow = c(1, 2))
x <- seq(0, 3, length = 100)
y <- rnorm(100)
lm1 <- lm(y ~ x)
plot(lm1)
par(oldpar)
4..1.7 Model checking - deviance
Commonly reported for GLM's
Usually compares the model where every point gets its own parameter to the model you are using
On it's own it doesn't tell you what is wrong
In large samples the deviance may be big even for "conservative" models
You can not compare deviances for models with different sample sizes
R2 may be a bad summary
4.2 Model selection
·Many times you have multiple variables to evaluate
·Options for choosing variables
-Domain-specific knowledge
-Exploratory analysis
-Statistical selection
·There are many statistical selection options
-Step-wise
-AIC
-BIC
-Modern approaches: Lasso, Ridge-Regression, etc.
·Statistical selection may bias your inference
-If possible, do selection on a held out sample
Error measures
R2 alone isn't enough - more variables = bigger R2
Adjusted R2 is taking into account the number of estimated parameters
AIC also penalizes models with more parameters
BIC does the same, but with a bigger penalty
EXAMPLE
require(RCurl)
myCsv <- getURL("https://dl.dropboxusercontent.com/u/8272421/movies03RT.txt",
ssl.verifypeer = FALSE)
movies <- read.csv(textConnection(myCsv), sep = "\t", header = T, quote = "")
head(movies)
## X score rating genre box.office running.time
## 1 2 Fast 2 Furious 48.9 PG-13 action/adventure 127.15 107
## 2 28 Days Later 78.2 R horror 45.06 113
## 3 A Guy Thing 39.5 PG-13 rom comedy 15.54 101
## 4 A Man Apart 42.9 R action/adventure 26.25 110
## 5 A Mighty Wind 79.9 PG-13 comedy 17.78 91
## 6 Agent Cody Banks 57.9 PG action/adventure 47.81 102
4.2.1 Model selection - step
movies <- movies[, -1]
lm1 <- lm(score ~ ., data = movies)
aicFormula <- step(lm1)
## Start: AIC=727.5
## score ~ rating + genre + box.office + running.time
##
## Df Sum of Sq RSS AIC
## - genre 12 2575 22132 721
## - rating 3 40 19596 722
## - running.time 1 237 19793 727
## <none> 19556 728
## - box.office 1 3007 22563 746
##
## Step: AIC=720.8
## score ~ rating + box.office + running.time
##
## Df Sum of Sq RSS AIC
## - rating 3 491 22623 718
## <none> 22132 721
## - running.time 1 1192 23324 726
## - box.office 1 2456 24588 734
##
## Step: AIC=717.9
## score ~ box.office + running.time
##
## Df Sum of Sq RSS AIC
## <none> 22623 718
## - running.time 1 935 23557 722
## - box.office 1 3337 25959 735
aicFormula
##
## Call:
## lm(formula = score ~ box.office + running.time, data = movies)
##
## Coefficients:
## (Intercept) box.office running.time
## 37.2364 0.0824 0.1275
4.2.2 Model selection - regsubsets
library(leaps)
## Warning: package 'leaps' was built under R version 3.0.1
regSub <- regsubsets(score ~ ., data = movies)
plot(regSub)
4.2.3 Model selection - bic.glm
library(BMA)
## Warning: package 'BMA' was built under R version 3.0.1
## Loading required package: survival
## Loading required package: splines
bicglm1 <- bic.glm(score ~ ., data = movies, glm.family = "gaussian")
print(bicglm1)
##
## Call:
## bic.glm.formula(f = score ~ ., data = movies, glm.family = "gaussian")
##
##
## Posterior probabilities(%):
## <NA> <NA> <NA> <NA>
## 0.0 100.0 100.0 18.2
##
## Coefficient posterior expected values:
## (Intercept) ratingPG ratingPG-13
## 45.263 0.000 0.000
## ratingR genreaction/adventure genreanimated
## 0.000 -0.120 7.628
## genrecomedy genredocumentary genredrama
## 2.077 8.642 13.041
## genrefantasy genrehorror genremusical
## 1.504 -3.458 -12.255
## genrerom comedy genresci-fi genresuspense
## 1.244 -3.324 3.815
## genrewestern box.office running.time
## 17.563 0.100 0.016