Simple Linear Regression Example:

data(iris)
fit <- lm(Petal.Width ~ Petal.Length, data=iris)
class(fit)

## [1] "lm"

methods(class=class(fit))

##  [1] add1.lm*           alias.lm*          anova.lm*         
##  [4] case.names.lm*     confint.lm         cooks.distance.lm*
##  [7] deviance.lm*       dfbeta.lm*         dfbetas.lm*       
## [10] drop1.lm*          dummy.coef.lm      effects.lm*       
## [13] extractAIC.lm*     family.lm*         formula.lm*       
## [16] hatvalues.lm*      influence.lm*      kappa.lm          
## [19] labels.lm*         logLik.lm*         model.frame.lm*   
## [22] model.matrix.lm    nobs.lm*           plot.lm*          
## [25] predict.lm         print.lm*          proj.lm*          
## [28] qr.lm*             residuals.lm       rstandard.lm*     
## [31] rstudent.lm*       simulate.lm*       summary.lm        
## [34] variable.names.lm* vcov.lm*          
## 
##    Non-visible functions are asterisked

summary(fit) # show results

## 
## Call:
## lm(formula = Petal.Width ~ Petal.Length, data = iris)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -0.565 -0.124 -0.019  0.133  0.643 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -0.36308    0.03976   -9.13  4.7e-16 ***
## Petal.Length  0.41576    0.00958   43.39  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.206 on 148 degrees of freedom
## Multiple R-squared:  0.927,  Adjusted R-squared:  0.927 
## F-statistic: 1.88e+03 on 1 and 148 DF,  p-value: <2e-16

Other useful functions

coefficients(fit) # model coefficients
predict(fit) # fitted predictions
predict(fit, newdata=data.frame(Petal.Length=seq(1, 2, by=0.1)))
confint(fit, level=0.95) # CIs for model parameters 
fitted(fit) # predicted values
residuals(fit) # residuals
anova(fit) # anova table 
influence(fit) # regression diagnostics

Diagnostic plots:

par(mfrow=c(2,2))
plot(fit)

plot of chunk unnamed-chunk-3

Multiple regression:

fit2 <- lm(Petal.Width ~ Petal.Length + Sepal.Length + Sepal.Width, data=iris)
summary(fit2) # show results

## 
## Call:
## lm(formula = Petal.Width ~ Petal.Length + Sepal.Length + Sepal.Width, 
##     data = iris)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.6096 -0.1013 -0.0109  0.0983  0.6069 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -0.2403     0.1784   -1.35     0.18    
## Petal.Length   0.5241     0.0245   21.40  < 2e-16 ***
## Sepal.Length  -0.2073     0.0475   -4.36  2.4e-05 ***
## Sepal.Width    0.2228     0.0489    4.55  1.1e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.192 on 146 degrees of freedom
## Multiple R-squared:  0.938,  Adjusted R-squared:  0.937 
## F-statistic:  734 on 3 and 146 DF,  p-value: <2e-16

anova(fit, fit2)

## Analysis of Variance Table
## 
## Model 1: Petal.Width ~ Petal.Length
## Model 2: Petal.Width ~ Petal.Length + Sepal.Length + Sepal.Width
##   Res.Df  RSS Df Sum of Sq    F  Pr(>F)    
## 1    148 6.31                              
## 2    146 5.38  2      0.93 12.6 8.8e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Interaction terms:

fit2int <- lm(Petal.Width ~ Petal.Length + Sepal.Length + Sepal.Width + Petal.Length:Sepal.Length, data=iris)
anova(fit2, fit2int)

## Analysis of Variance Table
## 
## Model 1: Petal.Width ~ Petal.Length + Sepal.Length + Sepal.Width
## Model 2: Petal.Width ~ Petal.Length + Sepal.Length + Sepal.Width + Petal.Length:Sepal.Length
##   Res.Df  RSS Df Sum of Sq    F Pr(>F)
## 1    146 5.38                         
## 2    145 5.29  1    0.0934 2.56   0.11

“Model matrices”

tail(model.matrix(~ Species, data=iris))

##     (Intercept) Speciesversicolor Speciesvirginica
## 145           1                 0                1
## 146           1                 0                1
## 147           1                 0                1
## 148           1                 0                1
## 149           1                 0                1
## 150           1                 0                1

tail(iris)

##     Sepal.Length Sepal.Width Petal.Length Petal.Width   Species
## 145          6.7         3.3          5.7         2.5 virginica
## 146          6.7         3.0          5.2         2.3 virginica
## 147          6.3         2.5          5.0         1.9 virginica
## 148          6.5         3.0          5.2         2.0 virginica
## 149          6.2         3.4          5.4         2.3 virginica
## 150          5.9         3.0          5.1         1.8 virginica

Analysis of covariance:

fit3 <- lm(Petal.Width ~ Petal.Length + Sepal.Length + Sepal.Width + Species, data=iris)
summary(fit3)

## 
## Call:
## lm(formula = Petal.Width ~ Petal.Length + Sepal.Length + Sepal.Width + 
##     Species, data = iris)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.5924 -0.0829 -0.0135  0.0877  0.4524 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -0.4731     0.1766   -2.68   0.0082 ** 
## Petal.Length        0.2422     0.0488    4.96  2.0e-06 ***
## Sepal.Length       -0.0929     0.0446   -2.08   0.0389 *  
## Sepal.Width         0.2422     0.0478    5.07  1.2e-06 ***
## Speciesversicolor   0.6481     0.1231    5.26  5.0e-07 ***
## Speciesvirginica    1.0464     0.1655    6.32  3.0e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.167 on 144 degrees of freedom
## Multiple R-squared:  0.954,  Adjusted R-squared:  0.952 
## F-statistic:  595 on 5 and 144 DF,  p-value: <2e-16

anova(fit2, fit3)

## Analysis of Variance Table
## 
## Model 1: Petal.Width ~ Petal.Length + Sepal.Length + Sepal.Width
## Model 2: Petal.Width ~ Petal.Length + Sepal.Length + Sepal.Width + Species
##   Res.Df  RSS Df Sum of Sq    F  Pr(>F)    
## 1    146 5.38                              
## 2    144 4.00  2      1.38 24.9 5.1e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Make nice tables - see http://cran.r-project.org/web/packages/xtable/vignettes/xtableGallery.pdf.

library(xtable)
print(xtable(fit3), type="html")

	Estimate	Std. Error	t value	Pr(>\|t\|)
(Intercept)	-0.4731	0.1766	-2.68	0.0082
Petal.Length	0.2422	0.0488	4.96	0.0000
Sepal.Length	-0.0929	0.0446	-2.08	0.0389
Sepal.Width	0.2422	0.0478	5.07	0.0000
Speciesversicolor	0.6481	0.1231	5.26	0.0000
Speciesvirginica	1.0464	0.1655	6.32	0.0000

print(xtable(anova(fit3)), type="html")

	Df	Sum Sq	Mean Sq	F value	Pr(>F)
Petal.Length	1	80.26	80.26	2891.11	0.0000
Sepal.Length	1	0.17	0.17	5.97	0.0157
Sepal.Width	1	0.76	0.76	27.52	0.0000
Species	2	1.38	0.69	24.90	0.0000
Residuals	144	4.00	0.03

Logistic regression:

iris2 <- iris
iris2$virginica <- iris$Species == "virginica"
fit4 <- glm(virginica ~ Petal.Width + Petal.Length + Sepal.Length + Sepal.Width, data=iris2, family=binomial)

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

plot(fit4)

plot of chunk unnamed-chunk-9

Poisson regression (obviously a bad choice here):

fit5 <- glm(round(Petal.Width) ~ Petal.Length + Sepal.Length + Sepal.Width, data=iris2, family=poisson)
plot(fit5)

plot of chunk unnamed-chunk-10

summary(fit5)

## 
## Call:
## glm(formula = round(Petal.Width) ~ Petal.Length + Sepal.Length + 
##     Sepal.Width, family = poisson, data = iris2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.2915  -0.4941  -0.0927   0.1675   1.4507  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -0.0869     0.8591   -0.10    0.919    
## Petal.Length   0.7914     0.1341    5.90  3.6e-09 ***
## Sepal.Length  -0.4154     0.2081   -2.00    0.046 *  
## Sepal.Width   -0.2873     0.2451   -1.17    0.241    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 145.993  on 149  degrees of freedom
## Residual deviance:  32.509  on 146  degrees of freedom
## AIC: 281.8
## 
## Number of Fisher Scoring iterations: 4

Generalized Estimating Equations??

library(gee)
gee(formula, id, data, ...)

Exercise 1

Plotting a linear regression with confidence and prediction intervals.

plot(Petal.Width ~ Petal.Length, col=c("black", "red", "blue")[Species], pch=(15:17)[Species], xlab="Petal Length (cm)", ylab="Petal Width (cm)", data=iris)
newx <- data.frame(Petal.Length=seq(min(iris$Petal.Length), max(iris$Petal.Length), length.out=100))
conf.interval <- predict(fit, newdata=newx, interval="confidence")
pred.interval <- predict(fit, newdata=newx, interval="prediction")
lines(conf.interval[, "fit"] ~ newx[, 1], lty=1, lw=3)
lines(conf.interval[, "lwr"] ~ newx[, 1], lty=2)
lines(conf.interval[, "upr"] ~ newx[, 1], lty=2)
lines(pred.interval[, "lwr"] ~ newx[, 1], lty=3)
lines(pred.interval[, "upr"] ~ newx[, 1], lty=3)
legend("topleft", legend=c(levels(iris$Species), "CI", "PI"), col=c("black", "red", "blue", "black", "black"), pch=c(15:17, -1, -1), lty=c(-1, -1, -1, 2, 3))

plot of chunk unnamed-chunk-12

Exercise 2:

Regression on the GSE12945 dataset. Is there an association between WIPF1 and any of the other variables? Note: this exercise is meant to do interactively with the class, otherwise the code will look too advanced for beginners.

download.file("https://www.dropbox.com/sh/pukanjaahmonmcp/AADWX-vKk70CuGgYWBqqxWjfa/datasets/GSE12945.csv", destfile="GSE12945.csv", method="wget")
gse <- read.csv("GSE12945.csv", row.names=1)  #note that row.names is needed in this case to make the first column row names.
sapply(gse, class) #make sure classes are appropriate

##                         primarysite                                   G 
##                            "factor"                           "integer" 
##                                   T                                   N 
##                           "integer"                           "integer" 
##                                   M age_at_initial_pathologic_diagnosis 
##                           "integer"                           "integer" 
##                            location                              gender 
##                            "factor"                            "factor" 
##                   lymphnodesremoved                  four.year.survivor 
##                           "integer"                            "factor" 
##                               WIPF1                               WIPF2 
##                           "numeric"                           "numeric" 
##                                IGF2 
##                           "numeric"

G, T, N, M should be ordered factors:

gse$G <- factor(gse$G, ordered=TRUE)
gse$T <- factor(gse$T, ordered=TRUE)
gse$N <- factor(gse$N, ordered=TRUE)
gse$M <- factor(gse$M, ordered=TRUE)

This is one way to select all columns except for WIPF1:

(vars.to.test <- colnames(gse)[!colnames(gse) %in% "WIPF1"])

##  [1] "primarysite"                        
##  [2] "G"                                  
##  [3] "T"                                  
##  [4] "N"                                  
##  [5] "M"                                  
##  [6] "age_at_initial_pathologic_diagnosis"
##  [7] "location"                           
##  [8] "gender"                             
##  [9] "lymphnodesremoved"                  
## [10] "four.year.survivor"                 
## [11] "WIPF2"                              
## [12] "IGF2"

all.lm <- lapply(vars.to.test, function(x){
  lm(as.formula(paste("WIPF1 ~", x)), data=gse)
})
names(all.lm) <- vars.to.test
all.p <- sapply(all.lm, function(mylm){
  fstat <- summary(mylm)$fstatistic
  1-pf(fstat[1], fstat[2], fstat[3])
})
t(all.p)

##      primarysite.value G.value T.value N.value M.value
## [1,]            0.5432  0.7705  0.6494  0.6691  0.4263
##      age_at_initial_pathologic_diagnosis.value location.value gender.value
## [1,]                                    0.1247         0.3882      0.04758
##      lymphnodesremoved.value four.year.survivor.value WIPF2.value
## [1,]                 0.06792                   0.3243      0.5729
##      IGF2.value
## [1,]     0.9386

hist(all.p)

plot of chunk unnamed-chunk-16

It doesn't look like it!