Introduction to R. Session 02.

Set working directory

# setwd("x:/path/to/your/folder")
setwd("/home/user/mydata")

Load the iris data set

data(iris)

Adding a variable to the data

Recoding exisiting data into a new element in the dataframe

# example with an ifelse statement
iris$my.factor <- as.factor(
                        ifelse(iris$Petal.Width > 1.5,
                        "Large Petal",
                        "Small Petal"))

Statistical test - anova

my.anova <- aov(Sepal.Length ~ Species * my.factor, data = iris)

# the "data" option is for convenience. Of course you can use the statement below as well.
my.anova <- aov(iris$Sepal.Length ~ iris$Species * iris$my.factor)

# a standard anova result table is given via "summary"
summary(my.anova)
##                              Df Sum Sq Mean Sq F value Pr(>F)    
## iris$Species                  2   63.2   31.61  120.71 <2e-16 ***
## iris$my.factor                1    0.9    0.91    3.48  0.064 .  
## iris$Species:iris$my.factor   1    0.1    0.08    0.30  0.585    
## Residuals                   145   38.0    0.26                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# If there is no need to check for interactions between two factors, use "+" instead of "*"
my.anova <- aov(Sepal.Length ~ Species + my.factor, data = iris)
summary(my.anova)
##              Df Sum Sq Mean Sq F value Pr(>F)    
## Species       2   63.2   31.61   121.3 <2e-16 ***
## my.factor     1    0.9    0.91     3.5  0.063 .  
## Residuals   146   38.0    0.26                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Split-plot design

example taken from Aho (2014) Foundational and applied statistics for biologists using R. CRC Press. Page 474 ff.

Data are available from the package “asbio: A collection of statistical tools for biologists”

package installation via (only needed when it is not installed yet):

install.packages("asbio")
library(asbio)
## Loading required package: tcltk
# load split-plot data
data(alfalfa.split.plot)

# the usual overview on data
summary(alfalfa.split.plot)
##      yield      variety cut.time  block 
##  Min.   :0.88   C:24    None:18   1:12  
##  1st Qu.:1.31   L:24    O7  :18   2:12  
##  Median :1.58   R:24    S1  :18   3:12  
##  Mean   :1.60           S20 :18   4:12  
##  3rd Qu.:1.82                     5:12  
##  Max.   :2.34                     6:12
str(alfalfa.split.plot)
## 'data.frame':    72 obs. of  4 variables:
##  $ yield   : num  2.17 1.58 2.29 2.23 2.33 1.38 1.86 2.27 1.75 1.52 ...
##  $ variety : Factor w/ 3 levels "C","L","R": 2 2 2 2 1 1 1 1 3 3 ...
##  $ cut.time: Factor w/ 4 levels "None","O7","S1",..: 1 3 4 2 1 3 4 2 1 3 ...
##  $ block   : Factor w/ 6 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
head(alfalfa.split.plot)
##   yield variety cut.time block
## 1  2.17       L     None     1
## 2  1.58       L       S1     1
## 3  2.29       L      S20     1
## 4  2.23       L       O7     1
## 5  2.33       C     None     1
## 6  1.38       C       S1     1
# quick overview on the number of samples for subsets of the data
# frequency table
table(alfalfa.split.plot$variety, alfalfa.split.plot$cut.time)
##    
##     None O7 S1 S20
##   C    6  6  6   6
##   L    6  6  6   6
##   R    6  6  6   6
# to make it easier to type
with(alfalfa.split.plot, table(variety, cut.time))
##        cut.time
## variety None O7 S1 S20
##       C    6  6  6   6
##       L    6  6  6   6
##       R    6  6  6   6
with(alfalfa.split.plot, table(block, variety, cut.time))
## , , cut.time = None
## 
##      variety
## block C L R
##     1 1 1 1
##     2 1 1 1
##     3 1 1 1
##     4 1 1 1
##     5 1 1 1
##     6 1 1 1
## 
## , , cut.time = O7
## 
##      variety
## block C L R
##     1 1 1 1
##     2 1 1 1
##     3 1 1 1
##     4 1 1 1
##     5 1 1 1
##     6 1 1 1
## 
## , , cut.time = S1
## 
##      variety
## block C L R
##     1 1 1 1
##     2 1 1 1
##     3 1 1 1
##     4 1 1 1
##     5 1 1 1
##     6 1 1 1
## 
## , , cut.time = S20
## 
##      variety
## block C L R
##     1 1 1 1
##     2 1 1 1
##     3 1 1 1
##     4 1 1 1
##     5 1 1 1
##     6 1 1 1
# From the description of the experiment in the book
# "The agricultural blocks of this experiment where grouped into six blocks, each with three plots. One of three varieties were randomly assigned to each plot in each block (see figure of the design). Each plot was then subdivided into split plot. one spit plot for each cutting date.

my.aov <- aov(yield ~ variety * cut.time + Error(block/variety),
              data = alfalfa.split.plot)
summary(my.aov)
## 
## Error: block
##           Df Sum Sq Mean Sq F value Pr(>F)
## Residuals  5   4.15    0.83               
## 
## Error: block:variety
##           Df Sum Sq Mean Sq F value Pr(>F)
## variety    2  0.178   0.089    0.65   0.54
## Residuals 10  1.362   0.136               
## 
## Error: Within
##                  Df Sum Sq Mean Sq F value  Pr(>F)    
## cut.time          3  1.962   0.654   23.39 2.8e-09 ***
## variety:cut.time  6  0.211   0.035    1.25     0.3    
## Residuals        45  1.259   0.028                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

How to test several parameters from a data frame in one go

i.e. how to automate data analysis?

# create a function that does the anova for our specific example
my.aov <- function(x) {
          out.aov <- summary(aov(x ~ iris$Species * iris$my.factor))
}
# this function can be improved later so that any data frame can be used, not just iris

# then we apply the function to the data
# using lapply (the columns that we want to test are given to lapply as a list)
my.aov.res <- lapply(iris[, 1:4], function(x) my.aov(x))

# the output is now stored in the list "my.aov.res"
# print the whole list
my.aov.res
## $Sepal.Length
##                              Df Sum Sq Mean Sq F value Pr(>F)    
## iris$Species                  2   63.2   31.61  120.71 <2e-16 ***
## iris$my.factor                1    0.9    0.91    3.48  0.064 .  
## iris$Species:iris$my.factor   1    0.1    0.08    0.30  0.585    
## Residuals                   145   38.0    0.26                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## $Sepal.Width
##                              Df Sum Sq Mean Sq F value  Pr(>F)    
## iris$Species                  2  11.34    5.67    52.5 < 2e-16 ***
## iris$my.factor                1   1.29    1.29    11.9 0.00072 ***
## iris$Species:iris$my.factor   1   0.01    0.01     0.1 0.74934    
## Residuals                   145  15.66    0.11                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## $Petal.Length
##                              Df Sum Sq Mean Sq F value Pr(>F)    
## iris$Species                  2    437   218.6  1259.7 <2e-16 ***
## iris$my.factor                1      2     1.9    11.1 0.0011 ** 
## iris$Species:iris$my.factor   1      0     0.1     0.8 0.3721    
## Residuals                   145     25     0.2                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## $Petal.Width
##                              Df Sum Sq Mean Sq F value  Pr(>F)    
## iris$Species                  2   80.4    40.2 1284.59 < 2e-16 ***
## iris$my.factor                1    1.5     1.5   48.92 9.1e-11 ***
## iris$Species:iris$my.factor   1    0.1     0.1    2.78   0.098 .  
## Residuals                   145    4.5     0.0                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# show the second list entry
my.aov.res[2]
## $Sepal.Width
##                              Df Sum Sq Mean Sq F value  Pr(>F)    
## iris$Species                  2  11.34    5.67    52.5 < 2e-16 ***
## iris$my.factor                1   1.29    1.29    11.9 0.00072 ***
## iris$Species:iris$my.factor   1   0.01    0.01     0.1 0.74934    
## Residuals                   145  15.66    0.11                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# show the entry for "Petal.Width"
my.aov.res$Petal.Width
##                              Df Sum Sq Mean Sq F value  Pr(>F)    
## iris$Species                  2   80.4    40.2 1284.59 < 2e-16 ***
## iris$my.factor                1    1.5     1.5   48.92 9.1e-11 ***
## iris$Species:iris$my.factor   1    0.1     0.1    2.78   0.098 .  
## Residuals                   145    4.5     0.0                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Linear regression

my.lm <- lm(Sepal.Length ~ Petal.Width, data = iris)
summary(my.lm)
## 
## Call:
## lm(formula = Sepal.Length ~ Petal.Width, data = iris)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.3882 -0.2936 -0.0439  0.2643  1.3452 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   4.7776     0.0729    65.5   <2e-16 ***
## Petal.Width   0.8886     0.0514    17.3   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.478 on 148 degrees of freedom
## Multiple R-squared:  0.669,  Adjusted R-squared:  0.667 
## F-statistic:  299 on 1 and 148 DF,  p-value: <2e-16
summary.lm(my.lm)
## 
## Call:
## lm(formula = Sepal.Length ~ Petal.Width, data = iris)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.3882 -0.2936 -0.0439  0.2643  1.3452 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   4.7776     0.0729    65.5   <2e-16 ***
## Petal.Width   0.8886     0.0514    17.3   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.478 on 148 degrees of freedom
## Multiple R-squared:  0.669,  Adjusted R-squared:  0.667 
## F-statistic:  299 on 1 and 148 DF,  p-value: <2e-16
# to see the coefficients only
coef(my.lm)
## (Intercept) Petal.Width 
##      4.7776      0.8886

How to extract elements from such results?

# First, see what elements are available of an object
# What are the elements that coef() returns?
names(coef(my.lm))
## [1] "(Intercept)" "Petal.Width"
# put the coefficients in a new object
my.coef <- coef(my.lm)

# grab the value of the Intercept
my.coef["(Intercept)"]
## (Intercept) 
##       4.778
# or slope
my.coef["Petal.Width"]
## Petal.Width 
##      0.8886

Visualise data

via plot from the “graphics” package of any base R installation

plot(Sepal.Length ~ Petal.Width, data = iris)

plot of chunk unnamed-chunk-8

Plot is a very generic function with many applications i.e. it is possible to plot a (small) dataframe this gives a quick visualisation of all data, potential correlations… don’t try this with lots of data: it will not fail, but it can take a long time and the resulting figure might not have the resolution on screen to see much.

plot(iris)

plot of chunk unnamed-chunk-9

# but back to the specific plot of two variables
plot(Sepal.Length ~ Petal.Width, data = iris)

plot of chunk unnamed-chunk-9

# instead of point, it is possible to use lines:
plot(Sepal.Length ~ Petal.Width, data = iris, type = "l")

plot of chunk unnamed-chunk-9

# but again back to the points plot
plot(Sepal.Length ~ Petal.Width, data = iris)

# add the regression line that we calculated earlier to the plot
abline(my.lm)


# Add a line with slope "1" and intercept "4.5" for illustration purposes
# * "col" specifies colour
# * "lty" specifies linetype
# * "lwd" specified line width
abline(a = 4.5, b = 1, col = "green", lty = "dashed", lwd = 5)

plot of chunk unnamed-chunk-9

Check colours() for available colours see for example http://wiki.stdout.org/rcookbook/Graphs/Shapes%20and%20line%20types/ for symbols, linetypes, etc… or check the lty section in ?par for linetype names

Use different symbols * “pch” specify point characters * check the above website, some of the symbols allow a background via “bg = colourname”

A more complex graph extend standard margins to allow space for a nice y-axis label default margins are “par()$mar” of 5.1, 4.1, 4.1, 2.1 lines

par(mar=c(5.1, 6, 4.1, 2.1))
plot(Sepal.Length ~ Petal.Width, 
     xlim = c(0, 4),  # manually defined x-axis range
     ylim = c(0, 8),  # manually defined y-axis range
     data = iris,
     xlab = "Petal width [cm]", # axis label independent of name in data frame
     ylab = expression("Fancy function"~sqrt(x^2)~CO[2]%+-%~sd), # just for show
     pch  = 23,
     bg   = "green")
# check "demo(plotmath)" for writing expressions

# add more data to the plot
# this is done without calling "plot" again!
points(Sepal.Width ~ Petal.Width, 
        data = iris,
        pch  = 4,
        col  = "blue" )

# add the linear model to the plot
abline(my.lm, col = "red", lty = "dashed", lwd = 2)

# add a 1:1 line
abline(a = 0, b = 1, lty = "dotted")

# build a legend
legend(2.6, 2.5, 
        c("Sepal length", "Sepal width", "Abline length", "1:1"),
        pch = c(23, 4, NA, NA),
        lty = c(NA, NA, "dashed", "dotted"),
        cex = 0.8 # make the characters slightly smaller than standard text
        )

plot of chunk unnamed-chunk-10

Regarding modifying tick labels (i.e. rotating…) it involves creating the labels manually: see http://cran.r-project.org/doc/FAQ/R-FAQ.html#How-can-I-create-rotated-axis-labels_003f

ggplot2-graphs: I’ll introduce the Grammar of Graphics (ggplot2) later in the seminar. This package allows to modify many elements of plots and offers options that are not available in the base plots.

But for now we stick with base graphics.

Another very useful type of graph is a boxplot Boxplots allow a quick overview on data, look for potential outliers, …

boxplot(Petal.Length ~ Species, data = iris)

plot of chunk unnamed-chunk-11

# show aggregation of two factors as boxplot
boxplot(Petal.Length ~ Species * my.factor, data = iris)

plot of chunk unnamed-chunk-11

How to save a graph to a file

R uses “Devices” for output. Standard device is the screen, but the output can always be redicrected to anover device. the principle is, that first a “device” is opened, then something is written to this device and then the device is closed.

# Example
plot(Sepal.Length ~ Petal.Width, data = iris)

plot of chunk unnamed-chunk-12

To get a png image of this graph in the current working directory open the png device

png("myplot.png")

# then create the plot
plot(Sepal.Length ~ Petal.Width, data = iris)

# and close the device
dev.off()

The default size of a picture device is 640x480 pixels to change that provide information on the desired size

png("myplot_1024.png", width=1024, height=768)
plot(Sepal.Length ~ Petal.Width, data = iris)
dev.off()

Change the resolution in dpi

png("myplot_1024_high_res.png", width=1024, height=768, res = 150)
plot(Sepal.Length ~ Petal.Width, data = iris)
dev.off()

With resolution and width/height information, the image can be tweaked according to journal layout specifications (e.g. figure should be one column wide in the printed paper or so), etc…

It is much easier to use vector based image formats (svg, wmf, pdf) compared to bitmap based ones (png, gif, tif, bmp, jpg) BTW never, ever use jpg for figures. It is a lossy format and will look bad. jpg was designed to be used with photos, not for images with few colours and clear lines!

My preferred way to export figures is via the pdf pdf is based on postscript, which is vector based. vector based images do not have a fixed rsolution, they can be scaled without losing information also, vector based figures can easily be modified later if additional information has to be added outside of R.

pdf("myplot.pdf")
plot(Sepal.Length ~ Petal.Width, data = iris)
dev.off()

# change the size of the pdf (given in inches)
pdf("myplot_larger.pdf", width = 15, height = 8)
plot(Sepal.Length ~ Petal.Width, data = iris)
dev.off()

# create a scalable vector graphic
svg("myplot.svg")
plot(Sepal.Length ~ Petal.Width, data = iris)
dev.off()

For further information on saving a graph see: http://wiki.stdout.org/rcookbook/Graphs/Output%20to%20a%20file/

The “sink” device can be used to dump text into a file on the harddrive i.e. to store the previous anova results as a file redirect them to a text file

sink(file = "my_anova_results.txt")
print(my.aov.res)
sink()

Always remember to turn “sink” off via “sink()”!!

Non-linear least squares

curve fitting

load the Loblolly pine example data set

data(Loblolly) 

# do the usual checks
summary(Loblolly)
##      height           age            Seed   
##  Min.   : 3.46   Min.   : 3.0   329    : 6  
##  1st Qu.:10.47   1st Qu.: 5.0   327    : 6  
##  Median :34.00   Median :12.5   325    : 6  
##  Mean   :32.36   Mean   :13.0   307    : 6  
##  3rd Qu.:51.36   3rd Qu.:20.0   331    : 6  
##  Max.   :64.10   Max.   :25.0   311    : 6  
##                                 (Other):48
dim(Loblolly)
## [1] 84  3
# create simple overview graph
plot(height ~ age, 
        data = Loblolly)

plot of chunk unnamed-chunk-13

# allow some space beyond the current x and y range
plot(height ~ age, 
        data = Loblolly,
        xlim = c(0,100),
        ylim = c(0,100))

# do a nls curve fit
nls.fit <- nls(height ~ (a * age) / (b + age),
              data = Loblolly,
              start = c(a = 10, b = 15))
summary(nls.fit)
## 
## Formula: height ~ (a * age)/(b + age)
## 
## Parameters:
##   Estimate Std. Error t value Pr(>|t|)  
## a      684        278    2.46    0.016 *
## b      251        110    2.28    0.025 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.91 on 82 degrees of freedom
## 
## Number of iterations to convergence: 8 
## Achieved convergence tolerance: 1.39e-06
# grab the coefficients

my.coef <- coef(nls.fit)
my.a <- my.coef[1] # or use my.coef["a"]
my.b <- my.coef[2] # or use my.coef["b"]

# add a curve based on the coefficients to the figure
curve( (my.a * x) / (my.b + x), 
        from = 0, 
        to   = 100, 
        add = TRUE)

plot of chunk unnamed-chunk-13 Another method to add the curve is available via “predict” see ?predict.nls

xValues <- seq(min(0), max(100), length.out = 100)  # create a sequence from 0 to the desired max x-value with 100 elements

# predict values from the nls.fit. 
# Predict expects a dataframe "in which to look for variables with which to predict". 
# Name has to match the nls model for the independent variable / predictor.
my.predicted <- predict(nls.fit, data.frame(age = xValues))

# Create the base plot again
plot(height ~ age, 
        data = Loblolly,
        xlim = c(0,100),
        ylim = c(0,100))

# use lines to connect the predicted values. 
lines(xValues, 
        my.predicted, 
        col = "red") 

plot of chunk unnamed-chunk-14

This is possible with the function plotfit() from package nlstools as well

# if the package is not installed do
# install.packages("nlstools", dep = TRUE)
# then load the package via "library()"
library(nlstools)
## 
## 'nlstools' has been loaded.
## 
## IMPORTANT NOTICE: Most nonlinear regression models and data set examples
## related to predictive microbiolgy have been moved to the package 'nlsMicrobio'
# create a plot with a fitted curve
# "smooth"" must be set to "TRUE" to request fitted values
plotfit(nls.fit, 
        smooth = TRUE)

plot of chunk unnamed-chunk-15

# as other functions, plotfit allows to pass parameters to underlying functions
# this is (as usual) indicated by "..." in the helpfile ?plotfit
# we can adjust the plotted range via

plotfit(nls.fit, 
        smooth = TRUE,
        xlim   = c(0, 100),
        ylim   = c(0, 100))    

plot of chunk unnamed-chunk-15

Next time * How to do individual fits for subsets of the data?