Set working directory
# setwd("x:/path/to/your/folder")
setwd("/home/user/mydata")
Load the iris data set
data(iris)
Recoding exisiting data into a new element in the dataframe
# example with an ifelse statement
iris$my.factor <- as.factor(
ifelse(iris$Petal.Width > 1.5,
"Large Petal",
"Small Petal"))
my.anova <- aov(Sepal.Length ~ Species * my.factor, data = iris)
# the "data" option is for convenience. Of course you can use the statement below as well.
my.anova <- aov(iris$Sepal.Length ~ iris$Species * iris$my.factor)
# a standard anova result table is given via "summary"
summary(my.anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## iris$Species 2 63.2 31.61 120.71 <2e-16 ***
## iris$my.factor 1 0.9 0.91 3.48 0.064 .
## iris$Species:iris$my.factor 1 0.1 0.08 0.30 0.585
## Residuals 145 38.0 0.26
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# If there is no need to check for interactions between two factors, use "+" instead of "*"
my.anova <- aov(Sepal.Length ~ Species + my.factor, data = iris)
summary(my.anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## Species 2 63.2 31.61 121.3 <2e-16 ***
## my.factor 1 0.9 0.91 3.5 0.063 .
## Residuals 146 38.0 0.26
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
example taken from Aho (2014) Foundational and applied statistics for biologists using R. CRC Press. Page 474 ff.
Data are available from the package “asbio: A collection of statistical tools for biologists”
package installation via (only needed when it is not installed yet):
install.packages("asbio")
library(asbio)
## Loading required package: tcltk
# load split-plot data
data(alfalfa.split.plot)
# the usual overview on data
summary(alfalfa.split.plot)
## yield variety cut.time block
## Min. :0.88 C:24 None:18 1:12
## 1st Qu.:1.31 L:24 O7 :18 2:12
## Median :1.58 R:24 S1 :18 3:12
## Mean :1.60 S20 :18 4:12
## 3rd Qu.:1.82 5:12
## Max. :2.34 6:12
str(alfalfa.split.plot)
## 'data.frame': 72 obs. of 4 variables:
## $ yield : num 2.17 1.58 2.29 2.23 2.33 1.38 1.86 2.27 1.75 1.52 ...
## $ variety : Factor w/ 3 levels "C","L","R": 2 2 2 2 1 1 1 1 3 3 ...
## $ cut.time: Factor w/ 4 levels "None","O7","S1",..: 1 3 4 2 1 3 4 2 1 3 ...
## $ block : Factor w/ 6 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
head(alfalfa.split.plot)
## yield variety cut.time block
## 1 2.17 L None 1
## 2 1.58 L S1 1
## 3 2.29 L S20 1
## 4 2.23 L O7 1
## 5 2.33 C None 1
## 6 1.38 C S1 1
# quick overview on the number of samples for subsets of the data
# frequency table
table(alfalfa.split.plot$variety, alfalfa.split.plot$cut.time)
##
## None O7 S1 S20
## C 6 6 6 6
## L 6 6 6 6
## R 6 6 6 6
# to make it easier to type
with(alfalfa.split.plot, table(variety, cut.time))
## cut.time
## variety None O7 S1 S20
## C 6 6 6 6
## L 6 6 6 6
## R 6 6 6 6
with(alfalfa.split.plot, table(block, variety, cut.time))
## , , cut.time = None
##
## variety
## block C L R
## 1 1 1 1
## 2 1 1 1
## 3 1 1 1
## 4 1 1 1
## 5 1 1 1
## 6 1 1 1
##
## , , cut.time = O7
##
## variety
## block C L R
## 1 1 1 1
## 2 1 1 1
## 3 1 1 1
## 4 1 1 1
## 5 1 1 1
## 6 1 1 1
##
## , , cut.time = S1
##
## variety
## block C L R
## 1 1 1 1
## 2 1 1 1
## 3 1 1 1
## 4 1 1 1
## 5 1 1 1
## 6 1 1 1
##
## , , cut.time = S20
##
## variety
## block C L R
## 1 1 1 1
## 2 1 1 1
## 3 1 1 1
## 4 1 1 1
## 5 1 1 1
## 6 1 1 1
# From the description of the experiment in the book
# "The agricultural blocks of this experiment where grouped into six blocks, each with three plots. One of three varieties were randomly assigned to each plot in each block (see figure of the design). Each plot was then subdivided into split plot. one spit plot for each cutting date.
my.aov <- aov(yield ~ variety * cut.time + Error(block/variety),
data = alfalfa.split.plot)
summary(my.aov)
##
## Error: block
## Df Sum Sq Mean Sq F value Pr(>F)
## Residuals 5 4.15 0.83
##
## Error: block:variety
## Df Sum Sq Mean Sq F value Pr(>F)
## variety 2 0.178 0.089 0.65 0.54
## Residuals 10 1.362 0.136
##
## Error: Within
## Df Sum Sq Mean Sq F value Pr(>F)
## cut.time 3 1.962 0.654 23.39 2.8e-09 ***
## variety:cut.time 6 0.211 0.035 1.25 0.3
## Residuals 45 1.259 0.028
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
i.e. how to automate data analysis?
# create a function that does the anova for our specific example
my.aov <- function(x) {
out.aov <- summary(aov(x ~ iris$Species * iris$my.factor))
}
# this function can be improved later so that any data frame can be used, not just iris
# then we apply the function to the data
# using lapply (the columns that we want to test are given to lapply as a list)
my.aov.res <- lapply(iris[, 1:4], function(x) my.aov(x))
# the output is now stored in the list "my.aov.res"
# print the whole list
my.aov.res
## $Sepal.Length
## Df Sum Sq Mean Sq F value Pr(>F)
## iris$Species 2 63.2 31.61 120.71 <2e-16 ***
## iris$my.factor 1 0.9 0.91 3.48 0.064 .
## iris$Species:iris$my.factor 1 0.1 0.08 0.30 0.585
## Residuals 145 38.0 0.26
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## $Sepal.Width
## Df Sum Sq Mean Sq F value Pr(>F)
## iris$Species 2 11.34 5.67 52.5 < 2e-16 ***
## iris$my.factor 1 1.29 1.29 11.9 0.00072 ***
## iris$Species:iris$my.factor 1 0.01 0.01 0.1 0.74934
## Residuals 145 15.66 0.11
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## $Petal.Length
## Df Sum Sq Mean Sq F value Pr(>F)
## iris$Species 2 437 218.6 1259.7 <2e-16 ***
## iris$my.factor 1 2 1.9 11.1 0.0011 **
## iris$Species:iris$my.factor 1 0 0.1 0.8 0.3721
## Residuals 145 25 0.2
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## $Petal.Width
## Df Sum Sq Mean Sq F value Pr(>F)
## iris$Species 2 80.4 40.2 1284.59 < 2e-16 ***
## iris$my.factor 1 1.5 1.5 48.92 9.1e-11 ***
## iris$Species:iris$my.factor 1 0.1 0.1 2.78 0.098 .
## Residuals 145 4.5 0.0
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# show the second list entry
my.aov.res[2]
## $Sepal.Width
## Df Sum Sq Mean Sq F value Pr(>F)
## iris$Species 2 11.34 5.67 52.5 < 2e-16 ***
## iris$my.factor 1 1.29 1.29 11.9 0.00072 ***
## iris$Species:iris$my.factor 1 0.01 0.01 0.1 0.74934
## Residuals 145 15.66 0.11
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# show the entry for "Petal.Width"
my.aov.res$Petal.Width
## Df Sum Sq Mean Sq F value Pr(>F)
## iris$Species 2 80.4 40.2 1284.59 < 2e-16 ***
## iris$my.factor 1 1.5 1.5 48.92 9.1e-11 ***
## iris$Species:iris$my.factor 1 0.1 0.1 2.78 0.098 .
## Residuals 145 4.5 0.0
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
my.lm <- lm(Sepal.Length ~ Petal.Width, data = iris)
summary(my.lm)
##
## Call:
## lm(formula = Sepal.Length ~ Petal.Width, data = iris)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.3882 -0.2936 -0.0439 0.2643 1.3452
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.7776 0.0729 65.5 <2e-16 ***
## Petal.Width 0.8886 0.0514 17.3 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.478 on 148 degrees of freedom
## Multiple R-squared: 0.669, Adjusted R-squared: 0.667
## F-statistic: 299 on 1 and 148 DF, p-value: <2e-16
summary.lm(my.lm)
##
## Call:
## lm(formula = Sepal.Length ~ Petal.Width, data = iris)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.3882 -0.2936 -0.0439 0.2643 1.3452
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.7776 0.0729 65.5 <2e-16 ***
## Petal.Width 0.8886 0.0514 17.3 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.478 on 148 degrees of freedom
## Multiple R-squared: 0.669, Adjusted R-squared: 0.667
## F-statistic: 299 on 1 and 148 DF, p-value: <2e-16
# to see the coefficients only
coef(my.lm)
## (Intercept) Petal.Width
## 4.7776 0.8886
# First, see what elements are available of an object
# What are the elements that coef() returns?
names(coef(my.lm))
## [1] "(Intercept)" "Petal.Width"
# put the coefficients in a new object
my.coef <- coef(my.lm)
# grab the value of the Intercept
my.coef["(Intercept)"]
## (Intercept)
## 4.778
# or slope
my.coef["Petal.Width"]
## Petal.Width
## 0.8886
via plot from the “graphics” package of any base R installation
plot(Sepal.Length ~ Petal.Width, data = iris)
Plot is a very generic function with many applications i.e. it is possible to plot a (small) dataframe this gives a quick visualisation of all data, potential correlations… don’t try this with lots of data: it will not fail, but it can take a long time and the resulting figure might not have the resolution on screen to see much.
plot(iris)
# but back to the specific plot of two variables
plot(Sepal.Length ~ Petal.Width, data = iris)
# instead of point, it is possible to use lines:
plot(Sepal.Length ~ Petal.Width, data = iris, type = "l")
# but again back to the points plot
plot(Sepal.Length ~ Petal.Width, data = iris)
# add the regression line that we calculated earlier to the plot
abline(my.lm)
# Add a line with slope "1" and intercept "4.5" for illustration purposes
# * "col" specifies colour
# * "lty" specifies linetype
# * "lwd" specified line width
abline(a = 4.5, b = 1, col = "green", lty = "dashed", lwd = 5)
Check colours() for available colours see for example http://wiki.stdout.org/rcookbook/Graphs/Shapes%20and%20line%20types/ for symbols, linetypes, etc… or check the lty section in ?par for linetype names
Use different symbols * “pch” specify point characters * check the above website, some of the symbols allow a background via “bg = colourname”
A more complex graph extend standard margins to allow space for a nice y-axis label default margins are “par()$mar” of 5.1, 4.1, 4.1, 2.1 lines
par(mar=c(5.1, 6, 4.1, 2.1))
plot(Sepal.Length ~ Petal.Width,
xlim = c(0, 4), # manually defined x-axis range
ylim = c(0, 8), # manually defined y-axis range
data = iris,
xlab = "Petal width [cm]", # axis label independent of name in data frame
ylab = expression("Fancy function"~sqrt(x^2)~CO[2]%+-%~sd), # just for show
pch = 23,
bg = "green")
# check "demo(plotmath)" for writing expressions
# add more data to the plot
# this is done without calling "plot" again!
points(Sepal.Width ~ Petal.Width,
data = iris,
pch = 4,
col = "blue" )
# add the linear model to the plot
abline(my.lm, col = "red", lty = "dashed", lwd = 2)
# add a 1:1 line
abline(a = 0, b = 1, lty = "dotted")
# build a legend
legend(2.6, 2.5,
c("Sepal length", "Sepal width", "Abline length", "1:1"),
pch = c(23, 4, NA, NA),
lty = c(NA, NA, "dashed", "dotted"),
cex = 0.8 # make the characters slightly smaller than standard text
)
Regarding modifying tick labels (i.e. rotating…) it involves creating the labels manually: see http://cran.r-project.org/doc/FAQ/R-FAQ.html#How-can-I-create-rotated-axis-labels_003f
ggplot2-graphs: I’ll introduce the Grammar of Graphics (ggplot2) later in the seminar. This package allows to modify many elements of plots and offers options that are not available in the base plots.
But for now we stick with base graphics.
Another very useful type of graph is a boxplot Boxplots allow a quick overview on data, look for potential outliers, …
boxplot(Petal.Length ~ Species, data = iris)
# show aggregation of two factors as boxplot
boxplot(Petal.Length ~ Species * my.factor, data = iris)
R uses “Devices” for output. Standard device is the screen, but the output can always be redicrected to anover device. the principle is, that first a “device” is opened, then something is written to this device and then the device is closed.
# Example
plot(Sepal.Length ~ Petal.Width, data = iris)
To get a png image of this graph in the current working directory open the png device
png("myplot.png")
# then create the plot
plot(Sepal.Length ~ Petal.Width, data = iris)
# and close the device
dev.off()
The default size of a picture device is 640x480 pixels to change that provide information on the desired size
png("myplot_1024.png", width=1024, height=768)
plot(Sepal.Length ~ Petal.Width, data = iris)
dev.off()
Change the resolution in dpi
png("myplot_1024_high_res.png", width=1024, height=768, res = 150)
plot(Sepal.Length ~ Petal.Width, data = iris)
dev.off()
With resolution and width/height information, the image can be tweaked according to journal layout specifications (e.g. figure should be one column wide in the printed paper or so), etc…
It is much easier to use vector based image formats (svg, wmf, pdf) compared to bitmap based ones (png, gif, tif, bmp, jpg) BTW never, ever use jpg for figures. It is a lossy format and will look bad. jpg was designed to be used with photos, not for images with few colours and clear lines!
My preferred way to export figures is via the pdf pdf is based on postscript, which is vector based. vector based images do not have a fixed rsolution, they can be scaled without losing information also, vector based figures can easily be modified later if additional information has to be added outside of R.
pdf("myplot.pdf")
plot(Sepal.Length ~ Petal.Width, data = iris)
dev.off()
# change the size of the pdf (given in inches)
pdf("myplot_larger.pdf", width = 15, height = 8)
plot(Sepal.Length ~ Petal.Width, data = iris)
dev.off()
# create a scalable vector graphic
svg("myplot.svg")
plot(Sepal.Length ~ Petal.Width, data = iris)
dev.off()
For further information on saving a graph see: http://wiki.stdout.org/rcookbook/Graphs/Output%20to%20a%20file/
The “sink” device can be used to dump text into a file on the harddrive i.e. to store the previous anova results as a file redirect them to a text file
sink(file = "my_anova_results.txt")
print(my.aov.res)
sink()
Always remember to turn “sink” off via “sink()”!!
curve fitting
load the Loblolly pine example data set
data(Loblolly)
# do the usual checks
summary(Loblolly)
## height age Seed
## Min. : 3.46 Min. : 3.0 329 : 6
## 1st Qu.:10.47 1st Qu.: 5.0 327 : 6
## Median :34.00 Median :12.5 325 : 6
## Mean :32.36 Mean :13.0 307 : 6
## 3rd Qu.:51.36 3rd Qu.:20.0 331 : 6
## Max. :64.10 Max. :25.0 311 : 6
## (Other):48
dim(Loblolly)
## [1] 84 3
# create simple overview graph
plot(height ~ age,
data = Loblolly)
# allow some space beyond the current x and y range
plot(height ~ age,
data = Loblolly,
xlim = c(0,100),
ylim = c(0,100))
# do a nls curve fit
nls.fit <- nls(height ~ (a * age) / (b + age),
data = Loblolly,
start = c(a = 10, b = 15))
summary(nls.fit)
##
## Formula: height ~ (a * age)/(b + age)
##
## Parameters:
## Estimate Std. Error t value Pr(>|t|)
## a 684 278 2.46 0.016 *
## b 251 110 2.28 0.025 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.91 on 82 degrees of freedom
##
## Number of iterations to convergence: 8
## Achieved convergence tolerance: 1.39e-06
# grab the coefficients
my.coef <- coef(nls.fit)
my.a <- my.coef[1] # or use my.coef["a"]
my.b <- my.coef[2] # or use my.coef["b"]
# add a curve based on the coefficients to the figure
curve( (my.a * x) / (my.b + x),
from = 0,
to = 100,
add = TRUE)
Another method to add the curve is available via “predict” see ?predict.nls
xValues <- seq(min(0), max(100), length.out = 100) # create a sequence from 0 to the desired max x-value with 100 elements
# predict values from the nls.fit.
# Predict expects a dataframe "in which to look for variables with which to predict".
# Name has to match the nls model for the independent variable / predictor.
my.predicted <- predict(nls.fit, data.frame(age = xValues))
# Create the base plot again
plot(height ~ age,
data = Loblolly,
xlim = c(0,100),
ylim = c(0,100))
# use lines to connect the predicted values.
lines(xValues,
my.predicted,
col = "red")
This is possible with the function plotfit() from package nlstools as well
# if the package is not installed do
# install.packages("nlstools", dep = TRUE)
# then load the package via "library()"
library(nlstools)
##
## 'nlstools' has been loaded.
##
## IMPORTANT NOTICE: Most nonlinear regression models and data set examples
## related to predictive microbiolgy have been moved to the package 'nlsMicrobio'
# create a plot with a fitted curve
# "smooth"" must be set to "TRUE" to request fitted values
plotfit(nls.fit,
smooth = TRUE)
# as other functions, plotfit allows to pass parameters to underlying functions
# this is (as usual) indicated by "..." in the helpfile ?plotfit
# we can adjust the plotted range via
plotfit(nls.fit,
smooth = TRUE,
xlim = c(0, 100),
ylim = c(0, 100))
Next time * How to do individual fits for subsets of the data?