setwd("F:/Documents/Work/Courses/Stats_LUND_2012_October/Exercises/Ex_03")
PART A
# 1.
succ <- read.csv("Succ.csv")
# 2. Make a histogram of the variable succ.
names(succ)
## [1] "year" "succ"
hist(succ$succ)
# add a normal curve (using code from:
# http://www.statmethods.net/graphs/density.html)
# Add a Normal Curve (Thanks to Peter Dalgaard)
x <- succ$succ #the variable to make a histogram from
h <- hist(x, breaks = 10, col = "red", xlab = "Breeding sucess", main = "Histogram with Normal Curve") #make a histogram object based on x
xfit <- seq(min(x), max(x), length = 10) #make vector of x values for range of our data, length setting number of x values to use, i.e. a higher value will result in a smoother final curve.
yfit <- dnorm(xfit, mean = mean(x), sd = sd(x)) #predict frequency for each of above x values
yfit <- yfit * diff(h$mids[1:2]) * length(x) #somehow normalise this object to our data
lines(xfit, yfit, col = "blue", lwd = 2) #add the normal curve to out histogram plot
# 3. Save average success per year
ymean <- aggregate(succ, by = list(succ$year), FUN = mean, na.rm = TRUE)
hist(ymean$succ)
# then add normal curve
x <- ymean$succ #the variable to make a histogram from
h <- hist(x, breaks = 5, col = "red", xlab = "Breeding sucess", main = "Histogram with Normal Curve") #make a histogram object based on x
xfit <- seq(min(x), max(x), length = 10) #make vector of x values for range of our data, length setting number of x values to use, i.e. a higher value will result in a smoother final curve.
yfit <- dnorm(xfit, mean = mean(x), sd = sd(x)) #predict frequency for each of above x values
yfit <- yfit * diff(h$mids[1:2]) * length(x) #somehow normalise this object to our data
lines(xfit, yfit, col = "blue", lwd = 2) #add the normal curve to out histogram plot
What does it look like? What is this an example of?
This is an example of the mean of sample means - i.e. an estimation of the true population mean. Here estimated by taking a sample each year, then averaging across years. According to the central limit theorem this will approach a normal distribution - as indeed they do here.
# Part B Generate 30 x 30 random numbers from a binnomial distribution
# with P 0.2
randob <- list() #make a list to recieve simulation results
for (i in 1:30) randob[[i]] <- rbinom(30, 1, 0.2) #generate 30 sets of 30 obs of observations drawn from binnomial distribution with 1,0 and P 0.2
ymean <- lapply(randob, mean)
ymean <- unlist(ymean)
mean(ymean)
## [1] 0.1978
hist(ymean)
# Calculate the mean (AVERAGE() in Excel) of each column. Are the means as
# expected? What did you expect? The mean for the most recent simulation
# is
mean(ymean)
## [1] 0.1978
# this is close to the mean of the distrubtion from which the random value
# were drawn (0.2). As it is the mean of population means I expected it to
# be close to the 'real' mean.
# SD for each column:
ysd <- lapply(randob, sd)
ysd <- unlist(ysd)
hist(ysd)
3.
# calculate the SE of the means
y.se <- sd(ymean)
y.se
## [1] 0.08707
# does this match with the sample population SD? No, as the SE is always
# less than the sample standard deviations. There is less variation in the
# samples means, than there is in invidual observations.
# 4. exact standard error (sinse we know the real population paramaters)
# mean 0.2 sample sd 0.2
y.se.t <- 0.2/sqrt(30)
# mean 5, sd 3
randob <- list() #make a list to recieve simulation results
for (i in 1:30) randob[[i]] <- rnorm(30, 5, 3) #generate 30 sets of 30 obs of observations drawn from binnomial distribution with 1,0 and P 0.2
ymean <- lapply(randob, mean)
ymean <- unlist(ymean)
mean(ymean)
## [1] 5.002
hist(ymean)
mean(ymean)
## [1] 5.002
# SD for each column:
ysd <- lapply(randob, sd)
ysd <- unlist(ysd)
hist(ysd)
# calculate the SE of the means
y.se <- sd(ymean)
y.se
## [1] 0.457
# similar results to those with the binnomial distribution, the SE is less
# than the SD for each sample, or the mean sd across samples.'
norm <- rnorm(900, 5, 3)
dim(norm) <- c(30, 30)
norm10 <- norm[, c(1:10)]
# Do a two-tailed Pearson correlation, using the command cor:
norm.corr <- cor(norm10, y = NULL, use = "everything", method = "pearson")
norm.corr
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [1,] 1.00000 -0.23544 -0.13114 0.19432 -0.23937 -0.09564 0.15720
## [2,] -0.23544 1.00000 0.29060 -0.06188 0.08509 0.21703 -0.16364
## [3,] -0.13114 0.29060 1.00000 -0.04298 0.28387 0.21047 -0.26016
## [4,] 0.19432 -0.06188 -0.04298 1.00000 0.20228 -0.14111 -0.24701
## [5,] -0.23937 0.08509 0.28387 0.20228 1.00000 0.07117 -0.21213
## [6,] -0.09564 0.21703 0.21047 -0.14111 0.07117 1.00000 -0.21514
## [7,] 0.15720 -0.16364 -0.26016 -0.24701 -0.21213 -0.21514 1.00000
## [8,] -0.06233 0.17894 -0.17914 0.11314 -0.22471 -0.02635 -0.27707
## [9,] -0.07830 -0.20783 0.13768 0.20267 -0.16214 0.32673 0.07950
## [10,] 0.07034 -0.24385 -0.28207 0.07922 0.05430 -0.34289 0.03374
## [,8] [,9] [,10]
## [1,] -0.06233 -0.0783 0.07034
## [2,] 0.17894 -0.2078 -0.24385
## [3,] -0.17914 0.1377 -0.28207
## [4,] 0.11314 0.2027 0.07922
## [5,] -0.22471 -0.1621 0.05430
## [6,] -0.02635 0.3267 -0.34289
## [7,] -0.27707 0.0795 0.03374
## [8,] 1.00000 -0.1100 0.07723
## [9,] -0.11005 1.0000 -0.32185
## [10,] 0.07723 -0.3218 1.00000
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 2.14.2
## Loading required package: survival
## Loading required package: splines
## Hmisc library by Frank E Harrell Jr
##
## Type library(help='Hmisc'), ?Overview, or ?Hmisc.Overview') to see overall
## documentation.
##
## NOTE:Hmisc no longer redefines [.factor to drop unused levels when
## subsetting. To get the old behavior of Hmisc type dropUnusedLevels().
## Attaching package: 'Hmisc'
## The following object(s) are masked from 'package:survival':
##
## untangle.specials
## The following object(s) are masked from 'package:base':
##
## format.pval, round.POSIXt, trunc.POSIXt, units
norm.corr.p <- rcorr(as.matrix(norm10), type = "pearson")
# three p values (in this simulation) are less than 0.05, and thus
# significant at alpha 0.05. This is the number of values may expect to be
# correlated purly by chance - i.e. they are not meaningful corrections.
Part C
tits <- read.csv("tits.csv")
names(tits)
## [1] "SPE" "wei" "egg"
boxplot(tits$wei ~ tits$SPE)
boxplot(tits$egg ~ tits$SPE)
The different parts of the boxes and whiskers represent, from below, the 2.5, 25, 50, 75, and 97.5 percentiles. What does that mean? Outliers are denoted by circles.
These reflect the data disbribution, with the whiskers representing the 95% CI.
# 2.
summary(tits$wei[tits$SPE == "BM"])
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 8.3 10.9 11.5 11.2 11.9 12.3 3.0
summary(tits$wei[tits$SPE == "TX"])
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 13.2 16.7 17.6 17.4 18.6 19.5 23.0
# 3.
TX <- subset(tits, SPE == "TX")
se.wei.TX <- (sd(TX$wei, na.rm = TRUE))/(sqrt(length(na.omit(TX$wei))))
BM <- subset(tits, SPE == "BM")
se.wei.BM <- (sd(BM$wei, na.rm = TRUE))/(sqrt(length(na.omit(BM$wei))))
se.egg.TX <- (sd(TX$egg, na.rm = TRUE))/(sqrt(length(na.omit(TX$egg))))
se.egg.BM <- (sd(BM$egg, na.rm = TRUE))/(sqrt(length(na.omit(BM$egg))))
se.wei <- cbind(se.wei.TX, se.wei.BM)
se.egg <- cbind(se.egg.TX, se.egg.BM)
mean.egg <- cbind(mean(TX$egg, na.rm = TRUE), mean(BM$egg, na.rm = TRUE))
mean.wei <- cbind(mean(TX$wei, na.rm = TRUE), mean(BM$wei, na.rm = TRUE))
x <- c(1:2)
plot(x, mean.egg, type = "b", lty = 0, xlim = c(0.5, 2.5), xlab = "Species",
ylim = c(6, 14), ylab = "Eggs", axes = FALSE)
axis(1, at = c(0.5, 1, 2, 2.5), labels = c("", "Great tit", "Blue tit", ""))
axis(2, at = seq(0, 14, 2))
errbar(x, mean.egg, mean.egg + se.egg, mean.egg - se.egg, add = TRUE)
x <- c(1:2)
plot(x, mean.wei, type = "b", lty = 0, xlim = c(0.5, 2.5), xlab = "Species",
ylim = c(6, 20), ylab = "Weight", axes = FALSE)
axis(1, at = c(0.5, 1, 2, 2.5), labels = c("", "Great tit", "Blue tit", ""))
axis(2, at = seq(0, 20, 2))
errbar(x, mean.wei, mean.wei + se.wei, mean.wei - se.wei, add = TRUE)
# The most important difference between this SE plot and the previous
# boxplot, is that the error bars represent the 95% CI for the mean here,
# thus, if they do not overlap, we can say that, at alpha 0.05, that they
# are significantly different.
part D
# 1. read in data, directly from URL
forest <- read.csv("http://wallace.teorekol.lu.se/\nstatistics_for_biologists/03/forest.csv",
header = TRUE, sep = ";", row.names = NULL)
## Warning: cannot open: HTTP status was '0 (null)'
## Error: cannot open the connection
names(forest)
## Error: object 'forest' not found
hist(forest$oak)
## Error: object 'forest' not found
hist(forest$bea)
## Error: object 'forest' not found
hist(forest$asp)
## Error: object 'forest' not found
boxplot(forest$oak ~ forest$site)
## Error: object 'forest' not found
boxplot(forest$bea ~ forest$site)
## Error: object 'forest' not found
boxplot(forest$asp ~ forest$site)
## Error: object 'forest' not found
summary(forest)
## Error: object 'forest' not found
av.forest <- aggregate(forest, by = list(forest$site), FUN = mean, na.rm = TRUE) #for each vaiable in forest, by site calculate the mean
## Error: object 'forest' not found
var.forest <- aggregate(forest, by = list(forest$site), FUN = var, na.rm = TRUE) #now do the same by for variance
## Error: object 'forest' not found
site.names <- av.forest$Group.1
## Error: object 'av.forest' not found
n.forest <- table(forest$site) #calculates frequency for each forest site
## Error: object 'forest' not found
n.forest <- as.matrix(n.forest) #coverts to matrix
## Error: object 'n.forest' not found
n.forest <- t(n.forest) #transposes the matrix
## Error: object 'n.forest' not found
se.forest <- sqrt(var.forest/n.forest) #calculate the SE
## Error: object 'var.forest' not found
se.forest <- cbind(se.forest, site.names) #pull the Se together with the site names
## Error: object 'se.forest' not found
# Now you can plot your data using plot and errbar:
plot(site.names, av.forest$oak, ylim = c(0, 1), xlab = "Site", ylab = "Oak")
## Error: object 'site.names' not found
errbar(site.names, av.forest$oak, av.forest$oak + se.forest$oak, av.forest$oak -
se.forest$oak, add = TRUE)
## Error: object 'av.forest' not found
What is the difference between the boxplots and the SE plots?
The SE plot allows us to see which sites are significantly different from each other.
# Subset of data with site < 30
forest.sub <- subset(forest, forest$site < 30)
## Error: object 'forest' not found
boxplot(forest.sub$oak ~ forest.sub$site, xlab = "Site", ylab = "Oak")
## Error: object 'forest.sub' not found
boxplot(forest.sub$bir ~ forest.sub$site, xlab = "Site", ylab = "Birch")
## Error: object 'forest.sub' not found
boxplot(forest.sub$bea ~ forest.sub$site, xlab = "Site", ylab = "Beach")
## Error: object 'forest.sub' not found
boxplot(forest.sub$ald ~ forest.sub$site, xlab = "Site", ylab = "Alder")
## Error: object 'forest.sub' not found
boxplot(forest.sub$asp ~ forest.sub$site, xlab = "Site", ylab = "Aspen")
## Error: object 'forest.sub' not found
boxplot(forest.sub$lim ~ forest.sub$site, xlab = "Site", ylab = "Lime")
## Error: object 'forest.sub' not found