setwd("F:/Documents/Work/Courses/Stats_LUND_2012_October/Exercises/Ex_03")

PART A

# 1.
succ <- read.csv("Succ.csv")


# 2. Make a histogram of the variable succ.
names(succ)
## [1] "year" "succ"
hist(succ$succ)

plot of chunk unnamed-chunk-2


# add a normal curve (using code from:
# http://www.statmethods.net/graphs/density.html)

# Add a Normal Curve (Thanks to Peter Dalgaard)
x <- succ$succ  #the variable to make a histogram from
h <- hist(x, breaks = 10, col = "red", xlab = "Breeding sucess", main = "Histogram with Normal Curve")  #make a histogram object based on x
xfit <- seq(min(x), max(x), length = 10)  #make vector of x values for range of our data, length setting number of x values to use, i.e. a higher value will result in a smoother final curve.
yfit <- dnorm(xfit, mean = mean(x), sd = sd(x))  #predict frequency for each of above x values
yfit <- yfit * diff(h$mids[1:2]) * length(x)  #somehow normalise this object to our data
lines(xfit, yfit, col = "blue", lwd = 2)  #add the normal curve to out histogram plot

plot of chunk unnamed-chunk-2


# 3. Save average success per year
ymean <- aggregate(succ, by = list(succ$year), FUN = mean, na.rm = TRUE)
hist(ymean$succ)

plot of chunk unnamed-chunk-2


# then add normal curve
x <- ymean$succ  #the variable to make a histogram from
h <- hist(x, breaks = 5, col = "red", xlab = "Breeding sucess", main = "Histogram with Normal Curve")  #make a histogram object based on x
xfit <- seq(min(x), max(x), length = 10)  #make vector of x values for range of our data, length setting number of x values to use, i.e. a higher value will result in a smoother final curve.
yfit <- dnorm(xfit, mean = mean(x), sd = sd(x))  #predict frequency for each of above x values
yfit <- yfit * diff(h$mids[1:2]) * length(x)  #somehow normalise this object to our data
lines(xfit, yfit, col = "blue", lwd = 2)  #add the normal curve to out histogram plot

plot of chunk unnamed-chunk-2

What does it look like? What is this an example of?
This is an example of the mean of sample means - i.e. an estimation of the true population mean. Here estimated by taking a sample each year, then averaging across years. According to the central limit theorem this will approach a normal distribution - as indeed they do here.

# Part B Generate 30 x 30 random numbers from a binnomial distribution
# with P 0.2
randob <- list()  #make a list to recieve simulation results
for (i in 1:30) randob[[i]] <- rbinom(30, 1, 0.2)  #generate 30 sets of 30 obs of observations drawn from binnomial distribution with 1,0 and P 0.2
ymean <- lapply(randob, mean)
ymean <- unlist(ymean)
mean(ymean)
## [1] 0.1978
hist(ymean)

plot of chunk unnamed-chunk-3


# Calculate the mean (AVERAGE() in Excel) of each column. Are the means as
# expected? What did you expect? The mean for the most recent simulation
# is
mean(ymean)
## [1] 0.1978
# this is close to the mean of the distrubtion from which the random value
# were drawn (0.2). As it is the mean of population means I expected it to
# be close to the 'real' mean.

# SD for each column:
ysd <- lapply(randob, sd)
ysd <- unlist(ysd)
hist(ysd)

plot of chunk unnamed-chunk-3

3.

# calculate the SE of the means
y.se <- sd(ymean)
y.se
## [1] 0.08707
# does this match with the sample population SD? No, as the SE is always
# less than the sample standard deviations. There is less variation in the
# samples means, than there is in invidual observations.

# 4. exact standard error (sinse we know the real population paramaters)
# mean 0.2 sample sd 0.2
y.se.t <- 0.2/sqrt(30)
  1. Now as above but with a normal distribution.
# mean 5, sd 3
randob <- list()  #make a list to recieve simulation results
for (i in 1:30) randob[[i]] <- rnorm(30, 5, 3)  #generate 30 sets of 30 obs of observations drawn from binnomial distribution with 1,0 and P 0.2
ymean <- lapply(randob, mean)
ymean <- unlist(ymean)
mean(ymean)
## [1] 5.002
hist(ymean)

plot of chunk unnamed-chunk-5

mean(ymean)
## [1] 5.002
# SD for each column:
ysd <- lapply(randob, sd)
ysd <- unlist(ysd)
hist(ysd)

plot of chunk unnamed-chunk-5


# calculate the SE of the means
y.se <- sd(ymean)
y.se
## [1] 0.457

# similar results to those with the binnomial distribution, the SE is less
# than the SD for each sample, or the mean sd across samples.'
norm <- rnorm(900, 5, 3)
dim(norm) <- c(30, 30)
norm10 <- norm[, c(1:10)]

# Do a two-tailed Pearson correlation, using the command cor:
norm.corr <- cor(norm10, y = NULL, use = "everything", method = "pearson")
norm.corr
##           [,1]     [,2]     [,3]     [,4]     [,5]     [,6]     [,7]
##  [1,]  1.00000 -0.23544 -0.13114  0.19432 -0.23937 -0.09564  0.15720
##  [2,] -0.23544  1.00000  0.29060 -0.06188  0.08509  0.21703 -0.16364
##  [3,] -0.13114  0.29060  1.00000 -0.04298  0.28387  0.21047 -0.26016
##  [4,]  0.19432 -0.06188 -0.04298  1.00000  0.20228 -0.14111 -0.24701
##  [5,] -0.23937  0.08509  0.28387  0.20228  1.00000  0.07117 -0.21213
##  [6,] -0.09564  0.21703  0.21047 -0.14111  0.07117  1.00000 -0.21514
##  [7,]  0.15720 -0.16364 -0.26016 -0.24701 -0.21213 -0.21514  1.00000
##  [8,] -0.06233  0.17894 -0.17914  0.11314 -0.22471 -0.02635 -0.27707
##  [9,] -0.07830 -0.20783  0.13768  0.20267 -0.16214  0.32673  0.07950
## [10,]  0.07034 -0.24385 -0.28207  0.07922  0.05430 -0.34289  0.03374
##           [,8]    [,9]    [,10]
##  [1,] -0.06233 -0.0783  0.07034
##  [2,]  0.17894 -0.2078 -0.24385
##  [3,] -0.17914  0.1377 -0.28207
##  [4,]  0.11314  0.2027  0.07922
##  [5,] -0.22471 -0.1621  0.05430
##  [6,] -0.02635  0.3267 -0.34289
##  [7,] -0.27707  0.0795  0.03374
##  [8,]  1.00000 -0.1100  0.07723
##  [9,] -0.11005  1.0000 -0.32185
## [10,]  0.07723 -0.3218  1.00000

library(Hmisc)
## Warning: package 'Hmisc' was built under R version 2.14.2
## Loading required package: survival
## Loading required package: splines
## Hmisc library by Frank E Harrell Jr
## 
## Type library(help='Hmisc'), ?Overview, or ?Hmisc.Overview') to see overall
## documentation.
## 
## NOTE:Hmisc no longer redefines [.factor to drop unused levels when
## subsetting.  To get the old behavior of Hmisc type dropUnusedLevels().
## Attaching package: 'Hmisc'
## The following object(s) are masked from 'package:survival':
## 
## untangle.specials
## The following object(s) are masked from 'package:base':
## 
## format.pval, round.POSIXt, trunc.POSIXt, units
norm.corr.p <- rcorr(as.matrix(norm10), type = "pearson")
# three p values (in this simulation) are less than 0.05, and thus
# significant at alpha 0.05. This is the number of values may expect to be
# correlated purly by chance - i.e. they are not meaningful corrections.

Part C

tits <- read.csv("tits.csv")
names(tits)
## [1] "SPE" "wei" "egg"
boxplot(tits$wei ~ tits$SPE)

plot of chunk unnamed-chunk-6

boxplot(tits$egg ~ tits$SPE)

plot of chunk unnamed-chunk-6

The different parts of the boxes and whiskers represent, from below, the 2.5, 25, 50, 75, and 97.5 percentiles. What does that mean? Outliers are denoted by circles.
These reflect the data disbribution, with the whiskers representing the 95% CI.

# 2.
summary(tits$wei[tits$SPE == "BM"])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##     8.3    10.9    11.5    11.2    11.9    12.3     3.0
summary(tits$wei[tits$SPE == "TX"])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    13.2    16.7    17.6    17.4    18.6    19.5    23.0

# 3.
TX <- subset(tits, SPE == "TX")
se.wei.TX <- (sd(TX$wei, na.rm = TRUE))/(sqrt(length(na.omit(TX$wei))))
BM <- subset(tits, SPE == "BM")
se.wei.BM <- (sd(BM$wei, na.rm = TRUE))/(sqrt(length(na.omit(BM$wei))))

se.egg.TX <- (sd(TX$egg, na.rm = TRUE))/(sqrt(length(na.omit(TX$egg))))
se.egg.BM <- (sd(BM$egg, na.rm = TRUE))/(sqrt(length(na.omit(BM$egg))))

se.wei <- cbind(se.wei.TX, se.wei.BM)
se.egg <- cbind(se.egg.TX, se.egg.BM)

mean.egg <- cbind(mean(TX$egg, na.rm = TRUE), mean(BM$egg, na.rm = TRUE))
mean.wei <- cbind(mean(TX$wei, na.rm = TRUE), mean(BM$wei, na.rm = TRUE))

x <- c(1:2)
plot(x, mean.egg, type = "b", lty = 0, xlim = c(0.5, 2.5), xlab = "Species", 
    ylim = c(6, 14), ylab = "Eggs", axes = FALSE)
axis(1, at = c(0.5, 1, 2, 2.5), labels = c("", "Great tit", "Blue tit", ""))
axis(2, at = seq(0, 14, 2))
errbar(x, mean.egg, mean.egg + se.egg, mean.egg - se.egg, add = TRUE)

plot of chunk unnamed-chunk-7



x <- c(1:2)
plot(x, mean.wei, type = "b", lty = 0, xlim = c(0.5, 2.5), xlab = "Species", 
    ylim = c(6, 20), ylab = "Weight", axes = FALSE)
axis(1, at = c(0.5, 1, 2, 2.5), labels = c("", "Great tit", "Blue tit", ""))
axis(2, at = seq(0, 20, 2))
errbar(x, mean.wei, mean.wei + se.wei, mean.wei - se.wei, add = TRUE)

plot of chunk unnamed-chunk-7


# The most important difference between this SE plot and the previous
# boxplot, is that the error bars represent the 95% CI for the mean here,
# thus, if they do not overlap, we can say that, at alpha 0.05, that they
# are significantly different.

part D

# 1. read in data, directly from URL
forest <- read.csv("http://wallace.teorekol.lu.se/\nstatistics_for_biologists/03/forest.csv", 
    header = TRUE, sep = ";", row.names = NULL)
## Warning: cannot open: HTTP status was '0 (null)'
## Error: cannot open the connection

names(forest)
## Error: object 'forest' not found

hist(forest$oak)
## Error: object 'forest' not found
hist(forest$bea)
## Error: object 'forest' not found
hist(forest$asp)
## Error: object 'forest' not found
  1. Make boxplots of some of the tree species (one at a time), grouped by the study site. You may see that many species have very different distributions, depending on the site.
boxplot(forest$oak ~ forest$site)
## Error: object 'forest' not found
boxplot(forest$bea ~ forest$site)
## Error: object 'forest' not found
boxplot(forest$asp ~ forest$site)
## Error: object 'forest' not found
  1. Get descriptive statistics for the whole dataset using summary.
summary(forest)
## Error: object 'forest' not found
  1. Make 'error bar' plots of the same variables. Adding error bars is easier in this case since the data is numerical. It's calculating the means and SEs that takes some time. Use aggregate to calculate means and variances for all the variables in forest. Then calculate the number of observations (N) at each site using table. Make the table into a matrix and then transpose, so that it is in the same format as the data in forest. Now you can use the variance and N data to calculate your SE. Note that applying functions to the whole dataframe means that your values for site will also change, so it's a good idea to make a new vector including the site names.
av.forest <- aggregate(forest, by = list(forest$site), FUN = mean, na.rm = TRUE)  #for each vaiable in forest, by site calculate the mean
## Error: object 'forest' not found
var.forest <- aggregate(forest, by = list(forest$site), FUN = var, na.rm = TRUE)  #now do the same by for variance
## Error: object 'forest' not found
site.names <- av.forest$Group.1
## Error: object 'av.forest' not found
n.forest <- table(forest$site)  #calculates frequency for each forest site
## Error: object 'forest' not found
n.forest <- as.matrix(n.forest)  #coverts to matrix
## Error: object 'n.forest' not found
n.forest <- t(n.forest)  #transposes the matrix
## Error: object 'n.forest' not found
se.forest <- sqrt(var.forest/n.forest)  #calculate the SE
## Error: object 'var.forest' not found
se.forest <- cbind(se.forest, site.names)  #pull the Se together with the site names
## Error: object 'se.forest' not found

# Now you can plot your data using plot and errbar:
plot(site.names, av.forest$oak, ylim = c(0, 1), xlab = "Site", ylab = "Oak")
## Error: object 'site.names' not found
errbar(site.names, av.forest$oak, av.forest$oak + se.forest$oak, av.forest$oak - 
    se.forest$oak, add = TRUE)
## Error: object 'av.forest' not found

What is the difference between the boxplots and the SE plots?
The SE plot allows us to see which sites are significantly different from each other.

# Subset of data with site < 30
forest.sub <- subset(forest, forest$site < 30)
## Error: object 'forest' not found
boxplot(forest.sub$oak ~ forest.sub$site, xlab = "Site", ylab = "Oak")
## Error: object 'forest.sub' not found
boxplot(forest.sub$bir ~ forest.sub$site, xlab = "Site", ylab = "Birch")
## Error: object 'forest.sub' not found
boxplot(forest.sub$bea ~ forest.sub$site, xlab = "Site", ylab = "Beach")
## Error: object 'forest.sub' not found
boxplot(forest.sub$ald ~ forest.sub$site, xlab = "Site", ylab = "Alder")
## Error: object 'forest.sub' not found
boxplot(forest.sub$asp ~ forest.sub$site, xlab = "Site", ylab = "Aspen")
## Error: object 'forest.sub' not found
boxplot(forest.sub$lim ~ forest.sub$site, xlab = "Site", ylab = "Lime")
## Error: object 'forest.sub' not found