Click here for other works of the author on RPubs
Load knitr for better report quality with markdown and ggplot2 for better plots
library(knitr)
library(ggplot2)
Import data set enviANDdensity, extract density data of fish and copepod from the data set and display partial data
enviANDdensity <- read.table("enviANDdensity.txt", header = T)
fish_density = as.matrix(enviANDdensity[, 11])
copepod_density = as.matrix(enviANDdensity[, 12])
# display a part of the data
kable(head(cbind(fish_density, copepod_density)), digits = 2, col.names = c("Fish density", "Copepod density"), align = "l", caption = "Fish and copepod density data (partial)")
| Fish density | Copepod density |
|---|---|
| 137.32 | 1119 |
| 20.97 | 1153 |
| 0.00 | 1719 |
| 0.00 | 855 |
| 180.71 | 1246 |
| 88.35 | 2123 |
Y = fish_density
X = cbind(1, copepod_density)
beta = solve(t(X) %*% X) %*% (t(X) %*% Y)
resamp_n = 100000
size = length(fish_density)
beta_sample = numeric()
for (i in 1:resamp_n) {
#create a lish of indices for randomization
shuffle = order(runif(size))
#define randomized dependent variable and independent variables and calculate coefficients
Y = fish_density[shuffle]
X = cbind(1, copepod_density)
b = solve(t(X) %*% X) %*% (t(X) %*% Y)
beta_sample = append(beta_sample, b)
}
beta_sample = t(matrix(beta_sample, 2, resamp_n))
#calculate p-value for the two tailed test
p_value_q1 = sum(abs(beta_sample[, 2]) > abs(beta[2])) / resamp_n
#create density plot for the permuted samples and add vertical lines at the estimated value
lab_q1 = c(-beta[2], seq(min(beta_sample[, 2]), max(beta_sample[, 2]), length.out=5), beta[2])
p_q1 <- ggplot(as.data.frame(beta_sample[, 2]), aes(x = beta_sample[, 2])) +
geom_density(fill = "#99CCFF", col = "#3399FF", alpha = 0.5) +
geom_vline(xintercept = c(-beta[2], beta[2]), col = "red") +
labs(title = expression(Distribution~of~permuted~hat(beta)[1]~samples~"for"~null~hypothesis), x = expression(Permuted~hat(beta)[1])) +
annotate("text", x = beta[2], y = 10, label = "Esimated~hat(beta)[1]", hjust = 1.1, parse = T) +
scale_x_continuous(breaks = lab_q1, labels = format(lab_q1, digits = 1))
#add shaded area to indicate significant values
d_q1 <- ggplot_build(p_q1)$data[[1]]
p_q1 + geom_area(data = subset(d_q1, x > beta[2]), aes(x = x, y = y), col = "red", fill = "red") +
geom_area(data = subset(d_q1, x < -beta[2]), aes(x = x, y = y), col = "red", fill = "red")
The p-value of our test on \(H_0:\beta_1=0\) is 0.00009, therefore we reject the null hypothesis and conclude that \(\beta_1\) is significantly different from 0.
Import data copepod_composition.txt, cop_density.txt and copepodSPlist.txt for analysis
#import copepod data
species <- read.table("copepod_composition.txt", header = T)
#import density data
dens <- as.vector(read.table("cop_density.txt", header = T)[[1]])
#import species name
species_name <- read.table("copepodSPlist.txt", sep = "\t")
#convert species frequency into percentage
species = species / 100
#calculate copepod density for each species for each cruise station
species.density = t(apply(species, 1, function(x) x * dens))
#extract the density of Oncaea Venusta and Canthocalanus pauper
OV = species.density[grep("Oncaea venusta", species_name[, 1]), ]
CP = species.density[grep("Canthocalanus pauper", species_name[, 1]), ]
#show partial data
kable(head(cbind(OV, CP)), digits = 2, col.names = c("Oncaea Venusta", "Canthocalanus pauper"), align = "l", caption = "Oncaea Venusta and Canthocalanus pauper density data (partial)")
| Oncaea Venusta | Canthocalanus pauper | |
|---|---|---|
| p1 | 4.48 | 0.00 |
| p3 | 4.38 | 0.00 |
| p4 | 0.00 | 0.00 |
| p6 | 18.04 | 0.00 |
| p13 | 78.75 | 4.73 |
| p16 | 56.05 | 4.67 |
diff = numeric(resamp_n)
for(i in 1:resamp_n){
shuffle = c(rep(c(1, 2), 34))[order(runif(68))]
data = cbind(c(OV, CP), shuffle)
g1 = subset(data, shuffle == 1)
g2 = subset(data, shuffle == 2)
diff[i] = mean(g1) - mean(g2)
}
#calculate estimated difference between Oncaea Venusta and Canthocalanus pauper density
est_diff = mean(OV) - mean(CP)
#calculate p-value
p_value_q2 = sum(abs(diff) > abs(est_diff)) / resamp_n
#create density plot for the permuted samples and add vertical lines at the estimated value
lab_q2 = c(-est_diff, seq(min(diff), max(diff), length.out = 5), est_diff)
p_q2 <- ggplot(as.data.frame(diff), aes(x = diff)) +
geom_density(fill = "#99CCFF", col = "#3399FF", alpha = 0.5) +
geom_vline(xintercept = c(-est_diff, est_diff), col = "red") +
labs(title = expression(Distribution~of~permuted~hat(theta)[1]-hat(theta)[2]~samples~"for"~null~hypothesis), x = expression(Permuted~hat(theta)[1]-hat(theta)[2])) +
annotate("text", x = est_diff, y = 0.01, label = "Esimated~hat(theta)[1]-hat(theta)[2]", hjust = 1.1, parse = T) +
scale_x_continuous(breaks = lab_q2, labels = format(lab_q2, digits = 2))
#add shaded area to indicate significant values
d_q2 <- ggplot_build(p_q2)$data[[1]]
p_q2 + geom_area(data = subset(d_q2, x > est_diff), aes(x = x, y = y), col = "red", fill = "red") +
geom_area(data = subset(d_q2, x < -est_diff), aes(x = x, y = y), col = "red", fill = "red")
The p-value of our test on \(H_0:\theta_1-\theta_2=0\) is 0.01011, therefore we reject the null hypothesis and conclude that significant difference does exist between the density of Oncaea Venusta and Canthocalanus pauper.