Click here for other works of the author on RPubs
Load knitr
for better report quality with markdown and ggplot2
for better plots
library(knitr)
library(ggplot2)
Import data set enviANDdensity
, extract density data of fish and copepod from the data set and display partial data
enviANDdensity <- read.table("enviANDdensity.txt", header = T)
fish_density = as.matrix(enviANDdensity[, 11])
copepod_density = as.matrix(enviANDdensity[, 12])
# display a part of the data
kable(head(cbind(fish_density, copepod_density)), digits = 2, col.names = c("Fish density", "Copepod density"), align = "l", caption = "Fish and copepod density data (partial)")
Fish density | Copepod density |
---|---|
137.32 | 1119 |
20.97 | 1153 |
0.00 | 1719 |
0.00 | 855 |
180.71 | 1246 |
88.35 | 2123 |
Y = fish_density
X = cbind(1, copepod_density)
beta = solve(t(X) %*% X) %*% (t(X) %*% Y)
resamp_n = 100000
size = length(fish_density)
beta_sample = numeric()
for (i in 1:resamp_n) {
#create a lish of indices for randomization
shuffle = order(runif(size))
#define randomized dependent variable and independent variables and calculate coefficients
Y = fish_density[shuffle]
X = cbind(1, copepod_density)
b = solve(t(X) %*% X) %*% (t(X) %*% Y)
beta_sample = append(beta_sample, b)
}
beta_sample = t(matrix(beta_sample, 2, resamp_n))
#calculate p-value for the two tailed test
p_value_q1 = sum(abs(beta_sample[, 2]) > abs(beta[2])) / resamp_n
#create density plot for the permuted samples and add vertical lines at the estimated value
lab_q1 = c(-beta[2], seq(min(beta_sample[, 2]), max(beta_sample[, 2]), length.out=5), beta[2])
p_q1 <- ggplot(as.data.frame(beta_sample[, 2]), aes(x = beta_sample[, 2])) +
geom_density(fill = "#99CCFF", col = "#3399FF", alpha = 0.5) +
geom_vline(xintercept = c(-beta[2], beta[2]), col = "red") +
labs(title = expression(Distribution~of~permuted~hat(beta)[1]~samples~"for"~null~hypothesis), x = expression(Permuted~hat(beta)[1])) +
annotate("text", x = beta[2], y = 10, label = "Esimated~hat(beta)[1]", hjust = 1.1, parse = T) +
scale_x_continuous(breaks = lab_q1, labels = format(lab_q1, digits = 1))
#add shaded area to indicate significant values
d_q1 <- ggplot_build(p_q1)$data[[1]]
p_q1 + geom_area(data = subset(d_q1, x > beta[2]), aes(x = x, y = y), col = "red", fill = "red") +
geom_area(data = subset(d_q1, x < -beta[2]), aes(x = x, y = y), col = "red", fill = "red")
The p-value of our test on \(H_0:\beta_1=0\) is 0.00009, therefore we reject the null hypothesis and conclude that \(\beta_1\) is significantly different from 0.
Import data copepod_composition.txt
, cop_density.txt
and copepodSPlist.txt
for analysis
#import copepod data
species <- read.table("copepod_composition.txt", header = T)
#import density data
dens <- as.vector(read.table("cop_density.txt", header = T)[[1]])
#import species name
species_name <- read.table("copepodSPlist.txt", sep = "\t")
#convert species frequency into percentage
species = species / 100
#calculate copepod density for each species for each cruise station
species.density = t(apply(species, 1, function(x) x * dens))
#extract the density of Oncaea Venusta and Canthocalanus pauper
OV = species.density[grep("Oncaea venusta", species_name[, 1]), ]
CP = species.density[grep("Canthocalanus pauper", species_name[, 1]), ]
#show partial data
kable(head(cbind(OV, CP)), digits = 2, col.names = c("Oncaea Venusta", "Canthocalanus pauper"), align = "l", caption = "Oncaea Venusta and Canthocalanus pauper density data (partial)")
Oncaea Venusta | Canthocalanus pauper | |
---|---|---|
p1 | 4.48 | 0.00 |
p3 | 4.38 | 0.00 |
p4 | 0.00 | 0.00 |
p6 | 18.04 | 0.00 |
p13 | 78.75 | 4.73 |
p16 | 56.05 | 4.67 |
diff = numeric(resamp_n)
for(i in 1:resamp_n){
shuffle = c(rep(c(1, 2), 34))[order(runif(68))]
data = cbind(c(OV, CP), shuffle)
g1 = subset(data, shuffle == 1)
g2 = subset(data, shuffle == 2)
diff[i] = mean(g1) - mean(g2)
}
#calculate estimated difference between Oncaea Venusta and Canthocalanus pauper density
est_diff = mean(OV) - mean(CP)
#calculate p-value
p_value_q2 = sum(abs(diff) > abs(est_diff)) / resamp_n
#create density plot for the permuted samples and add vertical lines at the estimated value
lab_q2 = c(-est_diff, seq(min(diff), max(diff), length.out = 5), est_diff)
p_q2 <- ggplot(as.data.frame(diff), aes(x = diff)) +
geom_density(fill = "#99CCFF", col = "#3399FF", alpha = 0.5) +
geom_vline(xintercept = c(-est_diff, est_diff), col = "red") +
labs(title = expression(Distribution~of~permuted~hat(theta)[1]-hat(theta)[2]~samples~"for"~null~hypothesis), x = expression(Permuted~hat(theta)[1]-hat(theta)[2])) +
annotate("text", x = est_diff, y = 0.01, label = "Esimated~hat(theta)[1]-hat(theta)[2]", hjust = 1.1, parse = T) +
scale_x_continuous(breaks = lab_q2, labels = format(lab_q2, digits = 2))
#add shaded area to indicate significant values
d_q2 <- ggplot_build(p_q2)$data[[1]]
p_q2 + geom_area(data = subset(d_q2, x > est_diff), aes(x = x, y = y), col = "red", fill = "red") +
geom_area(data = subset(d_q2, x < -est_diff), aes(x = x, y = y), col = "red", fill = "red")
The p-value of our test on \(H_0:\theta_1-\theta_2=0\) is 0.01011, therefore we reject the null hypothesis and conclude that significant difference does exist between the density of Oncaea Venusta and Canthocalanus pauper.