Click here for other works of the author on RPubs

Load packages

Load knitr for better report quality with markdown and ggplot2 for better plots

library(knitr)
library(ggplot2)

Q1. Compute the regression coefficients for \(fish = \beta_0 + \beta_1 * copepod\) and use randomization method to generate null distribution of \(\beta_1\) and test whether \(\beta_1\) is significantly different from null (with randomization 5000 times). Report your p-value.

Import data

Import data set enviANDdensity, extract density data of fish and copepod from the data set and display partial data

enviANDdensity <- read.table("enviANDdensity.txt", header = T)
fish_density = as.matrix(enviANDdensity[, 11])
copepod_density = as.matrix(enviANDdensity[, 12])

# display a part of the data
kable(head(cbind(fish_density, copepod_density)), digits = 2, col.names = c("Fish density", "Copepod density"), align = "l", caption = "Fish and copepod density data (partial)")
Fish and copepod density data (partial)
Fish density Copepod density
137.32 1119
20.97 1153
0.00 1719
0.00 855
180.71 1246
88.35 2123

Calculate estimated \(\hat{\beta_0}\) & \(\hat{\beta_1}\)

Y = fish_density
X = cbind(1, copepod_density)
beta = solve(t(X) %*% X) %*% (t(X) %*% Y)

Generate samples for permutaion test under null hypothesis \(H_0:\beta_1=0\)

resamp_n = 100000
size = length(fish_density)
beta_sample = numeric()
for (i in 1:resamp_n) {
    #create a lish of indices for randomization
    shuffle = order(runif(size))
    
    #define randomized dependent variable and independent variables and calculate coefficients
    Y = fish_density[shuffle]
    X = cbind(1, copepod_density)
    b = solve(t(X) %*% X) %*% (t(X) %*% Y)
    beta_sample = append(beta_sample, b)
}
beta_sample = t(matrix(beta_sample, 2, resamp_n))

Plot the distribution of permuted \(\hat{\beta_1}\) samples and report p-value for the test

#calculate p-value for the two tailed test
p_value_q1 = sum(abs(beta_sample[, 2]) > abs(beta[2])) / resamp_n

#create density plot for the permuted samples and add vertical lines at the estimated value
lab_q1 = c(-beta[2], seq(min(beta_sample[, 2]), max(beta_sample[, 2]), length.out=5), beta[2])
p_q1 <- ggplot(as.data.frame(beta_sample[, 2]), aes(x = beta_sample[, 2])) +
        geom_density(fill = "#99CCFF", col = "#3399FF", alpha = 0.5) +
        geom_vline(xintercept = c(-beta[2], beta[2]), col = "red") +
        labs(title = expression(Distribution~of~permuted~hat(beta)[1]~samples~"for"~null~hypothesis), x = expression(Permuted~hat(beta)[1])) +
        annotate("text", x = beta[2], y = 10, label = "Esimated~hat(beta)[1]", hjust = 1.1, parse = T) +
        scale_x_continuous(breaks = lab_q1, labels = format(lab_q1, digits = 1))

#add shaded area to indicate significant values
d_q1 <- ggplot_build(p_q1)$data[[1]]
p_q1 + geom_area(data = subset(d_q1, x > beta[2]), aes(x = x, y = y), col = "red", fill = "red") +
    geom_area(data = subset(d_q1, x < -beta[2]), aes(x = x, y = y), col = "red",  fill = "red")

The p-value of our test on \(H_0:\beta_1=0\) is 0.00009, therefore we reject the null hypothesis and conclude that \(\beta_1\) is significantly different from 0.


Q2. Randomization test whether significant difference exists between the density of Oncaea Venusta and Canthocalanus pauper. (Assume all data are independent and use all 34 stations.) Report your p-value.

Import data

Import data copepod_composition.txt, cop_density.txt and copepodSPlist.txt for analysis

#import copepod data
species <- read.table("copepod_composition.txt", header = T)

#import density data
dens <- as.vector(read.table("cop_density.txt", header = T)[[1]])

#import species name
species_name <- read.table("copepodSPlist.txt", sep = "\t")

Extract Oncaea Venusta and Canthocalanus pauper data

#convert species frequency into percentage
species = species / 100

#calculate copepod density for each species for each cruise station
species.density = t(apply(species, 1, function(x) x * dens))

#extract the density of Oncaea Venusta and Canthocalanus pauper
OV = species.density[grep("Oncaea venusta", species_name[, 1]), ]
CP = species.density[grep("Canthocalanus pauper", species_name[, 1]), ]

#show partial data
kable(head(cbind(OV, CP)), digits = 2, col.names = c("Oncaea Venusta", "Canthocalanus pauper"), align = "l", caption = "Oncaea Venusta and Canthocalanus pauper density data (partial)")
Oncaea Venusta and Canthocalanus pauper density data (partial)
Oncaea Venusta Canthocalanus pauper
p1 4.48 0.00
p3 4.38 0.00
p4 0.00 0.00
p6 18.04 0.00
p13 78.75 4.73
p16 56.05 4.67

Generate samples for permutation test for null hypothesis \(H_0:\theta_1-\theta_2=0\)

diff = numeric(resamp_n)
for(i in 1:resamp_n){
    shuffle = c(rep(c(1, 2), 34))[order(runif(68))]
    data = cbind(c(OV, CP), shuffle)
    g1 = subset(data, shuffle == 1)
    g2 = subset(data, shuffle == 2)
    diff[i] = mean(g1) - mean(g2)
}

Plot the distribution of permuted \(\hat{\theta_1}-\hat{\theta_2}\) samples and report p-value for the test

#calculate estimated difference between Oncaea Venusta and Canthocalanus pauper density
est_diff = mean(OV) - mean(CP)

#calculate p-value
p_value_q2 = sum(abs(diff) > abs(est_diff)) / resamp_n

#create density plot for the permuted samples and add vertical lines at the estimated value
lab_q2 = c(-est_diff, seq(min(diff), max(diff), length.out = 5), est_diff)
p_q2 <- ggplot(as.data.frame(diff), aes(x = diff)) +
        geom_density(fill = "#99CCFF", col = "#3399FF", alpha = 0.5) +
        geom_vline(xintercept = c(-est_diff, est_diff), col = "red") +
        labs(title = expression(Distribution~of~permuted~hat(theta)[1]-hat(theta)[2]~samples~"for"~null~hypothesis), x = expression(Permuted~hat(theta)[1]-hat(theta)[2])) +
        annotate("text", x = est_diff, y = 0.01, label = "Esimated~hat(theta)[1]-hat(theta)[2]", hjust = 1.1, parse = T) +
        scale_x_continuous(breaks = lab_q2, labels = format(lab_q2, digits = 2))

#add shaded area to indicate significant values
d_q2 <- ggplot_build(p_q2)$data[[1]]
p_q2 + geom_area(data = subset(d_q2, x > est_diff), aes(x = x, y = y), col = "red", fill = "red") +
    geom_area(data = subset(d_q2, x < -est_diff), aes(x = x, y = y), col = "red", fill = "red")

The p-value of our test on \(H_0:\theta_1-\theta_2=0\) is 0.01011, therefore we reject the null hypothesis and conclude that significant difference does exist between the density of Oncaea Venusta and Canthocalanus pauper.