Click here for other works of the author on RPubs
Load knitr
for better report quality with markdown, ggplot2
for better plot quality and reshape2
for rearranging data
library(knitr)
library(ggplot2)
library(reshape2)
Import data set enviANDdensity
and extract density data of fish and copepod from the data set
enviANDdensity <- read.table("enviANDdensity.txt", header = T)
fish_density=as.matrix(enviANDdensity[,11])
copepod_density=as.matrix(enviANDdensity[,12])
# display a part of the data
kable(head(cbind(fish_density,copepod_density)),digits=2,col.names = c("Fish density","Copepod density"),align="l",caption="Fish and copepod density data (partial)")
Fish density | Copepod density |
---|---|
137.32 | 1119 |
20.97 | 1153 |
0.00 | 1719 |
0.00 | 855 |
180.71 | 1246 |
88.35 | 2123 |
Define function resamp
for generating samples
resamp <- function(data,method,stat=mean,boot_n=1000){
FUN=match.fun(stat)
if(method=="Jackknife" | method== "jackknife"){
#create jackknife samples
resamp=numeric()
for(i in 1:length(data)){
resamp[i]=FUN(data[-i])
}
}
else if(method=="Bootstrap" | method== "bootstrap"){
#create and randomize n sets of bootstrap samples
size=length(data)
shuffle=rep(data,boot_n)[order(runif(size*boot_n))]
#split bootstrap samples and calculate the statistic of interest for each set
resamp=apply(matrix(shuffle,size,boot_n),2,FUN)
}
else{
stop("Please define a resampling method")
}
return(resamp)
}
Define function resamp.result
to report the estimated statistic and its standard error. If mean is the statistic of interested, analytical results can also be obtained by giving the original sample as an input.
resamp.result <- function(samp,method,origin.mean=NULL){
#calculate the estimated parameter and SE for jackknife
if(method=="Jackknife" | method== "jackknife"){
result=t(cbind(mean(samp),sd(samp)*sqrt((length(samp)-1)^2/length(samp))))
colnames(result) <- "Jackknife"
}
#calculate the estimated parameter and SE for bootstrap
else if(method=="Bootstrap" | method== "bootstrap"){
result=t(as.matrix(cbind(mean(samp),sd(samp))))
colnames(result) <- "Bootstrap"
}
else{
stop("Please define a resampling method")
}
# If mean is the statistic of interest, also calculate the estimated mean and SE using normal theory given the original data
if(!is.null(origin.mean)){
origin=t(as.matrix(cbind(mean(origin.mean),sd(origin.mean)/sqrt(length(origin.mean)))))
result=cbind(origin,result)
colnames(result)[1] <- "Normal theory"
}
rownames(result) <- c("Estimated parameter","Estimated standard error")
return(result)
}
Define function samp.plot
to plot histogram for sample
samp.plot <- function(data1,data2=NULL,dens=T,col.names=c("Jackknife","Bootstrap"),title="",xlab="",bins=30){
# If only one data set is given
if(is.null(data2)){
data1=as.data.frame(data1)
colnames(data1) <- "Density"
# plot the histogram using ggplot
ggp <- ggplot(data1,aes(x=Density)) +
geom_histogram(aes(y =..density..),col="red",fill="green",alpha=0.2,bins = bins) +
labs(title=title,x=xlab,y="Frequency")
if(dens){
ggp=ggp+geom_density(col="red")
}
}
else{
if(length(data1) < length(data2)){
data=cbind(c(data1,rep(NA,length(data2)-length(data1))),data2)
}
else if(length(data1) > length(data2)){
data=cbind(data1,c(data2,rep(NA,length(data1)-length(data2))))
}
else {data=cbind(data1,data2)}
colnames(data) <- col.names
data=melt(data)
data=subset(data,!is.na(value))
colnames(data) <- c("num","Method","Density")
ggp <- ggplot(as.data.frame(data),aes(x=Density,color=Method,fill=Method)) +
facet_grid(.~Method,scales="free") +
geom_histogram(aes(y =..density..),alpha=0.2,bins=bins) +
labs(title=title,
x=xlab,y="Frequency")
if(dens){
ggp=ggp+geom_density(alpha=0)
}
}
return(ggp)
}
fish_mean_jk=resamp(fish_density,"Jackknife",mean)
copepod_mean_jk=resamp(copepod_density,"Jackknife",mean)
jk_fish_mean_result=resamp.result(fish_mean_jk,"Jackknife",fish_density)
kable(jk_fish_mean_result,digits=2,caption="Comparison of normal theory v.s jackknife for fish density mean",align="l")
Normal theory | Jackknife | |
---|---|---|
Estimated parameter | 322.45 | 322.45 |
Estimated standard error | 61.23 | 61.23 |
samp.plot(fish_mean_jk,title="Jackknife samples of fish density mean",xlab="Fish density")
jk_copepod_mean_result=resamp.result(copepod_mean_jk,"Jackknife",copepod_density)
kable(jk_copepod_mean_result,digits=2,caption="Comparison of normal theory v.s jackknife for copepod density mean",align="l")
Normal theory | Jackknife | |
---|---|---|
Estimated parameter | 1972.37 | 1972.37 |
Estimated standard error | 342.55 | 342.55 |
samp.plot(copepod_mean_jk,title="Jackknife samples of copepod density mean",xlab="Copepod density")
# set dependent variable
Y=fish_density
# set intercept and independent variable
X=cbind(1,copepod_density)
# compute coefficients
B = solve(t(X) %*% X) %*% (t(X) %*% Y)
# compute the standard deviation for the coefficients
var_B <- anova(lm(Y ~ X))[[3]][2] * solve(t(X) %*% X)
std_B <- sqrt(diag(var_B))
nor_B <- t(cbind(B,std_B))
colnames(nor_B) <- c("Normal theory","Normal theory")
rownames(nor_B) <- c("Estimated parameter","Estimated standard error")
Define function lm.resamp
to generate samples for coefficients in a linear regression model for given dependent and independent variables
lm.resamp <- function(dv,iv,method,boot_n=1000){
B_sample=numeric()
sampsize=length(dv)
#generate jackknife samples for coefficients in a linear regression model
if(method=="Jackknife" | method== "jackknife"){
resamp_size=sampsize
for(i in 1:resamp_size){
samp=as.matrix(cbind(dv,iv))[-i,]
Y=samp[,1]
X=cbind(rep(1,length(samp[,1])),samp[,-1])
beta = solve(t(X) %*% X) %*% (t(X) %*% Y)
B_sample=append(B_sample,beta)
}
}
#generate bootstrap samples for coefficients in a linear regression model
else if(method=="Bootstrap" | method== "bootstrap"){
resamp_size=boot_n
for(i in 1:boot_n){
samp=as.matrix(cbind(dv,iv))[ceiling(runif(sampsize,0,sampsize)),]
Y=samp[,1]
X=cbind(rep(1,length(samp[,1])),samp[,-1])
beta = solve(t(X) %*% X) %*% (t(X) %*% Y)
B_sample=append(B_sample,beta)
}
}
else{
stop("Please define a resampling method")
}
#return samples for the coefficients
return(t(matrix(B_sample,dim(iv)[2]+1,resamp_size)))
}
Conduct jackknife on coefficients \(\beta_0\), \(\beta_1\) using the lm.resamp
function created above
lm_jk=lm.resamp(fish_density,copepod_density,"jackknife")
jk_lm_B0=resamp.result(lm_jk[,1],"jackknife")
B0_est=cbind(nor_B[,1],jk_lm_B0)
colnames(B0_est) <- c("Normal theory","Jackknife")
rownames(B0_est) <- c("Estimated parameter","Estimated standard error")
kable(B0_est,digits=2,align="l",caption="Analytical and Jackknife estimates of coefficient B0")
Normal theory | Jackknife | |
---|---|---|
Estimated parameter | 93.06 | 92.73 |
Estimated standard error | 66.86 | 69.27 |
samp.plot(lm_jk[,1],title="Histogram for jackknife samples of coefficient B0",xlab="B0")
jk_lm_B1=resamp.result(lm_jk[,2],"jackknife")
B1_est=cbind(nor_B[,2],jk_lm_B1)
colnames(B1_est) <- c("Normal theory","Jackknife")
rownames(B1_est) <- c("Estimated parameter","Estimated standard error")
kable(B1_est,digits=2,align="l",caption="Analytical and Jackknife estimates of coefficient B1")
Normal theory | Jackknife | |
---|---|---|
Estimated parameter | 0.12 | 0.12 |
Estimated standard error | 0.02 | 0.04 |
samp.plot(lm_jk[,2],title="Histogram for jackknife samples of coefficient B1",xlab="B1")
Estimates are calculated using functions created above. Bootstrap Sample size is set to 1000.
fish_mean_bt=resamp(fish_density,"bootstrap",mean)
boot_fish_mean=resamp.result(fish_mean_bt,"bootstrap")
copepod_mean_bt=resamp(copepod_density,"bootstrap",mean)
boot_copepod_mean=resamp.result(copepod_mean_bt,"bootstrap")
fish_resamp_mean=cbind(jk_fish_mean_result,boot_fish_mean)
kable(fish_resamp_mean,digits=2,caption="Comparison of normal theory, bootstrap and jackknife for fish density mean")
Normal theory | Jackknife | Bootstrap | |
---|---|---|---|
Estimated parameter | 322.45 | 322.45 | 322.45 |
Estimated standard error | 61.23 | 61.23 | 61.42 |
samp.plot(fish_mean_jk,fish_mean_bt,title="Jackknife and bootstrap samples of fish density mean",xlab="Fish density")
copepod_resamp_mean=cbind(jk_copepod_mean_result,boot_copepod_mean)
kable(copepod_resamp_mean,digits=2,caption="Comparison of normal theory, bootstrap and jackknife for copepod density mean")
Normal theory | Jackknife | Bootstrap | |
---|---|---|---|
Estimated parameter | 1972.37 | 1972.37 | 1972.37 |
Estimated standard error | 342.55 | 342.55 | 350.04 |
samp.plot(copepod_mean_jk,copepod_mean_bt,title="Jackknife and bootstrap samples of copepod density mean",xlab="Copepod density")
Conduct bootstrap on coefficients \(\beta_0\), \(\beta_1\) using the lm.resamp
function created above
lm_bt=lm.resamp(fish_density,copepod_density,"bootstrap")
boot_B0=resamp.result(lm_bt[,1],"bootstrap")
boot_B1=resamp.result(lm_bt[,2],"bootstrap")
result_lm_B0=cbind(nor_B[,1],jk_lm_B0,boot_B0)
colnames(result_lm_B0)[1] <- "Normal theory"
kable(result_lm_B0,digits=2,caption="Comparison of normal theory, bootstrap and jackknife for coefficient B0")
Normal theory | Jackknife | Bootstrap | |
---|---|---|---|
Estimated parameter | 93.06 | 92.73 | 86.10 |
Estimated standard error | 66.86 | 69.27 | 62.24 |
# compare histogram of jackknife and bootstrap samples
samp.plot(lm_jk[,1],lm_bt[,1],title="Jackknife and bootstrap samples of coefficient B0",xlab="B0")
result_lm_B1=cbind(nor_B[,2],jk_lm_B1,boot_B1)
colnames(result_lm_B1)[1] <- "Normal theory"
kable(result_lm_B1,digits=2,caption="Comparison of normal theory, bootstrap and jackknife for coefficient B1")
Normal theory | Jackknife | Bootstrap | |
---|---|---|---|
Estimated parameter | 0.12 | 0.12 | 0.12 |
Estimated standard error | 0.02 | 0.04 | 0.03 |
# compare histogram of jackknife and bootstrap samples
samp.plot(lm_jk[,2],lm_bt[,2],title="Jackknife and bootstrap samples of coefficient B1",xlab="B1")