setwd("/Users/jenniferpolson/Documents/School/2016S/Physci M200/Lab Projects/Assignment 3")
#import the data
mydata = read.csv("NKCC.csv")
#define your dataset
troutdata <- na.omit(mydata)
options(digits=2)
BL1 <- troutdata$Day1
title1 = "Day 1"
BL2 <- troutdata$Day7
title2 = "Day 7"
yaxis = "NKCC Abundance"
par(mfrow=c(2,3),oma = c(0, 0, 2, 0))
#overlapping histogram
hist(BL2, col=rgb(1,0.85,0,0.5), breaks=10, main="Fig. 1A Histogram", xlab= yaxis)
hist(BL1, col=rgb(0.07,0.59,0.42,0.5), breaks= "FD", add=T)
box()
legend("topright", c(title1, title2, "Means"), col=c(rgb(0.07,0.59,0.42,0.5), rgb(1,0.85,0,0.5), "black"), lwd=c(5,5,3))
abline(v = mean(BL1), lwd=2)
abline(v = mean(BL2), lwd=2)
#qqplot
qqnorm(BL1, main= "Fig. 1B Day 1 Normal Q-Q Plot", col=rgb(0.07,0.59,0.42,0.5));qqline(BL1, lwd=2)
qqnorm(BL2, main= "Fig. 1C Day 7 Normal Q-Q Plot", col=rgb(1,0.85,0,0.5));qqline(BL2, lwd=2)
#boxplot
boxplot(BL1, BL2, col=c(rgb(0.07,0.59,0.42,0.5), rgb(1,0.85,0,0.5)),
names = c(title1, title2), main="Fig. 1D Box Plot",
ylab=yaxis)
days <- list(title1=BL1, title2=BL2)
stripchart(days, method = "jitter",
vertical=TRUE, add=TRUE,
col=rgb(0.5,0.5,0.5,1), pch=16)
#violin plot
library(vioplot)
## Loading required package: sm
## Package 'sm', version 2.2-5.4: type help(sm) for summary information
# Set up plot
plot(1, 1, main = "Fig. 1E Violin Plots",
xlim = c(0, 2), ylim = range(c(BL1,BL2)), type = "n",
xlab = "", ylab = yaxis, xaxt = "n")
# Specify axis labels
axis(side = 1, at = c(0.5:2), labels=c(title1, title2))
#labels for the x axis (side = 1)
# Manually add each violin
vioplot(BL1, at = 0.5, col=rgb(0.07,0.59,0.42,0.5), add = TRUE)
#at = 1 > specifies x axis position (1)
vioplot(BL2, at = 1.5, col=rgb(1,0.85,0,0.5), add = TRUE)
#at = 2 > specifies x axis position (2)
#stripchart
days <- list("Day 1" =BL1, "Day 7"= BL2)
stripchart(days, method = "jitter", vertical=TRUE,
col=c(rgb(0.07,0.59,0.42,0.5), rgb(1,0.85,0,0.5)), pch=16,
main="Fig. 1F Stripchart",
ylab=yaxis)
mtext("Figure 1: Comparison of Trout NKCCs by Day 1 and Day 7", outer = TRUE, cex = 1, font = 2)
Figure 1 compares the NKCC Abundance in brown trout on Day 1 and Day 7 of saltwater exposure. The Day 1 and Day 7 measurements are represented by green and yellow, respectively, in each graph. Figure 1A represents the data of the trout in an overlapping histogram. Figures 1B and 1C depict quantile-quantile plots the Day 1 and Day 7 data, respectively, that compare these data subsets to a normal distribution. The line indicates the normal distribution, and the dots indicate the quantile values of the actual datasets. Figure 1D represents the data using box plots. The box indicates the 25th and 75th percentiles, with the black line indicating the median of the data. The dashed line plungers indicate the most extreme data point that is 1.5 times the interquartile range from the quartiles, and the blank dots indicate the most extreme data that doesnt lie in that range. Figure 1E shows the same data in a violin plot. The white dots indicate the median, with the black bars indicating the values of the upper and lower quantiles. Figure 1F depicts stripcharts of the data in a jittered formation.
The data depicted in Figure 1 of Day 1 and Day 7 measurements do not follow a Gaussian distribution. Both sets of data have one mode, and it does not seem that there are any apparent outliers in either dataset. However, the Day 1 data is positively skewed, as is the Day 7 data, as depicted by the histograms and the Q-Q plots. As such, these data cannot be fit into a Gaussian distribution. One other point of interest is the range of the two sets of measurements. The Day 1 measurements range from 0-25, while the Day 7 abundances range from 0-50.
#CIs for mean - Day 1
deals = 10000
len = length(BL1)
bootmean = rep (NA, deals)
for (i in 1 : deals) {
bootdata = sample (BL1, len, replace = TRUE)
bootmean[i] = mean(bootdata)
}
CIM1 <- sort(bootmean)[c(.025*deals, .975*deals)]
#Confidence Intervals of SD - Day 1
len = length(BL1)
bootsd = rep (NA, deals)
for (i in 1 : deals) {
bootdata = sample (BL1, len, replace = TRUE)
bootsd[i] = sd(bootdata)
}
CISD1 <- sort(bootsd)[c(.025*deals, .975*deals)]
#Graph of Day 1 Means
par(mfrow=c(2,2), oma = c(0, 0, 2, 0))
hist(bootmean, main= "Resampled Day 1 Means", xlab="Means",
ylim = c(0,2000))
abline(v = mean(bootmean),
col = "red",
lwd = 2)
# confidence plots?
abline(v = CIM1[1],
col = "red",
lwd = 1,
lty = 2)
abline(v = CIM1[2],
col = "red",
lwd = 1,
lty = 2)
legend("topright", legend = c("Mean", "95%CIs"), col = "Red", lty = c(1,2), lwd = c(2,1))
#histogram of bootsd for Day 1
hist(bootsd, main= "Resampled Day 1 SDs", xlab="Standard Deviations")
abline(v = mean(bootsd),
col = "red",
lwd = 2)
# confidence plots?
abline(v = CISD1[1],
col = "red",
lwd = 1,
lty = 2)
abline(v = CISD1[2],
col = "red",
lwd = 1,
lty = 2)
legend("topright", legend = c("Mean", "95%CIs"), col = "Red", lty = c(1,2), lwd = c(2,1))
#CIs for Mean - Day 7 Data
deals = 10000
len = length(BL2)
bootmean = rep (NA, deals)
for (i in 1 : deals) {
bootdata = sample (BL2, len, replace = TRUE)
bootmean[i] = mean(bootdata)
}
CIM2 <- sort(bootmean)[c(.025*deals, .975*deals)]
#Confidence Intervals of SD - Day 7
len = length(BL2)
bootsd = rep (NA, deals)
for (i in 1 : deals) {
bootdata = sample (BL2, len, replace = TRUE)
bootsd[i] = sd(bootdata)
}
CISD2 <- sort(bootsd)[c(.025*deals, .975*deals)]
#histogram for resampled means Day 1
hist(bootmean, main= "Resampled Day 7 Means", xlab="Means",
ylim = c(0,2000))
abline(v = mean(bootmean),
col = "red",
lwd = 2)
# confidence plots?
abline(v = CIM2[1],
col = "red",
lwd = 1,
lty = 2)
abline(v = CIM2[2],
col = "red",
lwd = 1,
lty = 2)
legend("topright", legend = c("Mean", "95%CIs"), col = "Red", lty = c(1,2), lwd = c(2,1))
#histogram of bootsd for Day 1
hist(bootsd, main= "Resampled Day 7 SDs", xlab="Standard Deviations",
ylim = c(0,2000))
abline(v = mean(bootsd),
col = "red",
lwd = 2)
# confidence plots?
abline(v = CISD2[1],
col = "red",
lwd = 1,
lty = 2)
abline(v = CISD2[2],
col = "red",
lwd = 1,
lty = 2)
legend("topright", legend = c("Mean", "95%CIs"), col = "Red", lty = c(1,2), lwd = c(2,1))
mtext("Figure 2: Histograms of Resampled Means and Standard Deviations", outer = TRUE, cex = 1, font = 2)
Based on descriptions of the data, the measure of central tendency used will be the mean. The mean of the Day 1 measurements was 10.99 (9.01, 13.05), with the standard deviation being 5.9(r CISD1[1]
, r CISD1[2]
); the mean of the measurements taken from the trout on Day 7 was 21.99 (17.92, 26.12), with the standard deviation being 11.81(r CISD2[1]
,r CISD2[2]
).
troutdata$diff = troutdata$Day7 - troutdata$Day1
deals = 10000
index = 1:nrow(troutdata)
meandiff = rep(NA,deals)
for (i in 1 : deals) {
bootindex = sample (index, length(index), replace = TRUE) #indices
bootdiff = troutdata$diff [bootindex] #use indices to get resampled
meandiff[i] = mean(bootdiff)
}
CIC <- sort(meandiff)[c(.025*deals,.975*deals)]
differencemeasuredE <- mean(troutdata$diff)
par(mfrow= c(1,1))
hist(meandiff,
main="Figure 3: Histogram of Resampled Mean Differences",
xlab = "Difference",
breaks = "FD")
abline(v = differencemeasuredE,
col = "red",
lwd = 2)
# confidence plots?
abline(v = CIC[1],
col = "red",
lwd = 1,
lty = 2)
abline(v = CIC[2],
col = "red",
lwd = 1,
lty = 2)
legend("topright", legend = c("Effect Size", "95% CIs"), col = "Red", lty = c(1,2), lwd = c(2,1))
Based on the type of comparison being made, a paired t-test is the most adequate way to compare the measurements taken from the same fish on different days. The calculated effect size was 10.99 (9, 13.04). This indicates the acclimation to salt water has a positive effect on the number of NKCC present in brown trout gills.
deals = 10000
meandiff = rep(NA ,deals)
for (i in 1:deals) {
signs = sample(c(-1,1), length(troutdata$diff), replace = TRUE) #sample random new signs
bootdiff = signs * troutdata$diff #randomize
meandiff[i] = mean(bootdiff)
}
CI <- sort(meandiff)[.95*deals]
differencemeasuredN <- mean(troutdata$diff)
#generate the p-value thresholds
HP <- sum(meandiff > abs(differencemeasuredN))
#calculate the pvalue
pvalue1 = (HP)/deals
hist(meandiff,
main="Figure 4: Histogram of NHST",
xlab = "Difference",
breaks = "FD",
xlim = c(-12,12))
abline(v = differencemeasuredN,
col = "red",
lwd = 2)
# confidence plots?
abline(v = CI,
col = "red",
lwd = 1,
lty = 2)
legend("topright", legend = c("Effect Size", expression(alpha)), col = "Red", lty = c(1,2), lwd = c(2,1))
## import stuff
troutdata2 = read.csv("trout_hormones.csv")
mydata2 <- na.omit(troutdata2)
par(mfrow=c(2,3), oma=c(0,0,2,0))
hist(mydata2$cort,
main= "Fig 5A Histogram of Cortisol",
xlab= "Hormone Level",
breaks= "FD")
abline(v= median(mydata2$cort),
lwd= 2,
col= "green")
legend("topright", legend = "Median", col = "green", lwd = 2)
hist(mydata2$pro,
main= "Fig 5B Histogram of Prolactin",
xlab= "Hormone Level",
breaks= "FD")
abline(v= median(mydata2$pro),
lwd= 2,
col= "green")
hist(mydata2$TH,
main= "Fig 5A Histogram of Thyroid Hormone",
xlab= "Hormone Level",
breaks= "FD")
abline(v= median(mydata2$TH),
lwd= 2,
col= "green")
plot(cort ~ pro, mydata2,
xlab= "Prolactin",
ylab= "Cortisol",
main= "Fig 5C Cortisol vs. Prolactin")
plot(cort ~ TH, mydata2,
xlab= "Thyroid Hormone",
ylab= "Cortisol",
main= "Fig 5D Cortisol vs. TH")
plot(pro ~ TH, mydata2,
ylab= "Prolactin",
xlab= "Thyroid Hormone",
main= "Fig 5E Prolactin vs. TH")
mtext("Figure 5: Distributions and Correlations of Trout Hormones", outer = TRUE, cex = 1, font = 2)
Figures 5A-C depict the distributions of each hormone level. The data, all follow a similar distribution, in that the data is positively skewed in every case. The medians are depicted as a green line. Because of this skew, these data cannot be considered normal in any of the three subsets. In addition, based on the 3 plots above in Figures 5D-F, it would make the most sense to test for correlation using the Spearman association, as none of the plots appear to be linear. If they were linear, a Pearson correlation would be used.
sp.ci.cp <- spearman.CI.boot(mydata2$cort, mydata2$pro)
sp.NH.cp <- spearman.NHST.boot(mydata2$cort, mydata2$pro)
sp.ci.ct <- spearman.CI.boot(mydata2$cort, mydata2$TH)
sp.NH.ct <- spearman.NHST.boot(mydata2$cort, mydata2$TH)
sp.ci.tp <- spearman.CI.boot(mydata2$TH, mydata2$pro)
sp.NH.tp <- spearman.NHST.boot(mydata2$TH, mydata2$pro)
par(mfrow= c(2,3), oma=c(0,0,2,0))
hist(sp.ci.cp[[1]],
main="6A: Cor-Pro Resample",
xlab="Spearman Correlation")
abline (v=sp.ci.cp[[2]],
col= "red",
lwd= 2,
lty= 1)
abline (v=sp.ci.cp[[3]],
col= "red",
lty= 2)
legend("topleft", legend = c(expression(rho), "95%CIs"), col = "Red", lty = c(1,2), lwd = c(2,1))
hist(sp.ci.ct[[1]],
main= "6B: Cor-TH Resample",
xlab="Spearman Correlation")
abline (v=sp.ci.ct[[2]],
col= "red",
lwd= 2,
lty= 1)
abline (v=sp.ci.ct[[3]],
col= "red",
lty= 2)
hist(sp.ci.tp[[1]],
main= "6C: Pro-TH Resample",
xlab="Spearman Correlation")
abline (v=sp.ci.tp[[2]],
col= "red",
lwd= 2,
lty= 1)
abline (v=sp.ci.tp[[3]],
col= "red",
lty= 2)
hist(sp.NH.cp[[1]],
main= "6D: Cort-Pro NHST",
xlab="Spearman Correlation",
xlim= c(-0.4,1))
abline (v=sp.ci.cp[[2]],
col= "blue",
lwd= 2,
lty= 1)
abline (v=sp.NH.cp[[4]],
col= "red",
lty= 2)
legend("topright", legend = c(expression(rho), "95%Threshold"), col = c("blue","red"), lty = c(1,2), lwd = c(2,1))
hist(sp.NH.ct[[1]],
main= "6E: Cort-TH NHST",
xlab="Spearman Correlation")
abline (v=sp.ci.ct[[2]],
col= "blue",
lwd= 2,
lty= 1)
abline (v=sp.NH.ct[[4]],
col= "red",
lty= 2)
hist(sp.NH.tp[[1]],
main= "6F: Pro-TH NHST",
xlab="Spearman Correlation")
abline (v=sp.ci.tp[[2]],
col= "blue",
lwd= 2,
lty= 1)
abline (v=sp.NH.tp[[4]],
col= "red",
lty= 2)
mtext("Figure 6: Resampled Correlations and NHSTs of Trout Hormone Spearman Correlations", outer = TRUE, cex = 1, font = 2)
Performing Spearman Correlations on all three comparisons yielded the following results: for the comparison of Cortisol and Prolactin, the coefficient \(\rho\) was 0.91 (0.74, 0.88). For the comparison of Cortisol and Thyroid Hormone (TH), \(\rho\) was -0.05 (-0.1, 0.26), . Finally, the comparison of Prolactin and TH yielded the following \(\rho\): 0 (-0.06, 0.29).
The Null Hypothesis Significance Test performed on these datasets was two-sided and paired. Using the null hypothesis that \(\rho\) = 0, the tests generated the following results: the p-value for the Cortisol-Prolactin comparison was 0 (< 0.0001), which means that we must reject the null hypothesis that these two datasets are statistically independen. When comparing Cortisol and TH, the calculated p-value was 0.62, which means that we cannot reject the null hypothesis, meaning that it is possible that there is no correlation between the two hormone levels. The same general result was seen in the comparison of Prolactin and TH, with the calculated p-value approximating to 0.98. Therefore, from this test, one can only reject the null hypothesis of the first comparison.
mic.ci.cp <- MIC.CI.boot(mydata2$cort, mydata2$pro)
mic.nhst.cp <- MIC.NHST.boot(mydata2$cort, mydata2$pro)
mic.ci.ct <- MIC.CI.boot(mydata2$cort, mydata2$TH)
mic.nhst.ct <- MIC.NHST.boot(mydata2$cort, mydata2$TH)
mic.ci.tp <- MIC.CI.boot(mydata2$TH, mydata2$pro)
mic.nhst.tp <- MIC.NHST.boot(mydata2$TH, mydata2$pro)
par(mfrow= c(2,3), oma=c(0,0,2,0))
hist(mic.ci.cp[[1]],
main="7A: Cor-Pro MIC Resample",
xlab="MIC",
ylim= c(0,3500))
abline (v=mic.ci.cp[[2]],
col= "red",
lwd= 2,
lty= 1)
abline (v=mic.ci.cp[[3]],
col= "red",
lty= 2)
legend("topright", legend = c("MIC", "95%CIs"), col = "Red", lty = c(1,2), lwd = c(2,1))
hist(mic.ci.ct[[1]],
main="7B: Cor-TH MIC Resample",
xlab="MIC")
abline (v=mic.ci.ct[[2]],
col= "red",
lwd= 2,
lty= 1)
abline (v=mic.ci.ct[[3]],
col= "red",
lty= 2)
hist(mic.ci.tp[[1]],
main="7C: Pro-TH MIC Resample",
xlab="MIC",
xlim=c(-0.15,0.6))
abline (v=mic.ci.tp[[2]],
col= "red",
lwd= 2,
lty= 1)
abline (v=mic.ci.tp[[3]],
col= "red",
lty= 2)
hist(mic.nhst.cp[[1]],
main= "7D: Cort-Pro NHST",
xlab="MIC",
xlim = c(0.15, 0.7))
abline (v=mic.ci.cp[[2]],
col= "blue",
lwd= 2,
lty= 1)
abline (v=mic.nhst.cp[[3]],
col= "red",
lty= 2)
legend("topright", legend = c("MIC", "95%Threshold"), col = c("blue","red"), lty = c(1,2), lwd = c(2,1))
hist(mic.nhst.ct[[1]],
main= "7E: Cort-TH NHST",
xlab="MIC")
abline (v=mic.ci.ct[[2]],
col= "blue",
lwd= 2,
lty= 1)
abline (v=mic.nhst.ct[[3]],
col= "red",
lty= 2)
hist(mic.nhst.tp[[1]],
main= "7F: Pro-TH NHST",
xlab="MIC")
abline (v=mic.ci.tp[[2]],
col= "blue",
lwd= 2,
lty= 1)
abline (v=mic.nhst.tp[[3]],
col= "red",
lty= 2)
mtext("Figure 7: Resampled Correlations and NHSTs of Trout Hormone MICs", outer = TRUE, cex = 1, font = 2)
Comparison | Spearman Corr. | p-value | MIC | p-value |
---|---|---|---|---|
Cort-Pro | 0.91 (0.74, 0.88) | 0 | 0.61 (0.64, 0.64) | 0 |
Cort-TH | -0.05 (-0.1, 0.26) | 0.62 | 0.26 (0.29, 0.49) | 0.14 |
Pro-TH | 0 (-0.06, 0.29) | 0.98 | 0.23 (0.29, 0.51) | 0.36 |
The calculations given by the Maximal Information Coefficient (MIC), part of the mine funtion from the minerva package, are very different from those generated by the Spearman correlations, as illustrated by Table 1. A NHST was also run for all three comparisons; this was a one-sided test, due to the nature of the MIC calculation yielding a positive number between 0 and 1. The MIC for the Cortisol-Prolactin comparison is much lower [0.61 (0.64, 0.64)] than the \(\rho\) [0.91 (0.74, 0.88)]. Interestingly, the p value for both the Spearman coefficient and the MIC were < 0.0001, which indicates that the null hypothesis can be rejected in both instances. Therefore, these data have passed to methods of testing for correlation. The MICs calculated for the other two comparisons were much higher than the Spearman counterparts, as evidence in Table 1. That being said, the p-values were within the critical thresholds that were calculated for the comparison, so conclusions could not be drawn from any of these tests.
###spearman CI formula
spearman.CI.boot <- function(attributeA, attributeB) {
deals = 10000
bootcor = rep(NA,deals)
indices = c(1:length(attributeA))
for (i in 1:deals) {
bootindices = sample(indices, length(indices), replace = T)
bootcor[i] = cor(attributeA[bootindices],
attributeB[bootindices], method = "spearman")
}
spearman = cor(attributeA, attributeB)
CISP <- sort(bootcor)[c(.025*deals,.975*deals)]
return(list(bootcor, spearman, CISP))
}
###spearman NHST
spearman.NHST.boot <- function(attributeA, attributeB){
deals = 10000
bootcor = rep(NA, deals)
indices = c(1:length(attributeA))
for (i in 1:deals){
bootindices = sample(indices, length(indices), replace = F)
bootcor[i] = cor(attributeA[bootindices], attributeB,
method = "spearman")
}
spearman = cor(attributeA, attributeB)
HP <- sum(bootcor > abs(spearman))
LP <- sum(bootcor < -abs(spearman))
pvalue = (HP+LP)/deals
thresholds = sort(bootcor)[c(.025*deals,.975*deals)]
return(list(bootcor, spearman, pvalue, thresholds))
}
###MIC CI function
MIC.CI.boot <- function(attributeA, attributeB) {
deals = 10000
bootmic = rep(NA,deals)
indices = c(1:length(attributeA))
for (i in 1:deals) {
bootindices = sample(indices, length(indices), replace = T)
bootmic[i] = mine(attributeA[bootindices],
attributeB[bootindices])$MIC
}
MIC = mine(attributeA, attributeB)$MIC
CIMIC <- sort(bootmic)[c(.025*deals,.975*deals)]
return(list(bootmic, MIC, CIMIC))
}
######MIC function for p-values/NHST
MIC.NHST.boot <- function(attributeA, attributeB){
deals = 10000
bootmic = rep(NA, deals)
indices = c(1:length(attributeA))
for (i in 1:deals){
bootindices = sample(indices, length(indices), replace = F)
bootmic[i] = mine(attributeA[bootindices], attributeB)$MIC
}
MIC = mine(attributeA, attributeB)$MIC
HP <- sum(bootmic > abs(MIC))
pvalue = (HP)/deals
thresholds = sort(bootmic)[.95*deals]
return(list(bootmic,pvalue,thresholds))
}