library(reshape2)
## Warning: package 'reshape2' was built under R version 3.4.4
library(chron)
## Warning: package 'chron' was built under R version 3.4.4
library(vegan)
## Warning: package 'vegan' was built under R version 3.4.4
## Loading required package: permute
## Warning: package 'permute' was built under R version 3.4.4
## Loading required package: lattice
## This is vegan 2.5-2
vwc2015 <- read.csv("C:/Users/Jack/Documents/MS Stats/556 Consulting/edge/Read in/Environmental time series/2015 VWC.csv")
vwc2016 <- read.csv("C:/Users/Jack/Documents/MS Stats/556 Consulting/edge/Read in/Environmental time series/2016 VWC.csv")
vwc2015 <- cbind(vwc2015, ts = 1:nrow(vwc2015))
vwc2015 <- cbind(vwc2015, timeInterval = rep(1:(24*4), length.out = nrow(vwc2015)))
head(vwc2015)
head(vwc2016)
Data problem 1: The original environmental data is organized in Excel worksheets with each row representing a time and each column representing a moisture measurement for a particular probe angle and plot. This information is encoded in complicated column names. For analysis this needs to be converted into “long” format, where there is only one column for each dependent variable, vmc at 45 degrees and vmc at 90 degrees, here with two dependent variables.
There are a few data cleaning tasks involved in converting the format of the dataset: -the plot number needs to be extracted from the original column labels and turned into factor levels for a plot variable -the plot number needs to be translated into factor levels for a separate treatment variable -the plot number needs to be translated into factor levels for a separate block variable -the probe angle needs to be extracted from the original column labels and turned into factor levels for an angle variable
Column names are like this: P15DRT_B5_D2M2_VWC_90_Avg The plot number follows the P. Plot 1 would be P1 not P01. The treatment follows the plot number: CON is control, ESR is partial roof, DRT is full roof. The block number follows the B. 90 is the probe angle.
First we can reshape the data to long format.
vwc2015Long <- melt(vwc2015, id.vars = c("DATE", "TIME"))
## Warning: attributes are not identical across measure variables; they will
## be dropped
head(vwc2015Long)
Break the column name into parts, separating at the underscores.
temp1 <- strsplit(as.character(vwc2015Long$variable), "_")
head(temp1)
## [[1]]
## [1] "P30CON" "B10" "D1M1" "VWC" "45" "Avg"
##
## [[2]]
## [1] "P30CON" "B10" "D1M1" "VWC" "45" "Avg"
##
## [[3]]
## [1] "P30CON" "B10" "D1M1" "VWC" "45" "Avg"
##
## [[4]]
## [1] "P30CON" "B10" "D1M1" "VWC" "45" "Avg"
##
## [[5]]
## [1] "P30CON" "B10" "D1M1" "VWC" "45" "Avg"
##
## [[6]]
## [1] "P30CON" "B10" "D1M1" "VWC" "45" "Avg"
Start to get these strings into variable vectors.
plot <- trt <- block <- angle <- rep(NA, nrow(vwc2015Long))
for(i in 1:nrow(vwc2015Long)){plot[i] <- temp1[[i]][1]}
for(i in 1:nrow(vwc2015Long)){trt[i] <- temp1[[i]][1]}
for(i in 1:nrow(vwc2015Long)){block[i] <- temp1[[i]][2]}
for(i in 1:nrow(vwc2015Long)){angle[i] <- temp1[[i]][5]}
This function extracts the plot number from the column labels
extractPlot <- function(myColumn){
digit1 <- as.numeric(substr(myColumn, start = 2, stop = 2))
suppressWarnings(digit2 <- as.numeric(substr(myColumn, start = 3, stop = 3))) # NAs are produced on purpose
ifelse(is.na(digit2), c(digit1), paste(digit1, digit2, sep = ""))
}
Extract the plot number.
plot <- extractPlot(plot)
## Warning in extractPlot(plot): NAs introduced by coercion
Extract the treatment. This is a little convoluted because it is not separated from plot by an underscore and because plot can be one or two digits, so it requires counting backwards from the end of the string.
for(i in 1:length(trt)){trt[i] <- substr(trt[i], start = nchar(trt[i]) - 2, stop = nchar(trt[i]))}
The other two variables, block and angle, are already usable.
Add these vectors to the data frame and remove the original column names.
vwc2015Long <- cbind(vwc2015Long, plot, trt, block, angle)
vwc2015Long <- vwc2015Long[, -3]
head(vwc2015Long)
Data problem 2: The date and time are imported as factors and need to be converted to date and time formats.
The original date format is mm/dd/yyyy. This needs to be converted from a factor to a date format in R.
Need to add seconds to convert using times
vwc2015Long$TIME <- paste(vwc2015Long$TIME,':00', sep = '')
head(vwc2015Long)
Convert the DATE and TIME columns, which are factors, into dates and times classes using the chron package. Also combine DATE and TIME into a new column dateTime, with class chron.
vwc2015Long$DATE <- dates(as.character(vwc2015Long$DATE), format = "m/d/y")
vwc2015Long$TIME <- times(as.character(vwc2015Long$TIME), format = "h:m:s")
vwc2015Long$dateTime <- chron(dates = vwc2015Long$DATE, times = vwc2015Long$TIME)
head(vwc2015Long)
Now all of the above can be repeated for the 2016 data…
vwc2016Long <- melt(vwc2016, id.vars = c("DATE", "TIME"))
## Warning: attributes are not identical across measure variables; they will
## be dropped
temp1 <- strsplit(as.character(vwc2016Long$variable), "_")
plot <- trt <- block <- angle <- rep(NA, nrow(vwc2016Long))
for(i in 1:nrow(vwc2016Long)){plot[i] <- temp1[[i]][1]}
for(i in 1:nrow(vwc2016Long)){trt[i] <- temp1[[i]][1]}
for(i in 1:nrow(vwc2016Long)){block[i] <- temp1[[i]][2]}
for(i in 1:nrow(vwc2016Long)){angle[i] <- temp1[[i]][5]}
plot <- extractPlot(plot)
for(i in 1:length(trt)){trt[i] <- substr(trt[i], start = nchar(trt[i]) - 2, stop = nchar(trt[i]))}
vwc2016Long <- cbind(vwc2016Long, plot, trt, block, angle)
vwc2016Long <- vwc2016Long[, -3]
vwc2016Long$TIME <- paste(vwc2016Long$TIME,':00', sep = '')
vwc2016Long$DATE <- dates(as.character(vwc2016Long$DATE), format = "m/d/y")
vwc2016Long$TIME <- times(as.character(vwc2016Long$TIME), format = "h:m:s")
vwc2016Long$dateTime <- chron(dates = vwc2016Long$DATE, times = vwc2016Long$TIME)
head(vwc2016Long)
and the two datasets can be combined, adding year as a variable, and converting the vwc from character to numeric.
vwc <- rbind(vwc2015Long, vwc2016Long)
vwc$year <- c(rep(0, nrow(vwc2015Long)), rep(1, nrow(vwc2016Long)))
vwc$value <- as.numeric(vwc$value)
## Warning: NAs introduced by coercion
colnames(vwc)[3] <- 'vwc'
vwc$month <- as.numeric(format(as.Date(vwc$DATE), "%m"))
vwc$day <- as.numeric(format(as.Date(vwc$DATE), "%d"))
vwc$cumDay <- ifelse(vwc$month == 5, vwc$day, vwc$day + 31)
head(vwc)
Now we have a dataset.
There are many rows with missing data, but there are still a very large number of rows with complete cases.
nrow(vwc) # total number of rows
## [1] 714128
sum(complete.cases(vwc)) # number of rows without any missing data
## [1] 650825
nrow(vwc) - sum(complete.cases(vwc)) # number of rows with missing data
## [1] 63303
Because the number of rows with complete cases is so large, it seems safe to simply omit all rows with missing data.
vwc2 <- vwc[complete.cases(vwc), ]
nrow(vwc2)
## [1] 650825
head(vwc2)
moist <- aggregate(vwc2$vwc[vwc2$year == 1], by = list(vwc2$plot[vwc2$year == 1]), FUN = mean, simplify = TRUE)
names(moist) <- c("plot", "mean(vwc)")
moist
Treating block and treatment as fixed effects allows us to see how treatment and block determine moisture levels.
fit <- lm(vwc ~ block + trt + angle + year, data = vwc2)
summary(fit)
##
## Call:
## lm(formula = vwc ~ block + trt + angle + year, data = vwc2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.093347 -0.024106 0.002048 0.023254 0.106467
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.694e-02 1.499e-04 646.89 <2e-16 ***
## blockB10 3.947e-02 1.860e-04 212.14 <2e-16 ***
## blockB2 2.826e-02 1.772e-04 159.48 <2e-16 ***
## blockB3 7.339e-02 1.772e-04 414.26 <2e-16 ***
## blockB4 6.431e-02 1.772e-04 362.97 <2e-16 ***
## blockB5 6.644e-02 1.772e-04 374.99 <2e-16 ***
## blockB6 5.538e-02 1.772e-04 312.57 <2e-16 ***
## blockB7 6.960e-02 1.861e-04 373.89 <2e-16 ***
## blockB8 5.589e-02 1.901e-04 293.95 <2e-16 ***
## blockB9 3.676e-02 1.860e-04 197.60 <2e-16 ***
## trtDRT -5.223e-03 1.010e-04 -51.69 <2e-16 ***
## trtESR -1.648e-02 1.005e-04 -164.07 <2e-16 ***
## angle90 5.919e-03 8.234e-05 71.89 <2e-16 ***
## year -2.218e-02 8.283e-05 -267.85 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0332 on 650811 degrees of freedom
## Multiple R-squared: 0.3792, Adjusted R-squared: 0.3792
## F-statistic: 3.058e+04 on 13 and 650811 DF, p-value: < 2.2e-16
anova(fit)
The deviations for the different blocks are larger than the deviations for the treatments or probe angle or year. This means that even though treatments caused a significant decrease in soil moisture, the treatment plots are not as a whole more dry than the control plots, so that treatments did not induce drought conditions.
The columns of the original data are very complicated chemical names that do not read into R correctly, so they are simply numbered here.
met <- read.csv("C:/Users/Jack/Documents/MS Stats/556 Consulting/edge/Read in/SpecAbundAve.csv")
names(met)[1] <- "group"
# Names the columns Cmpd1, Cmpd2, etc.
names(met)[-1] <- paste("Cmpd", 1:(length(names(met)) - 1), sep = "")
head(met)
The group column contains a code: sample number-plot sampling order-plot number-plant species-metabolism type-treatment-sampling date-sampling time
This information needs to be broken up to provide separate columns. The function strsplit() splits this character data by the “-”. Each cell becomes a list of the individual components.
temp2 <- strsplit(as.character(met$group), "-")
head(temp2)
## [[1]]
## [1] "10" "5" "21" "EE" "C3" "CHR" "160601" "10.25"
##
## [[2]]
## [1] "11" "6" "26" "BG" "C4" "INT" "160601" "10.32"
##
## [[3]]
## [1] "1" "1" "6" "BG" "C4" "CHR" "160601" "9.37"
##
## [[4]]
## [1] "12" "6" "26" "EE" "C3" "INT" "160601" "10.37"
##
## [[5]]
## [1] "13" "7" "27" "BG" "C4" "control" "160601"
## [8] "10.42"
##
## [[6]]
## [1] "14" "7" "27" "EE" "C3" "control" "160601"
## [8] "10.47"
These loops turns each of the above components into separate columns of data.
# Initialize the variables
sample <- plotOrder <- plot <- plant <- fixation <- trt <- data <- time <- rep(NA, length(temp2))
# Fill each variable with the apprpriate part of the list from above.
for(i in 1:length(temp2)){sample[i] <- temp2[[i]][1]}
for(i in 1:length(temp2)){plotOrder[i] <- temp2[[i]][2]}
for(i in 1:length(temp2)){plot[i] <- temp2[[i]][3]}
for(i in 1:length(temp2)){plant[i] <- temp2[[i]][4]}
for(i in 1:length(temp2)){fixation[i] <- temp2[[i]][5]}
for(i in 1:length(temp2)){trt[i] <- temp2[[i]][6]}
for(i in 1:length(temp2)){data[i] <- temp2[[i]][7]}
for(i in 1:length(temp2)){time[i] <- temp2[[i]][8]}
# Combine all of the new variables with the rest of the data set
met <- cbind(sample, plotOrder, plot, plant, fixation, trt, data, time, met)
# Remove the orignal code column that is now redundant
met <- met[, -9]
head(met)
The only column besides the metabolic species data that will be used is the plot column, so the rest can be removed.
# Remove unused columns
met.BG <- met[met$plant == "BG", c(3, 9:ncol(met))]
met.EE <- met[met$plant == "EE", c(3, 9:ncol(met))]
# Add the soil volumetric water content variable by
# merging on plot number.
met.BG <- merge(moist, met.BG)
met.EE <- merge(moist, met.EE)
spData.BG <- met.BG[, -c(1, 2)]
spData.EE <- met.EE[, -c(1, 2)]
moist.BG <- met.BG[, 2]
moist.EE <- met.EE[, 2]
Plot histograms of the pairwise correlations among metabolic variables for each plant species.
hist(cor(spData.BG),
main = c(expression(italic("B. gracilis"))),
xlab = "Correlation",
col = "lightsteelblue3",
freq = FALSE,
xlim = c(-1, 1),
ylim = c(0, 2),
breaks = 20
)
hist(cor(spData.EE),
main = c(expression(italic("E. elymoides"))),
xlab = "Correlation",
col = "lightsteelblue3",
freq = FALSE,
xlim = c(-1, 1),
ylim = c(0, 2),
breaks = 20
)
Clearly most metabolic responses are uncorrelated, but there are still some strong correlations among the variables. The fact that some variables are strongly related suggests that Principal Components Analysis could be a useful data reduction method, but the fact that so many variables are unrelated suggests that most of the variables may be unrelated to drought tolerance, and using a variable selection method to reduce the number of variables will be necessary to obtain useful results from the PCA.
Plot of histograms of correlation between soil volumetric water content and each of the metabolic variables for each species.
hist(cor(moist.BG, spData.BG),
main = c(expression(italic("B. gracilis"))),
xlab = "Correlation",
col = "lightsteelblue3",
freq = FALSE,
xlim = c(-1, 1),
ylim = c(0, 2)
)
hist(cor(moist.EE, spData.EE),
main = c(expression(italic("E. elymoides"))),
xlab = "Correlation",
col = "lightsteelblue3",
freq = FALSE,
xlim = c(-1, 1),
ylim = c(0, 2)
)
The distribution of correlations between soil moisture and each metabolic variable is also concentrated around zero, which, again, suggests that very few of the variables actually have a strong relationship to soil moisture, and many of the variables will need to be removed in order to obtain interpretable PCA results.
Define a function that returns variance explained by PCA and R^2 for a regression of PC’s on the moisture variable.
# This is a function that performs PCA then performs regression of a single
# environmental variable on the scores for the first three PC's.
# Returns the cumulative proportion of variance explained by the PCA and
# returns the value of R^2 for the linear regression together as a vector.
# spData is a data frame with one column for each species and one row for each plot.
# envData is a vector with one entry for each plot.
PCAwithRegression <- function(spData, envData){
# Perform PCA
pcFit <- prcomp(spData, center = TRUE, scale = TRUE, retx = TRUE, rank. = 3)
# Record cumulative proportion of variance explained for PCA for fist 3 PC's
propVarExp <- summary(pcFit)$importance[3, 3]
# Perform regression of volumetric soil content on first 3 PC's
lmFit <- lm(envData ~ pcFit$x)
# Record R^2 for regression
R2 <- summary(lmFit)$r.squared
c(propVarExp, R2)
}
Define a function that performs regression of PC’s on moisture starting with the variables most highly correlated with moisture and adding one variable at a time until all variables have been included.
# spData is a data frame with one column for each species and one row for each plot.
# envData is a vector with one entry for each plot.
# pvals = TRUE will calculate p-values based on simulation by randomly selecting
# the correct number of variables, but this is very slow.
bestPCA <- function(spData, envData, pvals = FALSE){
# Rank is large for larger numbers.
# Subtracting correlations from 1
# makes rank be 1 for the largest correlation.
# This is a vector of ranks for the columns of the
# species data matrix. The correlations are between the
# environmental data and each metabolic variable.
corRanks <- rank(1 - abs(cor(envData, spData)))
# Number of metabolites
numVars <- ncol(spData)
# Initialize data frame for output variables
varSelection <- data.frame(
n = rep(NA, numVars),
propVarExp = rep(NA, numVars),
R2 = rep(NA, numVars),
SigPropVarExp = rep(NA, numVars),
SigPercentileValR2 = rep(NA, numVars)
)
for(i in 4:numVars){
varSelection[i, 1] <- i
# Perform PCA for the ith variables with the strongest correlations to environmental gradient
varSelection[i, c(2, 3)] <- PCAwithRegression(spData[, which(corRanks <= i)], envData)
# If pvals = TRUE then get simulated p-values for the two measures
# from randomly choosing i variables to include
if(pvals == TRUE){
# Initialize vectors for sampling distributions in next loop
nReps <- 10
sampDists <- data.frame(
sampPropVarExp = rep(NA, nReps),
sampR2 = rep(NA, nReps)
)
# Get sampling distribution for proportion of variance explained in PCA
# and for R^2 for regression of PC's on environmental variable
for(j in 1:nReps){
sampDists[j, ] <- PCAwithRegression(spData[, sample(x = 1:numVars, size = i)], envData)
}
# Record value of 95th percentiles based on sampling distribution
varSelection[i, 4] <- quantile(sampDists[, 1], probs = 0.95)
varSelection[i, 5] <- quantile(sampDists[, 2], probs = 0.95)
}
}
# Output is a data frame
varSelection
}
bestPCA.BG <- bestPCA(spData = spData.BG, envData = moist.BG, pvals = FALSE)
bestPCA.EE <- bestPCA(spData = spData.EE, envData = moist.EE, pvals = FALSE)
Plot \(R^2\) as a function of the number of variables included in the PCA, with one line for each plant species.
plot(bestPCA.BG[, 1], bestPCA.BG[, 3], ylim = c(0, 1),
ylab = expression("R" ^ 2),
xlab = "Number of variables",
bty = "l",
type = "l",
col = "palegreen4",
lwd = 2
)
lines(bestPCA.EE[, 1], bestPCA.EE[, 3],
col = "royalblue4",
lwd = 2
)
legend("topleft",
legend = c(expression(italic("B. gracilis")), expression(italic("E. elymoides"))),
col = c("palegreen4", "royalblue4"),
lty = 1,
bty = "n",
lwd = 2
)
The variables that are most highly correlated with soil volumetric water content are included in the PCA first, then less highly correlated variables are added one at a time. Clearly including more variables does not risk overfitting, since the value of \(R^2\) peaks and then decreases as more variables are added.
Zoom in to the area of the peak in \(R^2\).
plot(bestPCA.BG[, 1], bestPCA.BG[, 3],
ylim = c(0, 1),
xlim = c(0, 30),
ylab = expression("R" ^ 2),
xlab = "Number of variables",
bty = "l",
type = "l",
col = "palegreen4",
lwd = 2
)
lines(bestPCA.EE[, 1], bestPCA.EE[, 3],
col = "royalblue4",
lwd = 2
)
legend("topleft",
legend = c(expression(italic("B. gracilis")), expression(italic("E. elymoides"))),
col = c("palegreen4", "royalblue4"),
lty = 1,
bty = "n",
lwd = 2
)
max.BG <- which(bestPCA.BG[, 3] == max(bestPCA.BG[, 3], na.rm = TRUE))
max.EE <- which(bestPCA.EE[, 3] == max(bestPCA.EE[, 3], na.rm = TRUE))
abline(v = bestPCA.BG[max.BG, 1],
col = "palegreen4",
lty = 2,
lwd = 2
)
abline(v = bestPCA.EE[max.EE, 1],
col = "royalblue4",
lty = 2,
lwd = 2
)
The maximum value of \(R^2\) for B. gracilis occurs at 29 variables and the maximum value of \(R^2\) for E. elymoides occurs at 28 variables.
Scree plots for PCA’s at the maximum of \(R^2\).
corRanks.BG <- rank(1 - abs(cor(moist.BG, spData.BG)))
pca.BG <- prcomp(spData.BG[, which(corRanks.BG <= max.BG)], center = TRUE, scale = TRUE, retx = TRUE)
screeplot(pca.BG, main = expression(italic("B. gracilis")))
corRanks.EE <- rank(1 - abs(cor(moist.EE, spData.EE)))
pca.EE <- prcomp(spData.EE[, which(corRanks.EE <= max.EE)], center = TRUE, scale = TRUE, retx = TRUE)
screeplot(pca.EE, main = expression(italic("E. elymoides")))
The scree plots do not indicate that more than 3 PC’s would be useful.
Rerun the PCS’a getting only three PC’s as before.
pca.BG <- prcomp(spData.BG[, which(corRanks.BG <= max.BG)], center = TRUE, scale = TRUE, retx = TRUE, rank. = 3)
pca.EE <- prcomp(spData.EE[, which(corRanks.EE <= max.EE)], center = TRUE, scale = TRUE, retx = TRUE, rank. = 3)
Plots that show plot scores colored by whether moisture is greater or less than the median moisture level.
plot(pca.BG$x[, 1], pca.BG$x[, 2], col = (moist.BG > median(moist.BG)) + 1, xlab = "PC1", ylab = "PC2", bty = "l", main = expression(italic("B. gracilis")))
plot(pca.BG$x[, 2], pca.BG$x[, 3], col = (moist.BG > median(moist.BG)) + 1, xlab = "PC2", ylab = "PC3", bty = "l")
plot(pca.BG$x[, 1], pca.BG$x[, 3], col = (moist.BG > median(moist.BG)) + 1, xlab = "PC1", ylab = "PC3", bty = "l")
plot(pca.EE$x[, 1], pca.EE$x[, 2], col = (moist.EE > median(moist.EE)) + 1, xlab = "PC1", ylab = "PC2", bty = "l", main = expression(italic("E. elymoides")))
plot(pca.EE$x[, 2], pca.EE$x[, 3], col = (moist.EE > median(moist.EE)) + 1, xlab = "PC2", ylab = "PC3", bty = "l")
plot(pca.EE$x[, 1], pca.EE$x[, 3], col = (moist.EE > median(moist.EE)) + 1, xlab = "PC1", ylab = "PC3", bty = "l")
These PC’s do a good job of separating sites based on soil moisture for both plant species.
Perform a linear regression of soil moisture on the three PC’s.
summary(lm(moist.BG ~ pca.BG$x))
##
## Call:
## lm(formula = moist.BG ~ pca.BG$x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.036205 -0.012062 -0.000047 0.013836 0.036501
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.119180 0.004001 29.790 < 2e-16 ***
## pca.BG$xPC1 -0.006781 0.001083 -6.261 1.8e-06 ***
## pca.BG$xPC2 0.005488 0.002014 2.725 0.0118 *
## pca.BG$xPC3 -0.002846 0.002732 -1.042 0.3079
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.02117 on 24 degrees of freedom
## Multiple R-squared: 0.6653, Adjusted R-squared: 0.6235
## F-statistic: 15.9 on 3 and 24 DF, p-value: 6.611e-06
summary(lm(moist.EE ~ pca.EE$x))
##
## Call:
## lm(formula = moist.EE ~ pca.EE$x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.032219 -0.012468 0.000068 0.011422 0.028579
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.1185612 0.0034672 34.195 < 2e-16 ***
## pca.EE$xPC1 0.0091215 0.0011229 8.123 3.3e-08 ***
## pca.EE$xPC2 0.0003195 0.0017838 0.179 0.85940
## pca.EE$xPC3 0.0066733 0.0023184 2.878 0.00849 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01802 on 23 degrees of freedom
## Multiple R-squared: 0.7636, Adjusted R-squared: 0.7328
## F-statistic: 24.77 on 3 and 23 DF, p-value: 2.187e-07
For B. gracilis, PC’s 1 and 2 have significant relationships with soil volumetric moisture content. For E. elymoides, PC’s 1 and 3 have significant relationships with soil volumetric moisture content.
summary(lm(moist.BG ~ prcomp(spData.BG, rank. = 3)$x))
##
## Call:
## lm(formula = moist.BG ~ prcomp(spData.BG, rank. = 3)$x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.068585 -0.027867 -0.006371 0.026972 0.058277
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.192e-01 6.479e-03 18.395 1.19e-15
## prcomp(spData.BG, rank. = 3)$xPC1 -1.815e-09 1.692e-09 -1.073 0.294
## prcomp(spData.BG, rank. = 3)$xPC2 2.895e-09 2.455e-09 1.179 0.250
## prcomp(spData.BG, rank. = 3)$xPC3 -2.644e-09 2.953e-09 -0.895 0.379
##
## (Intercept) ***
## prcomp(spData.BG, rank. = 3)$xPC1
## prcomp(spData.BG, rank. = 3)$xPC2
## prcomp(spData.BG, rank. = 3)$xPC3
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.03428 on 24 degrees of freedom
## Multiple R-squared: 0.1223, Adjusted R-squared: 0.01255
## F-statistic: 1.114 on 3 and 24 DF, p-value: 0.3628
summary(lm(moist.EE ~ prcomp(spData.EE, rank. = 3)$x))
##
## Call:
## lm(formula = moist.EE ~ prcomp(spData.EE, rank. = 3)$x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.069390 -0.018788 0.000496 0.027869 0.056383
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.186e-01 6.652e-03 17.824 5.86e-15
## prcomp(spData.EE, rank. = 3)$xPC1 -1.915e-09 2.090e-09 -0.916 0.369
## prcomp(spData.EE, rank. = 3)$xPC2 -2.184e-09 2.655e-09 -0.823 0.419
## prcomp(spData.EE, rank. = 3)$xPC3 -4.907e-09 3.542e-09 -1.386 0.179
##
## (Intercept) ***
## prcomp(spData.EE, rank. = 3)$xPC1
## prcomp(spData.EE, rank. = 3)$xPC2
## prcomp(spData.EE, rank. = 3)$xPC3
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.03456 on 23 degrees of freedom
## Multiple R-squared: 0.13, Adjusted R-squared: 0.01651
## F-statistic: 1.145 on 3 and 23 DF, p-value: 0.3518
The value of \(R^2\) is much lower than with the variable selection procedure.
Plots of the PC’s for each plant species vs the soil volumetric water content.
plot(moist.BG, pca.BG$x[, 1], xlab = "VWC", ylab = "PC1", bty = "l", ylim = c(0, 15), main = expression(italic("B. gracilis")))
plot(moist.BG, pca.BG$x[, 2], xlab = "VWC", ylab = "PC2", bty = "l", ylim = c(0, 15), main = expression(italic("B. gracilis")))
plot(moist.BG, pca.BG$x[, 3], xlab = "VWC", ylab = "PC3", bty = "l", ylim = c(0, 15), main = expression(italic("B. gracilis")))
plot(moist.EE, pca.EE$x[, 1], xlab = "VWC", ylab = "PC1", bty = "l", ylim = c(0, 15), main = expression(italic("E. elymoides")))
plot(moist.EE, pca.EE$x[, 2], xlab = "VWC", ylab = "PC2", bty = "l", ylim = c(0, 15), main = expression(italic("E. elymoides")))
plot(moist.EE, pca.EE$x[, 3], xlab = "VWC", ylab = "PC3", bty = "l", ylim = c(0, 15), main = expression(italic("E. elymoides")))
Loadings
pca.BG$rotation
## PC1 PC2 PC3
## Cmpd5 0.24145790 0.152279732 0.083069071
## Cmpd12 -0.15932554 0.089465356 -0.123035778
## Cmpd21 0.22494613 0.186667600 -0.144399048
## Cmpd60 0.23716681 0.188453897 -0.106947860
## Cmpd152 0.23638330 0.182660042 -0.142891175
## Cmpd182 0.13637459 0.068301613 0.363292784
## Cmpd271 0.13107296 -0.056731964 0.291120736
## Cmpd318 -0.15456375 0.198914003 0.191503025
## Cmpd338 -0.15067441 0.201701243 -0.014516747
## Cmpd347 -0.11589222 0.387230376 0.009840319
## Cmpd366 0.16010087 -0.092767804 0.391866820
## Cmpd377 0.20615247 0.122748591 -0.006591756
## Cmpd457 0.22845735 0.162365181 -0.136734114
## Cmpd473 0.23100260 0.134068500 0.147982695
## Cmpd493 -0.09354436 0.139285645 0.317038263
## Cmpd508 0.14387024 -0.099803477 0.465832637
## Cmpd516 0.23535324 0.158936331 -0.074547194
## Cmpd517 0.18623232 -0.023155207 -0.114238263
## Cmpd715 0.19526548 -0.004368839 0.015264733
## Cmpd845 0.19953330 -0.026204145 -0.154361587
## Cmpd875 -0.19990713 0.100899542 0.085707545
## Cmpd912 -0.09928398 0.376415063 -0.072377288
## Cmpd917 -0.13947585 0.210517407 0.002163104
## Cmpd934 -0.15064265 0.344800147 0.108674289
## Cmpd971 0.22685935 0.078292857 0.051917427
## Cmpd1102 -0.16517604 0.307242000 0.061952711
## Cmpd1152 0.23016338 0.178967133 -0.156414606
## Cmpd1164 0.10602800 -0.205779539 -0.182970366
## Cmpd1211 0.22738138 0.114681822 0.168181723
pca.EE$rotation
## PC1 PC2 PC3
## Cmpd19 -0.20053893 0.067987942 -0.154421869
## Cmpd90 -0.21504645 0.155329342 0.135070008
## Cmpd105 -0.24512781 0.115526077 0.218967144
## Cmpd139 -0.18670150 -0.100770045 -0.327140228
## Cmpd276 -0.13172242 -0.177710355 -0.231719498
## Cmpd327 -0.15883702 0.083847352 -0.251760631
## Cmpd343 0.22240779 -0.226130739 -0.005848648
## Cmpd377 -0.22558994 -0.235050279 0.237423729
## Cmpd396 -0.15960664 -0.147010073 -0.302820699
## Cmpd479 0.18797819 -0.190068785 -0.030587379
## Cmpd494 0.21631139 -0.274553389 -0.075876174
## Cmpd531 -0.26000886 -0.206180930 0.155915547
## Cmpd569 0.14952497 -0.153008367 -0.083136048
## Cmpd651 -0.20802943 -0.148593745 -0.133749664
## Cmpd683 0.20265040 -0.258195333 -0.200713384
## Cmpd711 -0.18506522 -0.129159316 0.002665762
## Cmpd732 0.06294035 -0.058401910 0.528382418
## Cmpd795 -0.13742272 0.003639979 -0.178714981
## Cmpd844 -0.18360116 -0.164161881 -0.011431152
## Cmpd894 -0.15039320 -0.246435655 0.075011086
## Cmpd926 0.20583876 -0.216388732 -0.016083081
## Cmpd930 0.21843499 -0.192234948 0.042222533
## Cmpd1005 -0.19648929 -0.165076240 0.196530660
## Cmpd1016 0.11331778 -0.192084934 0.079124204
## Cmpd1039 -0.18713128 -0.214627650 0.228783648
## Cmpd1135 0.20743747 -0.218600591 0.108502138
## Cmpd1159 -0.16075231 -0.253621480 -0.110676731
## Cmpd1174 -0.18683234 -0.317936577 0.038600380
write.csv(pca.BG$rotation, "loadingsBG.csv")
write.csv(pca.EE$rotation, "loadingsEE.csv")
Mantel test for correlation between the two metabolic species matrices for the two plant species.
temp1 <- met[met$plant == "BG", c(3, 9:ncol(met))]
temp2 <- met[met$plant == "EE", c(3, 9:ncol(met))]
temp3 <- merge(temp1, temp2, by = "plot")
names(temp2) <- paste(names(temp2), ".2", sep = "")
temp3 <- merge(temp1, temp2, by.x = "plot", by.y = "plot.2")
temp4 <- merge(moist, temp3, by = "plot")
mantel.BG <- temp4[, 3:(ncol(met) - 6)]
mantel.EE <- temp4[, (ncol(met) - 5):ncol(temp4)]
mantel(dist(mantel.BG), dist(mantel.EE))
##
## Mantel statistic based on Pearson's product-moment correlation
##
## Call:
## mantel(xdis = dist(mantel.BG), ydis = dist(mantel.EE))
##
## Mantel statistic r: 0.1999
## Significance: 0.026
##
## Upper quantiles of permutations (null model):
## 90% 95% 97.5% 99%
## 0.126 0.161 0.193 0.258
## Permutation: free
## Number of permutations: 999
There is significant correlation between the two matrices.
Repeat for response of each variable averaged across species
spDataCombo <- data.frame(matrix(rep(NA, nrow(mantel.BG) * ncol(mantel.BG)), nrow = nrow(mantel.BG), ncol = ncol(mantel.BG)))
envDataCombo <- temp4[, 2]
for(i in 1:ncol(mantel.BG)){
spDataCombo[, i] <- rowMeans(cbind(mantel.BG[, i], mantel.EE[, i]))
}
bestPCA.Combo <- bestPCA(spData = spDataCombo, envData = envDataCombo, pvals = FALSE)
plot(bestPCA.Combo[, 1], bestPCA.Combo[, 3], ylim = c(0, 1), ylab = expression("R" ^ 2), xlab = "Number of variables", bty = "l", type = "l")
plot(bestPCA.Combo[, 1], bestPCA.Combo[, 3], ylim = c(0, 1), xlim = c(0, 50), ylab = expression("R" ^ 2), xlab = "Number of variables", bty = "l", type = "l")
abline(v = bestPCA.Combo[which(bestPCA.Combo[, 3] == max(bestPCA.Combo[, 3], na.rm = TRUE)), 1], col = "black", lty = 2)
corRanks.Combo <- rank(1 - abs(cor(envDataCombo, spDataCombo)))
pca.Combo <- prcomp(spDataCombo[, which(corRanks.Combo <= 29)], center = TRUE, scale = TRUE, retx = TRUE, rank. = 3)
pca.Combo$rotation # loadings
## PC1 PC2 PC3
## X12 -0.15666496 -0.04348644 -0.04703786
## X105 0.19673947 0.13878921 -0.04139259
## X143 -0.20755356 0.24503280 0.03586008
## X152 0.15215211 0.34535250 -0.05045044
## X290 -0.08934823 -0.23377064 -0.35292924
## X318 -0.17417814 -0.04011116 -0.19019203
## X347 -0.22852968 0.15789534 -0.09768216
## X352 -0.21696580 0.14091015 -0.05578526
## X366 0.16741021 0.11882602 -0.19877224
## X377 0.17452751 0.30951571 -0.21256550
## X496 -0.22858131 0.19192753 -0.08861260
## X596 -0.19256739 0.10539999 -0.16864061
## X683 -0.24877913 0.16450644 0.03505792
## X693 -0.20411641 0.14713490 -0.10155169
## X715 0.19585352 0.22228468 -0.26039581
## X716 0.20196648 0.10279443 -0.36365034
## X795 0.10492422 0.28306251 0.24607867
## X815 0.20758306 -0.02902471 -0.29368455
## X875 -0.18721000 -0.07005823 -0.05057955
## X894 0.09845152 0.31760991 0.09863591
## X931 -0.21522081 0.23068497 -0.11927098
## X934 -0.24602455 0.06335322 -0.11035048
## X972 0.15785704 0.16110715 -0.26964366
## X973 0.16640154 -0.06805403 -0.22466660
## X1008 0.05973677 0.26541696 0.25934673
## X1016 -0.17611045 0.08998408 -0.03383010
## X1064 -0.22668255 0.10058762 -0.12939781
## X1102 -0.21959716 0.08594533 -0.04214195
## X1120 -0.09945247 -0.24763853 -0.30112815
write.csv(pca.Combo$rotation, "loadingsCombo.csv")
plot(pca.Combo$x[, 1], pca.Combo$x[, 2], col = (envDataCombo > median(envDataCombo)) + 1, xlab = "PC1", ylab = "PC2", bty = "l")
plot(pca.Combo$x[, 2], pca.Combo$x[, 3], col = (envDataCombo > median(envDataCombo)) + 1, xlab = "PC2", ylab = "PC3", bty = "l")
plot(pca.Combo$x[, 1], pca.Combo$x[, 3], col = (envDataCombo > median(envDataCombo)) + 1, xlab = "PC1", ylab = "PC3", bty = "l")
summary(lm(envDataCombo ~ pca.Combo$x))
##
## Call:
## lm(formula = envDataCombo ~ pca.Combo$x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.027595 -0.017185 -0.001137 0.017791 0.035720
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.117750 0.004038 29.160 < 2e-16 ***
## pca.Combo$xPC1 -0.007586 0.001145 -6.624 1.16e-06 ***
## pca.Combo$xPC2 -0.004551 0.002024 -2.249 0.0349 *
## pca.Combo$xPC3 -0.004480 0.002853 -1.570 0.1307
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.02059 on 22 degrees of freedom
## Multiple R-squared: 0.7003, Adjusted R-squared: 0.6594
## F-statistic: 17.13 on 3 and 22 DF, p-value: 5.775e-06
With the combined data the value of \(R^2\) is between the values for the individual species separately.
plot(envDataCombo, pca.Combo$x[, 1], xlab = "VWC", ylab = "PC1", bty = "l", main = "Combined")
plot(envDataCombo, pca.Combo$x[, 2], xlab = "VWC", ylab = "PC2", bty = "l", main = "Combined")
plot(envDataCombo, pca.Combo$x[, 3], xlab = "VWC", ylab = "PC3", bty = "l", main = "Combined")
Interpret PC in terms of physiological responses
phys <- read.csv("C:/Users/Jack/Documents/MS Stats/556 Consulting/edge/Read in/phys.csv")
head(phys)
The group column contains a code: sample number-plot sampling order-plot number-plant species-metabolism type-treatment-sampling date-sampling time-unknown-unknown
This information needs to be broken up to provide separate columns
temp2 <- strsplit(as.character(phys$code), "-")
head(temp2)
## [[1]]
## [1] "1" "1" "6" "BG" "C4" "CHR" "160601"
## [8] "9.37" "0.0596" "0.0927"
##
## [[2]]
## [1] "2" "1" "6" "EE" "C3" "CHR" "160601"
## [8] "9.53" "0.0596" "0.0927"
##
## [[3]]
## [1] "3" "2" "22" "BG" "C4" "INT" "160601"
## [8] "9.754" "0.0845" "0.0819"
##
## [[4]]
## [1] "4" "2" "22" "EE" "C3" "INT" "160601"
## [8] "9.82" "0.0845" "0.0819"
##
## [[5]]
## [1] "5" "3" "2" "BG" "C4" "CHR" "160601"
## [8] "9.9" "0.0277" "0.0343"
##
## [[6]]
## [1] "6" "3" "2" "EE" "C3" "CHR" "160601"
## [8] "9.97" "0.0277" "0.0343"
Translate each element of the above lists into the value of a new variable.
sample <- plotOrder <- plot <- plant <- fixation <- trt <- data <- time <- rep(NA, length(temp2))
for(i in 1:length(temp2)){sample[i] <- temp2[[i]][1]}
for(i in 1:length(temp2)){plotOrder[i] <- temp2[[i]][2]}
for(i in 1:length(temp2)){plot[i] <- temp2[[i]][3]}
for(i in 1:length(temp2)){plant[i] <- temp2[[i]][4]}
for(i in 1:length(temp2)){fixation[i] <- temp2[[i]][5]}
for(i in 1:length(temp2)){trt[i] <- temp2[[i]][6]}
for(i in 1:length(temp2)){data[i] <- temp2[[i]][7]}
for(i in 1:length(temp2)){time[i] <- temp2[[i]][8]}
phys <- cbind(sample, plotOrder, plot, plant, fixation, trt, data, time, phys)
phys <- phys[, -9]
head(phys)
Perform a Mantel test to test for correlation between the physiological response matrices for each plant species.
temp1 <- phys[phys$plant == "BG", c(3, 9:ncol(phys))]
temp2 <- phys[phys$plant == "EE", c(3, 9:ncol(phys))]
temp3 <- merge(temp1, temp2, by = "plot")
temp4 <- merge(moist, temp3, by = "plot")
phys.BG <- merge(temp4[, 2:(ncol(phys) - 6)], cbind(moist.BG, pca.BG$x), by.x = "mean(vwc)", by.y = "moist.BG")
phys.EE <- merge(temp4[, c(2, (ncol(phys) - 5):ncol(temp4))], cbind(moist.EE, pca.EE$x), by.x = "mean(vwc)", by.y = "moist.EE")
phys.combo <- merge(phys.BG, phys.EE, by = "mean(vwc)")
phys.mantel.BG <- phys.combo[, 2:5]
phys.mantel.EE <- phys.combo[, 9:12]
mantel(dist(phys.mantel.BG), dist(phys.mantel.EE))
##
## Mantel statistic based on Pearson's product-moment correlation
##
## Call:
## mantel(xdis = dist(phys.mantel.BG), ydis = dist(phys.mantel.EE))
##
## Mantel statistic r: 0.08737
## Significance: 0.228
##
## Upper quantiles of permutations (null model):
## 90% 95% 97.5% 99%
## 0.215 0.279 0.339 0.440
## Permutation: free
## Number of permutations: 999
There is insufficient evidence of a correlation between the physiological response variables for the two species.
head(phys.BG)
Regreesion of of the first PC on the physiological variables.
lm.phys.BG <- lm(phys.BG[, 6] ~ phys.BG[, 2] + phys.BG[, 3] + phys.BG[, 4] + phys.BG[, 5])
lm.phys.EE <- lm(phys.EE[, 6] ~ phys.EE[, 2] + phys.EE[, 3] + phys.EE[, 4] + phys.EE[, 5])
summary(lm.phys.BG)
##
## Call:
## lm(formula = phys.BG[, 6] ~ phys.BG[, 2] + phys.BG[, 3] + phys.BG[,
## 4] + phys.BG[, 5])
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.4975 -2.2361 -0.5836 0.6239 13.8896
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.078088 1.901725 0.567 0.576
## phys.BG[, 2] -0.394070 1.493146 -0.264 0.794
## phys.BG[, 3] 5.481573 31.003382 0.177 0.861
## phys.BG[, 4] -0.025012 0.088060 -0.284 0.779
## phys.BG[, 5] -0.001392 0.003665 -0.380 0.708
##
## Residual standard error: 3.992 on 23 degrees of freedom
## Multiple R-squared: 0.04083, Adjusted R-squared: -0.126
## F-statistic: 0.2448 on 4 and 23 DF, p-value: 0.9099
summary(lm.phys.EE)
##
## Call:
## lm(formula = phys.EE[, 6] ~ phys.EE[, 2] + phys.EE[, 3] + phys.EE[,
## 4] + phys.EE[, 5])
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.3675 -0.9751 -0.0411 1.1698 8.2812
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.850829 1.716172 -1.078 0.293
## phys.EE[, 2] 0.746236 0.708216 1.054 0.303
## phys.EE[, 3] -8.210893 10.453212 -0.785 0.441
## phys.EE[, 4] 0.059726 0.067690 0.882 0.387
## phys.EE[, 5] -0.006011 0.005536 -1.086 0.289
##
## Residual standard error: 3.12 on 22 degrees of freedom
## Multiple R-squared: 0.1679, Adjusted R-squared: 0.01666
## F-statistic: 1.11 on 4 and 22 DF, p-value: 0.3768
None of the physiological variables is related to the first PC for either plant species.
Are the physiological responses related to soil moisture?
lm.phys.BG <- lm(phys.BG[, 1] ~ phys.BG[, 2] + phys.BG[, 3] + phys.BG[, 4] + phys.BG[, 5])
lm.phys.EE <- lm(phys.EE[, 1] ~ phys.EE[, 2] + phys.EE[, 3] + phys.EE[, 4] + phys.EE[, 5])
summary(lm.phys.BG)
##
## Call:
## lm(formula = phys.BG[, 1] ~ phys.BG[, 2] + phys.BG[, 3] + phys.BG[,
## 4] + phys.BG[, 5])
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.077097 -0.021884 -0.002203 0.025167 0.050415
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.121e-01 1.687e-02 6.645 8.86e-07 ***
## phys.BG[, 2] -4.024e-03 1.325e-02 -0.304 0.764
## phys.BG[, 3] 1.644e-01 2.750e-01 0.598 0.556
## phys.BG[, 4] -2.287e-04 7.812e-04 -0.293 0.772
## phys.BG[, 5] -8.684e-06 3.251e-05 -0.267 0.792
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.03541 on 23 degrees of freedom
## Multiple R-squared: 0.1027, Adjusted R-squared: -0.0534
## F-statistic: 0.6578 on 4 and 23 DF, p-value: 0.6275
summary(lm.phys.EE)
##
## Call:
## lm(formula = phys.EE[, 1] ~ phys.EE[, 2] + phys.EE[, 3] + phys.EE[,
## 4] + phys.EE[, 5])
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.070095 -0.028440 -0.003291 0.027615 0.051007
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.016e-01 2.004e-02 5.072 4.42e-05 ***
## phys.EE[, 2] 4.779e-03 8.268e-03 0.578 0.569
## phys.EE[, 3] -5.950e-02 1.220e-01 -0.488 0.631
## phys.EE[, 4] 6.340e-04 7.903e-04 0.802 0.431
## phys.EE[, 5] -2.833e-05 6.463e-05 -0.438 0.665
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.03643 on 22 degrees of freedom
## Multiple R-squared: 0.07567, Adjusted R-squared: -0.09239
## F-statistic: 0.4502 on 4 and 22 DF, p-value: 0.7711
None of the physiological variables is related to soil volumetric water content for either plant species.
Repeat for response of each variable averaged across species
spDataCombo <- data.frame(matrix(rep(NA, nrow(mantel.BG) * ncol(mantel.BG)), nrow = nrow(mantel.BG), ncol = ncol(mantel.BG)))
envDataCombo <- temp4[, 2]
for(i in 1:ncol(mantel.BG)){
spDataCombo[, i] <- rowMeans(cbind(mantel.BG[, i], mantel.EE[, i]))
}