Schefferville - generalists/specialists; pairwise co-occurrence; GAM modelling

Includes GAM analyses with 34 focal species (including T. alpinum)

Author: Tammy L. Elliott

Date: February 7, 2014

R version 3.1.0

Data structure for GAMS, square root transformed

#saveRDS(data.cooc, file="cooc.z.data.rds")
data.cooc<-readRDS("cooc.z.data.rds")
head(data.cooc)

##      gen.spec   phy.dist      cooc.z
## 1 0.047371556  78.891213 -2.39264565
## 2 0.274031336 121.972198  5.85930884
## 3 0.226659781 121.972198 -0.06566243
## 4 0.000248625 121.972198 -2.71153372
## 5 0.047620180 121.972198 -0.70770244
## 6 0.274279961   9.008272 -4.68947236

phy.dist.sqrt<-sqrt(data.cooc$phy.dist)
head(phy.dist.sqrt)

## [1]  8.882073 11.044102 11.044102 11.044102 11.044102  3.001378

OLS models of co-occurrence explained by generalism and specialism and square root of phylogenetic distance

#Compare with OLS models
#do lm of cooc.z by gen.spec for entire data set
cooc.z.gen.spec.lm<-lm(data.cooc$cooc.z~data.cooc$gen.spec)
summary(cooc.z.gen.spec.lm)

## 
## Call:
## lm(formula = data.cooc$cooc.z ~ data.cooc$gen.spec)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.3892 -1.6934 -0.8110  0.8703 12.0012 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)   
## (Intercept)          0.3329     0.1918   1.736   0.0832 . 
## data.cooc$gen.spec  -1.6000     0.4949  -3.233   0.0013 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.798 on 559 degrees of freedom
## Multiple R-squared:  0.01836,    Adjusted R-squared:  0.0166 
## F-statistic: 10.45 on 1 and 559 DF,  p-value: 0.001296

#diagnostic plots for cooc.z.phy.dist.lm
opar <- par(mfrow=c(2,2))
plot(cooc.z.gen.spec.lm)

par(opar)
# What do the residuals suggest?
hist(resid(cooc.z.gen.spec.lm)) #this is a little right skewed

#Are the data normally distributed
hist(data.cooc$gen.spec,col="grey50", main="Untransformed data", xlab="Generalism-Specialism")

hist(data.cooc$cooc.z, col="grey50", main="Uransformed data", xlab="Standardized Co-occurrence")

# Let's verify using the Shapiro test:
shapiro.test(data.cooc$gen.spec)   #definiately not normal

## 
##  Shapiro-Wilk normality test
## 
## data:  data.cooc$gen.spec
## W = 0.921, p-value < 2.2e-16

shapiro.test(data.cooc$cooc.z)    #not normal either; however residuals no longer  a negative binomial or binomial relation

## 
##  Shapiro-Wilk normality test
## 
## data:  data.cooc$cooc.z
## W = 0.8524, p-value < 2.2e-16

#lm of cooc.z and phy.dist
cooc.z.phy.dist.lm<-lm(data.cooc$cooc.z~phy.dist.sqrt)
summary(cooc.z.phy.dist.lm)

## 
## Call:
## lm(formula = data.cooc$cooc.z ~ phy.dist.sqrt)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.7487 -1.7057 -0.8179  0.6467 12.4636 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept)    -1.0583     0.3881  -2.727   0.0066 **
## phy.dist.sqrt   0.1119     0.0458   2.443   0.0149 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.81 on 559 degrees of freedom
## Multiple R-squared:  0.01056,    Adjusted R-squared:  0.00879 
## F-statistic: 5.966 on 1 and 559 DF,  p-value: 0.01489

#diagnostic plots for cooc.z.phy.dist.lm
opar <- par(mfrow=c(2,2))
plot(cooc.z.phy.dist.lm)

par(opar)
# What do the residuals suggest?
hist(resid(cooc.z.phy.dist.lm),col="grey50" ) #this is a little right skewed

#Are the data normally distributed
hist(data.cooc$phy.dist,col="grey50", main="Untransformed data", xlab="Pairwise phylogenetic distance")

hist(data.cooc$cooc.z, col="grey50", main="Uransformed data", xlab="Standardized Co-occurrence")

# Let's verify using the Shapiro test:
shapiro.test(phy.dist.sqrt)   #definiately not normal

## 
##  Shapiro-Wilk normality test
## 
## data:  phy.dist.sqrt
## W = 0.8772, p-value < 2.2e-16

shapiro.test(data.cooc$cooc.z)    #not normal either; however residuals no longer  a negative binomial or binomial relation

## 
##  Shapiro-Wilk normality test
## 
## data:  data.cooc$cooc.z
## W = 0.8524, p-value < 2.2e-16

#variance is good; just need to fix normality

OLS with interaction term between predictor variables for entire data set

#OLS models with interaction term
#do lm of cooc.z by gen.spec and phy.dist in interaction term
cooc.z.gen.spec.phy.dist.lm<-lm(data.cooc$cooc.z~data.cooc$gen.spec*phy.dist.sqrt)
summary(cooc.z.gen.spec.phy.dist.lm)

## 
## Call:
## lm(formula = data.cooc$cooc.z ~ data.cooc$gen.spec * phy.dist.sqrt)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.8298 -1.7764 -0.7972  0.9147 11.9889 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       1.29160    0.62080   2.081   0.0379 *  
## data.cooc$gen.spec               -7.35617    1.57059  -4.684 3.54e-06 ***
## phy.dist.sqrt                    -0.12690    0.07296  -1.739   0.0825 .  
## data.cooc$gen.spec:phy.dist.sqrt  0.75860    0.19316   3.927 9.67e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.754 on 557 degrees of freedom
## Multiple R-squared:  0.05267,    Adjusted R-squared:  0.04757 
## F-statistic: 10.32 on 3 and 557 DF,  p-value: 1.275e-06

#diagnostic plots for cooc.z.phy.dist.lm
opar <- par(mfrow=c(2,2))
plot(cooc.z.gen.spec.phy.dist.lm)

par(opar)
# What do the residuals suggest?
hist(resid(cooc.z.gen.spec.phy.dist.lm)) #this is a little right skewed

#Are the data normally distributed
hist(data.cooc$gen.spec,col="grey50", main="Untransformed data", xlab="Generalism-Specialism")

hist(data.cooc$cooc.z, col="grey50", main="Uransformed data", xlab="Standardized Co-occurrence")

hist(phy.dist.sqrt,col="grey50", main="Uransformed data", xlab="Phylogenetic distance (sqrt)")

# Let's verify using the Shapiro test:
shapiro.test(data.cooc$gen.spec)   #definiately not normal

## 
##  Shapiro-Wilk normality test
## 
## data:  data.cooc$gen.spec
## W = 0.921, p-value < 2.2e-16

shapiro.test(data.cooc$cooc.z)    #not normal either; however residuals no longer  a negative binomial or binomial relation

## 
##  Shapiro-Wilk normality test
## 
## data:  data.cooc$cooc.z
## W = 0.8524, p-value < 2.2e-16

shapiro.test(phy.dist.sqrt)

## 
##  Shapiro-Wilk normality test
## 
## data:  phy.dist.sqrt
## W = 0.8772, p-value < 2.2e-16

Data structure for GAMS with only species pairs with less than 30 million years of separation, square root transformed

#####################do for same dataset, but remove phy.dist of 30 my
data.cooc.30<-subset(data.cooc[data.cooc$phy.dist<30,])
phy.dist.30.sqrt<-sqrt(data.cooc.30$phy.dist)
gen.spec.30<-data.cooc.30$gen.spec
cooc.z.30<-data.cooc.30$cooc.z
head(data.cooc.30)

##      gen.spec  phy.dist     cooc.z
## 6  0.27427996  9.008272 -4.6894724
## 15 0.06309177 14.509728  4.6917372
## 33 0.53791685 24.481046 -1.4696399
## 34 0.60100862 24.481046 -2.4803428
## 50 0.16159843 24.481046 -0.9417983
## 51 0.22469020 24.481046  0.3893929

OLS models of co-occurrence explained by generalism and specialism and square root of phylogenetic distance for only those species pairs separated by less than 30 million years

#Compare with OLS models
#do lm of cooc.z by gen.spec for entire data set
cooc.z.gen.spec.lm.30<-lm(data.cooc.30$cooc.z~data.cooc.30$gen.spec)
summary(cooc.z.gen.spec.lm.30)

## 
## Call:
## lm(formula = data.cooc.30$cooc.z ~ data.cooc.30$gen.spec)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.1571 -1.5938 -0.4869  0.7882 11.6722 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)   
## (Intercept)             0.1875     0.3715   0.505  0.61461   
## data.cooc.30$gen.spec  -2.6245     0.8858  -2.963  0.00368 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.537 on 120 degrees of freedom
## Multiple R-squared:  0.06817,    Adjusted R-squared:  0.0604 
## F-statistic: 8.778 on 1 and 120 DF,  p-value: 0.003677

#diagnostic plots for cooc.z.phy.dist.lm
opar <- par(mfrow=c(2,2))
plot(cooc.z.gen.spec.lm.30)

par(opar)
# What do the residuals suggest?
hist(resid(cooc.z.gen.spec.lm.30)) #this is a little right skewed

#Are the data normally distributed
hist(data.cooc.30$gen.spec,col="grey50", main="Untransformed data", xlab="Generalism-Specialism")

hist(data.cooc.30$cooc.z, col="grey50", main="Uransformed data", xlab="Standardized Co-occurrence")

# Let's verify using the Shapiro test:
shapiro.test(data.cooc.30$gen.spec)   #definiately not normal

## 
##  Shapiro-Wilk normality test
## 
## data:  data.cooc.30$gen.spec
## W = 0.9192, p-value = 1.827e-06

shapiro.test(data.cooc.30$cooc.z)    #not normal either; however residuals no longer  a negative binomial or binomial relation

## 
##  Shapiro-Wilk normality test
## 
## data:  data.cooc.30$cooc.z
## W = 0.8228, p-value = 8.239e-11

#lm of cooc.z and phy.dist
cooc.z.phy.dist.lm.30<-lm(data.cooc.30$cooc.z~phy.dist.30.sqrt)
summary(cooc.z.phy.dist.lm.30)

## 
## Call:
## lm(formula = data.cooc.30$cooc.z ~ phy.dist.30.sqrt)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.1446 -1.3318 -0.4673  0.5589 12.1081 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)  
## (Intercept)       -2.2158     1.2186  -1.818   0.0715 .
## phy.dist.30.sqrt   0.3722     0.2893   1.287   0.2007  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.61 on 120 degrees of freedom
## Multiple R-squared:  0.01361,    Adjusted R-squared:  0.005389 
## F-statistic: 1.656 on 1 and 120 DF,  p-value: 0.2007

#diagnostic plots for cooc.z.phy.dist.lm
opar <- par(mfrow=c(2,2))
plot(cooc.z.phy.dist.lm.30)

par(opar)
# What do the residuals suggest?
hist(resid(cooc.z.phy.dist.lm.30),col="grey50" ) #this is a little right skewed

#Are the data normally distributed
hist(phy.dist.30.sqrt,col="grey50", main="Untransformed data", xlab="Pairwise phylogenetic distance")

hist(data.cooc.30$cooc.z, col="grey50", main="Uransformed data", xlab="Standardized Co-occurrence")

# Let's verify using the Shapiro test:
shapiro.test(phy.dist.30.sqrt)   #definiately not normal

## 
##  Shapiro-Wilk normality test
## 
## data:  phy.dist.30.sqrt
## W = 0.8502, p-value = 9.04e-10

shapiro.test(data.cooc.30$cooc.z)    #not normal either; however residuals no longer  a negative binomial or binomial relation

## 
##  Shapiro-Wilk normality test
## 
## data:  data.cooc.30$cooc.z
## W = 0.8228, p-value = 8.239e-11

#variance is good; just need to fix normality

OLS with interaction term between predictor variables for only those species separated by less than 30 million years

#do lm of cooc.z by gen.spec*phy.dist.sqrt for entire data set
cooc.z.gen.spec.phy.dist.lm.30<-lm(data.cooc.30$cooc.z~data.cooc.30$gen.spec*phy.dist.30.sqrt)
summary(cooc.z.gen.spec.phy.dist.lm.30)

## 
## Call:
## lm(formula = data.cooc.30$cooc.z ~ data.cooc.30$gen.spec * phy.dist.30.sqrt)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.8669 -1.5452 -0.3620  0.6783 11.2057 
## 
## Coefficients:
##                                        Estimate Std. Error t value
## (Intercept)                             -2.8530     1.9211  -1.485
## data.cooc.30$gen.spec                    1.3289     4.3792   0.303
## phy.dist.30.sqrt                         0.7349     0.4540   1.619
## data.cooc.30$gen.spec:phy.dist.30.sqrt  -0.9489     1.0180  -0.932
##                                        Pr(>|t|)
## (Intercept)                               0.140
## data.cooc.30$gen.spec                     0.762
## phy.dist.30.sqrt                          0.108
## data.cooc.30$gen.spec:phy.dist.30.sqrt    0.353
## 
## Residual standard error: 2.527 on 118 degrees of freedom
## Multiple R-squared:  0.09072,    Adjusted R-squared:  0.0676 
## F-statistic: 3.924 on 3 and 118 DF,  p-value: 0.01036

#diagnostic plots for cooc.z.phy.dist.lm
opar <- par(mfrow=c(2,2))
plot(cooc.z.gen.spec.phy.dist.lm.30)

par(opar)
# What do the residuals suggest?
hist(resid(cooc.z.gen.spec.phy.dist.lm.30)) #this is a little right skewed

#Are the data normally distributed
hist(data.cooc.30$gen.spec,col="grey50", main="Untransformed data", xlab="Generalism-Specialism")

hist(data.cooc.30$cooc.z, col="grey50", main="Uransformed data", xlab="Standardized Co-occurrence")

hist(phy.dist.30.sqrt,col="grey50", main="Transformed data", xlab="Phylogenetic distance (square root)")

# Let's verify using the Shapiro test:
shapiro.test(data.cooc.30$gen.spec)   #definiately not normal

## 
##  Shapiro-Wilk normality test
## 
## data:  data.cooc.30$gen.spec
## W = 0.9192, p-value = 1.827e-06

shapiro.test(data.cooc.30$cooc.z)    #not normal either; however residuals no longer  a negative binomial or binomial relation

## 
##  Shapiro-Wilk normality test
## 
## data:  data.cooc.30$cooc.z
## W = 0.8228, p-value = 8.239e-11

shapiro.test(phy.dist.30.sqrt)

## 
##  Shapiro-Wilk normality test
## 
## data:  phy.dist.30.sqrt
## W = 0.8502, p-value = 9.04e-10

Plot relationships amongst different variables; the lower two graphs are for those data points separated by less than 30 million years

Top plots include entire dataset, while those on the bottom include only those plots separated by less than 30 million years.
x-axis for plots to the left is “Difference between specialists and generalists”, while x-axis for plots to the right is “Phylogenetic distance (square root transformed)”

#dev.new(width=2.95, height=2.95)
par(mfrow=c(2,2))
par(mar=c(3,2.5,0.75,0.75), mgp=c(1,0.25,0), las=0)

plot(data.cooc$gen.spec,data.cooc$cooc.z, xlab="", ylab="Co-occurrence", lwd=2, pch=16, col="black", cex.axis=0.75, cex.lab=0.85,bty="n", cex=0.5,tck=-0.05)
abline(cooc.z.gen.spec.lm, lwd=2)
box(bty="l", lwd=2)

plot(sqrt(data.cooc$phy.dist),data.cooc$cooc.z, xlab="", ylab="Co-occurrence", lwd=2, pch=16, col="black", cex.axis=0.75, cex.lab=0.85,bty="n", cex=0.5,tck=-0.05 )
abline(cooc.z.phy.dist.lm, lwd=2)
box(bty="l", lwd=2)

plot((data.cooc.30$gen.spec),data.cooc.30$cooc.z, xlab="", ylab="Co-occurrence", lwd=2, pch=16, col="black", cex.axis=0.75, cex.lab=0.85,bty="n", cex=0.5,tck=-0.05,)
abline(cooc.z.gen.spec.lm.30, lwd=2)
box(bty="l", lwd=2)

plot(sqrt(data.cooc.30$phy.dist),data.cooc.30$cooc.z, xlab="", ylab="Co-occurrence", lwd=2, pch=16, col="black", cex.axis=0.75, cex.lab=0.85,bty="n", cex=0.5,tck=-0.05 )
box(bty="l", lwd=2)

#textClick("(a)",cex=c(size=1, font=2) )
#textClick("(b)", cex=c(size=1, font=2))
#textClick("(c)", cex=c(size=1, font=2))
#textClick("(d)", cex=c(size=1, font=2))
#textClick("Difference in \nniche width",cex=c(size=1, font=2))
#textClick("Phylogenetic \ndistance", cex=c(size=1, font=2))
#textClick("Difference in \nniche width", cex=c(size=1, font=2))
#textClick("Phylogenetic \ndistance", cex=c(size=1, font=2))

GAM for entire dataset, with diagnostic plots

#set up GAM formula with interaction between two predictor variables####################################################################
#I am using tensor product smoothing because predictors not of same unit
#Do same GAM model with tensor product smoothing because predictors not of same unit; the second analysis includes a square root transformation of phy.dist. GAM.check
#indicates that k value too log for .log model so used k=6 instead (selected by a trial-and-error process, seeing when p-value of gam.check greater than p<0.05
#For te smooths, the upper limit of degrees of freedom is given by product of the K values provided by each marginal smooth less than one, for the construction.
#Exact choice is not generally ciritical, but should be large enough so you are reasonable sure of having enough degrees of frreedom to represent the 'truth' reasonable well, but small
#enough for comuptational efficiency

cooc.gam.sqrt.te<-gam(cooc.z~te(phy.dist.sqrt,gen.spec), dat=data.cooc)
summary(cooc.gam.sqrt.te)

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## cooc.z ~ te(phy.dist.sqrt, gen.spec)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  -0.1557     0.1148  -1.356    0.176
## 
## Approximate significance of smooth terms:
##                              edf Ref.df     F  p-value    
## te(phy.dist.sqrt,gen.spec) 5.353  6.084 7.505 8.19e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0717   Deviance explained = 8.06%
## GCV = 7.4771  Scale est. = 7.3924    n = 561

gam.check(cooc.gam.sqrt.te)

## 
## Method: GCV   Optimizer: magic
## Smoothing parameter selection converged after 15 iterations.
## The RMS GCV score gradiant at convergence was 4.960206e-07 .
## The Hessian was positive definite.
## The estimated model rank was 25 (maximum possible: 25)
## Model rank =  25 / 25 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##                                k'    edf k-index p-value
## te(phy.dist.sqrt,gen.spec) 24.000  5.353   0.979    0.25

#other diagnostic checks that should be done
par(mfrow=c(1,2))
plot(fitted(cooc.gam.sqrt.te),residuals(cooc.gam.sqrt.te))
plot(data.cooc$cooc.z,residuals(cooc.gam.sqrt.te))

GAM for only those plots separated by 30 million years, with diagnostic plots

#Calculate GAM for only those species pairs with less than 30 million years of separation
cooc.gam.30<-gam(cooc.z.30~te(phy.dist.30.sqrt,gen.spec.30), dat=data.cooc.30)
summary(cooc.gam.30)

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## cooc.z.30 ~ te(phy.dist.30.sqrt, gen.spec.30)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  -0.6775     0.2288  -2.962   0.0037 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                  edf Ref.df     F p-value  
## te(phy.dist.30.sqrt,gen.spec.30)   3      3 3.924  0.0103 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0676   Deviance explained = 9.07%
## GCV = 6.6017  Scale est. = 6.3853    n = 122

gam.check(cooc.gam.30, k.rep = 1000)

## 
## Method: GCV   Optimizer: magic
## Smoothing parameter selection converged after 16 iterations.
## The RMS GCV score gradiant at convergence was 3.426105e-07 .
## The Hessian was positive definite.
## The estimated model rank was 25 (maximum possible: 25)
## Model rank =  25 / 25 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##                                      k'    edf k-index p-value
## te(phy.dist.30.sqrt,gen.spec.30) 24.000  3.000   0.948    0.23

#other diagnostic checks that should be done
par(mfrow=c(1,2))
plot(fitted(cooc.gam.30),residuals(cooc.gam.30))
plot(data.cooc.30$cooc.z,residuals(cooc.gam.30))

Plot two GAM plots together

x-axis should be “Phylogenetic distance (square root transformed)”. Plot for all species pairs is to the left, whereas plot for only those species pairs separated by 30 million years is on the right.

#Plot two GAMs side-by-side
#dev.new(width=5.95, height=4)
layout(matrix(1:2,ncol=2), width = c(1,1),height = c(1,1))
par(mar=c(4,2.5,2.75,0.75), mgp=c(1.5,0.5,0), las=0)
#par(mgp=c(axis.title.position, axis.label.position, axis.line.position))

my.vis.gam(cooc.gam.sqrt.te, type="response", plot.type = "contour", color="gray",  ylab="Difference in niche width", main="",
xlab="", cex.lab=1.1, cex.axis=0.9, cex.main=0.1, lwd=2, cex=2)
box(bty="o", lwd=2)

my.vis.gam(cooc.gam.30,type="response", plot.type = "contour", color="gray",  ylab="Difference in niche width", main="",
xlab="", cex.lab=1.1, cex.axis=0.9, cex.main=0.1, lwd=2, cex=2)
box(bty="o", lwd=2)

#textClick("(a)", cex=c(size=1, font=2))
#textClick("(b)", cex=1)
#textClick("Phylogenetic distance", cex=1)
#textClick("Phylogenetic distance", cex=1)