#Read in vegetation matrix
veg<-read.csv("Carex_2013_stats_TE.csv", head=TRUE, row.names=1)
veg<-veg[c(1:201,222:701), c(1:3,5:7, 10:12,14:22,24:39)] #Remove all columns that do not respresent focal species so that every species has at least 20 representatives; also remove DEF and ALP
veg[veg>0]<-1
veg<-veg[, order(names(veg))]
##Step 1: Calculating the actual co-occurrence prevelences for the data set; this creates a species*species co-occurrence matrix
##import occurrence data (species as columns and site as rows, with 1=presence and 0=absence)
veg.m<-as.matrix(veg)##convert to matrix
veg.out<-t(veg.m) %*% veg.m##multiply the occurrence matrix by the transposed matrix
diag(veg.out)<-0##set self-co-occurrence to zero (technically NA, but this simplifies the procedure)
#SUM<-colSums (veg.out, na.rm = FALSE, dims = 1)##calculate the total number of occurrences per species #this was in Liam's code, instead divide by number of plots included
#get upper.triangular matrix with names; then get a dataframe showing presence and absence of a particular combination;this is one way, but doesn't account for 0-0s
veg.out.ind <- which( upper.tri(veg.out,diag=F) , arr.ind = TRUE )
veg.out.df<-data.frame( col = dimnames(veg.out)[[2]][veg.out.ind[,2]] ,
row = dimnames(veg.out)[[1]][veg.out.ind[,1]] ,
val = veg.out[ veg.out.ind ] )
veg.out.df.neg<-cbind(veg.out.df,661-veg.out.df$val)
colSums(veg)
## ALP ANG AQU BIG BRU CAN CAP CES CHO DIS ECH EVA EXI GYN HEL LEN LEP LIM
## 76 57 218 51 28 74 27 189 46 31 56 25 29 77 26 29 79 167
## LIV MAG OLI PAU RAR ROS RUS SAX SCI STY TEN TRI UTR VAG VES VIR
## 77 201 40 75 60 66 62 33 39 30 61 60 66 143 39 38
veg.out.df.neg<-cbind(veg.out.df,681-veg.out.df$val)
#make upper triangular co-occurrence matrix; include 1:1 presence; 1:0 and 0:1 presence; and 0:0 presence
veg.out.df.neg.col<-revalue(veg.out.df.neg$col, c("ALP"=76, "AQU"=218,"BIG"=51, "BRU"=28, "CAP"=27, "CAN"=74, "CHO"=46, "DIS"=31, "ECH"=56, "EXI"=29, "GYN"=77, "HEL"=26, "LEN"=29, "LEP"=79, "LIM"=155, "LIV"=68,
"OLI"=40, "PAU"=74, "MAG"=201, "RAR"=60, "ROS"=66, "SAX"=33, "SCI"=39, "STY"=30, "TEN"=61, "TRI"=60, "UTR"=66, "VAG"=143, "VES"=39, "ANG"=57, "EVA"=25, "RUS"=62, "VIR"=38, "CES"=189))
## The following `from` values were not present in `x`: ALP
veg.out.df.neg.row<-revalue(veg.out.df.neg$row, c("ALP"=76, "AQU"=218,"BIG"=51, "BRU"=28, "CAP"=27, "CAN"=74, "CHO"=46, "DIS"=31, "ECH"=56, "EXI"=29, "GYN"=77, "HEL"=26, "LEN"=29, "LEP"=79, "LIM"=155, "LIV"=68,
"OLI"=40, "PAU"=74, "MAG"=201, "RAR"=60, "ROS"=66, "SAX"=33, "SCI"=39, "STY"=30, "TEN"=61, "TRI"=60, "UTR"=66, "VAG"=143, "VES"=39, "ANG"=57, "EVA"=25, "RUS"=62, "VIR"=38, "CES"=189))
## The following `from` values were not present in `x`: VIR
veg.out.df.neg.col.num<-as.numeric(as.character(veg.out.df.neg.col))
veg.out.df.neg.row.num<-as.numeric(as.character(veg.out.df.neg.row))
#calculate presence-absence/absence-presence; without absence-absence and presence-presence for each species pair
veg.out.df.neg.row.col<-(veg.out.df.neg.col.num-veg.out.df[,3])+(veg.out.df.neg.row.num-veg.out.df[,3])
#calculate presence-absence/absence-presence; without absence-absence and presence-presence for each species pair
veg.out.df.neg.row.col<-(veg.out.df.neg.col.num-veg.out.df[,3])+(veg.out.df.neg.row.num-veg.out.df[,3])
#use permiswap from Vegan package for randomization and creation of null model
set.seed(4)## set the seed for randomization
veg.rand<-permatswap(veg, method="quasiswap", fixedmar="both",times = 999, burnin = 20000, thin = 500, mtype = "prab") ##shuffle the occurrence matrix keeping both row and colsums equal
#get 999 transposed matrices from veg.rand$perm[[i]]
#create 999 empty matrices; fill them all with i veg.rand$perm transposed; get sd for each orf the 528 entries
veg.rand.perm<-veg.rand$perm
veg.rand<-lapply(1:999, function(x) matrix(0, 34,34))
for (i in 1:length(veg.rand)) { #loop over 999 matrices in veg.rand.perm
for (j in 1:length(veg.rand.perm)){
veg.rand[[j]]<-t(veg.rand.perm[[j]]) %*% veg.rand.perm[[j]]
}
}
#calculate mean across 999 different matrices for each indexed position
fulldim<-34*34
meanmat<-matrix(nrow=34,ncol=34)
meanmat[1:1156]<-
apply(as.matrix(1:fulldim), 1,function(y) mean(unlist(lapply(veg.rand,function(x) x[y]))))
#the 1:9 is gonna be 1089 (33*33 = 1089)
#y is the index (so 1:1089, a.k.a, [1,1],[1,2] etc
#x is the list index (1 to 999)
#x[y] is list x, element y
head(meanmat)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [1,] 76.000000 6.471471 24.158158 5.749750 3.110110 8.319319 3.079079
## [2,] 6.471471 57.000000 18.222222 4.282282 2.360360 6.166166 2.348348
## [3,] 24.158158 18.222222 218.000000 16.260260 8.958959 23.624625 8.658659
## [4,] 5.749750 4.282282 16.260260 51.000000 2.074074 5.519520 1.986987
## [5,] 3.110110 2.360360 8.958959 2.074074 28.000000 2.983984 1.187187
## [6,] 8.319319 6.166166 23.624625 5.519520 2.983984 74.000000 3.049049
## [,8] [,9] [,10] [,11] [,12] [,13] [,14]
## [1,] 21.035035 5.328328 3.511512 6.246246 2.792793 3.196196 8.697698
## [2,] 15.686687 3.931932 2.702703 4.744745 2.105105 2.483483 6.537538
## [3,] 60.200200 14.657658 9.838839 17.801802 8.019019 9.158158 24.721722
## [4,] 14.289289 3.437437 2.377377 4.176176 1.902903 2.136136 5.798799
## [5,] 7.686687 1.913914 1.287287 2.310310 0.969970 1.185185 3.129129
## [6,] 20.523524 4.897898 3.419419 6.141141 2.800801 3.110110 8.382382
## [,15] [,16] [,17] [,18] [,19] [,20] [,21]
## [1,] 2.872873 3.351351 8.924925 18.640641 8.392392 22.304304 4.354354
## [2,] 2.199199 2.489489 6.579580 13.875876 6.438438 16.806807 3.309309
## [3,] 8.315315 9.307307 25.105105 53.127127 24.732733 63.631632 12.803804
## [4,] 1.938939 2.164164 5.992993 12.388388 5.829830 15.144144 2.919920
## [5,] 1.081081 1.218218 3.211211 6.831832 3.125125 8.291291 1.608609
## [6,] 2.744745 3.260260 8.653654 17.952953 8.295295 21.718719 4.377377
## [,22] [,23] [,24] [,25] [,26] [,27] [,28]
## [1,] 8.445445 6.707708 7.430430 6.897898 3.622623 4.343343 3.429429
## [2,] 6.148148 5.059059 5.554555 5.149149 2.802803 3.201201 2.558559
## [3,] 23.933934 19.029029 21.163163 19.597598 10.540541 12.522523 9.637638
## [4,] 5.673674 4.476476 4.940941 4.604605 2.521522 2.912913 2.288288
## [5,] 3.085085 2.537538 2.674675 2.581582 1.408408 1.708709 1.234234
## [6,] 8.195195 6.613614 7.168168 6.795796 3.599600 4.222222 3.211211
## [,29] [,30] [,31] [,32] [,33] [,34]
## [1,] 6.818819 6.927928 7.409409 16.076076 4.285285 4.329329
## [2,] 4.995996 5.132132 5.580581 12.123123 3.180180 3.213213
## [3,] 19.467467 19.114114 20.976977 45.343343 12.271271 12.265265
## [4,] 4.657658 4.529530 5.013013 10.615616 2.877878 2.890891
## [5,] 2.573574 2.449449 2.682683 5.932933 1.599600 1.622623
## [6,] 6.531532 6.592593 7.220220 15.411411 4.248248 4.097097
#get upper.triangular matrix with names for mean; then get a dataframe showing presence and absence of a particular combination;this is one way, but does not account for 0-0s
meanmat.out.ind <- which( upper.tri(meanmat,diag=F) , arr.ind = TRUE )
meanmat.out.df<-data.frame( col = dimnames(veg.out)[[2]][meanmat.out.ind[,2]] ,
row = dimnames(veg.out)[[1]][meanmat.out.ind[,1]] ,
val = meanmat[ meanmat.out.ind ] )
#Calculate plots that mean species data aren't in
meanmat.out.df.neg<-cbind(meanmat.out.df,681-meanmat.out.df$val)
#find how many plots either one oor the other species in, but not both
meanmat.out.df.neg.col<-revalue(meanmat.out.df.neg$col, c("ALP"=76, "AQU"=218,"BIG"=51, "BRU"=28, "CAP"=27, "CAN"=74, "CHO"=46, "DIS"=31, "ECH"=56, "EXI"=29, "GYN"=77, "HEL"=26, "LEN"=29, "LEP"=79, "LIM"=155, "LIV"=68,
"OLI"=40, "PAU"=74, "MAG"=201, "RAR"=60, "ROS"=66, "SAX"=33, "SCI"=39, "STY"=30, "TEN"=61, "TRI"=60, "UTR"=66, "VAG"=143, "VES"=39, "ANG"=57, "EVA"=25, "RUS"=62, "VIR"=38, "CES"=189))
## The following `from` values were not present in `x`: ALP
meanmat.out.df.neg.row<-revalue(meanmat.out.df.neg$row, c("ALP"=76, "AQU"=218,"BIG"=51, "BRU"=28, "CAP"=27, "CAN"=74, "CHO"=46, "DIS"=31, "ECH"=56, "EXI"=29, "GYN"=77, "HEL"=26, "LEN"=29, "LEP"=79, "LIM"=155, "LIV"=68,
"OLI"=40, "PAU"=74, "MAG"=201, "RAR"=60, "ROS"=66, "SAX"=33, "SCI"=39, "STY"=30, "TEN"=61, "TRI"=60, "UTR"=66, "VAG"=143, "VES"=39, "ANG"=57, "EVA"=25, "RUS"=62, "VIR"=38, "CES"=189))
## The following `from` values were not present in `x`: VIR
meanmat.out.df.neg.col.num<-as.numeric(as.character(meanmat.out.df.neg.col))
meanmat.out.df.neg.row.num<-as.numeric(as.character(meanmat.out.df.neg.row))
meanmat.out.df.neg.row.col<-(meanmat.out.df.neg.col.num+meanmat.out.df.neg.row.num)-meanmat.out.df[,3]
#calculate sd across 999 different matrices for each position
fulldim<-34*34
sdmat<-matrix(nrow=34,ncol=34)
sdmat[1:1156]<-
apply(as.matrix(1:fulldim), 1,function(y) sd(unlist(lapply(veg.rand,function(x) x[y]))))
#the 1:9 is gonna be 1089 (34*34 = 1089)
#y is the index (so 1:1089, a.k.a, [1,1],[1,2] etc
#x is the list index (1 to 999)
#x[y] is list x, element y
head(sdmat)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [1,] 0.000000 2.286787 3.898385 2.120479 1.632550 2.592794 1.662010
## [2,] 2.286787 0.000000 3.384313 1.811895 1.420671 2.261115 1.432088
## [3,] 3.898385 3.384313 0.000000 3.254153 2.435813 3.801201 2.359605
## [4,] 2.120479 1.811895 3.254153 0.000000 1.363906 2.220638 1.320727
## [5,] 1.632550 1.420671 2.435813 1.363906 0.000000 1.495398 1.047947
## [6,] 2.592794 2.261115 3.801201 2.220638 1.495398 0.000000 1.616733
## [,8] [,9] [,10] [,11] [,12] [,13] [,14]
## [1,] 3.842304 2.039762 1.789979 2.199796 1.5273420 1.599582 2.613113
## [2,] 3.302840 1.814927 1.565760 2.080039 1.3256994 1.475748 2.305857
## [3,] 5.434983 2.987435 2.631521 3.288523 2.3562135 2.357143 3.797299
## [4,] 3.063685 1.727950 1.441139 1.827477 1.2468725 1.404076 2.213223
## [5,] 2.301636 1.302301 1.076507 1.391285 0.9596567 1.026083 1.619750
## [6,] 3.681073 1.974686 1.719645 2.205636 1.5856822 1.618371 2.641189
## [,15] [,16] [,17] [,18] [,19] [,20] [,21]
## [1,] 1.5117625 1.725001 2.711101 3.551509 2.545022 3.639778 2.029975
## [2,] 1.4287905 1.518635 2.260895 3.061594 2.239194 3.332555 1.702710
## [3,] 2.2520700 2.496622 3.686074 5.222394 3.858941 5.484213 2.925177
## [4,] 1.2831868 1.359697 2.255466 2.945957 2.161114 3.182373 1.571156
## [5,] 0.9871075 1.060508 1.664324 2.212586 1.558916 2.438879 1.208672
## [6,] 1.4974628 1.688797 2.580597 3.603159 2.520435 3.686617 1.907355
## [,22] [,23] [,24] [,25] [,26] [,27] [,28]
## [1,] 2.607753 2.324103 2.390050 2.367868 1.780187 1.934611 1.763234
## [2,] 2.353115 2.066411 2.141483 2.050655 1.632631 1.717107 1.450010
## [3,] 3.828440 3.369996 3.567613 3.427788 2.643878 2.738291 2.486602
## [4,] 2.172800 1.927337 2.049858 2.103728 1.497924 1.547324 1.448160
## [5,] 1.620191 1.494258 1.435057 1.554972 1.138403 1.255143 1.076805
## [6,] 2.580747 2.262098 2.409015 2.397047 1.770341 1.873416 1.662517
## [,29] [,30] [,31] [,32] [,33] [,34]
## [1,] 2.345038 2.413807 2.464081 3.240249 1.913902 1.868055
## [2,] 2.077165 2.101283 2.150934 2.959285 1.670673 1.673076
## [3,] 3.440402 3.404728 3.612557 4.897674 2.882725 2.868652
## [4,] 2.007730 1.928813 2.169850 2.755646 1.516476 1.595053
## [5,] 1.462402 1.432935 1.563746 2.120259 1.216882 1.265675
## [6,] 2.357089 2.400147 2.471390 3.158776 1.884552 1.797789
#get upper.triangular matrix with names for mean; then get a dataframe showing presence and absence of a particular combination;this is one way, but does not account for 0-0s
sd.out.ind <- which( upper.tri(sdmat,diag=F) , arr.ind = TRUE )
sd.out.df<-data.frame( col = dimnames(veg.out)[[2]][sd.out.ind[,2]] ,
row = dimnames(veg.out)[[1]][sd.out.ind[,1]] ,
val = sdmat[ sd.out.ind ] )
#Calculate z cooccurrence scores for each species pair
#1. make a data frame with appropriate data
cooc.z.info<-cbind(veg.out.df, meanmat.out.df,sd.out.df)
cooc.z.comb<-(cooc.z.info[,3]-cooc.z.info[,6])/cooc.z.info[,9]
#get the cooccurrence.z list with names
cooc.z<-cbind(cooc.z.info[,c(1:2)], cooc.z.comb)
Cyperaceae.trees<-read.nexus("Sedges_april30_comb.nex")
#calculate phy.dist for all 100 random trees
phy.dist<- cophenetic(Cyperaceae.trees)
#drop non-carex species from tree and those not focal species: exclude C. deflexa, but include T. alpinum
pruned.cyp<-drop.tip(Cyperaceae.trees,c( "DEF", "JARC", "JTRI", "ACI", "ARC","ATR", "BRA","BUX","CAT", "CVI", "DIA", "FOE","GAR","GLA","LAC","MED","NAR","NIT","WIL"))
pruned.cyp
##
## Phylogenetic tree with 34 tips and 33 internal nodes.
##
## Tip labels:
## ALP, ANG, AQU, BIG, BRU, CAN, ...
##
## Rooted; includes branch lengths.
phy.dist.cyp<-cophenetic(pruned.cyp)
phy.dist.ind <- which( upper.tri(phy.dist.cyp,diag=F) , arr.ind = TRUE )
phy.dist.df<-data.frame( col = dimnames(phy.dist.cyp)[[2]][phy.dist.ind[,2]] ,
row = dimnames(phy.dist.cyp)[[1]][phy.dist.ind[,1]] ,
val = phy.dist.cyp[ phy.dist.ind ] )
#read in extent data
extent<-read.csv("Sedges_2_extentofoccurrence.csv", row.names=1, head=TRUE)
#choose species of interest, make dataframe and order alphabetically
extent.names<-rownames(extent)
extent.species<-cbind(extent.names, extent)
names(extent.species)<-c("Species","Area")
extent.ord<-extent.species[order(extent.species),]
extent.sch<-extent.ord[c(53:55,58,60,62:63,65:66, 70:73,77:78, 80:84, 88:101),]
extent.sch.names<-extent.sch$Species
#do squareroot transformation and make back into a dataframe
extent.sqrt.one<-sqrt(extent.sch$Area)
extent.sqrt.rescale<-rescale(extent.sqrt.one, to=c(0,1), range(extent.sqrt.one, na.rm = TRUE))
extent.sqrt<-as.data.frame(cbind(as.character(extent.sch.names), extent.sqrt.rescale))
names(extent.sqrt)<-c("Species", "Extent.sqrt")
head(extent.sqrt)
## Species Extent.sqrt
## 1 ALP 0.369492390338916
## 2 ANG 0.645463985067416
## 3 AQU 1
## 4 BIG 0.470087843564745
## 5 BRU 0.476181913627433
## 6 CAN 0.775928051064645
#choose columns for further analyzes
extent<-extent.sqrt.rescale #extent without species names
extent.sp<-extent.sqrt[,c(1:2),drop=FALSE] #with species names
#create a matrix with absolute value differences; pairwise
final.extent<-matrix(nrow=34,ncol=34)
#this is for extent
for (i in 1:nrow(final.extent)){
final.extent[i,]<-abs(extent[i]-extent)
}
#put rownames and colnames on matrix
rownames(final.extent)<-extent.sch.names
colnames(final.extent)<-extent.sch.names
extent.ind <- which( upper.tri(final.extent,diag=F) , arr.ind = TRUE )
extent.df<-data.frame( col = dimnames(final.extent)[[2]][extent.ind[,2]] ,
row = dimnames(final.extent)[[1]][extent.ind[,1]] ,
val = final.extent[ extent.ind ] )
#make up matrix with information for lm's
data.cooc<-cbind(extent.df, phy.dist.df, cooc.z)
data.cooc<-data.cooc[,c(3,6,9)]
colnames(data.cooc)<-c("gen.spec", "phy.dist", "cooc.z")
head(data.cooc)
## gen.spec phy.dist cooc.z
## 1 0.2759716 78.891213 -2.39264565
## 2 0.6305076 121.972198 5.85930884
## 3 0.3545360 121.972198 -0.06566243
## 4 0.1005955 121.972198 -2.71153372
## 5 0.1753761 121.972198 -0.70770244
## 6 0.5299122 9.008272 -4.68947236
#lm of cooc.z and phy.dist
cooc.z.phy.dist.lm<-lm(data.cooc$cooc.z~sqrt(data.cooc$phy.dist))
summary(cooc.z.phy.dist.lm)
##
## Call:
## lm(formula = data.cooc$cooc.z ~ sqrt(data.cooc$phy.dist))
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.7487 -1.7057 -0.8179 0.6467 12.4636
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.0583 0.3881 -2.727 0.0066 **
## sqrt(data.cooc$phy.dist) 0.1119 0.0458 2.443 0.0149 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.81 on 559 degrees of freedom
## Multiple R-squared: 0.01056, Adjusted R-squared: 0.00879
## F-statistic: 5.966 on 1 and 559 DF, p-value: 0.01489
#diagnostic plots for cooc.z.phy.dist.lm
opar <- par(mfrow=c(2,2))
plot(cooc.z.phy.dist.lm)
par(opar)
par(mfrow=c(2,2))
# What do the residuals suggest?
hist(resid(cooc.z.phy.dist.lm)) #this is a little right skewed
#Are the data normally distributed
hist(sqrt(data.cooc$phy.dist),col="grey50", main="Transformed data", xlab="Pairwise phylogenetic distance")
hist(data.cooc$cooc.z, col="grey50", main="Uransformed data", xlab="Standardized Co-occurrence")
# Let's verify using the Shapiro test:
shapiro.test(sqrt(data.cooc$phy.dist)) #definiately not normal
##
## Shapiro-Wilk normality test
##
## data: sqrt(data.cooc$phy.dist)
## W = 0.8772, p-value < 2.2e-16
shapiro.test(data.cooc$cooc.z) #not normal either; however residuals no longer a negative binomial or binomial relation
##
## Shapiro-Wilk normality test
##
## data: data.cooc$cooc.z
## W = 0.8524, p-value < 2.2e-16
#variance is good; just need to fix normality
plot(sqrt(data.cooc$phy.dist), data.cooc$cooc.z)
abline(cooc.z.phy.dist.lm)
#do lm of cooc.z by phy.dist for entire data set
cooc.z.gen.spec.lm<-lm(data.cooc$cooc.z~data.cooc$gen.spec)
summary(cooc.z.gen.spec.lm)
##
## Call:
## lm(formula = data.cooc$cooc.z ~ data.cooc$gen.spec)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9180 -1.6516 -0.9352 0.8152 12.3378
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.2873 0.1958 -1.468 0.143
## data.cooc$gen.spec 0.5211 0.6146 0.848 0.397
##
## Residual standard error: 2.823 on 559 degrees of freedom
## Multiple R-squared: 0.001284, Adjusted R-squared: -0.0005023
## F-statistic: 0.7189 on 1 and 559 DF, p-value: 0.3969
#diagnostic plots for cooc.z.phy.dist.lm
opar <- par(mfrow=c(2,2))
plot(cooc.z.gen.spec.lm)
par(opar)
par(mfrow=c(2,2))
# What do the residuals suggest?
hist(resid(cooc.z.gen.spec.lm)) #this is a little right skewed
#Are the data normally distributed
hist(data.cooc$gen.spec,col="grey50", main="Untransformed data", xlab="Generalism-Specialism")
hist(data.cooc$cooc.z, col="grey50", main="Uransformed data", xlab="Standardized Co-occurrence")
# Let's verify using the Shapiro test:
shapiro.test(data.cooc$gen.spec) #definiately not normal
##
## Shapiro-Wilk normality test
##
## data: data.cooc$gen.spec
## W = 0.9222, p-value < 2.2e-16
shapiro.test(data.cooc$cooc.z) #not normal either; however residuals no longer a negative binomial or binomial relation
##
## Shapiro-Wilk normality test
##
## data: data.cooc$cooc.z
## W = 0.8524, p-value < 2.2e-16
#variance is alright; just need to fix normality
plot(data.cooc$gen.spec, data.cooc$cooc.z)
abline(cooc.z.gen.spec.lm)
#do lm of genspec by phy.dist for entire data set
phydist.gen.spec.lm<-lm(data.cooc$gen.spec~sqrt(data.cooc$phy.dist))
summary(phydist.gen.spec.lm)
##
## Call:
## lm(formula = data.cooc$gen.spec ~ sqrt(data.cooc$phy.dist))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.28255 -0.15814 -0.04417 0.12122 0.75180
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.322675 0.026651 12.108 < 2e-16 ***
## sqrt(data.cooc$phy.dist) -0.008672 0.003145 -2.757 0.00602 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1929 on 559 degrees of freedom
## Multiple R-squared: 0.01342, Adjusted R-squared: 0.01165
## F-statistic: 7.603 on 1 and 559 DF, p-value: 0.006017
#diagnostic plots for cooc.z.phy.dist.lm
opar <- par(mfrow=c(2,2))
plot(phydist.gen.spec.lm)
par(opar)
par(mfrow=c(2,2))
# What do the residuals suggest?
hist(resid(phydist.gen.spec.lm)) #this is a little right skewed
#Are the data normally distributed
hist(data.cooc$gen.spec,col="grey50", main="Untransformed data", xlab="Generalism-Specialism")
hist(sqrt(data.cooc$phy.dist), col="grey50", main="Transformed data", xlab="Phylogenetic Distance")
# Let's verify using the Shapiro test:
shapiro.test(asin(sqrt(data.cooc$gen.spec))) #definiately not normal
##
## Shapiro-Wilk normality test
##
## data: asin(sqrt(data.cooc$gen.spec))
## W = 0.9754, p-value = 4.298e-08
shapiro.test(data.cooc$phy.dist) #not normal either; however residuals no longer a negative binomial or binomial relation
##
## Shapiro-Wilk normality test
##
## data: data.cooc$phy.dist
## W = 0.8671, p-value < 2.2e-16
plot(data.cooc$gen.spec~sqrt(data.cooc$phy.dist))
abline(phydist.gen.spec.lm)
#variance is alright
#do lm of genspec by phy.dist and gen spec for entire data set
phydist.gen.spec.cooc.z.lm<-lm(data.cooc$cooc.z ~ data.cooc$gen.spec+sqrt(data.cooc$phy.dist))
summary(phydist.gen.spec.cooc.z.lm) #looks like
##
## Call:
## lm(formula = data.cooc$cooc.z ~ data.cooc$gen.spec + sqrt(data.cooc$phy.dist))
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.6713 -1.7095 -0.8221 0.6721 12.5998
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.2853 0.4359 -2.949 0.00333 **
## data.cooc$gen.spec 0.7037 0.6158 1.143 0.25362
## sqrt(data.cooc$phy.dist) 0.1180 0.0461 2.559 0.01075 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.809 on 558 degrees of freedom
## Multiple R-squared: 0.01287, Adjusted R-squared: 0.009333
## F-statistic: 3.638 on 2 and 558 DF, p-value: 0.02694
#diagnostic plots for cooc.z.phy.dist.lm
opar <- par(mfrow=c(2,2))
plot(phydist.gen.spec.cooc.z.lm) #Q-Q plot is like a big snake; variance is highest for middle values
par(opar)
par(mfrow=c(2,2))
# What do the residuals suggest?
hist(resid(phydist.gen.spec.cooc.z.lm)) #this is a little right skewed
#Are the data normally distributed
hist(data.cooc$gen.spec,col="grey50", main="Untransformed data", xlab="Generalism-Specialism")
hist(sqrt(data.cooc$phy.dist), col="grey50", main="Uransformed data", xlab="Phylogenetic Distance")
# Let's verify using the Shapiro test:
shapiro.test(data.cooc$gen.spec) #definiately not normal
##
## Shapiro-Wilk normality test
##
## data: data.cooc$gen.spec
## W = 0.9222, p-value < 2.2e-16
shapiro.test(data.cooc$cooc.z) #not normal either; however residuals no longer a negative binomial or binomial relation
##
## Shapiro-Wilk normality test
##
## data: data.cooc$cooc.z
## W = 0.8524, p-value < 2.2e-16
shapiro.test(sqrt(data.cooc$phy.dist)) #not normal
##
## Shapiro-Wilk normality test
##
## data: sqrt(data.cooc$phy.dist)
## W = 0.8772, p-value < 2.2e-16
#normality and variance are not good
#
#check to see if there is an interaction - yes there is between phy.dist and gen.spec.
phydist.gen.spec.cooc.z.lm.int<-lm(data.cooc$cooc.z ~ data.cooc$gen.spec*sqrt(data.cooc$phy.dist))
summary(phydist.gen.spec.cooc.z.lm.int)
##
## Call:
## lm(formula = data.cooc$cooc.z ~ data.cooc$gen.spec * sqrt(data.cooc$phy.dist))
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.6994 -1.6859 -0.8242 0.6570 12.5552
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) -1.03221 0.65810 -1.568
## data.cooc$gen.spec -0.22477 1.90967 -0.118
## sqrt(data.cooc$phy.dist) 0.08636 0.07691 1.123
## data.cooc$gen.spec:sqrt(data.cooc$phy.dist) 0.11942 0.23250 0.514
## Pr(>|t|)
## (Intercept) 0.117
## data.cooc$gen.spec 0.906
## sqrt(data.cooc$phy.dist) 0.262
## data.cooc$gen.spec:sqrt(data.cooc$phy.dist) 0.608
##
## Residual standard error: 2.811 on 557 degrees of freedom
## Multiple R-squared: 0.01334, Adjusted R-squared: 0.008024
## F-statistic: 2.51 on 3 and 557 DF, p-value: 0.05792
#####################do for same dataset, but for phy.dist <30
data.cooc.dh<-subset(data.cooc[data.cooc$phy.dist<30,])
head(data.cooc.dh)
## gen.spec phy.dist cooc.z
## 6 0.52991216 9.008272 -4.6894724
## 15 0.29974614 14.509728 4.6917372
## 33 0.02927515 24.481046 -1.4696399
## 34 0.27047098 24.481046 -2.4803428
## 50 0.39525528 24.481046 -0.9417983
## 51 0.09550914 24.481046 0.3893929
#lm of cooc.z and phy.dist
cooc.z.phy.dist.dh.lm<-lm(data.cooc.dh$cooc.z~sqrt(data.cooc.dh$phy.dist))
summary(cooc.z.phy.dist.dh.lm)
##
## Call:
## lm(formula = data.cooc.dh$cooc.z ~ sqrt(data.cooc.dh$phy.dist))
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.1446 -1.3318 -0.4673 0.5589 12.1081
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.2158 1.2186 -1.818 0.0715 .
## sqrt(data.cooc.dh$phy.dist) 0.3722 0.2893 1.287 0.2007
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.61 on 120 degrees of freedom
## Multiple R-squared: 0.01361, Adjusted R-squared: 0.005389
## F-statistic: 1.656 on 1 and 120 DF, p-value: 0.2007
#diagnostic plots for cooc.z.phy.dist.lm
opar <- par(mfrow=c(2,2))
plot(cooc.z.phy.dist.dh.lm)
par(opar)
# What do the residuals suggest?
par(mfrow=c(2,2))
hist(resid(cooc.z.phy.dist.dh.lm)) #this is a little right skewed
#Are the data normally distributed
hist(sqrt(data.cooc.dh$phy.dist),col="grey50", main="Trransformed data", xlab="Pairwise phylogenetic distance")
hist(data.cooc.dh$cooc.z, col="grey50", main="Untransformed data", xlab="Standardized Co-occurrence")
# Let's verify using the Shapiro test:
shapiro.test(sqrt(data.cooc.dh$phy.dist)) #definiately not normal
##
## Shapiro-Wilk normality test
##
## data: sqrt(data.cooc.dh$phy.dist)
## W = 0.8502, p-value = 9.04e-10
shapiro.test(data.cooc.dh$cooc.z) #not normal either; however residuals no longer a negative binomial or binomial relation
##
## Shapiro-Wilk normality test
##
## data: data.cooc.dh$cooc.z
## W = 0.8228, p-value = 8.239e-11
#variance is good; just need to fix normality
plot(sqrt(data.cooc.dh$phy.dist), data.cooc.dh$cooc.z)
#do lm of cooc.z by phy.dist for entire data set
cooc.z.gen.spec.dh.lm<-lm(data.cooc.dh$cooc.z~data.cooc.dh$gen.spec)
summary(cooc.z.gen.spec.dh.lm)
##
## Call:
## lm(formula = data.cooc.dh$cooc.z ~ data.cooc.dh$gen.spec)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.1922 -1.2601 -0.5757 0.4471 12.3882
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.70273 0.41350 -1.699 0.0918 .
## data.cooc.dh$gen.spec 0.08357 1.12215 0.074 0.9408
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.628 on 120 degrees of freedom
## Multiple R-squared: 4.621e-05, Adjusted R-squared: -0.008287
## F-statistic: 0.005546 on 1 and 120 DF, p-value: 0.9408
#diagnostic plots for cooc.z.phy.dist.lm
opar <- par(mfrow=c(2,2))
plot(cooc.z.gen.spec.dh.lm)
par(opar)
par(mfrow=c(2,2))
# What do the residuals suggest?
hist(resid(cooc.z.gen.spec.dh.lm)) #this is a little right skewed
#Are the data normally distributed
hist(data.cooc.dh$gen.spec,col="grey50", main="Untransformed data", xlab="Generalism-Specialism")
hist(data.cooc.dh$cooc.z, col="grey50", main="Untransformed data", xlab="Standardized Co-occurrence")
# Let's verify using the Shapiro test:
shapiro.test(data.cooc$gen.spec) #definiately not normal
##
## Shapiro-Wilk normality test
##
## data: data.cooc$gen.spec
## W = 0.9222, p-value < 2.2e-16
shapiro.test(data.cooc$cooc.z) #not normal either; however residuals no longer a negative binomial or binomial relation
##
## Shapiro-Wilk normality test
##
## data: data.cooc$cooc.z
## W = 0.8524, p-value < 2.2e-16
#variance and normality are not good
plot(data.cooc.dh$gen.spec, data.cooc.dh$cooc.z)
#do lm of genspec by phy.dist for entire data set
phydist.gen.spec.dh.lm<-lm(data.cooc.dh$gen.spec~sqrt(data.cooc.dh$phy.dist))
summary(phydist.gen.spec.dh.lm)
##
## Call:
## lm(formula = data.cooc.dh$gen.spec ~ sqrt(data.cooc.dh$phy.dist))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.29340 -0.18459 -0.03925 0.14527 0.67631
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.38414 0.09952 3.860 0.000184 ***
## sqrt(data.cooc.dh$phy.dist) -0.02003 0.02362 -0.848 0.398321
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2131 on 120 degrees of freedom
## Multiple R-squared: 0.005952, Adjusted R-squared: -0.002332
## F-statistic: 0.7185 on 1 and 120 DF, p-value: 0.3983
#diagnostic plots for cooc.z.phy.dist.lm
opar <- par(mfrow=c(2,2))
plot(phydist.gen.spec.dh.lm)
par(opar)
par(mfrow=c(2,2))
# What do the residuals suggest?
hist(resid(phydist.gen.spec.dh.lm)) #this is a little right skewed
#Are the data normally distributed
hist(data.cooc.dh$gen.spec,col="grey50", main="Untransformed data", xlab="Generalism-Specialism") # a little right skewed
hist(sqrt(data.cooc.dh$phy.dist), col="grey50", main="Transformed data", xlab="Phylogenetic Distance") #majorly left skewed
plot(data.cooc.dh$gen.spec~sqrt(data.cooc.dh$phy.dist))
# Let's verify using the Shapiro test:
shapiro.test(data.cooc.dh$gen.spec) #good
##
## Shapiro-Wilk normality test
##
## data: data.cooc.dh$gen.spec
## W = 0.9366, p-value = 2.201e-05
shapiro.test(sqrt(data.cooc.dh$phy.dist)) #not normal either; however residuals no longer a negative binomial or binomial relation
##
## Shapiro-Wilk normality test
##
## data: sqrt(data.cooc.dh$phy.dist)
## W = 0.8502, p-value = 9.04e-10
#variance is not good
#do lm of genspec by phy.dist and gen spec for entire data set
phydist.gen.spec.cooc.z.dh.lm<-lm(data.cooc.dh$cooc.z ~ data.cooc.dh$gen.spec+sqrt(data.cooc.dh$phy.dist))
summary(phydist.gen.spec.cooc.z.dh.lm) #looks like
##
## Call:
## lm(formula = data.cooc.dh$cooc.z ~ data.cooc.dh$gen.spec + sqrt(data.cooc.dh$phy.dist))
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.1032 -1.3105 -0.4903 0.5374 12.1553
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.2908 1.2973 -1.766 0.080 .
## data.cooc.dh$gen.spec 0.1954 1.1224 0.174 0.862
## sqrt(data.cooc.dh$phy.dist) 0.3761 0.2913 1.291 0.199
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.62 on 119 degrees of freedom
## Multiple R-squared: 0.01386, Adjusted R-squared: -0.002714
## F-statistic: 0.8363 on 2 and 119 DF, p-value: 0.4359
#diagnostic plots for cooc.z.phy.dist.lm
opar <- par(mfrow=c(2,2))
plot(phydist.gen.spec.cooc.z.dh.lm) #Q-Q plot is like a big snake; variance is highest for middle values #variance goes downwards
par(opar)
# What do the residuals suggest?
par(mfrow=c(2,2))
hist(resid(phydist.gen.spec.cooc.z.dh.lm)) #tgood
#Are the data normally distributed
hist(data.cooc.dh$gen.spec,col="grey50", main="Untransformed data", xlab="Generalism-Specialism")
hist(sqrt(data.cooc.dh$phy.dist), col="grey50", main="Transformed data", xlab="Phylogenetic Distance") #majorily left skewed
# Let's verify using the Shapiro test:
shapiro.test(data.cooc.dh$gen.spec) #normal
##
## Shapiro-Wilk normality test
##
## data: data.cooc.dh$gen.spec
## W = 0.9366, p-value = 2.201e-05
shapiro.test(data.cooc.dh$cooc.z) #not normal either; however residuals no longer a negative binomial or binomial relation
##
## Shapiro-Wilk normality test
##
## data: data.cooc.dh$cooc.z
## W = 0.8228, p-value = 8.239e-11
shapiro.test(sqrt(data.cooc.dh$phy.dist)) #not normal
##
## Shapiro-Wilk normality test
##
## data: sqrt(data.cooc.dh$phy.dist)
## W = 0.8502, p-value = 9.04e-10
#normality and variance are not good
#check to see if there is an interaction - yes there is between phy.dist and gen.spec.
phydist.gen.spec.cooc.z.dh.lm.int<-lm(data.cooc.dh$cooc.z ~ data.cooc.dh$gen.spec*sqrt(data.cooc.dh$phy.dist))
summary(phydist.gen.spec.cooc.z.dh.lm.int)
##
## Call:
## lm(formula = data.cooc.dh$cooc.z ~ data.cooc.dh$gen.spec * sqrt(data.cooc.dh$phy.dist))
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.1366 -1.3286 -0.5089 0.5488 12.0553
##
## Coefficients:
## Estimate Std. Error
## (Intercept) -3.0454 2.2718
## data.cooc.dh$gen.spec 2.4907 5.7737
## sqrt(data.cooc.dh$phy.dist) 0.5568 0.5331
## data.cooc.dh$gen.spec:sqrt(data.cooc.dh$phy.dist) -0.5550 1.3692
## t value Pr(>|t|)
## (Intercept) -1.341 0.183
## data.cooc.dh$gen.spec 0.431 0.667
## sqrt(data.cooc.dh$phy.dist) 1.045 0.298
## data.cooc.dh$gen.spec:sqrt(data.cooc.dh$phy.dist) -0.405 0.686
##
## Residual standard error: 2.63 on 118 degrees of freedom
## Multiple R-squared: 0.01523, Adjusted R-squared: -0.009805
## F-statistic: 0.6084 on 3 and 118 DF, p-value: 0.6109
#there are no interactions
Set up GAM formula with interaction between two predictor variables.
I am using tensor product smoothing because predictors not of same unit
For te smooths, the upper limit of degrees of freedom is given by product of the K values provided by each marginal smooth less than one, for the construction.
Exact choice is not generally ciritical, but should be large enough so you are reasonable sure of having enough degrees of freedom to represent the ‘truth’ reasonable well, but small enough for comuptational efficiency
cooc.gam.sqrt.te<-gam(cooc.z~te(phy.dist.sqrt,gen.spec), dat=data.cooc)
summary(cooc.gam.sqrt.te)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## cooc.z ~ te(phy.dist.sqrt, gen.spec)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.1557 0.1187 -1.312 0.19
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## te(phy.dist.sqrt,gen.spec) 3 3 2.51 0.0579 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.00802 Deviance explained = 1.33%
## GCV = 7.9563 Scale est. = 7.8995 n = 561
gam.check(cooc.gam.sqrt.te)
##
## Method: GCV Optimizer: magic
## Smoothing parameter selection converged after 16 iterations.
## The RMS GCV score gradiant at convergence was 5.25362e-07 .
## The Hessian was positive definite.
## The estimated model rank was 25 (maximum possible: 25)
## Model rank = 25 / 25
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## te(phy.dist.sqrt,gen.spec) 24.000 3.000 0.983 0.32
#####################do for same dataset, but remove phy.dist of 30 my
data.cooc.30<-subset(data.cooc[data.cooc$phy.dist<30,])
phy.dist.30.sqrt<-sqrt(data.cooc.30$phy.dist)
gen.spec.30<-data.cooc.30$gen.spec
cooc.z.30<-data.cooc.30$cooc.z
cooc.gam.30<-gam(cooc.z.30~te(phy.dist.30.sqrt,gen.spec.30), dat=data.cooc.30)
summary(cooc.gam.30)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## cooc.z.30 ~ te(phy.dist.30.sqrt, gen.spec.30)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.6775 0.2381 -2.846 0.00522 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## te(phy.dist.30.sqrt,gen.spec.30) 3 3 0.608 0.611
##
## R-sq.(adj) = -0.00981 Deviance explained = 1.52%
## GCV = 7.1498 Scale est. = 6.9154 n = 122
gam.check(cooc.gam.30, k.rep = 1000)
##
## Method: GCV Optimizer: magic
## Smoothing parameter selection converged after 19 iterations.
## The RMS GCV score gradiant at convergence was 3.445932e-07 .
## The Hessian was positive definite.
## The estimated model rank was 25 (maximum possible: 25)
## Model rank = 25 / 25
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## te(phy.dist.30.sqrt,gen.spec.30) 24.00 3.00 1.01 0.45
#other diagnostic checks that should be done
par(mfrow=c(1,2))
plot(fitted(cooc.gam.30),residuals(cooc.gam.30))
plot(data.cooc.30$cooc.z,residuals(cooc.gam.30))
#dev.new(width=7.4, height=4)
layout(matrix(1:2,ncol=2), width = c(1,1),height = c(1,1))
par(mar=c(4,2.5,2.75,0.75), mgp=c(1.5,0.5,0), las=0)
my.vis.gam(cooc.gam.sqrt.te, type = "response",color="gray",plot.type = "contour",
ylab="Difference in niche width",main="Predicted co-occurrence based on \ninteracting predictor variables",
xlab="Phylogenetic distance (square root)", cex.lab=1, cex.axis=0.75, cex.main=1, lwd=2, cex=2)
my.vis.gam(cooc.gam.30, type = "response",color="gray",plot.type = "contour",
ylab="Difference in niche width",main="Predicted co-occurrence based on \ninteracting predictor variables",
xlab="Phylogenetic distance (square root)", cex.lab=1, cex.axis=0.75, cex.main=1, lwd=2, cex=2)
#textClick("(a)", cex=1)
#textClick("(b)", cex=1)