knitr::opts_chunk$set(echo = TRUE)
rm(list= ls())
library(extrafont)
## Registering fonts with R
library(geosphere)
library(scales)
setwd("/Users/rpineau3/Dropbox (GaTech)/BioSci-Ratcliff/Rozenn/4.PandoProject/17-analyses/3_spatial_structure/data/")
ids <- read.csv("ids_PS_sup4.txt", sep = "\t", header = T) # 203 x 6
som <- read.table("gl_PS_5082snps_203inds.txt", sep = " ")
som <- t(som) # 5082 x 203
nInd <- 203
hets <- rowSums(som)/nInd
# Make sure there are no singletons and empty variants, and remove anything above 80%
idx <- which(hets>1/nInd & hets < 0.8) #5082
som <- som[idx,]
hets <- rowSums(som)/nInd
# Check heterozygosity per SNP
par(family="Times New Roman", cex.lab=1.2, cex.axis=1.2, cex.main=1, cex.sub=1)
hist(hets, xlab = "Heterozygosity", ylab = "counts", 100, main = "heterozygosity per SNP",
sub = "Pando + Subclone datasets",
col = "#2ca25f")

# ------------------------------------------------------------------------------------- #
# Detecting Spatial Signal per individual
# ------------------------------------------------------------------------------------- #
# define function - Euclidean distance
euclidean <- function(a, b) sqrt(sum((a - b)^2))
# Calculate number of shared SNP and mean Euclidean distance between pairs of samples, compare to random
n_SNP <- dim(som)[1]
n_samp <- length(ids$full_ID)
store_num_common_snp <- matrix(0, n_samp, n_samp)
space <- matrix(0, n_samp, n_samp)
store_eucl <- matrix(0, n_samp, n_samp)
n <- 1
for (i in 1:n_samp){
for (j in n:n_samp){
store_num_common_snp[i,j] <- length(intersect(which(som[,i]==1), which(som[,j]==1) ) ) / mean(length(which(som[,i]==1)), length(which(som[,j]==1)))
space[i,j] <- distVincentyEllipsoid(c(ids$long[i],ids$lat[i]), c(ids$long[j],ids$lat[j]))
store_eucl[i,j] <- euclidean(som[,i], som[,j])
}
n <- n + 1
}
# ------------------------------------------------------------------------------------- #
# Some stats
# ------------------------------------------------------------------------------------- #
# Number of SNP between each sample pair
mean(store_num_common_snp[upper.tri(store_num_common_snp, diag = FALSE)]) # 91
## [1] 0.2317073
min(store_num_common_snp[upper.tri(store_num_common_snp, diag = FALSE)]) # 42
## [1] 0.08955224
max(store_num_common_snp[upper.tri(store_num_common_snp, diag = FALSE)]) # 329
## [1] 0.5164835
# ------------------------------------------------------------------------------------- #
# figure
# ------------------------------------------------------------------------------------- #
#setwd("/Users/rpineau3/Dropbox (GaTech)/BioSci-Ratcliff/Rozenn/4.PandoProject/5-Writing/5.Figures/F3/")
#pdf(file="panelB_2008.pdf",
# bg = "transparent", width=5, height=4, family = "Times New Roman")
# Visualization
par(family="Times New Roman", cex.lab=1.2, cex.axis=1.2, cex.main=1, cex.sub=1)
plot(space[upper.tri(space)] , store_num_common_snp[upper.tri(store_num_common_snp)], pch = 16, xlab = "Distance between pairs (m)",
ylab = "Proportion of SNPs in common", col = "#bcbddc", bty = "none")

#dev.off()
# correlation between number of variants and space
val <- cor.test(space[upper.tri(space)] ,store_num_common_snp[upper.tri(store_num_common_snp)])
val
##
## Pearson's product-moment correlation
##
## data: space[upper.tri(space)] and store_num_common_snp[upper.tri(store_num_common_snp)]
## t = -19.17, df = 20501, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1461212 -0.1192270
## sample estimates:
## cor
## -0.1326985
# ------------------------------------------------------------------------------------- #
# SNPs shared by two ramets
# ------------------------------------------------------------------------------------- #
# Initialize
store.SNPpairs <- c()
store.spatial.dist <- c()
i <- 0
# Find the number of SNPs shared by 2 ramets
SNPind <- which(rowSums(som,na.rm=TRUE)==2) # 27
# Start loop
for (i in 1:length(SNPind)) {
rametsID <- which(som[SNPind[i],]==1) # pair of focus
# Calculate genetic distance between i and closest neighbour trees
spatial.dist <- as.matrix(distm(cbind(ids$long[rametsID[1]], ids$lat[rametsID[1]]), cbind(ids$long[rametsID[2]], ids$lat[rametsID[2]]), fun=distVincentyEllipsoid))
store.spatial.dist <- c(store.spatial.dist, spatial.dist)
store.SNPpairs <- c(store.SNPpairs, length(which(rowSums(som[, rametsID])==2))) # They share one unique mutation, and many more
}
store.pairs.2 <- store.SNPpairs
store.spatial.dist.2 <- store.spatial.dist
# Plot and save figure - two ramets' SNPs
# pdf(file="SNPs_shared_by_two_ramets.pdf", bg = "transparent",
# width=5.2, height=4, family = "Times New Roman")
par(family="Times New Roman", cex.lab=1.1, cex.axis=1, cex.main=1, cex.sub=1)#, col.lab="white", col.axis = "white")
plot(store.spatial.dist, store.SNPpairs, pch=16,col="#756bb1", cex=1.2, xlim = c(0,160), ylim = c(0,350),
xlab = "Spatial distance (m)",
ylab = "Number of shared SNPs")
legend("topright", c("SNPs shared by two ramets"), col = c("#756bb1"), pch=16, cex = 1, bty = "n")

# Regression line
# abline(lm(store.SNPpairs ~ store.spatial.dist), col = "red", lwd = 3)
#dev.off()
cor.val.2 <- cor(store.spatial.dist[!is.na(store.spatial.dist)], store.SNPpairs[!is.na(store.spatial.dist)]) # p val not significant
cor.val.2
## [1] -0.07895282
cor.test(store.spatial.dist[!is.na(store.spatial.dist)], store.SNPpairs[!is.na(store.spatial.dist)])
##
## Pearson's product-moment correlation
##
## data: store.spatial.dist[!is.na(store.spatial.dist)] and store.SNPpairs[!is.na(store.spatial.dist)]
## t = -2.3839, df = 906, p-value = 0.01733
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.14327625 -0.01396518
## sample estimates:
## cor
## -0.07895282
# Pearson's product-moment correlation
#
# data: store.spatial.dist[!is.na(store.spatial.dist)] and store.SNPpairs[!is.na(store.spatial.dist)]
# t = -6.7839, df = 553, p-value = 3.015e-11
# alternative hypothesis: true correlation is not equal to 0
# 95 percent confidence interval:
# -0.3522812 -0.1985307
# sample estimates:
# cor
# -0.2771794
# ------------------------------------------------------------------------------------- #
# Detecting Spatial Signal per SNP - new threshold
# ------------------------------------------------------------------------------------- #
# Calculate mean distance between pairs of samples presenting the mutation
n_SNP <- dim(som)[1]
store_mean_dist <- matrix(NA, n_SNP, 1)
store_mean_rand_dist <- matrix(NA, n_SNP, 1)
store_num_samp <- matrix(NA, n_SNP, 1)
dist <- c()
rand_dist <- c()
for (x in 1:n_SNP) {
#print(x)
samples <- which(som[x,]==1) # samples that have the mutation
start <- 2
dist <- c()
# Calculate every pairwise distance between samples with the mutation
for (i in 1:c(length(samples)-1)) {
for (j in start:length(samples)) {
dist <- c(dist, distVincentyEllipsoid(c(ids$long[i],ids$lat[i]), c(ids$long[j],ids$lat[j])) )
}
j <- j + 1 # do not compute the same distance twice
}
# Store mean pairwise distance for each sample
store_mean_dist[x] <- mean(c(dist))
store_num_samp[x] <- length(samples)
}
mean(store_mean_dist) # 1.52 meters
## [1] 131.8289
# Randomization - on the cluster
# rand <- hist(store_mean_rand_dist, 1000, plot = FALSE)
# rand$counts=rand$counts/sum(rand$counts)
# x <- (rand$breaks[1:length(rand$breaks)-1]+ rand$breaks[2:length(rand$breaks)])/2
# y <- rand$counts
# plot(x, y, type='l', lwd = 2, col = "#6baed6", xlab="Mean distance b/w group of samples sharing a mutation (m)",
# ylab="Density", bty="n", main = '223 SNPs')
# # Data
# data <- hist(store_mean_dist, 150, plot = FALSE)
# data$counts=data$counts/sum(data$counts)
# x <- (data$breaks[1:length(data$breaks)-1]+ data$breaks[2:length(data$breaks)])/2
# y <- data$counts
# points(x, y/10, type='l', lwd = 2, col = "#74c476")
#
# legend(500,0.02, c("Randomization", "Data"),
# pch = 15, col = c("#6baed6", "#74c476"), box.lty = 0)
#