knitr::opts_chunk$set(echo = TRUE)
rm(list= ls())

library(extrafont)
## Registering fonts with R
library(geosphere)
library(scales)


setwd("/Users/rpineau3/Dropbox (GaTech)/BioSci-Ratcliff/Rozenn/4.PandoProject/17-analyses/3_spatial_structure/data/")
ids <- read.csv("ids_PS_sup4.txt", sep = "\t", header = T)
ids_subclone <- ids[ids$group!="pando",] # 101 x 6 
som_subclone <- read.table("gl_subclone_3047snps_101inds.txt", sep = " ")
som_subclone <- t(som_subclone) # 3047 x 101

nInd <- 101
hets <- rowSums(som_subclone)/nInd
# Make sure there are no singletons and empty variants, and remove anything above 80%
idx <- which(hets>1/nInd & hets < 0.8) #3034
som_subclone <- som_subclone[idx,]
hets <- rowSums(som_subclone)/nInd

# Check heterozygosity per SNP
par(family="Times New Roman", cex.lab=1.2, cex.axis=1.2, cex.main=1, cex.sub=1)
hist(hets, xlab = "Heterozygosity", ylab = "counts", 100, main = "heterozygosity per SNP",
     sub = "Subclone 2022 dataset",
     col = "#2ca25f")

# ------------------------------------------------------------------------------------- #
# Detecting Spatial Signal per individual
# ------------------------------------------------------------------------------------- #

# define function - Euclidean distance
euclidean <- function(a, b) sqrt(sum((a - b)^2))
# Calculate number of shared SNP and mean Euclidean distance between pairs of samples, compare to random

n_SNP <- dim(som_subclone)[1]
n_samp <- length(ids_subclone$full_ID)
store_num_common_snp <-  matrix(0, n_samp, n_samp)
space <-  matrix(0, n_samp, n_samp)
store_eucl <-  matrix(0, n_samp, n_samp)

n <- 1

for (i in 1:n_samp){  
  
  
  for (j in n:n_samp){
    
    store_num_common_snp[i,j] <- length(intersect(which(som_subclone[,i]==1), which(som_subclone[,j]==1))) / mean(length(which(som_subclone[,i]==1)), length(which(som_subclone[,j]==1)))
    space[i,j] <-  distVincentyEllipsoid(c(ids_subclone$long[i],ids_subclone$lat[i]), c(ids_subclone$long[j],ids_subclone$lat[j]))
    store_eucl[i,j] <- euclidean(som_subclone[,i], som_subclone[,j]) 
    
  }
  
  n <- n + 1
}


# ------------------------------------------------------------------------------------- #
# Some stats
# ------------------------------------------------------------------------------------- #
# Number of SNP between each sample pair
mean(store_num_common_snp[upper.tri(store_num_common_snp, diag = FALSE)]) # 91
## [1] 0.2657913
min(store_num_common_snp[upper.tri(store_num_common_snp, diag = FALSE)]) # 42
## [1] 0.07801418
max(store_num_common_snp[upper.tri(store_num_common_snp, diag = FALSE)]) # 329
## [1] 0.5177305
# ------------------------------------------------------------------------------------- #
# figure
# ------------------------------------------------------------------------------------- #

#setwd("/Users/rpineau3/Dropbox (GaTech)/BioSci-Ratcliff/Rozenn/4.PandoProject/5-Writing/5.Figures/F3/")

#pdf(file="panelB_2008.pdf",
#    bg = "transparent", width=5, height=4, family = "Times New Roman")


# Visualization
par(family="Times New Roman", cex.lab=1.2, cex.axis=1.2, cex.main=1, cex.sub=1)
plot(space[upper.tri(space)] , store_num_common_snp[upper.tri(store_num_common_snp)], pch = 16, xlab = "Distance between pairs (m)", 
     ylab = "Number of shared SNPs", col = "#bcbddc", bty = "none")

#dev.off()

# correlation between number of variants and space
val <- cor.test(space[upper.tri(space)] ,store_num_common_snp[upper.tri(store_num_common_snp)])
val
## 
##  Pearson's product-moment correlation
## 
## data:  space[upper.tri(space)] and store_num_common_snp[upper.tri(store_num_common_snp)]
## t = -6.9377, df = 5048, p-value = 4.487e-12
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.12443278 -0.06978997
## sample estimates:
##         cor 
## -0.09718461
# ------------------------------------------------------------------------------------- #
# SNPs shared by two ramets
# ------------------------------------------------------------------------------------- #
# Initialize 
store.SNPpairs <- c()
store.spatial.dist <- c()
i <- 0

# Find the number of SNPs shared by 2 ramets
SNPind <- which(rowSums(som_subclone,na.rm=TRUE)==2) # 27

# Start loop
for (i in 1:length(SNPind)) {

  rametsID <- which(som_subclone[SNPind[i],]==1) # pair of focus
  
  # Calculate genetic distance between i and closest neighbour trees
  spatial.dist <- as.matrix(distm(cbind(ids_subclone$long[rametsID[1]], ids_subclone$lat[rametsID[1]]), cbind(ids_subclone$long[rametsID[2]], ids_subclone$lat[rametsID[2]]), fun=distVincentyEllipsoid))
  store.spatial.dist <- c(store.spatial.dist, spatial.dist)
  
  store.SNPpairs <- c(store.SNPpairs, length(which(rowSums(som_subclone[, rametsID])==2))) # They share one unique mutation, and many more
  
}

store.pairs.2 <- store.SNPpairs
store.spatial.dist.2 <- store.spatial.dist


# Plot and save figure - two ramets' SNPs
# pdf(file="SNPs_shared_by_two_ramets.pdf", bg = "transparent",
#     width=5.2, height=4, family = "Times New Roman")

par(family="Times New Roman", cex.lab=1.1, cex.axis=1, cex.main=1, cex.sub=1)#, col.lab="white", col.axis = "white")

plot(store.spatial.dist, store.SNPpairs, pch=16,col="#756bb1", cex=1.2, xlim = c(0,160), ylim = c(0,350),
     xlab = "Spatial distance (m)",
     ylab = "Number of shared SNPs")
legend("topright", c("SNPs shared by two ramets"), col = c("#756bb1"), pch=16, cex = 1, bty = "n")

# Regression line
# abline(lm(store.SNPpairs ~ store.spatial.dist), col = "red", lwd = 3)

#dev.off()

cor.val.2 <- cor(store.spatial.dist[!is.na(store.spatial.dist)], store.SNPpairs[!is.na(store.spatial.dist)]) # p val not significant
cor.val.2
## [1] -0.2854507
cor.test(store.spatial.dist[!is.na(store.spatial.dist)], store.SNPpairs[!is.na(store.spatial.dist)]) 
## 
##  Pearson's product-moment correlation
## 
## data:  store.spatial.dist[!is.na(store.spatial.dist)] and store.SNPpairs[!is.na(store.spatial.dist)]
## t = -7.0041, df = 553, p-value = 7.247e-12
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.3601236 -0.2071433
## sample estimates:
##        cor 
## -0.2854507
#   Pearson's product-moment correlation
# 
# data:  store.spatial.dist[!is.na(store.spatial.dist)] and store.SNPpairs[!is.na(store.spatial.dist)]
# t = -6.7839, df = 553, p-value = 3.015e-11
# alternative hypothesis: true correlation is not equal to 0
# 95 percent confidence interval:
#  -0.3522812 -0.1985307
# sample estimates:
#        cor 
# -0.2771794 
# ------------------------------------------------------------------------------------- #
# Detecting Spatial Signal per SNP - new threshold
# ------------------------------------------------------------------------------------- #

# Calculate mean distance between pairs of samples presenting the mutation
n_SNP <- dim(som_subclone)[1]
store_mean_dist <-  matrix(NA, n_SNP, 1)
store_mean_rand_dist <-  matrix(NA, n_SNP, 1)
store_num_samp <-  matrix(NA, n_SNP, 1)
dist <- c()
rand_dist <- c()

for (x in 1:n_SNP) {
  
  #print(x)
  samples <- which(som_subclone[x,]==1) # samples that have the mutation
  start <- 2
  dist <- c()
  
  # Calculate every pairwise distance between samples with the mutation
  for (i in 1:c(length(samples)-1)) {
    
    for (j in start:length(samples)) {
      
      dist <- c(dist, distVincentyEllipsoid(c(ids_subclone$long[i],ids_subclone$lat[i]), c(ids_subclone$long[j],ids_subclone$lat[j])) )

    }
    j <- j + 1 # do not compute the same distance twice
  }
  
  # Store mean pairwise distance for each sample
  store_mean_dist[x] <- mean(c(dist))
  store_num_samp[x] <- length(samples)

}

mean(store_mean_dist) # 1.52 meters
## [1] 1.279075
# Randomization - on the cluster

# rand <- hist(store_mean_rand_dist, 1000, plot = FALSE)
# rand$counts=rand$counts/sum(rand$counts)
# x <- (rand$breaks[1:length(rand$breaks)-1]+ rand$breaks[2:length(rand$breaks)])/2
# y <- rand$counts
# plot(x, y, type='l', lwd = 2, col = "#6baed6", xlab="Mean distance b/w group of samples sharing a mutation (m)", 
#      ylab="Density", bty="n", main = '223 SNPs')
# # Data
# data <- hist(store_mean_dist, 150, plot = FALSE)
# data$counts=data$counts/sum(data$counts)
# x <- (data$breaks[1:length(data$breaks)-1]+ data$breaks[2:length(data$breaks)])/2
# y <- data$counts
# points(x, y/10, type='l', lwd = 2, col = "#74c476")
# 
# legend(500,0.02, c("Randomization", "Data"), 
#        pch = 15, col = c("#6baed6", "#74c476"), box.lty = 0)
#