Jaccard Similarity Score - the number of matching hits, divided by all possible hits between the two. The higher the score, the more similar the distribution of those genes across genomes. jaccard

Heatmap of 3600 most abundant genes across reference bacterial species

Heatmap

Using the Presence/Absence Matrix, generate Jaccard of a specific reference gene

Load in the presence/absence matrix

library(data.table)
library(tidyverse)

# read_csv or vroom are better alternatives
genes_merged <- read.csv("../../data/merged_genes_full.csv",row.names = 1, header= TRUE)
gene_list = genes_merged[,"gene_names"]
genes_merged <- column_to_rownames(genes_merged,var="gene_names")

Single Gene Comparisons

Generate Jaccard Similarity Score for each gene against a specific reference gene

# Pick the reference gene you're going to track
ref_gene <- as.numeric(genes_merged["nifH",])

# gene_count =as.data.frame(rowSums(genes_merged))
# gene_count <- tibble::rownames_to_column(gene_count, "gene_name")


# Calculate Jaccard Similarity
gene_matches <- colSums(t(genes_merged)+ref_gene==2)
gene_totalhits <- colSums(t(genes_merged)+ref_gene>0)
fave_genes <- sort(gene_matches/gene_totalhits, decreasing = TRUE)

Generates a confusion matrix of the top 10 best hits - useful to verify the matrix

# Check the confusion matrix of the top 10 best matched genes
confusion_top10 <- lapply(names(head(fave_genes, 10)), function(gene_name){
  table(as.numeric(genes_merged["acnA",]), 
        as.numeric(genes_merged[gene_name,]))
})

Plot the distribution

library(tidyverse)
# plot the ordered data based on similarity score
x_steps = seq(1, length(fave_genes)-1, 1)
y_steps = fave_genes[-1]
df = data.frame(x=x_steps,y=y_steps)
ggplot(data=df,mapping = aes(x, y))+geom_point()

### Notes from specific genes

nifH is kinda interesting - lots of metal binding genes associated amidst the other nif genes

Generate a Jaccard similarity matrix of all genes with greater than 100 counts

# Generate the Counts of the Genes
gene_count =as.data.frame(rowSums(genes_merged))
gene_count <- tibble::rownames_to_column(gene_count, "gene_name")

# Exclude any gene with fewer than min_num_genes hits
min_num_genes <- 100
gene_count$min <- ifelse(gene_count$`rowSums(genes_merged)` > min_num_genes, 1, 0)
genes_merged <- tibble::rownames_to_column(genes_merged, "gene_name")

# Filter on main
genes_merged = genes_merged[as.logical(gene_count$min),]
gene_list = genes_merged[,"gene_name"]

genes_merged <- tibble::remove_rownames(genes_merged)
genes_merged <- tibble::column_to_rownames(genes_merged, "gene_name")


# Calculate Jaccard Similarity Matrix
holder <- lapply(gene_list, function(gene_i) {

  ref_gene <- as.numeric(genes_merged[gene_i,])
  gene_matches <- colSums(t(genes_merged)+ref_gene==2)
  gene_totalhits <- colSums(t(genes_merged)+ref_gene>0)
  gene_matches/gene_totalhits
  })

jaccard_matrix <- do.call(rbind, holder)
row.names(jaccard_matrix) <- gene_list

write.csv(jaccard_matrix, "../../data/jaccard_matrix.csv", row.names=TRUE)

Find the highest values

heatmap(jaccard_matrix, sym=TRUE)

Distance Clustering - just hierachical. There are three general clusters that immediately jump out to me. Guess is that the first cluster is all of the core required genes that almost all microbes have. But what about everything else?

Verify the dendrogram - cophenetic correlation ends up being around 0.78, not amazing but reasonable to work with.

hc = hclust(dist(jaccard_matrix))
plot(hc)
# Is it possible to put data into dendrogram - close to 1 is good, close to 0 is bad
cophen = cophenetic(hc)
verify = cor(dist(jaccard_matrix), cophen)
# Order the Abundance for correct mapping to the plot
hc_labels = as.data.frame(hc$labels)
hc_labels$index = 1:length(hc$labels)

gene_abund_index = merge(hc_labels,gene_count,by.x="hc$labels",by.y="gene_name")

gene_abund_index$order <- hc$order
gene_ordered = arrange(gene_abund_index, index)[hc$order,]
colorfun = colorRamp(c("red","blue"))

cl_members <- cutree(tree = hc, k = 3)
plot(x = hc, labels =  row.names(hc), cex = 0.5, hang= -1)
points(x=1:3686,y=numeric(3686),col=rgb(colorfun(gene_count$abundance[hc$order]), maxColorValue = 255))
# rect.hclust(tree = hc, k = 3, which = 1:3, border = 1:3, cluster = cl_members)

pdf("~/sam-associationstudy/output/gene_clusters.pdf",width = 250)
plot(x = hc, labels =  row.names(hc), cex = 0.5, hang= -1)
points(x=1:3686,y=numeric(3686),col=rgb(colorfun(gene_ordered$abundance), maxColorValue = 255))
dev.off()

See saved PDF:Blue indicate highly abundant genes while Red are not very abundant genes. The first cluster are all highly expressed, cluster 3 are not highly expressed. The second cluster though has an interesting mix. Need to dive into it more

# 
# 1diag(jaccard_matrix) <- 0
corr_genes <- which(jaccard_matrix==max(jaccard_matrix), arr.ind=TRUE)
# Verify
sum(genes_merged["anfD",])
sum(genes_merged["anfG",])
num_occ = sort(rowSums(genes_merged),decreasing=TRUE)

# Find the most abundant gene
which.max(rowSums(genes_merged))
gene_count$abundance = gene_count$`rowSums(genes_merged)`/17019
library(tidyverse)
jaccard_matrix = read.csv("../../data/jaccard_matrix.csv", row.names=1, header=TRUE)
ggplot()+geom_histogram(aes(jaccard_matrix[,"frr"]),bins = 200)

ggplot()+geom_histogram(aes(jaccard_matrix[,"nifH"]),bins = 200)

ggplot()+geom_histogram(aes(jaccard_matrix[,"tssK"]),bins = 200)