Jaccard Similarity Score - the number of matching hits, divided by
all possible hits between the two. The higher the score, the more
similar the distribution of those genes across genomes.
Heatmap
Load in the presence/absence matrix
library(data.table)
library(tidyverse)
# read_csv or vroom are better alternatives
genes_merged <- read.csv("../../data/merged_genes_full.csv",row.names = 1, header= TRUE)
gene_list = genes_merged[,"gene_names"]
genes_merged <- column_to_rownames(genes_merged,var="gene_names")
Generate Jaccard Similarity Score for each gene against a specific reference gene
# Pick the reference gene you're going to track
ref_gene <- as.numeric(genes_merged["nifH",])
# gene_count =as.data.frame(rowSums(genes_merged))
# gene_count <- tibble::rownames_to_column(gene_count, "gene_name")
# Calculate Jaccard Similarity
gene_matches <- colSums(t(genes_merged)+ref_gene==2)
gene_totalhits <- colSums(t(genes_merged)+ref_gene>0)
fave_genes <- sort(gene_matches/gene_totalhits, decreasing = TRUE)
Generates a confusion matrix of the top 10 best hits - useful to verify the matrix
# Check the confusion matrix of the top 10 best matched genes
confusion_top10 <- lapply(names(head(fave_genes, 10)), function(gene_name){
table(as.numeric(genes_merged["acnA",]),
as.numeric(genes_merged[gene_name,]))
})
Plot the distribution
library(tidyverse)
# plot the ordered data based on similarity score
x_steps = seq(1, length(fave_genes)-1, 1)
y_steps = fave_genes[-1]
df = data.frame(x=x_steps,y=y_steps)
ggplot(data=df,mapping = aes(x, y))+geom_point()
### Notes from specific genes
nifH is kinda interesting - lots of metal binding genes associated amidst the other nif genes
# Generate the Counts of the Genes
gene_count =as.data.frame(rowSums(genes_merged))
gene_count <- tibble::rownames_to_column(gene_count, "gene_name")
# Exclude any gene with fewer than min_num_genes hits
min_num_genes <- 100
gene_count$min <- ifelse(gene_count$`rowSums(genes_merged)` > min_num_genes, 1, 0)
genes_merged <- tibble::rownames_to_column(genes_merged, "gene_name")
# Filter on main
genes_merged = genes_merged[as.logical(gene_count$min),]
gene_list = genes_merged[,"gene_name"]
genes_merged <- tibble::remove_rownames(genes_merged)
genes_merged <- tibble::column_to_rownames(genes_merged, "gene_name")
# Calculate Jaccard Similarity Matrix
holder <- lapply(gene_list, function(gene_i) {
ref_gene <- as.numeric(genes_merged[gene_i,])
gene_matches <- colSums(t(genes_merged)+ref_gene==2)
gene_totalhits <- colSums(t(genes_merged)+ref_gene>0)
gene_matches/gene_totalhits
})
jaccard_matrix <- do.call(rbind, holder)
row.names(jaccard_matrix) <- gene_list
write.csv(jaccard_matrix, "../../data/jaccard_matrix.csv", row.names=TRUE)
Find the highest values
heatmap(jaccard_matrix, sym=TRUE)
Distance Clustering - just hierachical. There are three general clusters that immediately jump out to me. Guess is that the first cluster is all of the core required genes that almost all microbes have. But what about everything else?
Verify the dendrogram - cophenetic correlation ends up being around 0.78, not amazing but reasonable to work with.
hc = hclust(dist(jaccard_matrix))
plot(hc)
# Is it possible to put data into dendrogram - close to 1 is good, close to 0 is bad
cophen = cophenetic(hc)
verify = cor(dist(jaccard_matrix), cophen)
# Order the Abundance for correct mapping to the plot
hc_labels = as.data.frame(hc$labels)
hc_labels$index = 1:length(hc$labels)
gene_abund_index = merge(hc_labels,gene_count,by.x="hc$labels",by.y="gene_name")
gene_abund_index$order <- hc$order
gene_ordered = arrange(gene_abund_index, index)[hc$order,]
colorfun = colorRamp(c("red","blue"))
cl_members <- cutree(tree = hc, k = 3)
plot(x = hc, labels = row.names(hc), cex = 0.5, hang= -1)
points(x=1:3686,y=numeric(3686),col=rgb(colorfun(gene_count$abundance[hc$order]), maxColorValue = 255))
# rect.hclust(tree = hc, k = 3, which = 1:3, border = 1:3, cluster = cl_members)
pdf("~/sam-associationstudy/output/gene_clusters.pdf",width = 250)
plot(x = hc, labels = row.names(hc), cex = 0.5, hang= -1)
points(x=1:3686,y=numeric(3686),col=rgb(colorfun(gene_ordered$abundance), maxColorValue = 255))
dev.off()
See saved PDF:Blue indicate highly abundant genes while Red are not very abundant genes. The first cluster are all highly expressed, cluster 3 are not highly expressed. The second cluster though has an interesting mix. Need to dive into it more
#
# 1diag(jaccard_matrix) <- 0
corr_genes <- which(jaccard_matrix==max(jaccard_matrix), arr.ind=TRUE)
# Verify
sum(genes_merged["anfD",])
sum(genes_merged["anfG",])
num_occ = sort(rowSums(genes_merged),decreasing=TRUE)
# Find the most abundant gene
which.max(rowSums(genes_merged))
gene_count$abundance = gene_count$`rowSums(genes_merged)`/17019
library(tidyverse)
jaccard_matrix = read.csv("../../data/jaccard_matrix.csv", row.names=1, header=TRUE)
ggplot()+geom_histogram(aes(jaccard_matrix[,"frr"]),bins = 200)
ggplot()+geom_histogram(aes(jaccard_matrix[,"nifH"]),bins = 200)
ggplot()+geom_histogram(aes(jaccard_matrix[,"tssK"]),bins = 200)