1 Overview

Looking at ideas to cluster Ancestry Pro Tools ICWs. Based on discussion in DNAGedcom Facebook group.

2 Read and Process ICW Data

Read DNAGedcom ICW file as an edge list. Probably better (more efficient) to do ICW/edge filtering in SQL, but this should work for a prototype.

icw <- read.csv("../Richard_Ernest_Seiter_Ancestry_ICW.csv")
#str(icw)

# This removes blank last line.  The more general solution.
icw <- icw[icw$sharedCentimorgans != 0,]

# Change field names so they work with Gephi.
# dput(colnames(icw))
colnames(icw) <- c("source", "matchname", "matchadmin", "target", "icwname", "icwadmin", "Source_DNA", "sharedCentimorgans", "confidence", "numSharedSegments")

# Read match data as well to allow filtering by match size or other characteristics
matches <- read.csv("../Richard_Ernest_Seiter_Ancestry_m.csv")
#matches[duplicated(matches$matchid),]
#str(matches)

# Modify ICW file as follows.

# Convert file to simple edge list
icw.edges <- icw[,c("source", "target", "sharedCentimorgans")]
#str(icw.edges)

# Prune by match size
cM.threshold <- 40

icw.matches <- matches[matches$sharedCM >= cM.threshold, c("matchid", "name", "sharedCM")]
#str(icw.matches)

# Only keep edges with both source and matchid in match list
icw.edges.filtered <- icw.edges[icw.edges$source %in% icw.matches$matchid,]
icw.edges.filtered <- icw.edges.filtered[icw.edges.filtered$target %in% icw.matches$matchid,]
#nrow(icw.edges.filtered)

# Transformation done later
icw.edges.filtered$weight <- icw.edges.filtered$sharedCentimorgans
#str(icw.edges.filtered)

2.1 Convert Edge List to Adjacency Matrix

See https://stackoverflow.com/questions/16584948/how-to-create-weighted-adjacency-list-matrix-from-edge-list

We use a (pseudo to handle zeroes) log2 transform of the match cMs as the weight.

# https://stackoverflow.com/questions/51856706/weighted-graph-from-a-data-frame
icw.g <- graph.data.frame(icw.edges.filtered)
#is_weighted(icw.g)
icw.mat <- as.matrix(get.adjacency(icw.g, attr='weight'))
#str(icw.mat)

#diag(icw.mat) <- 7000 # Use 2x max value for self
diag(icw.mat) <- 4096 # Try changing to 4096 to reduce gap in data values and give integer log2

# https://win-vector.com/2012/03/01/modeling-trick-the-signed-pseudo-logarithm/
# transform with pseudo log, sigma not implemented yet
pseudo_log <- function (val, sigma = 1, base = exp(1)) {
  xval <- asinh(val/2)/log(base)
}
icw.mat <- pseudo_log(icw.mat, base=2)

3 Heatmap Visualizations

General discussion of heatmaps

3.1 Pheatmap Version

https://slowkow.com/notes/pheatmap-tutorial/
Has log transform, but no way to display original values in legend.

# Create a public version
icw.mat.public <- icw.mat
rownames(icw.mat.public) <- 1:nrow(icw.mat.public)
colnames(icw.mat.public) <- 1:ncol(icw.mat.public)

# Create a different color palette which reduces the gap (0 to 6/8cM or threshold) in the data.
gap.colors <- colorRampPalette(rev(brewer.pal(n = 7, name = "RdYlBu")))(256)
data.min <- min(icw.mat.public[icw.mat.public != 0])
data.max <- max(icw.mat.public)
# Add additional colors so we use full display range for actual data range
# Leaving some gap helps see the min values in the heatmap.  Trying different values.
gap <- 1
num.added <- round(256 * ((data.min-gap) / (data.max - (data.min-gap))))
gap.fill <- gap.colors[1]
gap.filled.colors <- c(rep(gap.fill, num.added), gap.colors)

pheatmap(icw.mat.public, color=gap.filled.colors, main=paste("Heatmap of Matches ", cM.threshold, "cM and Over", sep=""))

3.2 Heatmaply Version

heatmaply(icw.mat.public, color = gap.filled.colors, hclust_method="complete",
          grid_gap=0.3, # grid_gap recommended even though it does not work well with light colored cells
          branches_lwd = 0.2, subplot_widths = c(0.95, 0.05), subplot_heights = c(0.05, 0.95),
          #colorbar_thickness = 15, # plotly only
          #row_dend_left = TRUE,
          main=paste("Heatmap of Matches ", cM.threshold, "cM and Over", sep=""))

Save as an interactive HTML page.

heatmaply(icw.mat.public, file = "icw_public_heatmaply_plot.html")
browseURL("icw_public_heatmaply_plot.html")

3.3 File History

File Initially created: Friday, October 24, 2024
Public version created: Sunday, October 27, 2024
File modified: Monday, October 28, 2024

File knitted: Mon Oct 28 11:00:01 2024