Looking at ideas to cluster Ancestry Pro Tools ICWs. Based on discussion in DNAGedcom Facebook group.
Read DNAGedcom ICW file as an edge list. Probably better (more efficient) to do ICW/edge filtering in SQL, but this should work for a prototype.
icw <- read.csv("../Richard_Ernest_Seiter_Ancestry_ICW.csv")
#str(icw)
# This removes blank last line. The more general solution.
icw <- icw[icw$sharedCentimorgans != 0,]
# Change field names so they work with Gephi.
# dput(colnames(icw))
colnames(icw) <- c("source", "matchname", "matchadmin", "target", "icwname", "icwadmin", "Source_DNA", "sharedCentimorgans", "confidence", "numSharedSegments")
# Read match data as well to allow filtering by match size or other characteristics
matches <- read.csv("../Richard_Ernest_Seiter_Ancestry_m.csv")
#matches[duplicated(matches$matchid),]
#str(matches)
# Modify ICW file as follows.
# Convert file to simple edge list
icw.edges <- icw[,c("source", "target", "sharedCentimorgans")]
#str(icw.edges)
# Prune by match size
cM.threshold <- 40
icw.matches <- matches[matches$sharedCM >= cM.threshold, c("matchid", "name", "sharedCM")]
#str(icw.matches)
# Only keep edges with both source and matchid in match list
icw.edges.filtered <- icw.edges[icw.edges$source %in% icw.matches$matchid,]
icw.edges.filtered <- icw.edges.filtered[icw.edges.filtered$target %in% icw.matches$matchid,]
#nrow(icw.edges.filtered)
# Transformation done later
icw.edges.filtered$weight <- icw.edges.filtered$sharedCentimorgans
#str(icw.edges.filtered)
We use a (pseudo to handle zeroes) log2 transform of the match cMs as the weight.
# https://stackoverflow.com/questions/51856706/weighted-graph-from-a-data-frame
icw.g <- graph.data.frame(icw.edges.filtered)
#is_weighted(icw.g)
icw.mat <- as.matrix(get.adjacency(icw.g, attr='weight'))
#str(icw.mat)
#diag(icw.mat) <- 7000 # Use 2x max value for self
diag(icw.mat) <- 4096 # Try changing to 4096 to reduce gap in data values and give integer log2
# https://win-vector.com/2012/03/01/modeling-trick-the-signed-pseudo-logarithm/
# transform with pseudo log, sigma not implemented yet
pseudo_log <- function (val, sigma = 1, base = exp(1)) {
xval <- asinh(val/2)/log(base)
}
icw.mat <- pseudo_log(icw.mat, base=2)
General discussion of heatmaps
https://slowkow.com/notes/pheatmap-tutorial/
Has log transform, but no way to display original values in legend.
# Create a public version
icw.mat.public <- icw.mat
rownames(icw.mat.public) <- 1:nrow(icw.mat.public)
colnames(icw.mat.public) <- 1:ncol(icw.mat.public)
# Create a different color palette which reduces the gap (0 to 6/8cM or threshold) in the data.
gap.colors <- colorRampPalette(rev(brewer.pal(n = 7, name = "RdYlBu")))(256)
data.min <- min(icw.mat.public[icw.mat.public != 0])
data.max <- max(icw.mat.public)
# Add additional colors so we use full display range for actual data range
# Leaving some gap helps see the min values in the heatmap. Trying different values.
gap <- 1
num.added <- round(256 * ((data.min-gap) / (data.max - (data.min-gap))))
gap.fill <- gap.colors[1]
gap.filled.colors <- c(rep(gap.fill, num.added), gap.colors)
pheatmap(icw.mat.public, color=gap.filled.colors, main=paste("Heatmap of Matches ", cM.threshold, "cM and Over", sep=""))
heatmaply(icw.mat.public, color = gap.filled.colors, hclust_method="complete",
grid_gap=0.3, # grid_gap recommended even though it does not work well with light colored cells
branches_lwd = 0.2, subplot_widths = c(0.95, 0.05), subplot_heights = c(0.05, 0.95),
#colorbar_thickness = 15, # plotly only
#row_dend_left = TRUE,
main=paste("Heatmap of Matches ", cM.threshold, "cM and Over", sep=""))
Save as an interactive HTML page.
heatmaply(icw.mat.public, file = "icw_public_heatmaply_plot.html")
browseURL("icw_public_heatmaply_plot.html")
File Initially created: Friday, October 24, 2024
Public version created: Sunday, October 27, 2024
File modified: Monday, October 28, 2024
File knitted: Mon Oct 28 11:00:01 2024