28a_Change_findTAD.algorithm

Load data

TopDom df

Window size = 3L

Change Algorithm - findTAD

match_dis_TAD function

Function to find TAD from a starting point by matching distance between starting point and local minimums

## Identify TAD by matching distance between starting point and local minimums
match_dis_TAD <- function(smooth_line, probability_table, starting_right_TAD.pos, region_segment_mag = 0.2, error_prop, threhold, low_prob_max.rm = TRUE){
  
  ### Find all local minimum position of smooth line
  local_minimum_pos = find_peaks(-smooth_line$y[smooth_line$time %% 1 == 0])
  local_minimum = probability_table$Position[local_minimum_pos]
  ### Find all local maximum position of smooth line
  local_maximum_pos = find_peaks(smooth_line$y[smooth_line$time %% 1 == 0])
  local_maximum = probability_table$Position[local_maximum_pos]
  ### Vector of local min and local max
  local_min_max_pos = c(local_minimum_pos, local_maximum_pos)
  local_min_max_prob.df = smooth_line[local_min_max_pos, ]
  
  ### Identify low and high regions for possible boundary and non-boundary
  local_min_max_density.list = density(local_min_max_prob.df$y)
  # plot(local_min_max_density.list)
  local_min_max_sd = sd(local_min_max_prob.df$y)
  local_min_max_mu = mean(local_min_max_prob.df$y)
  lower_bound.y = local_min_max_mu - region_segment_mag*local_min_max_sd
  upper_bound.y = local_min_max_mu + region_segment_mag*local_min_max_sd
  # Identify low and high regions
  low.region_pos = local_min_max_pos[which(local_min_max_prob.df$y < lower_bound.y)]
  upper.region_pos = local_min_max_pos[which(local_min_max_prob.df$y > upper_bound.y)]
  
  if(low_prob_max.rm == TRUE){
    ## Only keep maximum boundaries abve 0.5
    local_maximum_pos = local_maximum_pos[smooth_line$y[smooth_line$time %% 1 == 0][local_maximum_pos]>0.5]
    local_maximum = probability_table$Position[local_maximum_pos]
  }
  
  ## At starting point of right TAD, the local minimum points on the left side
  local_maximum = upper.region_pos; local_minimum = low.region_pos
  starting_right_TAD = local_maximum[starting_right_TAD.pos]
  
  ## Find boundaries with all local minimum
  left_TAD = center_TAD = right_TAD = table =  NULL
  for(i in 1:length(local_minimum)){
    ## distance between starting l.max and consecutive l.min
    dis = local_minimum[i] - starting_right_TAD
    ## interval for distance between right.max and min
    interval.dis = c(round(local_minimum[i] + dis - error_prop*abs(dis) ) : 
                       round(local_minimum[i] + dis + error_prop*abs(dis) ) )
    ## identify any right max in above interval and avg[d(left.max-min), d(right.max-min)] > threshold
    possible.tad = local_maximum[local_maximum %in% interval.dis][1]
    if (!is.na(possible.tad)){
      ## Create output data frame
      if(possible.tad < starting_right_TAD){
        left_TAD <- c(left_TAD, possible.tad)
        right_TAD <- c(right_TAD, starting_right_TAD)
        center_TAD <- c(center_TAD, local_minimum[i]) }
      else{
        left_TAD <- c(left_TAD, starting_right_TAD)
        right_TAD <- c(right_TAD, possible.tad)
        center_TAD <- c(center_TAD, local_minimum[i]) }
    }  
  }  
  table <- data.frame(left_TAD , center_TAD, right_TAD)
  if(nrow(table)>0){
    table <- threhold_boundary(probability_table = probability_table, all_possible_boundaries = table, threhold = threhold)
  }
  
  return(table)
  
}

TAD_identify function

Function to identify all TAD from all local min or max that is belong to high probability region.

### Function TAD Identify (All Step combine)
TAD_identify <- function( smooth_line, probability_table, region_segment_mag = 0.2,
                          parent.error_prop =0.1, parent.threhold = 0.4,
                          sub.error_prop =0.2, sub.threhold = 0.3, low_prob_max.rm = TRUE ){
  
  ## 1. Identify potential min and max
  ### Find all local minimum position of smooth line
  local_minimum_pos = find_peaks(-smooth_line$y[smooth_line$time %% 1 == 0])
  local_minimum = probability_table$Position[local_minimum_pos]
  ### Find all local maximum position of smooth line
  local_maximum_pos = find_peaks(smooth_line$y[smooth_line$time %% 1 == 0])
  local_maximum = probability_table$Position[local_maximum_pos]
  ### Vector of local min and local max
  local_min_max_pos = c(local_minimum_pos, local_maximum_pos)
  local_min_max_prob.df = smooth_line[local_min_max_pos, ]
  
  ### Identify low and high regions for possible boundary and non-boundary
  local_min_max_density.list = density(local_min_max_prob.df$y)
  # plot(local_min_max_density.list)
  local_min_max_sd = sd(local_min_max_prob.df$y)
  local_min_max_mu = mean(local_min_max_prob.df$y)
  lower_bound.y = local_min_max_mu - region_segment_mag*local_min_max_sd
  upper_bound.y = local_min_max_mu + region_segment_mag*local_min_max_sd
  # Identify low and high regions
  low.region_pos = local_min_max_pos[which(local_min_max_prob.df$y < lower_bound.y)]
  upper.region_pos = local_min_max_pos[which(local_min_max_prob.df$y > upper_bound.y)]
  
  
  if(low_prob_max.rm == TRUE){
    ## Only keep maximum boundaries abve 0.5
    local_maximum_pos = local_maximum_pos[smooth_line$y[smooth_line$time %% 1 == 0][local_maximum_pos]>0.5]
    local_maximum = probability_table$Position[local_maximum_pos]
  }
  
  ### Rank the maximimum by its probability  
  max.rank = order(smooth_line$y[smooth_line$time %% 1 == 0][upper.region_pos],
                   decreasing = T)
  order.max = local_maximum[max.rank]
  
  
  
  
  ## 2. Identify  TAD from highest to lowest probability - maxima  
  ### First big TAD from starting position 
  first_starting_list = NULL; tad.table = NULL
  if(length(local_minimum) > 0 & length(local_maximum) > 0){
    for(i in 1: length(max.rank)){  #Identify first starting point has TAD boundaries
      first_starting.pos = max.rank[i]
      tad.table.element <- match_dis_TAD(smooth_line = smooth_line, region_segment_mag = region_segment_mag,  probability_table = probability_table, starting_right_TAD.pos = first_starting.pos, error_prop = parent.error_prop, threhold = parent.threhold, low_prob_max.rm = low_prob_max.rm)
      tad.table <- unique(rbind(tad.table, tad.table.element))
    }
  } else {
    tad.table = NULL
  }
  
    return(tad.table)
}

filter_tad_round1

Function to filter TAD with tad size threshold, minimum of probability difference (t) threshold

### Function to filter TAD 

filter_tad_round1 <- function(all.tads, original_probability_vector, t.threshold, size.threshold){
  
  ## Prepare data
  outcome.vector = original_probability_vector$y
  probability_table <- data.frame(Position = c(1:length(outcome.vector)), Probability = outcome.vector)
  
  ## Parameter report
  all.tads.report = parameter_report(all.tads = all.tads, probability_table = probability_table, probability_vector = original_probability_vector )
  
  ## Filter tad size threshold
  rm.tad.size.pos = which(all.tads.report$TAD_size > size.threshold)
  ## Filter t threshold
  rm.t.pos = which(all.tads.report$minProb_diff < t.threshold)
  
  ## Filter TADs those same boundaries but different non-boundary -> choose one with smallest error from symmetric
  left.right.only = all.tads[ , c('left_TAD', 'right_TAD')]
  duplicated_tad <- left.right.only[duplicated(left.right.only), ]
  rm.dup.tads_pos = NULL
  for(i in 1:nrow(duplicated_tad)){
    tads.rows_dup_pos <- which(left.right.only$left_TAD == duplicated_tad$left_TAD[i] & left.right.only$right_TAD == duplicated_tad$right_TAD[i], )
    chosen.tad_pos = tads.rows_dup_pos[which(order(all.tads.report[tads.rows_dup_pos, 'error_symetric']) == 1)]
    rm.tad_pos = tads.rows_dup_pos[!tads.rows_dup_pos %in% chosen.tad_pos]
    rm.dup.tads_pos = c(rm.dup.tads_pos, rm.tad_pos)
  }
  
  
  ## Final TAD to plot
  rm.rad.pos = unique(c(rm.dup.tads_pos, rm.t.pos, rm.tad.size.pos))
  final.tad <- all.tads[-rm.rad.pos, ]
  
  return(final.tad)
}

Test updated Algorithm

H = 300, e = 0.13, t = 4

Finding all possible TADs

Study parameter distribution

t threshold

## t threshold
plot(density(all.tads.report$minProb_diff), main = 'Density of t')

summary(all.tads.report$minProb_diff)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.333   7.111  10.444  11.237  15.556  25.111

tad size

## tad size 
plot(density(all.tads.report$TAD_size), main = 'Density of TAD size')

summary(all.tads.report$TAD_size)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     6.0   106.0   178.0   188.0   266.2   409.0

TAD filter round 1

Filter function

tad size < 70

## try on thresholds: t=0, e<0.1, size <70, and duplicated tad
filtered.tad = filter_tad_round1(all.tads = all.tads2, original_probability_vector = flipped.binSignal.df, t.threshold = 7, size.threshold = 70)

DT::datatable(filtered.tad)

plot.boundary(smooth_line1, boundary_table =filtered.tad, probability_table = probability_table, parameters = paste0('Flipped TopDom Smooth - Filter size <70 \n e = 0.1, t= 7'), boundary.line = T  ) # position is index

tad size < 60

## try on thresholds: t=0, e<0.1, size <60, and duplicated tad
filtered.tad = filter_tad_round1(all.tads = all.tads2, original_probability_vector = flipped.binSignal.df, t.threshold = 7, size.threshold = 60)

DT::datatable(filtered.tad)

plot.boundary(smooth_line1, boundary_table =filtered.tad, probability_table = probability_table, parameters = paste0('Flipped TopDom Smooth - Filter size <60 \n e = 0.1, t= 7'), boundary.line = T  ) # position is index

tad size < 50

## try on thresholds: t=0, e<0.1, size <60, and duplicated tad
filtered.tad = filter_tad_round1(all.tads = all.tads2, original_probability_vector = flipped.binSignal.df, t.threshold = 7, size.threshold = 50)

DT::datatable(filtered.tad)

plot.boundary(smooth_line1, boundary_table =filtered.tad, probability_table = probability_table, parameters = paste0('Flipped TopDom Smooth - Filter size < 50 \n e = 0.1, t= 7'), boundary.line = T  ) # position is index