Window size = 3L
Function to find TAD from a starting point by matching distance between starting point and local minimums
## Identify TAD by matching distance between starting point and local minimums
match_dis_TAD <- function(smooth_line, probability_table, starting_right_TAD.pos, region_segment_mag = 0.2, error_prop, threhold, low_prob_max.rm = TRUE){
### Find all local minimum position of smooth line
local_minimum_pos = find_peaks(-smooth_line$y[smooth_line$time %% 1 == 0])
local_minimum = probability_table$Position[local_minimum_pos]
### Find all local maximum position of smooth line
local_maximum_pos = find_peaks(smooth_line$y[smooth_line$time %% 1 == 0])
local_maximum = probability_table$Position[local_maximum_pos]
### Vector of local min and local max
local_min_max_pos = c(local_minimum_pos, local_maximum_pos)
local_min_max_prob.df = smooth_line[local_min_max_pos, ]
### Identify low and high regions for possible boundary and non-boundary
local_min_max_density.list = density(local_min_max_prob.df$y)
# plot(local_min_max_density.list)
local_min_max_sd = sd(local_min_max_prob.df$y)
local_min_max_mu = mean(local_min_max_prob.df$y)
lower_bound.y = local_min_max_mu - region_segment_mag*local_min_max_sd
upper_bound.y = local_min_max_mu + region_segment_mag*local_min_max_sd
# Identify low and high regions
low.region_pos = local_min_max_pos[which(local_min_max_prob.df$y < lower_bound.y)]
upper.region_pos = local_min_max_pos[which(local_min_max_prob.df$y > upper_bound.y)]
if(low_prob_max.rm == TRUE){
## Only keep maximum boundaries abve 0.5
local_maximum_pos = local_maximum_pos[smooth_line$y[smooth_line$time %% 1 == 0][local_maximum_pos]>0.5]
local_maximum = probability_table$Position[local_maximum_pos]
}
## At starting point of right TAD, the local minimum points on the left side
local_maximum = upper.region_pos; local_minimum = low.region_pos
starting_right_TAD = local_maximum[starting_right_TAD.pos]
## Find boundaries with all local minimum
left_TAD = center_TAD = right_TAD = table = NULL
for(i in 1:length(local_minimum)){
## distance between starting l.max and consecutive l.min
dis = local_minimum[i] - starting_right_TAD
## interval for distance between right.max and min
interval.dis = c(round(local_minimum[i] + dis - error_prop*abs(dis) ) :
round(local_minimum[i] + dis + error_prop*abs(dis) ) )
## identify any right max in above interval and avg[d(left.max-min), d(right.max-min)] > threshold
possible.tad = local_maximum[local_maximum %in% interval.dis][1]
if (!is.na(possible.tad)){
## Create output data frame
if(possible.tad < starting_right_TAD){
left_TAD <- c(left_TAD, possible.tad)
right_TAD <- c(right_TAD, starting_right_TAD)
center_TAD <- c(center_TAD, local_minimum[i]) }
else{
left_TAD <- c(left_TAD, starting_right_TAD)
right_TAD <- c(right_TAD, possible.tad)
center_TAD <- c(center_TAD, local_minimum[i]) }
}
}
table <- data.frame(left_TAD , center_TAD, right_TAD)
if(nrow(table)>0){
table <- threhold_boundary(probability_table = probability_table, all_possible_boundaries = table, threhold = threhold)
}
return(table)
}
Function to identify all TAD from all local min or max that is belong to high probability region.
### Function TAD Identify (All Step combine)
TAD_identify <- function( smooth_line, probability_table, region_segment_mag = 0.2,
parent.error_prop =0.1, parent.threhold = 0.4,
sub.error_prop =0.2, sub.threhold = 0.3, low_prob_max.rm = TRUE ){
## 1. Identify potential min and max
### Find all local minimum position of smooth line
local_minimum_pos = find_peaks(-smooth_line$y[smooth_line$time %% 1 == 0])
local_minimum = probability_table$Position[local_minimum_pos]
### Find all local maximum position of smooth line
local_maximum_pos = find_peaks(smooth_line$y[smooth_line$time %% 1 == 0])
local_maximum = probability_table$Position[local_maximum_pos]
### Vector of local min and local max
local_min_max_pos = c(local_minimum_pos, local_maximum_pos)
local_min_max_prob.df = smooth_line[local_min_max_pos, ]
### Identify low and high regions for possible boundary and non-boundary
local_min_max_density.list = density(local_min_max_prob.df$y)
# plot(local_min_max_density.list)
local_min_max_sd = sd(local_min_max_prob.df$y)
local_min_max_mu = mean(local_min_max_prob.df$y)
lower_bound.y = local_min_max_mu - region_segment_mag*local_min_max_sd
upper_bound.y = local_min_max_mu + region_segment_mag*local_min_max_sd
# Identify low and high regions
low.region_pos = local_min_max_pos[which(local_min_max_prob.df$y < lower_bound.y)]
upper.region_pos = local_min_max_pos[which(local_min_max_prob.df$y > upper_bound.y)]
if(low_prob_max.rm == TRUE){
## Only keep maximum boundaries abve 0.5
local_maximum_pos = local_maximum_pos[smooth_line$y[smooth_line$time %% 1 == 0][local_maximum_pos]>0.5]
local_maximum = probability_table$Position[local_maximum_pos]
}
### Rank the maximimum by its probability
max.rank = order(smooth_line$y[smooth_line$time %% 1 == 0][upper.region_pos],
decreasing = T)
order.max = local_maximum[max.rank]
## 2. Identify TAD from highest to lowest probability - maxima
### First big TAD from starting position
first_starting_list = NULL; tad.table = NULL
if(length(local_minimum) > 0 & length(local_maximum) > 0){
for(i in 1: length(max.rank)){ #Identify first starting point has TAD boundaries
first_starting.pos = max.rank[i]
tad.table.element <- match_dis_TAD(smooth_line = smooth_line, region_segment_mag = region_segment_mag, probability_table = probability_table, starting_right_TAD.pos = first_starting.pos, error_prop = parent.error_prop, threhold = parent.threhold, low_prob_max.rm = low_prob_max.rm)
tad.table <- unique(rbind(tad.table, tad.table.element))
}
} else {
tad.table = NULL
}
return(tad.table)
}
Function to filter TAD with tad size threshold, minimum of probability difference (t) threshold
### Function to filter TAD
filter_tad_round1 <- function(all.tads, original_probability_vector, t.threshold, size.threshold){
## Prepare data
outcome.vector = original_probability_vector$y
probability_table <- data.frame(Position = c(1:length(outcome.vector)), Probability = outcome.vector)
## Parameter report
all.tads.report = parameter_report(all.tads = all.tads, probability_table = probability_table, probability_vector = original_probability_vector )
## Filter tad size threshold
rm.tad.size.pos = which(all.tads.report$TAD_size > size.threshold)
## Filter t threshold
rm.t.pos = which(all.tads.report$minProb_diff < t.threshold)
## Filter TADs those same boundaries but different non-boundary -> choose one with smallest error from symmetric
left.right.only = all.tads[ , c('left_TAD', 'right_TAD')]
duplicated_tad <- left.right.only[duplicated(left.right.only), ]
rm.dup.tads_pos = NULL
for(i in 1:nrow(duplicated_tad)){
tads.rows_dup_pos <- which(left.right.only$left_TAD == duplicated_tad$left_TAD[i] & left.right.only$right_TAD == duplicated_tad$right_TAD[i], )
chosen.tad_pos = tads.rows_dup_pos[which(order(all.tads.report[tads.rows_dup_pos, 'error_symetric']) == 1)]
rm.tad_pos = tads.rows_dup_pos[!tads.rows_dup_pos %in% chosen.tad_pos]
rm.dup.tads_pos = c(rm.dup.tads_pos, rm.tad_pos)
}
## Final TAD to plot
rm.rad.pos = unique(c(rm.dup.tads_pos, rm.t.pos, rm.tad.size.pos))
final.tad <- all.tads[-rm.rad.pos, ]
return(final.tad)
}
H = 300, e = 0.13, t = 4
## t threshold
plot(density(all.tads.report$minProb_diff), main = 'Density of t')
summary(all.tads.report$minProb_diff)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.333 7.111 10.444 11.237 15.556 25.111
## tad size
plot(density(all.tads.report$TAD_size), main = 'Density of TAD size')
summary(all.tads.report$TAD_size)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6.0 106.0 178.0 188.0 266.2 409.0
Filter function
## try on thresholds: t=0, e<0.1, size <70, and duplicated tad
filtered.tad = filter_tad_round1(all.tads = all.tads2, original_probability_vector = flipped.binSignal.df, t.threshold = 7, size.threshold = 70)
DT::datatable(filtered.tad)
plot.boundary(smooth_line1, boundary_table =filtered.tad, probability_table = probability_table, parameters = paste0('Flipped TopDom Smooth - Filter size <70 \n e = 0.1, t= 7'), boundary.line = T ) # position is index
## try on thresholds: t=0, e<0.1, size <60, and duplicated tad
filtered.tad = filter_tad_round1(all.tads = all.tads2, original_probability_vector = flipped.binSignal.df, t.threshold = 7, size.threshold = 60)
DT::datatable(filtered.tad)
plot.boundary(smooth_line1, boundary_table =filtered.tad, probability_table = probability_table, parameters = paste0('Flipped TopDom Smooth - Filter size <60 \n e = 0.1, t= 7'), boundary.line = T ) # position is index
## try on thresholds: t=0, e<0.1, size <60, and duplicated tad
filtered.tad = filter_tad_round1(all.tads = all.tads2, original_probability_vector = flipped.binSignal.df, t.threshold = 7, size.threshold = 50)
DT::datatable(filtered.tad)
plot.boundary(smooth_line1, boundary_table =filtered.tad, probability_table = probability_table, parameters = paste0('Flipped TopDom Smooth - Filter size < 50 \n e = 0.1, t= 7'), boundary.line = T ) # position is index