Get TCF7L2 probes target regions
TCF7L2_targets <- read_csv("~/Downloads/Targets/getTFtargets_genomic_coordinates_mapped_to_hg19.hyper_in_escc_vs_esad_hg38_TCF7L2.csv",
col_types = readr::cols())
DT::datatable(TCF7L2_targets,options = list(scrollX = TRUE),filter = 'top')
probes <- makeGRangesFromDataFrame(TCF7L2_targets,
keep.extra.columns = T,
seqnames.field = "probe_hg19_seqnames",
start.field = "probe_hg19_start",
end.field = "probe_hg19_end",
strand.field = "probe_hg19_strand"
)
probes <- unique(probes)
probes
## GRanges object with 136 ranges and 19 metadata columns:
## seqnames ranges strand | GeneID Probe
## <Rle> <IRanges> <Rle> | <character> <character>
## [1] chr6 45441433-45441434 * | ENSG00000001561 cg04757168
## [2] chr4 10447440-10447441 * | ENSG00000002587 cg11578055
## [3] chr4 11652505-11652506 * | ENSG00000002587 cg23971987
## [4] chr11 68174286-68174287 * | ENSG00000006534 cg10687219
## [5] chr19 46385115-46385116 * | ENSG00000008438 cg12067547
## ... ... ... ... . ... ...
## [132] chr15 83790129-83790130 * | ENSG00000259767 cg18593236
## [133] chr15 84168459-84168460 * | ENSG00000259767 cg03752741
## [134] chr7 41721644-41721645 * | ENSG00000261019 cg17544672
## [135] chr7 42195875-42195876 * | ENSG00000261019 cg16848937
## [136] chr7 42303654-42303655 * | ENSG00000261019 cg14434601
## Symbol Sides Raw.p FDR
## <character> <character> <numeric> <numeric>
## [1] ENPP4 R5 1.27012155256609e-21 1.93977507032146e-20
## [2] HS3ST1 R7 1.23758168414e-10 3.68138906358986e-10
## [3] HS3ST1 L3 1.23758168414e-10 3.68138906358986e-10
## [4] ALDH3B1 L10 3.4750686111754e-20 4.00485667858466e-19
## [5] PGLYRP1 R7 1.51558755453394e-05 2.16751520456186e-05
## ... ... ... ... ...
## [132] AC022558.1 L7 0.00058118811574975 0.000619966993433223
## [133] AC022558.1 L8 0.00058118811574975 0.000619966993433223
## [134] AC010132.4 R8 0.000151242555954231 0.000180329508955901
## [135] AC010132.4 R4 0.000151242555954231 0.000180329508955901
## [136] AC010132.4 R5 0.000151242555954231 0.000180329508955901
## distNearestTSS DMC_analysis_pvalue DMC_analysis_escc_Minus_esad
## <integer> <numeric> <numeric>
## [1] 656295 1.54013249119762e-27 0.342492029434533
## [2] 982780 4.83187113109079e-19 0.349687571993035
## [3] 221115 1.07408894369233e-22 0.327702012987323
## [4] 384974 6.12602939805421e-18 0.344646832765288
## [5] 141207 2.39777788986254e-48 0.344608475216899
## ... ... ... ...
## [132] 97040 3.29795962417823e-21 0.302649568499366
## [133] 475370 7.07335576164312e-19 0.303159293681504
## [134] 1220592 1.00981370327399e-19 0.327279634433238
## [135] 746362 6.27960586446515e-24 0.329610231319257
## [136] 638583 2.58107943402524e-27 0.32008766236016
## DMC_analysis_adjust.p ensembl_gene_id original_ensembl_gene_id
## <numeric> <character> <character>
## [1] 3.29196455545598e-25 ENSG00000001561 ENSG00000001561.6
## [2] 3.25783229365739e-17 ENSG00000002587 ENSG00000002587.8
## [3] 1.20247813228428e-20 ENSG00000002587 ENSG00000002587.8
## [4] 3.57956084645453e-16 ENSG00000006534 ENSG00000006534.14
## [5] 1.50581593282363e-44 ENSG00000008438 ENSG00000008438.4
## ... ... ... ...
## [132] 3.00164398341097e-19 ENSG00000259767 ENSG00000259767.1
## [133] 4.66887503103732e-17 ENSG00000259767 ENSG00000259767.1
## [134] 7.46916662935934e-18 ENSG00000261019 ENSG00000261019.1
## [135] 8.29820341694918e-22 ENSG00000261019 ENSG00000261019.1
## [136] 5.44632538941892e-25 ENSG00000261019 ENSG00000261019.1
## TF
## <character>
## [1] FOXA3;FOXA2;FOXP4;FOXD2;FOXJ1;TCF7L2;NR1I2;PPARG;RORC;THRA;NR1H3;NR1H4;ZNF620;ZSCAN16;ZNF774;ZNF468;ZNF33A;ZNF816;ZNF619;ZNF765;ZNF799;ZNF763;ZNF823;ZNF124;SPDEF;ELF3
## [2] GATA6;GATA4;NR1I2;PPARG;RORC;THRA;NR1H3;NR1H4;TCF7L2;HNF1A;HNF1B;PDX1;CDX2;HOXB6;HOXB5;MNX1;EVX1;CDX1;HOXA13;HOXB9;SPDEF;ELF3;IRF8;SOX9
## [3] GATA6;GATA4;NR1I2;PPARG;RORC;THRA;NR1H3;NR1H4;TCF7L2;SPDEF;ELF3;HNF1A;HNF1B;PDX1;CDX2;HOXB6;HOXB5;MNX1;EVX1;CDX1;HOXA13;HOXB9;FOXA3;FOXA2;FOXP4;FOXD2;FOXJ1;SOX9
## [4] HNF4A;HNF4G;NR2F2;NR2C1;FOXA3;FOXA2;FOXP4;FOXD2;FOXJ1;NR1I2;PPARG;RORC;THRA;NR1H3;NR1H4;TCF7L2;SPDEF;ELF3;NR3C2
## [5] GATA6;GATA4;HNF1A;HNF1B;PDX1;CDX2;HOXB6;HOXB5;MNX1;EVX1;CDX1;HOXA13;HOXB9;SPDEF;ELF3;NR1I2;PPARG;RORC;THRA;NR1H3;NR1H4;TCF7L2;NFATC2;FOXA3;FOXA2;FOXP4;FOXD2;FOXJ1;ZSCAN16;ZNF816
## ... ...
## [132] NR1I2;PPARG;RORC;THRA;NR1H3;NR1H4;HNF4A;HNF4G;NR2F2;NR2C1;TCF7L2;FOXA3;FOXA2;FOXP4;FOXD2;FOXJ1;NFATC2;SPDEF;ELF3;ZSCAN16;ZNF816;IRF8
## [133] HNF1A;HNF1B;PDX1;CDX2;HOXB6;HOXB5;MNX1;EVX1;CDX1;HOXA13;HOXB9;FOXA3;FOXA2;FOXP4;FOXD2;FOXJ1;TCF7L2
## [134] HNF1A;HNF1B;PDX1;CDX2;HOXB6;HOXB5;MNX1;EVX1;CDX1;HOXA13;HOXB9;GATA6;GATA4;FOXA3;FOXA2;FOXP4;FOXD2;FOXJ1;NR1I2;PPARG;RORC;THRA;NR1H3;NR1H4;TCF7L2;SPDEF;ELF3
## [135] HNF1A;HNF1B;PDX1;CDX2;HOXB6;HOXB5;MNX1;EVX1;CDX1;HOXA13;HOXB9;FOXA3;FOXA2;FOXP4;FOXD2;FOXJ1;TCF7L2;NR1I2;PPARG;RORC;THRA;NR1H3;NR1H4;NFATC2;SOX9;NR3C2
## [136] GATA6;GATA4;NR1I2;PPARG;RORC;THRA;NR1H3;NR1H4;HNF4A;HNF4G;NR2F2;NR2C1;TCF7L2;FOXA3;FOXA2;FOXP4;FOXD2;FOXJ1
## probe_hg19_width gene_hg19_seqnames gene_hg19_start gene_hg19_end
## <integer> <character> <integer> <integer>
## [1] 2 chr6 46097730 46114436
## [2] 2 chr4 11394774 11431389
## [3] 2 chr4 11394774 11431389
## [4] 2 chr11 67789294 67795378
## [5] 2 chr19 46522411 46526323
## ... ... ... ... ...
## [132] 2 chr15 83690988 83693088
## [133] 2 chr15 83690988 83693088
## [134] 2 chr7 42940871 42942238
## [135] 2 chr7 42940871 42942238
## [136] 2 chr7 42940871 42942238
## gene_hg19_width gene_hg19_strand
## <integer> <character>
## [1] 16707 +
## [2] 36616 -
## [3] 36616 -
## [4] 6085 +
## [5] 3913 -
## ... ... ...
## [132] 2101 -
## [133] 2101 -
## [134] 1368 -
## [135] 1368 -
## [136] 1368 -
## -------
## seqinfo: 22 sequences from an unspecified genome; no seqlengths
Read narrow TCF7L2 peaks files
colnames(peaks) <- c("chrom","Start","End","Name","Width","Score", "fold_enrichment","-log10(pValue)","-log10(qValue)","peak")
DT::datatable(peaks,options = list(scrollX = TRUE),filter = 'top')
peaks$chrom <- paste0("chr",peaks$chrom)
peaks.gr <- makeGRangesFromDataFrame(peaks,keep.extra.columns = T)
peaks.gr
## GRanges object with 28740 ranges and 6 metadata columns:
## seqnames ranges strand | Name Score
## <Rle> <IRanges> <Rle> | <character> <character>
## [1] chr1 29198-29441 * | OE33_peak_1 .
## [2] chr1 713825-714329 * | OE33_peak_2 .
## [3] chr1 762669-763049 * | OE33_peak_3 .
## [4] chr1 839595-840410 * | OE33_peak_4 .
## [5] chr1 859188-859486 * | OE33_peak_5 .
## ... ... ... ... . ... ...
## [28736] chrY 13460946-13461195 * | OE33_peak_28736 .
## [28737] chrY 13485442-13485967 * | OE33_peak_28737 .
## [28738] chrY 58827136-58827588 * | OE33_peak_28738 .
## [28739] chrY 58883594-58884000 * | OE33_peak_28739 .
## [28740] chrY 58905133-58905450 * | OE33_peak_28740 .
## fold_enrichment -log10(pValue) -log10(qValue) peak
## <numeric> <numeric> <numeric> <integer>
## [1] 4.15958 7.48895 4.98834 122
## [2] 2.43645 4.85955 2.5425 215
## [3] 5.53781 10.38868 7.74738 198
## [4] 5.3282 14.39616 11.61202 461
## [5] 6.29061 10.93385 8.26844 116
## ... ... ... ... ...
## [28736] 2.42299 10.26745 7.63054 121
## [28737] 4.66213 19.55059 16.63674 177
## [28738] 26.84596 104.7633 100.97492 219
## [28739] 5.08153 7.6158 5.10279 185
## [28740] 10.16306 21.14335 18.19341 166
## -------
## seqinfo: 24 sequences from an unspecified genome; no seqlengths
ChIPseeker::covplot(peaks.gr, weightCol="fold_enrichment")

Verify distance between probes and peaks
probes_extended_1kb <- resize(unique(probes),width = 2001,fix = "center")
probes_extended_1kb[,1:2]
## GRanges object with 136 ranges and 2 metadata columns:
## seqnames ranges strand | GeneID Probe
## <Rle> <IRanges> <Rle> | <character> <character>
## [1] chr6 45440433-45442433 * | ENSG00000001561 cg04757168
## [2] chr4 10446440-10448440 * | ENSG00000002587 cg11578055
## [3] chr4 11651505-11653505 * | ENSG00000002587 cg23971987
## [4] chr11 68173286-68175286 * | ENSG00000006534 cg10687219
## [5] chr19 46384115-46386115 * | ENSG00000008438 cg12067547
## ... ... ... ... . ... ...
## [132] chr15 83789129-83791129 * | ENSG00000259767 cg18593236
## [133] chr15 84167459-84169459 * | ENSG00000259767 cg03752741
## [134] chr7 41720644-41722644 * | ENSG00000261019 cg17544672
## [135] chr7 42194875-42196875 * | ENSG00000261019 cg16848937
## [136] chr7 42302654-42304654 * | ENSG00000261019 cg14434601
## -------
## seqinfo: 22 sequences from an unspecified genome; no seqlengths
unique(width(probes_extended_1kb[,1:2]))
## [1] 2001
x <- getTagMatrix(peaks.gr, windows=probes_extended_1kb)
p <- plotAvgProf(x, xlim=c(-1000, 1000),
xlab="Genomic Region (5'->3')", ylab = "Read Count Frequency")
p <- p + scale_x_continuous(breaks=c(-1000,-500,0,500,1000),labels=c(-1000,-500,"Paired distal CpG",500,1000))
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
p

probes_extended_5kb <- resize(probes,width = 10001,fix = "center")
probes_extended_5kb[,1:2]
## GRanges object with 136 ranges and 2 metadata columns:
## seqnames ranges strand | GeneID Probe
## <Rle> <IRanges> <Rle> | <character> <character>
## [1] chr6 45436433-45446433 * | ENSG00000001561 cg04757168
## [2] chr4 10442440-10452440 * | ENSG00000002587 cg11578055
## [3] chr4 11647505-11657505 * | ENSG00000002587 cg23971987
## [4] chr11 68169286-68179286 * | ENSG00000006534 cg10687219
## [5] chr19 46380115-46390115 * | ENSG00000008438 cg12067547
## ... ... ... ... . ... ...
## [132] chr15 83785129-83795129 * | ENSG00000259767 cg18593236
## [133] chr15 84163459-84173459 * | ENSG00000259767 cg03752741
## [134] chr7 41716644-41726644 * | ENSG00000261019 cg17544672
## [135] chr7 42190875-42200875 * | ENSG00000261019 cg16848937
## [136] chr7 42298654-42308654 * | ENSG00000261019 cg14434601
## -------
## seqinfo: 22 sequences from an unspecified genome; no seqlengths
unique(width(probes_extended_5kb[,1:2]))
## [1] 10001
x <- getTagMatrix(peaks.gr, windows=probes_extended_5kb)
p <- plotAvgProf(x, xlim=c(-5000, 5000),
xlab="Genomic Region (5'->3')", ylab = "Read Count Frequency")
p <- p + scale_x_continuous(breaks=c(-5000,-2000,0,2000,5000),labels=c(-5000,-2000,"Paired distal CpG",2000,5000))
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
p

Verify overlap between peaks and linked probes
hits <- suppressWarnings(findOverlaps(peaks.gr,probes))
probes.hit <- subjectHits(hits)
length(unique(probes[probes.hit])) # unique regions
## [1] 27
Verify overlap between peaks and distall probes
distal.probes <- ELMER::get.feature.probe(feature = NULL,genome = "hg19", met.platform = "450K")
## Downloading transcripts information. Using: Homo sapiens genes (GRCh37.p13)
## Returning distal probes: 163576
hits.distal <- suppressWarnings(findOverlaps(peaks.gr,distal.probes))
distal.probes.hit <- subjectHits(hits.distal)
Calculate enrichement
x <- matrix(c(
length(unique(probes[probes.hit])), # all linked probes with hits
(length(unique(probes)) - length(unique(probes[probes.hit]))), # all linked probes with no hits
length(unique(distal.probes.hit)), # all distal probes with hits (4216)
(length(distal.probes)) - length(unique(distal.probes.hit))), # all distal probes with no hits
nrow = 2,
dimnames = list(c("Overlap", "No overlap"),
c("Unique Linked probes", "Distal probes"))
)
x <- t(x)
x
## Overlap No overlap
## Unique Linked probes 27 109
## Distal probes 4256 159320
fisher.test(x)
##
## Fisher's Exact Test for Count Data
##
## data: x
## p-value = 2.565e-16
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 5.840886 14.257038
## sample estimates:
## odds ratio
## 9.272322
Regions overlapped
DT::datatable(tibble::as_data_frame(probes[subjectHits(hits)]),options = list(scrollX = TRUE),filter = 'top')
DT::datatable(tibble::as_data_frame(peaks.gr[queryHits(hits)]),options = list(scrollX = TRUE),filter = 'top')
Exteding overlap regions (+-1Kb)
extend.hits <- suppressWarnings(findOverlaps(peaks.gr,probes_extended_1kb))
# How many unique probes overlap the peaks (considering +-1KB)
length(unique(probes[subjectHits(extend.hits)]))
## [1] 42
Distance
y <- distanceToNearest(probes,peaks.gr,ignore.strand=T)
qplot(mcols(y)$distance,geom="histogram") + ggthemes::theme_gdocs()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

qplot(log(mcols(y)$distance + 1),geom="histogram") + ggthemes::theme_gdocs()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
