Get TCF7L2 probes target regions

TCF7L2_targets <- read_csv("~/Downloads/Targets/getTFtargets_genomic_coordinates_mapped_to_hg19.hyper_in_escc_vs_esad_hg38_TCF7L2.csv",
                           col_types = readr::cols())
DT::datatable(TCF7L2_targets,options = list(scrollX = TRUE),filter = 'top')
probes <- makeGRangesFromDataFrame(TCF7L2_targets,
                                   keep.extra.columns = T,
                                   seqnames.field = "probe_hg19_seqnames",
                                   start.field = "probe_hg19_start",
                                   end.field =  "probe_hg19_end",
                                   strand.field = "probe_hg19_strand"
)
probes <- unique(probes)
probes
## GRanges object with 136 ranges and 19 metadata columns:
##         seqnames            ranges strand |          GeneID       Probe
##            <Rle>         <IRanges>  <Rle> |     <character> <character>
##     [1]     chr6 45441433-45441434      * | ENSG00000001561  cg04757168
##     [2]     chr4 10447440-10447441      * | ENSG00000002587  cg11578055
##     [3]     chr4 11652505-11652506      * | ENSG00000002587  cg23971987
##     [4]    chr11 68174286-68174287      * | ENSG00000006534  cg10687219
##     [5]    chr19 46385115-46385116      * | ENSG00000008438  cg12067547
##     ...      ...               ...    ... .             ...         ...
##   [132]    chr15 83790129-83790130      * | ENSG00000259767  cg18593236
##   [133]    chr15 84168459-84168460      * | ENSG00000259767  cg03752741
##   [134]     chr7 41721644-41721645      * | ENSG00000261019  cg17544672
##   [135]     chr7 42195875-42195876      * | ENSG00000261019  cg16848937
##   [136]     chr7 42303654-42303655      * | ENSG00000261019  cg14434601
##              Symbol       Sides                Raw.p                  FDR
##         <character> <character>            <numeric>            <numeric>
##     [1]       ENPP4          R5 1.27012155256609e-21 1.93977507032146e-20
##     [2]      HS3ST1          R7    1.23758168414e-10 3.68138906358986e-10
##     [3]      HS3ST1          L3    1.23758168414e-10 3.68138906358986e-10
##     [4]     ALDH3B1         L10  3.4750686111754e-20 4.00485667858466e-19
##     [5]     PGLYRP1          R7 1.51558755453394e-05 2.16751520456186e-05
##     ...         ...         ...                  ...                  ...
##   [132]  AC022558.1          L7  0.00058118811574975 0.000619966993433223
##   [133]  AC022558.1          L8  0.00058118811574975 0.000619966993433223
##   [134]  AC010132.4          R8 0.000151242555954231 0.000180329508955901
##   [135]  AC010132.4          R4 0.000151242555954231 0.000180329508955901
##   [136]  AC010132.4          R5 0.000151242555954231 0.000180329508955901
##         distNearestTSS  DMC_analysis_pvalue DMC_analysis_escc_Minus_esad
##              <integer>            <numeric>                    <numeric>
##     [1]         656295 1.54013249119762e-27            0.342492029434533
##     [2]         982780 4.83187113109079e-19            0.349687571993035
##     [3]         221115 1.07408894369233e-22            0.327702012987323
##     [4]         384974 6.12602939805421e-18            0.344646832765288
##     [5]         141207 2.39777788986254e-48            0.344608475216899
##     ...            ...                  ...                          ...
##   [132]          97040 3.29795962417823e-21            0.302649568499366
##   [133]         475370 7.07335576164312e-19            0.303159293681504
##   [134]        1220592 1.00981370327399e-19            0.327279634433238
##   [135]         746362 6.27960586446515e-24            0.329610231319257
##   [136]         638583 2.58107943402524e-27             0.32008766236016
##         DMC_analysis_adjust.p ensembl_gene_id original_ensembl_gene_id
##                     <numeric>     <character>              <character>
##     [1]  3.29196455545598e-25 ENSG00000001561        ENSG00000001561.6
##     [2]  3.25783229365739e-17 ENSG00000002587        ENSG00000002587.8
##     [3]  1.20247813228428e-20 ENSG00000002587        ENSG00000002587.8
##     [4]  3.57956084645453e-16 ENSG00000006534       ENSG00000006534.14
##     [5]  1.50581593282363e-44 ENSG00000008438        ENSG00000008438.4
##     ...                   ...             ...                      ...
##   [132]  3.00164398341097e-19 ENSG00000259767        ENSG00000259767.1
##   [133]  4.66887503103732e-17 ENSG00000259767        ENSG00000259767.1
##   [134]  7.46916662935934e-18 ENSG00000261019        ENSG00000261019.1
##   [135]  8.29820341694918e-22 ENSG00000261019        ENSG00000261019.1
##   [136]  5.44632538941892e-25 ENSG00000261019        ENSG00000261019.1
##                                                                                                                                                                                        TF
##                                                                                                                                                                               <character>
##     [1]            FOXA3;FOXA2;FOXP4;FOXD2;FOXJ1;TCF7L2;NR1I2;PPARG;RORC;THRA;NR1H3;NR1H4;ZNF620;ZSCAN16;ZNF774;ZNF468;ZNF33A;ZNF816;ZNF619;ZNF765;ZNF799;ZNF763;ZNF823;ZNF124;SPDEF;ELF3
##     [2]                                           GATA6;GATA4;NR1I2;PPARG;RORC;THRA;NR1H3;NR1H4;TCF7L2;HNF1A;HNF1B;PDX1;CDX2;HOXB6;HOXB5;MNX1;EVX1;CDX1;HOXA13;HOXB9;SPDEF;ELF3;IRF8;SOX9
##     [3]                  GATA6;GATA4;NR1I2;PPARG;RORC;THRA;NR1H3;NR1H4;TCF7L2;SPDEF;ELF3;HNF1A;HNF1B;PDX1;CDX2;HOXB6;HOXB5;MNX1;EVX1;CDX1;HOXA13;HOXB9;FOXA3;FOXA2;FOXP4;FOXD2;FOXJ1;SOX9
##     [4]                                                                   HNF4A;HNF4G;NR2F2;NR2C1;FOXA3;FOXA2;FOXP4;FOXD2;FOXJ1;NR1I2;PPARG;RORC;THRA;NR1H3;NR1H4;TCF7L2;SPDEF;ELF3;NR3C2
##     [5] GATA6;GATA4;HNF1A;HNF1B;PDX1;CDX2;HOXB6;HOXB5;MNX1;EVX1;CDX1;HOXA13;HOXB9;SPDEF;ELF3;NR1I2;PPARG;RORC;THRA;NR1H3;NR1H4;TCF7L2;NFATC2;FOXA3;FOXA2;FOXP4;FOXD2;FOXJ1;ZSCAN16;ZNF816
##     ...                                                                                                                                                                               ...
##   [132]                                              NR1I2;PPARG;RORC;THRA;NR1H3;NR1H4;HNF4A;HNF4G;NR2F2;NR2C1;TCF7L2;FOXA3;FOXA2;FOXP4;FOXD2;FOXJ1;NFATC2;SPDEF;ELF3;ZSCAN16;ZNF816;IRF8
##   [133]                                                                                HNF1A;HNF1B;PDX1;CDX2;HOXB6;HOXB5;MNX1;EVX1;CDX1;HOXA13;HOXB9;FOXA3;FOXA2;FOXP4;FOXD2;FOXJ1;TCF7L2
##   [134]                       HNF1A;HNF1B;PDX1;CDX2;HOXB6;HOXB5;MNX1;EVX1;CDX1;HOXA13;HOXB9;GATA6;GATA4;FOXA3;FOXA2;FOXP4;FOXD2;FOXJ1;NR1I2;PPARG;RORC;THRA;NR1H3;NR1H4;TCF7L2;SPDEF;ELF3
##   [135]                            HNF1A;HNF1B;PDX1;CDX2;HOXB6;HOXB5;MNX1;EVX1;CDX1;HOXA13;HOXB9;FOXA3;FOXA2;FOXP4;FOXD2;FOXJ1;TCF7L2;NR1I2;PPARG;RORC;THRA;NR1H3;NR1H4;NFATC2;SOX9;NR3C2
##   [136]                                                                        GATA6;GATA4;NR1I2;PPARG;RORC;THRA;NR1H3;NR1H4;HNF4A;HNF4G;NR2F2;NR2C1;TCF7L2;FOXA3;FOXA2;FOXP4;FOXD2;FOXJ1
##         probe_hg19_width gene_hg19_seqnames gene_hg19_start gene_hg19_end
##                <integer>        <character>       <integer>     <integer>
##     [1]                2               chr6        46097730      46114436
##     [2]                2               chr4        11394774      11431389
##     [3]                2               chr4        11394774      11431389
##     [4]                2              chr11        67789294      67795378
##     [5]                2              chr19        46522411      46526323
##     ...              ...                ...             ...           ...
##   [132]                2              chr15        83690988      83693088
##   [133]                2              chr15        83690988      83693088
##   [134]                2               chr7        42940871      42942238
##   [135]                2               chr7        42940871      42942238
##   [136]                2               chr7        42940871      42942238
##         gene_hg19_width gene_hg19_strand
##               <integer>      <character>
##     [1]           16707                +
##     [2]           36616                -
##     [3]           36616                -
##     [4]            6085                +
##     [5]            3913                -
##     ...             ...              ...
##   [132]            2101                -
##   [133]            2101                -
##   [134]            1368                -
##   [135]            1368                -
##   [136]            1368                -
##   -------
##   seqinfo: 22 sequences from an unspecified genome; no seqlengths

Read narrow TCF7L2 peaks files

colnames(peaks) <- c("chrom","Start","End","Name","Width","Score", "fold_enrichment","-log10(pValue)","-log10(qValue)","peak")
DT::datatable(peaks,options = list(scrollX = TRUE),filter = 'top')
peaks$chrom <- paste0("chr",peaks$chrom)
peaks.gr <- makeGRangesFromDataFrame(peaks,keep.extra.columns = T)
peaks.gr
## GRanges object with 28740 ranges and 6 metadata columns:
##           seqnames            ranges strand |            Name       Score
##              <Rle>         <IRanges>  <Rle> |     <character> <character>
##       [1]     chr1       29198-29441      * |     OE33_peak_1           .
##       [2]     chr1     713825-714329      * |     OE33_peak_2           .
##       [3]     chr1     762669-763049      * |     OE33_peak_3           .
##       [4]     chr1     839595-840410      * |     OE33_peak_4           .
##       [5]     chr1     859188-859486      * |     OE33_peak_5           .
##       ...      ...               ...    ... .             ...         ...
##   [28736]     chrY 13460946-13461195      * | OE33_peak_28736           .
##   [28737]     chrY 13485442-13485967      * | OE33_peak_28737           .
##   [28738]     chrY 58827136-58827588      * | OE33_peak_28738           .
##   [28739]     chrY 58883594-58884000      * | OE33_peak_28739           .
##   [28740]     chrY 58905133-58905450      * | OE33_peak_28740           .
##           fold_enrichment -log10(pValue) -log10(qValue)      peak
##                 <numeric>      <numeric>      <numeric> <integer>
##       [1]         4.15958        7.48895        4.98834       122
##       [2]         2.43645        4.85955         2.5425       215
##       [3]         5.53781       10.38868        7.74738       198
##       [4]          5.3282       14.39616       11.61202       461
##       [5]         6.29061       10.93385        8.26844       116
##       ...             ...            ...            ...       ...
##   [28736]         2.42299       10.26745        7.63054       121
##   [28737]         4.66213       19.55059       16.63674       177
##   [28738]        26.84596       104.7633      100.97492       219
##   [28739]         5.08153         7.6158        5.10279       185
##   [28740]        10.16306       21.14335       18.19341       166
##   -------
##   seqinfo: 24 sequences from an unspecified genome; no seqlengths
ChIPseeker::covplot(peaks.gr, weightCol="fold_enrichment")

Verify distance between probes and peaks

probes_extended_1kb <- resize(unique(probes),width = 2001,fix = "center")
probes_extended_1kb[,1:2]
## GRanges object with 136 ranges and 2 metadata columns:
##         seqnames            ranges strand |          GeneID       Probe
##            <Rle>         <IRanges>  <Rle> |     <character> <character>
##     [1]     chr6 45440433-45442433      * | ENSG00000001561  cg04757168
##     [2]     chr4 10446440-10448440      * | ENSG00000002587  cg11578055
##     [3]     chr4 11651505-11653505      * | ENSG00000002587  cg23971987
##     [4]    chr11 68173286-68175286      * | ENSG00000006534  cg10687219
##     [5]    chr19 46384115-46386115      * | ENSG00000008438  cg12067547
##     ...      ...               ...    ... .             ...         ...
##   [132]    chr15 83789129-83791129      * | ENSG00000259767  cg18593236
##   [133]    chr15 84167459-84169459      * | ENSG00000259767  cg03752741
##   [134]     chr7 41720644-41722644      * | ENSG00000261019  cg17544672
##   [135]     chr7 42194875-42196875      * | ENSG00000261019  cg16848937
##   [136]     chr7 42302654-42304654      * | ENSG00000261019  cg14434601
##   -------
##   seqinfo: 22 sequences from an unspecified genome; no seqlengths
unique(width(probes_extended_1kb[,1:2]))
## [1] 2001
x <- getTagMatrix(peaks.gr, windows=probes_extended_1kb)
p <- plotAvgProf(x, xlim=c(-1000, 1000),
            xlab="Genomic Region (5'->3')", ylab = "Read Count Frequency")
p <- p + scale_x_continuous(breaks=c(-1000,-500,0,500,1000),labels=c(-1000,-500,"Paired distal CpG",500,1000))
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
p

probes_extended_5kb <- resize(probes,width = 10001,fix = "center")
probes_extended_5kb[,1:2]
## GRanges object with 136 ranges and 2 metadata columns:
##         seqnames            ranges strand |          GeneID       Probe
##            <Rle>         <IRanges>  <Rle> |     <character> <character>
##     [1]     chr6 45436433-45446433      * | ENSG00000001561  cg04757168
##     [2]     chr4 10442440-10452440      * | ENSG00000002587  cg11578055
##     [3]     chr4 11647505-11657505      * | ENSG00000002587  cg23971987
##     [4]    chr11 68169286-68179286      * | ENSG00000006534  cg10687219
##     [5]    chr19 46380115-46390115      * | ENSG00000008438  cg12067547
##     ...      ...               ...    ... .             ...         ...
##   [132]    chr15 83785129-83795129      * | ENSG00000259767  cg18593236
##   [133]    chr15 84163459-84173459      * | ENSG00000259767  cg03752741
##   [134]     chr7 41716644-41726644      * | ENSG00000261019  cg17544672
##   [135]     chr7 42190875-42200875      * | ENSG00000261019  cg16848937
##   [136]     chr7 42298654-42308654      * | ENSG00000261019  cg14434601
##   -------
##   seqinfo: 22 sequences from an unspecified genome; no seqlengths
unique(width(probes_extended_5kb[,1:2]))
## [1] 10001
x <- getTagMatrix(peaks.gr, windows=probes_extended_5kb)
p <- plotAvgProf(x, xlim=c(-5000, 5000),
            xlab="Genomic Region (5'->3')", ylab = "Read Count Frequency")
p <- p + scale_x_continuous(breaks=c(-5000,-2000,0,2000,5000),labels=c(-5000,-2000,"Paired distal CpG",2000,5000))
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
p

Verify overlap between peaks and linked probes

hits <- suppressWarnings(findOverlaps(peaks.gr,probes))
probes.hit <- subjectHits(hits)
length(unique(probes[probes.hit])) # unique regions
## [1] 27

Verify overlap between peaks and distall probes

distal.probes <- ELMER::get.feature.probe(feature = NULL,genome = "hg19", met.platform = "450K")
## Downloading transcripts information. Using: Homo sapiens genes (GRCh37.p13)
## Returning distal probes: 163576
hits.distal <- suppressWarnings(findOverlaps(peaks.gr,distal.probes))
distal.probes.hit <- subjectHits(hits.distal)

Calculate enrichement

x <- matrix(c(
    length(unique(probes[probes.hit])), # all linked probes with hits
    (length(unique(probes)) - length(unique(probes[probes.hit]))), # all linked probes with no hits
    length(unique(distal.probes.hit)), # all distal probes with  hits (4216)
    (length(distal.probes)) - length(unique(distal.probes.hit))), # all distal probes with no hits
    nrow = 2,
    dimnames = list(c("Overlap", "No overlap"),
                    c("Unique Linked probes", "Distal probes"))
)
x <- t(x)
x
##                      Overlap No overlap
## Unique Linked probes      27        109
## Distal probes           4256     159320
fisher.test(x)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  x
## p-value = 2.565e-16
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##   5.840886 14.257038
## sample estimates:
## odds ratio 
##   9.272322

Regions overlapped

DT::datatable(tibble::as_data_frame(probes[subjectHits(hits)]),options = list(scrollX = TRUE),filter = 'top')
DT::datatable(tibble::as_data_frame(peaks.gr[queryHits(hits)]),options = list(scrollX = TRUE),filter = 'top')

Exteding overlap regions (+-1Kb)

extend.hits <- suppressWarnings(findOverlaps(peaks.gr,probes_extended_1kb))
# How many unique probes overlap the peaks (considering +-1KB)
length(unique(probes[subjectHits(extend.hits)]))
## [1] 42

Distance

y <- distanceToNearest(probes,peaks.gr,ignore.strand=T)
qplot(mcols(y)$distance,geom="histogram") + ggthemes::theme_gdocs()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

qplot(log(mcols(y)$distance + 1),geom="histogram") + ggthemes::theme_gdocs()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.