Find TFs overlapping with CLL patient deleted region

Pariksheet Nanda

2017-06-01

Search JASPAR database for binding sites

This analysis follows the TFBSTools vignette.

suppressPackageStartupMessages({
  library(JASPAR2016) # JASPAR2016
  library(TFBSTools)
  library(Biostrings) # DNAString
  library(Biobase) # cache
})

sequence_deleted <- DNAString("GGCTTCT")
sequence_all <- DNAString("ccgccgagctctcggagggcagggaagaggGGCTTCTTGCGGTTGCTGCTGC")
seqname <- "chr15"

Query JASPAR database for Homosapiens sites:

pfml <- getMatrixSet(JASPAR2016,
                     list(species = 9606)) # Homosapiens NCBI species ID
pwml <- toPWM(pfml)
system.time(siteslist <- searchSeq(pwml,
                                   subject = sequence_all,
                                   seqname = seqname))
##    user  system elapsed 
##   2.892   0.008   2.901

Find scores and remove TFs with no hits

All the original transcription factors are returned to us, even if it had no hits (i.e.Ā length 0).

Therefore find the scores and remove transcription factors with no hits.

Do not calculate p-values, as it eats up a lot of memory and takes a long time.

scores <- relScore(siteslist)
scores <- scores[sapply(scores, length) > 0]
length(scores)
## [1] 58
# Do not calculate p-values as it takes too long.
#pvals_empirical <- sapply(siteslist, pvalues, type = "TFMPvalue")

Subset the binding sites and scores

suppressPackageStartupMessages({
  library(GenomicRanges)
})


del <- pairwiseAlignment(sequence_deleted, sequence_all)
gr_del <- GRanges(as(Views(del), "IRanges"), seqnames = seqname)
# Give our deletion the same column names as the transcription factors to be
# able to concatenate it.
mcols(gr_del) <- DataFrame(
  source = "DNA",
  feature = "Deletion",
  absScore = NA,
  relScore = NA,
  ID = NA,
  TF = NA,
  class = "DNA",
  siteSeqs = as.character(sequence_deleted)
)

# Subset transcription factors to those overlapping our deletion.
gr_tf <- as(siteslist[names(scores)], "GRanges")
gr_tf <- subsetByOverlaps(gr_tf, gr_del)
length(gr_tf)
## [1] 45
gr <- c(gr_tf, gr_del)

Plot TF binding positions and relative scores

suppressPackageStartupMessages({
  library(ggbio)
  library(dplyr)
})

autoplot(gr, aes(fill = class, color = class, alpha = relScore))

ggplot(as.data.frame(gr), aes(x = ID, y = relScore, color = class)) +
  geom_label(aes(label = TF)) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1),
        legend.position = "none")
## Warning: Removed 1 rows containing missing values (geom_label).

# Summary of overlapping hits.
idx <- unlist(order(mcols(gr_tf)["relScore"], decreasing = TRUE))
as.data.frame(gr_tf[idx]) %>%
  `rownames<-`(NULL) %>%
  select(ID, TF, relScore, absScore, siteSeqs, class)
##          ID            TF  relScore     absScore              siteSeqs
## 1  MA0511.2         RUNX2 0.9777678  13.20350536             CAACCGCAA
## 2  MA0684.1         RUNX3 0.9543368  12.36617572            CAACCGCAAG
## 3  MA0719.1        RHOXF1 0.9409625   4.63861618              AGAAGCCC
## 4  MA0516.1           SP2 0.9030864   9.26761947       GCCCCTCTTCCCTGC
## 5  MA0673.1        NKX2-8 0.8939499   3.18853490             CTTCTTGCG
## 6  MA0079.3           SP1 0.8736000  -0.35827854           GCCCCTCTTCC
## 7  MA0672.1        NKX2-3 0.8722333   3.02980546            CCCTCTTCCC
## 8  MA0673.1        NKX2-8 0.8678255   0.82865142             CCCCTCTTC
## 9  MA0014.2          PAX5 0.8601111   8.27055952   GAGGGCAGGGAAGAGGGGC
## 10 MA0668.1       NEUROD2 0.8587433   0.05360028            GGCTTCTTGC
## 11 MA0831.1          TFE3 0.8555210   2.65184882            ACCGCAAGAA
## 12 MA0599.1          KLF5 0.8502467  -5.91407047            GCCCCTCTTC
## 13 MA0528.1        ZNF263 0.8469956   4.36297236 GGAGGGCAGGGAAGAGGGGCT
## 14 MA0599.1          KLF5 0.8418724  -7.11398226            CCCTCTTCCC
## 15 MA0162.2          EGR1 0.8355595  -6.42093265        CCCTCTTCCCTGCC
## 16 MA0820.1         FIGLA 0.8351795   1.31787693            AGCCCCTCTT
## 17 MA0155.1         INSM1 0.8273882   5.16895379          TTCTTGCGGTTG
## 18 MA0673.1        NKX2-8 0.8273551  -2.82713524             GCCCCTCTT
## 19 MA0508.1         PRDM1 0.8257167   2.34895033       GGCAGGGAAGAGGGG
## 20 MA0528.1        ZNF263 0.8254461   0.98025334 GAGGGCAGGGAAGAGGGGCTT
## 21 MA0759.1          ELK3 0.8252448   3.20587694            ACCGCAAGAA
## 22 MA0484.1         HNF4G 0.8233078   0.34083963       AGGGAAGAGGGGCTT
## 23 MA0076.2          ELK4 0.8228688  -2.28936357           CTTCTTGCGGT
## 24 MA0672.1        NKX2-3 0.8217014  -1.42376402            GCTTCTTGCG
## 25 MA0674.1        NKX6-1 0.8214148  -0.50389317              CTTCTTGC
## 26 MA0640.1          ELF3 0.8195680   0.40206752         CAACCGCAAGAAG
## 27 MA0162.2          EGR1 0.8185364  -9.22220118        GCCCCTCTTCCCTG
## 28 MA0028.2          ELK1 0.8179104   1.27002475            ACCGCAAGAA
## 29 MA0003.3        TFAP2A 0.8159761  -1.25274724           AACCGCAAGAA
## 30 MA0076.2          ELK4 0.8149414  -3.14622462           GGGCTTCTTGC
## 31 MA0762.1          ETV2 0.8115932  -1.42373140           AACCGCAAGAA
## 32 MA0672.1        NKX2-3 0.8103112  -2.42762554            GCCCCTCTTC
## 33 MA0831.1          TFE3 0.8102794  -1.06246532            TTCTTGCGGT
## 34 MA0057.1   MZF1(var.2) 0.8055744   3.93824320            AGAGGGGCTT
## 35 MA0122.2        NKX3-2 0.8055041   2.47159484             CCCTCTTCC
## 36 MA0827.1         OLIG3 0.8054129  -1.08026117            GCAAGAAGCC
## 37 MA0807.1          TBX5 0.8042962   2.92412208              AGGGGCTT
## 38 MA0508.1         PRDM1 0.8032390  -0.27286135       GGGCAGGGAAGAGGG
## 39 MA0671.1          NFIX 0.8015347  -0.16919924             GAAGCCCCT
## 40 MA0687.1          SPIC 0.8011496   2.57104281        AGCAACCGCAAGAA
## 41 MA0814.1 TFAP2C(var.2) 0.8011416  -0.39894797           AACCGCAAGAA
## 42 MA0528.1        ZNF263 0.8006551  -2.91130142 CGGAGGGCAGGGAAGAGGGGC
## 43 MA0468.1          DUX4 0.8005940 -14.66800940           GAAGCCCCTCT
## 44 MA0741.1         KLF16 0.8001657   1.06243306           AAGAAGCCCCT
## 45 MA0504.1         NR2C2 0.8000939  -2.40325276       CAGGGAAGAGGGGCT
##                                     class
## 1                     Runt domain factors
## 2                     Runt domain factors
## 3                    Homeo domain factors
## 4                C2H2 zinc finger factors
## 5                    Homeo domain factors
## 6                C2H2 zinc finger factors
## 7                    Homeo domain factors
## 8                    Homeo domain factors
## 9                      Paired box factors
## 10  Basic helix-loop-helix factors (bHLH)
## 11  Basic helix-loop-helix factors (bHLH)
## 12               C2H2 zinc finger factors
## 13               C2H2 zinc finger factors
## 14               C2H2 zinc finger factors
## 15               C2H2 zinc finger factors
## 16  Basic helix-loop-helix factors (bHLH)
## 17               C2H2 zinc finger factors
## 18                   Homeo domain factors
## 19               C2H2 zinc finger factors
## 20               C2H2 zinc finger factors
## 21             Tryptophan cluster factors
## 22 Nuclear receptors with C4 zinc fingers
## 23             Tryptophan cluster factors
## 24                   Homeo domain factors
## 25                   Homeo domain factors
## 26             Tryptophan cluster factors
## 27               C2H2 zinc finger factors
## 28             Tryptophan cluster factors
## 29  Basic helix-span-helix factors (bHSH)
## 30             Tryptophan cluster factors
## 31             Tryptophan cluster factors
## 32                   Homeo domain factors
## 33  Basic helix-loop-helix factors (bHLH)
## 34               C2H2 zinc finger factors
## 35                   Homeo domain factors
## 36  Basic helix-loop-helix factors (bHLH)
## 37                          T-Box factors
## 38               C2H2 zinc finger factors
## 39   SMAD/NF-1 DNA-binding domain factors
## 40             Tryptophan cluster factors
## 41  Basic helix-span-helix factors (bHSH)
## 42               C2H2 zinc finger factors
## 43                   Homeo domain factors
## 44               C2H2 zinc finger factors
## 45 Nuclear receptors with C4 zinc fingers