This analysis follows the TFBSTools vignette.
suppressPackageStartupMessages({
library(JASPAR2016) # JASPAR2016
library(TFBSTools)
library(Biostrings) # DNAString
library(Biobase) # cache
})
sequence_deleted <- DNAString("GGCTTCT")
sequence_all <- DNAString("ccgccgagctctcggagggcagggaagaggGGCTTCTTGCGGTTGCTGCTGC")
seqname <- "chr15"Query JASPAR database for Homosapiens sites:
pfml <- getMatrixSet(JASPAR2016,
list(species = 9606)) # Homosapiens NCBI species ID
pwml <- toPWM(pfml)
system.time(siteslist <- searchSeq(pwml,
subject = sequence_all,
seqname = seqname))## user system elapsed
## 2.892 0.008 2.901
All the original transcription factors are returned to us, even if it had no hits (i.e.Ā length 0).
Therefore find the scores and remove transcription factors with no hits.
Do not calculate p-values, as it eats up a lot of memory and takes a long time.
scores <- relScore(siteslist)
scores <- scores[sapply(scores, length) > 0]
length(scores)## [1] 58
# Do not calculate p-values as it takes too long.
#pvals_empirical <- sapply(siteslist, pvalues, type = "TFMPvalue")suppressPackageStartupMessages({
library(GenomicRanges)
})
del <- pairwiseAlignment(sequence_deleted, sequence_all)
gr_del <- GRanges(as(Views(del), "IRanges"), seqnames = seqname)
# Give our deletion the same column names as the transcription factors to be
# able to concatenate it.
mcols(gr_del) <- DataFrame(
source = "DNA",
feature = "Deletion",
absScore = NA,
relScore = NA,
ID = NA,
TF = NA,
class = "DNA",
siteSeqs = as.character(sequence_deleted)
)
# Subset transcription factors to those overlapping our deletion.
gr_tf <- as(siteslist[names(scores)], "GRanges")
gr_tf <- subsetByOverlaps(gr_tf, gr_del)
length(gr_tf)## [1] 45
gr <- c(gr_tf, gr_del)suppressPackageStartupMessages({
library(ggbio)
library(dplyr)
})
autoplot(gr, aes(fill = class, color = class, alpha = relScore))ggplot(as.data.frame(gr), aes(x = ID, y = relScore, color = class)) +
geom_label(aes(label = TF)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
legend.position = "none")## Warning: Removed 1 rows containing missing values (geom_label).
# Summary of overlapping hits.
idx <- unlist(order(mcols(gr_tf)["relScore"], decreasing = TRUE))
as.data.frame(gr_tf[idx]) %>%
`rownames<-`(NULL) %>%
select(ID, TF, relScore, absScore, siteSeqs, class)## ID TF relScore absScore siteSeqs
## 1 MA0511.2 RUNX2 0.9777678 13.20350536 CAACCGCAA
## 2 MA0684.1 RUNX3 0.9543368 12.36617572 CAACCGCAAG
## 3 MA0719.1 RHOXF1 0.9409625 4.63861618 AGAAGCCC
## 4 MA0516.1 SP2 0.9030864 9.26761947 GCCCCTCTTCCCTGC
## 5 MA0673.1 NKX2-8 0.8939499 3.18853490 CTTCTTGCG
## 6 MA0079.3 SP1 0.8736000 -0.35827854 GCCCCTCTTCC
## 7 MA0672.1 NKX2-3 0.8722333 3.02980546 CCCTCTTCCC
## 8 MA0673.1 NKX2-8 0.8678255 0.82865142 CCCCTCTTC
## 9 MA0014.2 PAX5 0.8601111 8.27055952 GAGGGCAGGGAAGAGGGGC
## 10 MA0668.1 NEUROD2 0.8587433 0.05360028 GGCTTCTTGC
## 11 MA0831.1 TFE3 0.8555210 2.65184882 ACCGCAAGAA
## 12 MA0599.1 KLF5 0.8502467 -5.91407047 GCCCCTCTTC
## 13 MA0528.1 ZNF263 0.8469956 4.36297236 GGAGGGCAGGGAAGAGGGGCT
## 14 MA0599.1 KLF5 0.8418724 -7.11398226 CCCTCTTCCC
## 15 MA0162.2 EGR1 0.8355595 -6.42093265 CCCTCTTCCCTGCC
## 16 MA0820.1 FIGLA 0.8351795 1.31787693 AGCCCCTCTT
## 17 MA0155.1 INSM1 0.8273882 5.16895379 TTCTTGCGGTTG
## 18 MA0673.1 NKX2-8 0.8273551 -2.82713524 GCCCCTCTT
## 19 MA0508.1 PRDM1 0.8257167 2.34895033 GGCAGGGAAGAGGGG
## 20 MA0528.1 ZNF263 0.8254461 0.98025334 GAGGGCAGGGAAGAGGGGCTT
## 21 MA0759.1 ELK3 0.8252448 3.20587694 ACCGCAAGAA
## 22 MA0484.1 HNF4G 0.8233078 0.34083963 AGGGAAGAGGGGCTT
## 23 MA0076.2 ELK4 0.8228688 -2.28936357 CTTCTTGCGGT
## 24 MA0672.1 NKX2-3 0.8217014 -1.42376402 GCTTCTTGCG
## 25 MA0674.1 NKX6-1 0.8214148 -0.50389317 CTTCTTGC
## 26 MA0640.1 ELF3 0.8195680 0.40206752 CAACCGCAAGAAG
## 27 MA0162.2 EGR1 0.8185364 -9.22220118 GCCCCTCTTCCCTG
## 28 MA0028.2 ELK1 0.8179104 1.27002475 ACCGCAAGAA
## 29 MA0003.3 TFAP2A 0.8159761 -1.25274724 AACCGCAAGAA
## 30 MA0076.2 ELK4 0.8149414 -3.14622462 GGGCTTCTTGC
## 31 MA0762.1 ETV2 0.8115932 -1.42373140 AACCGCAAGAA
## 32 MA0672.1 NKX2-3 0.8103112 -2.42762554 GCCCCTCTTC
## 33 MA0831.1 TFE3 0.8102794 -1.06246532 TTCTTGCGGT
## 34 MA0057.1 MZF1(var.2) 0.8055744 3.93824320 AGAGGGGCTT
## 35 MA0122.2 NKX3-2 0.8055041 2.47159484 CCCTCTTCC
## 36 MA0827.1 OLIG3 0.8054129 -1.08026117 GCAAGAAGCC
## 37 MA0807.1 TBX5 0.8042962 2.92412208 AGGGGCTT
## 38 MA0508.1 PRDM1 0.8032390 -0.27286135 GGGCAGGGAAGAGGG
## 39 MA0671.1 NFIX 0.8015347 -0.16919924 GAAGCCCCT
## 40 MA0687.1 SPIC 0.8011496 2.57104281 AGCAACCGCAAGAA
## 41 MA0814.1 TFAP2C(var.2) 0.8011416 -0.39894797 AACCGCAAGAA
## 42 MA0528.1 ZNF263 0.8006551 -2.91130142 CGGAGGGCAGGGAAGAGGGGC
## 43 MA0468.1 DUX4 0.8005940 -14.66800940 GAAGCCCCTCT
## 44 MA0741.1 KLF16 0.8001657 1.06243306 AAGAAGCCCCT
## 45 MA0504.1 NR2C2 0.8000939 -2.40325276 CAGGGAAGAGGGGCT
## class
## 1 Runt domain factors
## 2 Runt domain factors
## 3 Homeo domain factors
## 4 C2H2 zinc finger factors
## 5 Homeo domain factors
## 6 C2H2 zinc finger factors
## 7 Homeo domain factors
## 8 Homeo domain factors
## 9 Paired box factors
## 10 Basic helix-loop-helix factors (bHLH)
## 11 Basic helix-loop-helix factors (bHLH)
## 12 C2H2 zinc finger factors
## 13 C2H2 zinc finger factors
## 14 C2H2 zinc finger factors
## 15 C2H2 zinc finger factors
## 16 Basic helix-loop-helix factors (bHLH)
## 17 C2H2 zinc finger factors
## 18 Homeo domain factors
## 19 C2H2 zinc finger factors
## 20 C2H2 zinc finger factors
## 21 Tryptophan cluster factors
## 22 Nuclear receptors with C4 zinc fingers
## 23 Tryptophan cluster factors
## 24 Homeo domain factors
## 25 Homeo domain factors
## 26 Tryptophan cluster factors
## 27 C2H2 zinc finger factors
## 28 Tryptophan cluster factors
## 29 Basic helix-span-helix factors (bHSH)
## 30 Tryptophan cluster factors
## 31 Tryptophan cluster factors
## 32 Homeo domain factors
## 33 Basic helix-loop-helix factors (bHLH)
## 34 C2H2 zinc finger factors
## 35 Homeo domain factors
## 36 Basic helix-loop-helix factors (bHLH)
## 37 T-Box factors
## 38 C2H2 zinc finger factors
## 39 SMAD/NF-1 DNA-binding domain factors
## 40 Tryptophan cluster factors
## 41 Basic helix-span-helix factors (bHSH)
## 42 C2H2 zinc finger factors
## 43 Homeo domain factors
## 44 C2H2 zinc finger factors
## 45 Nuclear receptors with C4 zinc fingers