rm(list = ls())
###################################_method_1
library(Biostrings)
## 载入需要的程辑包:BiocGenerics
## 载入需要的程辑包:parallel
##
## 载入程辑包:'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## 载入需要的程辑包:S4Vectors
## 载入需要的程辑包:stats4
##
## 载入程辑包:'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## 载入需要的程辑包:IRanges
##
## 载入程辑包:'IRanges'
## The following object is masked from 'package:grDevices':
##
## windows
## 载入需要的程辑包:XVector
## 载入需要的程辑包:GenomeInfoDb
##
## 载入程辑包:'Biostrings'
## The following object is masked from 'package:base':
##
## strsplit
sequence_kmers <- function(sequence, k){
k_mers <- lapply(sequence,function(x){
seq_loop_size <- length(DNAString(x))-k+1
kmers <- sapply(1:seq_loop_size, function(z){
y <- z + k -1
kmer <- substr(x=x, start=z, stop=y)
return(kmer)
})
return(kmers)
})
uniq <- unique(unlist(k_mers))
ind <- t(sapply(k_mers, function(x){
tabulate(match(x, uniq), length(uniq))
}))
colnames(ind) <- uniq
return(ind)
}
DNAlst<-list("CAAACTGATTTT","GATGAAAGTAAAATACCG","ATTATGC","TGGA","CGCGCATCAA")
dim(sequence_kmers(DNAlst,3));sequence_kmers(DNAlst,3) #[1] 5 31
## [1] 5 31
## CAA AAA AAC ACT CTG TGA GAT ATT TTT ATG GAA AAG AGT GTA TAA AAT ATA TAC
## [1,] 1 1 1 1 1 1 1 1 2 0 0 0 0 0 0 0 0 0
## [2,] 0 3 0 0 0 1 1 0 0 1 1 1 1 1 1 1 1 1
## [3,] 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0
## [4,] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [5,] 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## ACC CCG TTA TAT TGC TGG GGA CGC GCG GCA CAT ATC TCA
## [1,] 0 0 0 0 0 0 0 0 0 0 0 0 0
## [2,] 1 1 0 0 0 0 0 0 0 0 0 0 0
## [3,] 0 0 1 1 1 0 0 0 0 0 0 0 0
## [4,] 0 0 0 0 0 1 1 0 0 0 0 0 0
## [5,] 0 0 0 0 0 0 0 2 1 1 1 1 1
#############################################method_2
library(Biostrings)
data_1 <- t(sapply(DNAlst, function(x){x1 <- DNAString(x)
oligonucleotideFrequency(x1,3)}))
dim(data_1) #[1] 5 64
## [1] 5 64
data_2 <- data_1[ ,colSums(data_1) != 0]
dim(data_2); data_2
## [1] 5 31
## AAA AAC AAG AAT ACC ACT AGT ATA ATC ATG ATT CAA CAT CCG CGC CTG GAA GAT
## [1,] 1 1 0 0 0 1 0 0 0 0 1 1 0 0 0 1 0 1
## [2,] 3 0 1 1 1 0 1 1 0 1 0 0 0 1 0 0 1 1
## [3,] 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0
## [4,] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [5,] 0 0 0 0 0 0 0 0 1 0 0 1 1 0 2 0 0 0
## GCA GCG GGA GTA TAA TAC TAT TCA TGA TGC TGG TTA TTT
## [1,] 0 0 0 0 0 0 0 0 1 0 0 0 2
## [2,] 0 0 0 1 1 1 0 0 1 0 0 0 0
## [3,] 0 0 0 0 0 0 1 0 0 1 0 1 0
## [4,] 0 0 1 0 0 0 0 0 0 0 1 0 0
## [5,] 1 1 0 0 0 0 0 1 0 0 0 0 0
#oligonucleotideFrequency(DNAStringSet(unlist(DNAlst)), 3L)
#################################################
library(stringi)
DNAlst <- list("CAAACTGATTTT","GATGAAAGTAAAATACCG","ATTATGC","TGGA","CGCGCATCAA")
dna <- do.call(paste0,expand.grid(rep(list(c('A', 'G', 'T', 'C')), 3)))
result <- t(sapply(DNAlst, stri_count_fixed,pattern=dna,overlap=TRUE))
colnames(result) <- dna
result
## AAA GAA TAA CAA AGA GGA TGA CGA ATA GTA TTA CTA ACA GCA TCA CCA AAG GAG
## [1,] 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0
## [2,] 3 1 1 0 0 0 1 0 1 1 0 0 0 0 0 0 1 0
## [3,] 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
## [4,] 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
## [5,] 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0
## TAG CAG AGG GGG TGG CGG ATG GTG TTG CTG ACG GCG TCG CCG AAT GAT TAT CAT
## [1,] 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0
## [2,] 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 1 0 0
## [3,] 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0
## [4,] 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## [5,] 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1
## AGT GGT TGT CGT ATT GTT TTT CTT ACT GCT TCT CCT AAC GAC TAC CAC AGC GGC
## [1,] 0 0 0 0 1 0 2 0 1 0 0 0 1 0 0 0 0 0
## [2,] 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## [3,] 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## [4,] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [5,] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## TGC CGC ATC GTC TTC CTC ACC GCC TCC CCC
## [1,] 0 0 0 0 0 0 0 0 0 0
## [2,] 0 0 0 0 0 0 1 0 0 0
## [3,] 1 0 0 0 0 0 0 0 0 0
## [4,] 0 0 0 0 0 0 0 0 0 0
## [5,] 0 2 1 0 0 0 0 0 0 0
data_2 <- result[ ,colSums(result) != 0]
dim(data_2); data_2
## [1] 5 31
## AAA GAA TAA CAA GGA TGA ATA GTA TTA GCA TCA AAG TGG ATG CTG GCG CCG AAT
## [1,] 1 0 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0
## [2,] 3 1 1 0 0 1 1 1 0 0 0 1 0 1 0 0 1 1
## [3,] 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0
## [4,] 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0
## [5,] 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 1 0 0
## GAT TAT CAT AGT ATT TTT ACT AAC TAC TGC CGC ATC ACC
## [1,] 1 0 0 0 1 2 1 1 0 0 0 0 0
## [2,] 1 0 0 1 0 0 0 0 1 0 0 0 1
## [3,] 0 1 0 0 1 0 0 0 0 1 0 0 0
## [4,] 0 0 0 0 0 0 0 0 0 0 0 0 0
## [5,] 0 0 1 0 0 0 0 0 0 0 2 1 0
###################################_method_3
###https://stackoverflow.com/questions/26600735/matching-and-counting-strings-k-mer-of-dna-in-r
##https://rpubs.com/TX-YXL/866139