rm(list = ls())
###############################input data
dir_path <- "/Users/xut2/Desktop/dna seq/"
dir_path_name <- list.files(pattern = ".*",dir_path,full.names = T, recursive = T)
dir_path_name
## [1] "/Users/xut2/Desktop/dna seq//AHR.fasta"
## [2] "/Users/xut2/Desktop/dna seq//DNA sequence vector.R"
## [3] "/Users/xut2/Desktop/dna seq//DNA-sequence-vector.R"
## [4] "/Users/xut2/Desktop/dna seq//DNA-sequence-vector.spin.R"
## [5] "/Users/xut2/Desktop/dna seq//DNA-sequence-vector.spin.Rmd"
rawseq <- readLines(grep("AHR",dir_path_name,value = T))
head(rawseq); length(rawseq)
## [1] ">NC_000007.14:17298652-17346147 Homo sapiens chromosome 7, GRCh38.p14 Primary Assembly"
## [2] "AGTGGCTGGGGAGTCCCGTCGACGCTCTGTTCCGAGAGCGTGCCCCGGACCGCCAGCTCAGAACAGGGGC"
## [3] "AGCCGTGTAGCCGAACGGAAGCTGGGAGCAGCCGGGACTGGTGGCCCGCGCCCGAGCTCCGCAGGCGGGA"
## [4] "AGCACCCTGGATTTAGGAAGTCCCGGGAGCAGCGCGGCGGCACCTCCCTCACCCAAGGGGCCGCGGCGAC"
## [5] "GGTCACGGGGCGCGGCGCCACCGTGAGCGACCCAGGCCAGGATTCTAAATAGACGGCCCAGGCTCCTCCT"
## [6] "CCGCCCGGGCCGCCTCACCTGCGGGCATTGCCGCGCCGCCTCCGCCGGTGTAGACGGCACCTGCGCCGCC"
## [1] 681
ahr_arr <- rawseq[2:length(rawseq)]
head(ahr_arr);length(ahr_arr) ###680 rows
## [1] "AGTGGCTGGGGAGTCCCGTCGACGCTCTGTTCCGAGAGCGTGCCCCGGACCGCCAGCTCAGAACAGGGGC"
## [2] "AGCCGTGTAGCCGAACGGAAGCTGGGAGCAGCCGGGACTGGTGGCCCGCGCCCGAGCTCCGCAGGCGGGA"
## [3] "AGCACCCTGGATTTAGGAAGTCCCGGGAGCAGCGCGGCGGCACCTCCCTCACCCAAGGGGCCGCGGCGAC"
## [4] "GGTCACGGGGCGCGGCGCCACCGTGAGCGACCCAGGCCAGGATTCTAAATAGACGGCCCAGGCTCCTCCT"
## [5] "CCGCCCGGGCCGCCTCACCTGCGGGCATTGCCGCGCCGCCTCCGCCGGTGTAGACGGCACCTGCGCCGCC"
## [6] "TTGCTCGCGGGTCTCCGCCCCTCGCCCACCCTCACTGCGCCAGGCCCAGGCAGCTCACCTGTACTGGCGC"
## [1] 680
tail(ahr_arr)
## [1] "TGGGATTTATTTCTAGATGATGTGCACATCTAAGGATATGGATGTGTCTAATTTAGTCTTTTCCTGTACC"
## [2] "AGGTTTTTCTTACAATACCTGAAGACTTACCAGTATTCTAGTGTATTATGAAGCTTTCAACATTACTATG"
## [3] "CACAAACTAGTGTTTTTCGATGTTACTAAATTTTAGGTAAATGCTTTCATGGCTTTTTTCTTCAAAATGT"
## [4] "TACTGCTTACATATATCATGCATAGATTTTTGCTTAAAGTATGATTTATAATATCCTCATTATCAAAGTT"
## [5] "GTATACAATAATATATAATAAAATAACAAATATGAA"
## [6] ""
nchar(ahr_arr[1]); nchar(ahr_arr[679])
## [1] 70
## [1] 36
ahr_arr <- ahr_arr[1:2]
ahrseq <- paste(ahr_arr, collapse="")
class(ahrseq);length(ahrseq);nchar(ahrseq);substr(ahrseq, 1, 5)
## [1] "character"
## [1] 1
## [1] 140
## [1] "AGTGG"
ahrseq
## [1] "AGTGGCTGGGGAGTCCCGTCGACGCTCTGTTCCGAGAGCGTGCCCCGGACCGCCAGCTCAGAACAGGGGCAGCCGTGTAGCCGAACGGAAGCTGGGAGCAGCCGGGACTGGTGGCCCGCGCCCGAGCTCCGCAGGCGGGA"
strsplit(ahrseq, split=NULL)
## [[1]]
## [1] "A" "G" "T" "G" "G" "C" "T" "G" "G" "G" "G" "A" "G" "T" "C" "C" "C" "G"
## [19] "T" "C" "G" "A" "C" "G" "C" "T" "C" "T" "G" "T" "T" "C" "C" "G" "A" "G"
## [37] "A" "G" "C" "G" "T" "G" "C" "C" "C" "C" "G" "G" "A" "C" "C" "G" "C" "C"
## [55] "A" "G" "C" "T" "C" "A" "G" "A" "A" "C" "A" "G" "G" "G" "G" "C" "A" "G"
## [73] "C" "C" "G" "T" "G" "T" "A" "G" "C" "C" "G" "A" "A" "C" "G" "G" "A" "A"
## [91] "G" "C" "T" "G" "G" "G" "A" "G" "C" "A" "G" "C" "C" "G" "G" "G" "A" "C"
## [109] "T" "G" "G" "T" "G" "G" "C" "C" "C" "G" "C" "G" "C" "C" "C" "G" "A" "G"
## [127] "C" "T" "C" "C" "G" "C" "A" "G" "G" "C" "G" "G" "G" "A"
#####################################################Base complement
myFun <- function(invec) {
require(stringi)
invec <- stri_reverse(invec)
chartr(old = "AaTtGgCc", new = "TTAACCGG", invec)
}
x <- "ATCG"
myFun(x) #"CGAT"
## Loading required package: stringi
## [1] "CGAT"
####################################################
#Next, we want to extract all (overlapping) substrings of the length of the motif (??len??) in our sequence
len <- 6
leftlims <- 1:(nchar(ahrseq) - (len - 1))
rightlims <- len:nchar(ahrseq)
leftlims;rightlims;ahrseq
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## [19] 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
## [37] 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
## [55] 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
## [73] 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
## [91] 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
## [109] 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
## [127] 127 128 129 130 131 132 133 134 135
## [1] 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
## [19] 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41
## [37] 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
## [55] 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
## [73] 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
## [91] 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
## [109] 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131
## [127] 132 133 134 135 136 137 138 139 140
## [1] "AGTGGCTGGGGAGTCCCGTCGACGCTCTGTTCCGAGAGCGTGCCCCGGACCGCCAGCTCAGAACAGGGGCAGCCGTGTAGCCGAACGGAAGCTGGGAGCAGCCGGGACTGGTGGCCCGCGCCCGAGCTCCGCAGGCGGGA"
ahrsubstrings <- mapply(substr, ahrseq, leftlims, rightlims,
USE.NAMES=FALSE)
ahrsubstrings
## [1] "AGTGGC" "GTGGCT" "TGGCTG" "GGCTGG" "GCTGGG" "CTGGGG" "TGGGGA" "GGGGAG"
## [9] "GGGAGT" "GGAGTC" "GAGTCC" "AGTCCC" "GTCCCG" "TCCCGT" "CCCGTC" "CCGTCG"
## [17] "CGTCGA" "GTCGAC" "TCGACG" "CGACGC" "GACGCT" "ACGCTC" "CGCTCT" "GCTCTG"
## [25] "CTCTGT" "TCTGTT" "CTGTTC" "TGTTCC" "GTTCCG" "TTCCGA" "TCCGAG" "CCGAGA"
## [33] "CGAGAG" "GAGAGC" "AGAGCG" "GAGCGT" "AGCGTG" "GCGTGC" "CGTGCC" "GTGCCC"
## [41] "TGCCCC" "GCCCCG" "CCCCGG" "CCCGGA" "CCGGAC" "CGGACC" "GGACCG" "GACCGC"
## [49] "ACCGCC" "CCGCCA" "CGCCAG" "GCCAGC" "CCAGCT" "CAGCTC" "AGCTCA" "GCTCAG"
## [57] "CTCAGA" "TCAGAA" "CAGAAC" "AGAACA" "GAACAG" "AACAGG" "ACAGGG" "CAGGGG"
## [65] "AGGGGC" "GGGGCA" "GGGCAG" "GGCAGC" "GCAGCC" "CAGCCG" "AGCCGT" "GCCGTG"
## [73] "CCGTGT" "CGTGTA" "GTGTAG" "TGTAGC" "GTAGCC" "TAGCCG" "AGCCGA" "GCCGAA"
## [81] "CCGAAC" "CGAACG" "GAACGG" "AACGGA" "ACGGAA" "CGGAAG" "GGAAGC" "GAAGCT"
## [89] "AAGCTG" "AGCTGG" "GCTGGG" "CTGGGA" "TGGGAG" "GGGAGC" "GGAGCA" "GAGCAG"
## [97] "AGCAGC" "GCAGCC" "CAGCCG" "AGCCGG" "GCCGGG" "CCGGGA" "CGGGAC" "GGGACT"
## [105] "GGACTG" "GACTGG" "ACTGGT" "CTGGTG" "TGGTGG" "GGTGGC" "GTGGCC" "TGGCCC"
## [113] "GGCCCG" "GCCCGC" "CCCGCG" "CCGCGC" "CGCGCC" "GCGCCC" "CGCCCG" "GCCCGA"
## [121] "CCCGAG" "CCGAGC" "CGAGCT" "GAGCTC" "AGCTCC" "GCTCCG" "CTCCGC" "TCCGCA"
## [129] "CCGCAG" "CGCAGG" "GCAGGC" "CAGGCG" "AGGCGG" "GGCGGG" "GCGGGA"
####check whether a fragment is a hit
hitcounter <- function(fragment, themotif) {ifelse (fragment==themotif, 1, 0)}
seq_1 <- "GCTCCG"
hitcounter(ahrsubstrings, seq_1)
## [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [38] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [75] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [112] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
###############################wrap everything up
counthits <- function(sequence, motif) {
compmotif <- myFun(motif)
len <- nchar(motif)
leftlim <- 1:(nchar(sequence) - (len - 1))
rightlim <- len:nchar(sequence)
frags <- mapply(substr, sequence, leftlim, rightlim, USE.NAMES=FALSE)
scores <- ifelse(frags==motif|frags==compmotif, 1, 0)
return(sum(scores))
}
counthits(ahrseq, seq_1)
## [1] 1
#REF https://stackoverflow.com/questions/28934185/reverse-complementary-base