rm(list = ls())
###############################input data 
dir_path <- "/Users/xut2/Desktop/dna seq/"
dir_path_name <- list.files(pattern = ".*",dir_path,full.names = T, recursive = T)
dir_path_name
## [1] "/Users/xut2/Desktop/dna seq//AHR.fasta"                   
## [2] "/Users/xut2/Desktop/dna seq//DNA sequence vector.R"       
## [3] "/Users/xut2/Desktop/dna seq//DNA-sequence-vector.R"       
## [4] "/Users/xut2/Desktop/dna seq//DNA-sequence-vector.spin.R"  
## [5] "/Users/xut2/Desktop/dna seq//DNA-sequence-vector.spin.Rmd"
rawseq <- readLines(grep("AHR",dir_path_name,value = T))
head(rawseq); length(rawseq)
## [1] ">NC_000007.14:17298652-17346147 Homo sapiens chromosome 7, GRCh38.p14 Primary Assembly"
## [2] "AGTGGCTGGGGAGTCCCGTCGACGCTCTGTTCCGAGAGCGTGCCCCGGACCGCCAGCTCAGAACAGGGGC"                
## [3] "AGCCGTGTAGCCGAACGGAAGCTGGGAGCAGCCGGGACTGGTGGCCCGCGCCCGAGCTCCGCAGGCGGGA"                
## [4] "AGCACCCTGGATTTAGGAAGTCCCGGGAGCAGCGCGGCGGCACCTCCCTCACCCAAGGGGCCGCGGCGAC"                
## [5] "GGTCACGGGGCGCGGCGCCACCGTGAGCGACCCAGGCCAGGATTCTAAATAGACGGCCCAGGCTCCTCCT"                
## [6] "CCGCCCGGGCCGCCTCACCTGCGGGCATTGCCGCGCCGCCTCCGCCGGTGTAGACGGCACCTGCGCCGCC"
## [1] 681
ahr_arr <- rawseq[2:length(rawseq)]
head(ahr_arr);length(ahr_arr)  ###680 rows
## [1] "AGTGGCTGGGGAGTCCCGTCGACGCTCTGTTCCGAGAGCGTGCCCCGGACCGCCAGCTCAGAACAGGGGC"
## [2] "AGCCGTGTAGCCGAACGGAAGCTGGGAGCAGCCGGGACTGGTGGCCCGCGCCCGAGCTCCGCAGGCGGGA"
## [3] "AGCACCCTGGATTTAGGAAGTCCCGGGAGCAGCGCGGCGGCACCTCCCTCACCCAAGGGGCCGCGGCGAC"
## [4] "GGTCACGGGGCGCGGCGCCACCGTGAGCGACCCAGGCCAGGATTCTAAATAGACGGCCCAGGCTCCTCCT"
## [5] "CCGCCCGGGCCGCCTCACCTGCGGGCATTGCCGCGCCGCCTCCGCCGGTGTAGACGGCACCTGCGCCGCC"
## [6] "TTGCTCGCGGGTCTCCGCCCCTCGCCCACCCTCACTGCGCCAGGCCCAGGCAGCTCACCTGTACTGGCGC"
## [1] 680
tail(ahr_arr)
## [1] "TGGGATTTATTTCTAGATGATGTGCACATCTAAGGATATGGATGTGTCTAATTTAGTCTTTTCCTGTACC"
## [2] "AGGTTTTTCTTACAATACCTGAAGACTTACCAGTATTCTAGTGTATTATGAAGCTTTCAACATTACTATG"
## [3] "CACAAACTAGTGTTTTTCGATGTTACTAAATTTTAGGTAAATGCTTTCATGGCTTTTTTCTTCAAAATGT"
## [4] "TACTGCTTACATATATCATGCATAGATTTTTGCTTAAAGTATGATTTATAATATCCTCATTATCAAAGTT"
## [5] "GTATACAATAATATATAATAAAATAACAAATATGAA"                                  
## [6] ""
nchar(ahr_arr[1]); nchar(ahr_arr[679])
## [1] 70
## [1] 36
ahr_arr <- ahr_arr[1:2]
ahrseq <- paste(ahr_arr, collapse="")
class(ahrseq);length(ahrseq);nchar(ahrseq);substr(ahrseq, 1, 5)
## [1] "character"
## [1] 1
## [1] 140
## [1] "AGTGG"
ahrseq
## [1] "AGTGGCTGGGGAGTCCCGTCGACGCTCTGTTCCGAGAGCGTGCCCCGGACCGCCAGCTCAGAACAGGGGCAGCCGTGTAGCCGAACGGAAGCTGGGAGCAGCCGGGACTGGTGGCCCGCGCCCGAGCTCCGCAGGCGGGA"
strsplit(ahrseq, split=NULL)
## [[1]]
##   [1] "A" "G" "T" "G" "G" "C" "T" "G" "G" "G" "G" "A" "G" "T" "C" "C" "C" "G"
##  [19] "T" "C" "G" "A" "C" "G" "C" "T" "C" "T" "G" "T" "T" "C" "C" "G" "A" "G"
##  [37] "A" "G" "C" "G" "T" "G" "C" "C" "C" "C" "G" "G" "A" "C" "C" "G" "C" "C"
##  [55] "A" "G" "C" "T" "C" "A" "G" "A" "A" "C" "A" "G" "G" "G" "G" "C" "A" "G"
##  [73] "C" "C" "G" "T" "G" "T" "A" "G" "C" "C" "G" "A" "A" "C" "G" "G" "A" "A"
##  [91] "G" "C" "T" "G" "G" "G" "A" "G" "C" "A" "G" "C" "C" "G" "G" "G" "A" "C"
## [109] "T" "G" "G" "T" "G" "G" "C" "C" "C" "G" "C" "G" "C" "C" "C" "G" "A" "G"
## [127] "C" "T" "C" "C" "G" "C" "A" "G" "G" "C" "G" "G" "G" "A"
#####################################################Base complement
myFun <- function(invec) {
  require(stringi)
  invec <- stri_reverse(invec)
  chartr(old = "AaTtGgCc", new = "TTAACCGG", invec)
}

x <- "ATCG"
myFun(x) #"CGAT"
## Loading required package: stringi
## [1] "CGAT"
####################################################
#Next, we want to extract all (overlapping) substrings of the length of the motif (??len??) in our sequence
len <- 6
leftlims <- 1:(nchar(ahrseq) - (len - 1))
rightlims <- len:nchar(ahrseq)
leftlims;rightlims;ahrseq
##   [1]   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
##  [19]  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36
##  [37]  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54
##  [55]  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72
##  [73]  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90
##  [91]  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108
## [109] 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
## [127] 127 128 129 130 131 132 133 134 135
##   [1]   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20  21  22  23
##  [19]  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40  41
##  [37]  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59
##  [55]  60  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77
##  [73]  78  79  80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95
##  [91]  96  97  98  99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
## [109] 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131
## [127] 132 133 134 135 136 137 138 139 140
## [1] "AGTGGCTGGGGAGTCCCGTCGACGCTCTGTTCCGAGAGCGTGCCCCGGACCGCCAGCTCAGAACAGGGGCAGCCGTGTAGCCGAACGGAAGCTGGGAGCAGCCGGGACTGGTGGCCCGCGCCCGAGCTCCGCAGGCGGGA"
ahrsubstrings <- mapply(substr, ahrseq, leftlims, rightlims,
                          USE.NAMES=FALSE)
ahrsubstrings
##   [1] "AGTGGC" "GTGGCT" "TGGCTG" "GGCTGG" "GCTGGG" "CTGGGG" "TGGGGA" "GGGGAG"
##   [9] "GGGAGT" "GGAGTC" "GAGTCC" "AGTCCC" "GTCCCG" "TCCCGT" "CCCGTC" "CCGTCG"
##  [17] "CGTCGA" "GTCGAC" "TCGACG" "CGACGC" "GACGCT" "ACGCTC" "CGCTCT" "GCTCTG"
##  [25] "CTCTGT" "TCTGTT" "CTGTTC" "TGTTCC" "GTTCCG" "TTCCGA" "TCCGAG" "CCGAGA"
##  [33] "CGAGAG" "GAGAGC" "AGAGCG" "GAGCGT" "AGCGTG" "GCGTGC" "CGTGCC" "GTGCCC"
##  [41] "TGCCCC" "GCCCCG" "CCCCGG" "CCCGGA" "CCGGAC" "CGGACC" "GGACCG" "GACCGC"
##  [49] "ACCGCC" "CCGCCA" "CGCCAG" "GCCAGC" "CCAGCT" "CAGCTC" "AGCTCA" "GCTCAG"
##  [57] "CTCAGA" "TCAGAA" "CAGAAC" "AGAACA" "GAACAG" "AACAGG" "ACAGGG" "CAGGGG"
##  [65] "AGGGGC" "GGGGCA" "GGGCAG" "GGCAGC" "GCAGCC" "CAGCCG" "AGCCGT" "GCCGTG"
##  [73] "CCGTGT" "CGTGTA" "GTGTAG" "TGTAGC" "GTAGCC" "TAGCCG" "AGCCGA" "GCCGAA"
##  [81] "CCGAAC" "CGAACG" "GAACGG" "AACGGA" "ACGGAA" "CGGAAG" "GGAAGC" "GAAGCT"
##  [89] "AAGCTG" "AGCTGG" "GCTGGG" "CTGGGA" "TGGGAG" "GGGAGC" "GGAGCA" "GAGCAG"
##  [97] "AGCAGC" "GCAGCC" "CAGCCG" "AGCCGG" "GCCGGG" "CCGGGA" "CGGGAC" "GGGACT"
## [105] "GGACTG" "GACTGG" "ACTGGT" "CTGGTG" "TGGTGG" "GGTGGC" "GTGGCC" "TGGCCC"
## [113] "GGCCCG" "GCCCGC" "CCCGCG" "CCGCGC" "CGCGCC" "GCGCCC" "CGCCCG" "GCCCGA"
## [121] "CCCGAG" "CCGAGC" "CGAGCT" "GAGCTC" "AGCTCC" "GCTCCG" "CTCCGC" "TCCGCA"
## [129] "CCGCAG" "CGCAGG" "GCAGGC" "CAGGCG" "AGGCGG" "GGCGGG" "GCGGGA"
####check whether a fragment is a hit
hitcounter <- function(fragment, themotif) {ifelse (fragment==themotif, 1, 0)}
seq_1 <- "GCTCCG"
hitcounter(ahrsubstrings, seq_1)
##   [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [38] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [75] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [112] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
###############################wrap everything up
counthits <- function(sequence, motif) {
  compmotif <- myFun(motif)
  len <- nchar(motif)
  leftlim <- 1:(nchar(sequence) - (len - 1))
  rightlim <- len:nchar(sequence)
  frags <- mapply(substr, sequence, leftlim, rightlim, USE.NAMES=FALSE)
  scores <- ifelse(frags==motif|frags==compmotif, 1, 0)
  return(sum(scores))
}
counthits(ahrseq, seq_1) 
## [1] 1
#REF https://stackoverflow.com/questions/28934185/reverse-complementary-base