rm(list = ls())
###############################input data
dir_path <- "/Users/xut2/Desktop/ahr/"
dir_path_name <- list.files(pattern = ".*fasta",dir_path,full.names = T, recursive = T)
#dir_path_name
rawseq <- readLines(grep("AHR",dir_path_name,value = T))
head(rawseq); length(rawseq) #681
## [1] ">NC_000007.14:17298652-17346147 Homo sapiens chromosome 7, GRCh38.p14 Primary Assembly"
## [2] "AGTGGCTGGGGAGTCCCGTCGACGCTCTGTTCCGAGAGCGTGCCCCGGACCGCCAGCTCAGAACAGGGGC"
## [3] "AGCCGTGTAGCCGAACGGAAGCTGGGAGCAGCCGGGACTGGTGGCCCGCGCCCGAGCTCCGCAGGCGGGA"
## [4] "AGCACCCTGGATTTAGGAAGTCCCGGGAGCAGCGCGGCGGCACCTCCCTCACCCAAGGGGCCGCGGCGAC"
## [5] "GGTCACGGGGCGCGGCGCCACCGTGAGCGACCCAGGCCAGGATTCTAAATAGACGGCCCAGGCTCCTCCT"
## [6] "CCGCCCGGGCCGCCTCACCTGCGGGCATTGCCGCGCCGCCTCCGCCGGTGTAGACGGCACCTGCGCCGCC"
## [1] 681
ahr_arr <- rawseq[2:length(rawseq)]
#head(ahr_arr);length(ahr_arr) ###680 rows
#tail(ahr_arr)
#nchar(ahr_arr[1]); nchar(ahr_arr[679])
ahr_arr <- ahr_arr[1:2]
ahr_arr
## [1] "AGTGGCTGGGGAGTCCCGTCGACGCTCTGTTCCGAGAGCGTGCCCCGGACCGCCAGCTCAGAACAGGGGC"
## [2] "AGCCGTGTAGCCGAACGGAAGCTGGGAGCAGCCGGGACTGGTGGCCCGCGCCCGAGCTCCGCAGGCGGGA"
ahrseq <- paste(ahr_arr, collapse="")
#class(ahrseq);length(ahrseq);nchar(ahrseq);substr(ahrseq, 1, 5)
ahrseq
## [1] "AGTGGCTGGGGAGTCCCGTCGACGCTCTGTTCCGAGAGCGTGCCCCGGACCGCCAGCTCAGAACAGGGGCAGCCGTGTAGCCGAACGGAAGCTGGGAGCAGCCGGGACTGGTGGCCCGCGCCCGAGCTCCGCAGGCGGGA"
strsplit(ahrseq, split=NULL)
## [[1]]
## [1] "A" "G" "T" "G" "G" "C" "T" "G" "G" "G" "G" "A" "G" "T" "C" "C" "C" "G"
## [19] "T" "C" "G" "A" "C" "G" "C" "T" "C" "T" "G" "T" "T" "C" "C" "G" "A" "G"
## [37] "A" "G" "C" "G" "T" "G" "C" "C" "C" "C" "G" "G" "A" "C" "C" "G" "C" "C"
## [55] "A" "G" "C" "T" "C" "A" "G" "A" "A" "C" "A" "G" "G" "G" "G" "C" "A" "G"
## [73] "C" "C" "G" "T" "G" "T" "A" "G" "C" "C" "G" "A" "A" "C" "G" "G" "A" "A"
## [91] "G" "C" "T" "G" "G" "G" "A" "G" "C" "A" "G" "C" "C" "G" "G" "G" "A" "C"
## [109] "T" "G" "G" "T" "G" "G" "C" "C" "C" "G" "C" "G" "C" "C" "C" "G" "A" "G"
## [127] "C" "T" "C" "C" "G" "C" "A" "G" "G" "C" "G" "G" "G" "A"
####################################################
#Next, we want to extract all (overlapping) substrings of the length of the motif (??len??) in our sequence
len <- 6
leftlims <- 1:(nchar(ahrseq) - (len - 1))
rightlims <- len:nchar(ahrseq)
#leftlims;rightlims;ahrseq
ahrsubstrings <- mapply(substr, ahrseq, leftlims, rightlims,
USE.NAMES=FALSE)
ahrsubstrings #140-6+1 = 135
## [1] "AGTGGC" "GTGGCT" "TGGCTG" "GGCTGG" "GCTGGG" "CTGGGG" "TGGGGA" "GGGGAG"
## [9] "GGGAGT" "GGAGTC" "GAGTCC" "AGTCCC" "GTCCCG" "TCCCGT" "CCCGTC" "CCGTCG"
## [17] "CGTCGA" "GTCGAC" "TCGACG" "CGACGC" "GACGCT" "ACGCTC" "CGCTCT" "GCTCTG"
## [25] "CTCTGT" "TCTGTT" "CTGTTC" "TGTTCC" "GTTCCG" "TTCCGA" "TCCGAG" "CCGAGA"
## [33] "CGAGAG" "GAGAGC" "AGAGCG" "GAGCGT" "AGCGTG" "GCGTGC" "CGTGCC" "GTGCCC"
## [41] "TGCCCC" "GCCCCG" "CCCCGG" "CCCGGA" "CCGGAC" "CGGACC" "GGACCG" "GACCGC"
## [49] "ACCGCC" "CCGCCA" "CGCCAG" "GCCAGC" "CCAGCT" "CAGCTC" "AGCTCA" "GCTCAG"
## [57] "CTCAGA" "TCAGAA" "CAGAAC" "AGAACA" "GAACAG" "AACAGG" "ACAGGG" "CAGGGG"
## [65] "AGGGGC" "GGGGCA" "GGGCAG" "GGCAGC" "GCAGCC" "CAGCCG" "AGCCGT" "GCCGTG"
## [73] "CCGTGT" "CGTGTA" "GTGTAG" "TGTAGC" "GTAGCC" "TAGCCG" "AGCCGA" "GCCGAA"
## [81] "CCGAAC" "CGAACG" "GAACGG" "AACGGA" "ACGGAA" "CGGAAG" "GGAAGC" "GAAGCT"
## [89] "AAGCTG" "AGCTGG" "GCTGGG" "CTGGGA" "TGGGAG" "GGGAGC" "GGAGCA" "GAGCAG"
## [97] "AGCAGC" "GCAGCC" "CAGCCG" "AGCCGG" "GCCGGG" "CCGGGA" "CGGGAC" "GGGACT"
## [105] "GGACTG" "GACTGG" "ACTGGT" "CTGGTG" "TGGTGG" "GGTGGC" "GTGGCC" "TGGCCC"
## [113] "GGCCCG" "GCCCGC" "CCCGCG" "CCGCGC" "CGCGCC" "GCGCCC" "CGCCCG" "GCCCGA"
## [121] "CCCGAG" "CCGAGC" "CGAGCT" "GAGCTC" "AGCTCC" "GCTCCG" "CTCCGC" "TCCGCA"
## [129] "CCGCAG" "CGCAGG" "GCAGGC" "CAGGCG" "AGGCGG" "GGCGGG" "GCGGGA"
data_1 <- data.frame(ahrsubstrings)
head(data_1);dim(data_1) #[1] 135 1
## ahrsubstrings
## 1 AGTGGC
## 2 GTGGCT
## 3 TGGCTG
## 4 GGCTGG
## 5 GCTGGG
## 6 CTGGGG
## [1] 135 1
write.table(data_1, paste0(dir_path,Sys.Date(),"-","word.txt"),row.names = FALSE,na = "")
####################################________word2vec
library(word2vec)
set.seed(2022)
model <- word2vec(x = unique(data_1$ahrsubstrings), type = "skip-gram",
dim = 5, iter = 20, min_count = 1) #
#model
embedding <- as.matrix(model)
#embedding
dim(embedding);head(embedding) #[1] 132 5
## [1] 133 5
## [,1] [,2] [,3] [,4] [,5]
## CGCAGG -1.42758203 1.0332497 0.05486977 0.3609640 1.3270642
## </s> 1.16630876 0.5435506 -1.40140951 0.1616713 -1.1636968
## GGGAGT -0.05267066 -1.1165233 -1.52184999 -0.5042231 1.0864314
## GCAGGC -1.57140899 -1.0731850 -1.12068594 0.0611884 0.3453507
## GGCAGC -1.73438978 -0.8986887 0.40715396 0.9391397 0.3694496
## CGAGCT -1.40603161 1.1261673 0.16232981 0.9921160 0.8626572
embedding <- data.frame(embedding)
#View(embedding)
sum(rownames(embedding) %in% unique(data_1$ahrsubstrings))
## [1] 132
rownames(embedding)[!rownames(embedding) %in% unique(data_1$ahrsubstrings)] #"</s>"
## [1] "</s>"
length(unique(data_1$ahrsubstrings)) #132
## [1] 132
embedding <- embedding[rownames(embedding) != "</s>", ]
embedding_mean <- t(data.frame(apply(embedding, 2, mean)))
#mean(embedding$X1)
class(embedding_mean)
## [1] "matrix" "array"
rownames(embedding_mean)<- colnames(data_1)
embedding_mean
## X1 X2 X3 X4 X5
## ahrsubstrings -0.03797262 -0.126068 0.0126564 0.06973255 0.1449661
write.csv(embedding_mean, paste0(dir_path,Sys.Date(),"-","word2vec.csv"),row.names = FALSE,na = "")
####################################################fasttext
# 'skipgram' function with n-gram enabled
library(fastText)
data_1 <- read.delim(paste0(dir_path,Sys.Date(),"-","word.txt"),header = F,stringsAsFactors = F)
data_1 <- unique(data_1)
head(data_1);class(data_1);dim(data_1)
## V1
## 1 ahrsubstrings
## 2 AGTGGC
## 3 GTGGCT
## 4 TGGCTG
## 5 GGCTGG
## 6 GCTGGG
## [1] "data.frame"
## [1] 133 1
list_params = list(command = 'skipgram',
lr = 0.05,
dim = 5,
input = paste0(dir_path,Sys.Date(),"-","word.txt"),
output = paste0(dir_path,Sys.Date(),"_", 'fastetxt_dna'),
thread = 1,
minCount = 1,
minn = 1,
maxn = 10)
res = fasttext_interface(list_params,
path_output = paste0(dir_path,Sys.Date(),"_", 'skipgram_logs.txt'),
MilliSecs = 5)
##
Read 0M words
## Number of words: 134
## Number of labels: 0
##
Progress: 105.0% words/sec/thread: 1436 lr: -0.002500 loss: 2.768705 ETA: 0h 0m
Progress: 100.0% words/sec/thread: 1436 lr: 0.000000 loss: 2.768705 ETA: 0h 0m
#?fasttext_interface
output <- read.table(paste0(dir_path,Sys.Date(),"_", 'fastetxt_dna.vec'),
header =T,stringsAsFactors = F, skip =1, quote = " ")
dim(output); head(output)
## [1] 133 6
## X..s. X.0.16599 X0.15665 X.0.12413 X.0.040804 X0.097398
## 1 "CAGCCG" -0.0228120 -0.0141460 0.0146650 0.0095979 -0.00035153
## 2 "GCAGCC" -0.0332710 -0.0117580 0.0195630 -0.0142900 -0.00615280
## 3 "GCTGGG" -0.0035998 0.0068568 0.0015307 -0.0290690 0.03709000
## 4 "TGGCTG" -0.0086298 -0.0120140 0.0114370 -0.0027087 0.03120800
## 5 "GGCTGG" -0.0021057 0.0017799 0.0042539 -0.0396550 0.02451600
## 6 "GTGGCT" -0.0107030 0.0265000 0.0065868 -0.0161210 0.02471100
colnames(output) <- paste0("X", 1:ncol(output))
output <- output[output$X1 != "\"ahrsubstrings\"",] #!!!
dim(output)
## [1] 132 6
apply(output[,-1], 2, mean)
## X2 X3 X4 X5 X6
## -0.010985285 -0.008698564 0.011960120 -0.019157586 0.012389787
#################################