rm(list = ls())
################# ##############input data 
dir_path <- "C:\\Users\\liyix\\Downloads\\"
dir_path_name <- list.files(pattern = ".*fna",dir_path,full.names = T, recursive = T)
dir_path_name
## [1] "C:\\Users\\liyix\\Downloads\\viral.1.1.genomic.fna.gz"
## [2] "C:\\Users\\liyix\\Downloads\\viral.2.1.genomic.fna.gz"
## [3] "C:\\Users\\liyix\\Downloads\\viral.3.1.genomic.fna.gz"
#################################
library("Biostrings")
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
## 
##     expand.grid
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
## 
##     windows
## Loading required package: XVector
## 
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
## 
##     strsplit
dna_1 <- readDNAStringSet(grep("viral.1.1.genomic.fna.gz",dir_path_name,value = T))
dna_2 <- readDNAStringSet(grep("viral.2.1.genomic.fna.gz",dir_path_name,value = T))
dna_3 <- readDNAStringSet(grep("viral.3.1.genomic.fna.gz",dir_path_name,value = T))
length(dna_1);length(dna_2);length(dna_3)
## [1] 9142
## [1] 2634
## [1] 2944
names(dna_1)[1:2]
## [1] "NC_001798.2 Human herpesvirus 2 strain HG52, complete genome"
## [2] "NC_030692.1 Borna disease virus 2, complete genome"
name_all <- c(dna_1@ranges@NAMES, dna_2@ranges@NAMES, dna_3@ranges@NAMES)
length(name_all) #[1] 14720
## [1] 14720
length(unique(name_all)) #[1] 14720
## [1] 14720
dna_all <- c(dna_1, dna_2, dna_3)
head(dna_all)
## DNAStringSet object of length 6:
##      width seq                                              names               
## [1] 154675 AGTCCCCGTCTTGCCGCGCGGGG...GGGGGAAAAGAGGCGGGGCGGG NC_001798.2 Human...
## [2]   8908 TGTTGCGTTAACAACAAATCAAT...GTTGTGGCTTTGTTGTAGCGCA NC_030692.1 Borna...
## [3]   8957 CAGCTCTCGCACCAAAACCCAGT...GATGTGTTGGTTTGGTTTCTTT NC_027892.1 Canar...
## [4]   9006 TGTTGCGTTAACAACAAACCAAC...CGTTGTGGTTTTGTTGTAGCGC NC_029642.1 Aquat...
## [5]   8910 GTTGCGTTAACAACAAACCACTC...TGTTGCTTTGTTGTAGCGCGTT NC_001607.1 Borna...
## [6]   5386 GAGTTTTATCGCTTCCATGACGC...TGATTGGCGTATCCAACCTGCA NC_001422.1 Colip...
length(dna_all@ranges@NAMES) #[1] 14720
## [1] 14720
################################################
head(name_all)
## [1] "NC_001798.2 Human herpesvirus 2 strain HG52, complete genome"
## [2] "NC_030692.1 Borna disease virus 2, complete genome"          
## [3] "NC_027892.1 Canary bornavirus 2, complete cds"               
## [4] "NC_029642.1 Aquatic bird bornavirus 1, complete genome"      
## [5] "NC_001607.1 Borna disease virus 1, complete sequence"        
## [6] "NC_001422.1 Coliphage phi-X174, complete genome"
name_all[1]
## [1] "NC_001798.2 Human herpesvirus 2 strain HG52, complete genome"
name_all <- data.frame(name_all)
head(name_all)
##                                                       name_all
## 1 NC_001798.2 Human herpesvirus 2 strain HG52, complete genome
## 2           NC_030692.1 Borna disease virus 2, complete genome
## 3                NC_027892.1 Canary bornavirus 2, complete cds
## 4       NC_029642.1 Aquatic bird bornavirus 1, complete genome
## 5         NC_001607.1 Borna disease virus 1, complete sequence
## 6              NC_001422.1 Coliphage phi-X174, complete genome
#Use gsub remove all string before first white space in R
name_all$virus <- sub(".*? ","", name_all$name_all)
name_all$nc <- gsub(" .*","", name_all$name_all)
head(name_all)
##                                                       name_all
## 1 NC_001798.2 Human herpesvirus 2 strain HG52, complete genome
## 2           NC_030692.1 Borna disease virus 2, complete genome
## 3                NC_027892.1 Canary bornavirus 2, complete cds
## 4       NC_029642.1 Aquatic bird bornavirus 1, complete genome
## 5         NC_001607.1 Borna disease virus 1, complete sequence
## 6              NC_001422.1 Coliphage phi-X174, complete genome
##                                              virus          nc
## 1 Human herpesvirus 2 strain HG52, complete genome NC_001798.2
## 2           Borna disease virus 2, complete genome NC_030692.1
## 3                Canary bornavirus 2, complete cds NC_027892.1
## 4       Aquatic bird bornavirus 1, complete genome NC_029642.1
## 5         Borna disease virus 1, complete sequence NC_001607.1
## 6              Coliphage phi-X174, complete genome NC_001422.1
write.csv(name_all, paste0(dir_path,Sys.Date(),"-","all_genome_virus_name.csv"),row.names = FALSE)
dna_all[grep("NC_045512.2",dna_all@ranges@NAMES),]
## DNAStringSet object of length 1:
##     width seq                                               names               
## [1] 29903 ATTAAAGGTTTATACCTTCCCAG...AAAAAAAAAAAAAAAAAAAAAAA NC_045512.2 Sever...
dna_all[1:3]
## DNAStringSet object of length 3:
##      width seq                                              names               
## [1] 154675 AGTCCCCGTCTTGCCGCGCGGGG...GGGGGAAAAGAGGCGGGGCGGG NC_001798.2 Human...
## [2]   8908 TGTTGCGTTAACAACAAATCAAT...GTTGTGGCTTTGTTGTAGCGCA NC_030692.1 Borna...
## [3]   8957 CAGCTCTCGCACCAAAACCCAGT...GATGTGTTGGTTTGGTTTCTTT NC_027892.1 Canar...
#?writeXStringSet()
writeXStringSet(dna_all, filepath = paste0(dir_path, Sys.Date(),"-all_virus_seq.fasta"), format = "fasta")
head(dna_1)
## DNAStringSet object of length 6:
##      width seq                                              names               
## [1] 154675 AGTCCCCGTCTTGCCGCGCGGGG...GGGGGAAAAGAGGCGGGGCGGG NC_001798.2 Human...
## [2]   8908 TGTTGCGTTAACAACAAATCAAT...GTTGTGGCTTTGTTGTAGCGCA NC_030692.1 Borna...
## [3]   8957 CAGCTCTCGCACCAAAACCCAGT...GATGTGTTGGTTTGGTTTCTTT NC_027892.1 Canar...
## [4]   9006 TGTTGCGTTAACAACAAACCAAC...CGTTGTGGTTTTGTTGTAGCGC NC_029642.1 Aquat...
## [5]   8910 GTTGCGTTAACAACAAACCACTC...TGTTGCTTTGTTGTAGCGCGTT NC_001607.1 Borna...
## [6]   5386 GAGTTTTATCGCTTCCATGACGC...TGATTGGCGTATCCAACCTGCA NC_001422.1 Colip...
dim(dna_1)
## NULL
dna_1@pool
## SharedRaw_Pool of length 1
## 1: SharedRaw of length 81948929 (data starting at address 00007FF4F8BD0040)
dna_1@ranges@NAMES[1:3]
## [1] "NC_001798.2 Human herpesvirus 2 strain HG52, complete genome"
## [2] "NC_030692.1 Borna disease virus 2, complete genome"          
## [3] "NC_027892.1 Canary bornavirus 2, complete cds"
dna_1[1]
## DNAStringSet object of length 1:
##      width seq                                              names               
## [1] 154675 AGTCCCCGTCTTGCCGCGCGGGG...GGGGGAAAAGAGGCGGGGCGGG NC_001798.2 Human...
dna_1@elementType
## [1] "DNAString"
dna_1@elementMetadata
## NULL
dna_1@metadata
## list()