My_Quarto
Step by step documentation of Flowering Pipeline analysis.
Running Code
Step by step scripts to produce the requred files for hmm domain search for flowerings genes in legumes:
##STEPS
## STEP 1
#Downloaded all gene data from the CFG database for each species (protein sequences marked as *_ref.fasta).
#Used the Python script "CFG_fetch_fastas.py" to generate FASTA files for each protein, then ran OrthoFinder with existing datasets from chickpea and other legumes.
#Each CFG species FASTA was renamed as *_ref.fasta and stored in HPC at:
#/isilon/swarupsir_user/majnu_bhai/flowering_data_udita/legumes/Legumes_AA_NUC
#- Total dataset: 112 species with 82,104 proteins
#- My dataset: 36 species with 167253 flowering proteins
#OrthoFinder outputs were stored at:
#~/Downloads/Sep23_long_work/01_long_work_data/Orthogroups
#Key output files:
#- Orthogroups.GeneCount.tsv
#- Orthogroups.tsv (28,636 OGs)
#- Orthogroups_SingleCopyOrthologues.txt
#- Orthogroups_UnassignedGenes.tsv
##
# from collections import defaultdict
# import os
#
# # Input files
# species_file = "Species112.txt"
# mapping_file = "Species_protein.tsv"
# fasta_file = "protein.fa"
# output_dir = "species_fastas"
#
# # Create output directory
# os.makedirs(output_dir, exist_ok=True)
#
# # Step 1: load species list
# with open(species_file) as f:
# species_list = [line.strip() for line in f if line.strip()]
#
# # Step 2: load mapping file (Species ↔ ProteinID)
# protein_to_species = {}
# with open(mapping_file) as f:
# header = f.readline() # skip header
# for line in f:
# if not line.strip():
# continue
# parts = line.strip().split("\t")
# if len(parts) < 2:
# continue
# sp, prot = parts[0], parts[1]
# protein_to_species[prot] = sp
#
# # Step 3: open FASTA and collect sequences by species
# species_fastas = defaultdict(list)
#
# with open(fasta_file) as f:
# seq_id = None
# seq_lines = []
# for line in f:
# if line.startswith(">"): # new sequence
# if seq_id:
# if seq_id in protein_to_species:
# sp = protein_to_species[seq_id]
# species_fastas[sp].append((seq_id, "".join(seq_lines)))
# seq_id = line[1:].strip()
# seq_lines = []
# else:
# seq_lines.append(line.strip())
# if seq_id and seq_id in protein_to_species:
# sp = protein_to_species[seq_id]
# species_fastas[sp].append((seq_id, "".join(seq_lines)))
#
# # Step 4: write species-specific FASTA files
# for sp, seqs in species_fastas.items():
# if sp not in species_list:
# continue # only keep species in Species112.txt
# parts = sp.split()
# abbr = f"{parts[0][0]}. {' '.join(parts[1:])}"
# fasta_name = f"{abbr}_ref.fasta".replace(" ", "_")
# out_path = os.path.join(output_dir, fasta_name)
#
# with open(out_path, "w") as out:
# for prot_id, seq in seqs:
# out.write(f">{prot_id}\n{seq}\n")
#
# print(f"✅ {sp} → {out_path} ({len(seqs)} proteins)")
## STEP 2
#I analyzed the orthofinder results and filtered the orthogroups (OGs) associated with #flowering genes for further downstream processing.Next, I generated separate HMM profiles for each flowering gene family and then merged them into a single HMM profile database.
#The individual HMMs were stored in: /home/mustafa/Downloads/Flowering_data_CFG##/01_flowering_data/02_HMMSCAN_inputs/custom_DB/aligned
#The merged main HMM database was stored in: /home/mustafa/Downloads/Flowering_data_CFG/01_flowering_data/02_HMMSCAN_inputs/custom_DB
##Script:01_matched_OG3_cfg_5887.sh
#!/bin/bash
# ID_FILE="cfg_DB.txt" ##cfg data with 5887 protein ids
# ORTHO_FILE="Orthogroups.txt"
# OUTPUT_FILE="01_matched_hmm5887_OGs.txt" ##fetch 5887 genes OGs contains 348 OGS in output, but other flowering genes also present
# so i will drop in the next script using keep_flw_genes_ids.py and the file name is 04_hmm_OGs_req.txt
#
# awk '
# BEGIN { FS="[: ]+"; OFS="\t" } # split by colon or spaces
# NR==FNR {
# ids[$1] = 1
# next
# }
# {
# og = $1 # orthogroup ID before colon
# matched = 0
# for (i = 2; i <= NF; i++) {
# if ($i in ids) {
# matched = 1
# break
# }
# }
# if (matched) {
# print og, $0
# }
# }
# ' "$ID_FILE" "$ORTHO_FILE" > "$OUTPUT_FILE"
#
# echo "✅ Done! Matches written to: $OUTPUT_FILE"
##Script:02_matched_OG3_CFG_82104.sh
#!/bin/bash
ID_FILE="id_list_flower.txt"
ORTHO_FILE="Orthogroups.txt"
OUTPUT_FILE="02_matched_All_flower_82104_OGs.txt" ## i fetchout here flower OGs(748) for 82104 genes of CFG data
# awk '
# BEGIN { FS="[: ]+"; OFS="\t" } # split by colon or spaces
# NR==FNR {
# ids[$1] = 1
# next
# }
# {
# og = $1 # orthogroup ID before colon
# matched = 0
# for (i = 2; i <= NF; i++) {
# if ($i in ids) {
# matched = 1
# break
# }
# }
# if (matched) {
# print og, $0
# }
# }
# ' "$ID_FILE" "$ORTHO_FILE" > "$OUTPUT_FILE"
#
# echo "✅ Done! Matches written to: $OUTPUT_FILE"
#
##Script:03_matched_OG3_legumes_1397779.sh
#!/bin/bash
# ID_FILE="legumes_ID.txt"
# ORTHO_FILE="02_matched_All_flower_82104_OGs.txt"
# OUTPUT_FILE="03_matched_All_flower_OGs.txt" ## i fetchout here legumes OGs(526) for 1397779 genes of my data (legumes)
# #then run this script (03_keep_legumes_genes_ids.py) to exclude other species protein id and keep only mydata legumes flowering genes 167253 proteins in 526 OG's
# #03_matched_All_flower_OGs2.txt
#
# awk '
# BEGIN { FS="[: ]+"; OFS="\t" } # split by colon or spaces
# NR==FNR {
# ids[$1] = 1
# next
# }
# {
# og = $1 # orthogroup ID before colon
# matched = 0
# for (i = 2; i <= NF; i++) {
# if ($i in ids) {
# matched = 1
# break
# }
# }
# if (matched) {
# print og, $0
# }
# }
# ' "$ID_FILE" "$ORTHO_FILE" > "$OUTPUT_FILE"
#
# echo "✅ Done! Matches written to: $OUTPUT_FILE"
##Script4:03_keep_legumes_genes_ids.py
# Files
# id_file = "legumes_ID.txt"
# input_file = "03_matched_All_flower_1397779_OGs.txt"
# output_file = "03_matched_All_flower_1397779_OGs2.txt" ## This will contains only 9 legumes ids 167253 genes ids
#
# # Load IDs to KEEP
# with open(id_file) as f:
# keep_ids = set(line.strip() for line in f if line.strip())
#
# with open(input_file) as fin, open(output_file, "w") as fout:
# for line in fin:
# parts = line.strip().split()
# if not parts:
# continue
# og = parts[0] # OG ID
# ids = parts[1:] # member IDs
# # Keep only IDs that are in cfg_DB.txt
# filtered_ids = [i for i in ids if i in keep_ids]
# if filtered_ids: # only save OG if it still has members
# fout.write(og + "\t" + "\t".join(filtered_ids) + "\n")
#
# print(f"✅ Filtered OGs (only IDs in cfg_DB.txt) saved to {output_file}")
#
##Script5:04_keep_flw_genes_ids.py
# Files
# id_file = "cfg_DB.txt"
# input_file = "01_matched_hmm5887_OGs.txt"
# output_file = "04_hmm_OGs_req.txt" ## This will contains only 9 legumes + A. thaliana and Oryza sativa ids for hmm
#
# # Load IDs to KEEP
# with open(id_file) as f:
# keep_ids = set(line.strip() for line in f if line.strip())
#
# with open(input_file) as fin, open(output_file, "w") as fout:
# for line in fin:
# parts = line.strip().split()
# if not parts:
# continue
# og = parts[0] # OG ID
# ids = parts[1:] # member IDs
# # Keep only IDs that are in cfg_DB.txt
# filtered_ids = [i for i in ids if i in keep_ids]
# if filtered_ids: # only save OG if it still has members
# fout.write(og + "\t" + "\t".join(filtered_ids) + "\n")
#
# print(f"✅ Filtered OGs (only IDs in cfg_DB.txt) saved to {output_file}")
##Script6:05_og_fetch_for_hmm.py
# id_file = "cfg_DB.txt"
# input_file = "01_matched_hmm5887_OGs.txt"
# output_file = "04_hmm_OGs_req.txt" ## This will contains only 9 legumes + A. thaliana and Oryza sativa ids for hmm
#
# # Load IDs to KEEP
# with open(id_file) as f:
# keep_ids = set(line.strip() for line in f if line.strip())
#
# with open(input_file) as fin, open(output_file, "w") as fout:
# for line in fin:
# parts = line.strip().split()
# if not parts:
# continue
# og = parts[0] # OG ID
# ids = parts[1:] # member IDs
# # Keep only IDs that are in cfg_DB.txt
# filtered_ids = [i for i in ids if i in keep_ids]
# if filtered_ids: # only save OG if it still has members
# fout.write(og + "\t" + "\t".join(filtered_ids) + "\n")
#
# print(f"✅ Filtered OGs (only IDs in cfg_DB.txt) saved to {output_file}")
## STEP 4
# 1. Run HMM search:
#
# **hmmscan --cpu 48 --acc -E 1e-5 --tblout pfam_scan.tblout --domtblout pfam_scan.domtblout custom_DB/custom_DB.hmm myData_proteins_HMM.faa > pfam_scan_myDATA.stdout**
#
# 2. Process HMMER results in R:
# - Use the hmmer2.R script to filter and analyze the output.
#
# 3. Run DIAMOND BLASTP:
# - Use the legume dataset (5,887 sequences) as the database.
# - Query with protein IDs from HMMER results (36 species).
# - Process BLAST results using the same hmmer2.R script.
#
# 4. Annotate with NR protein database:
# - Run BLASTP against NR using the same protein IDs.
# - Collect hit and annotation data.
#
# *Note: All filtering and processing parameters are defined within the R script.
##load the packages
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readr)
library("data.table")
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
##path
setwd("/Users/Tina/Downloads/Sep24_OG_filter/")
##here I define the colnames of hmmer results
colnames_hmmer <- c(
"target_name", "target_accession", "tlen",
"query_name", "query_accession", "qlen",
"fullseq_Evalue", "fullseq_score", "fullseq_bias",
"domain_num", "domain_of",
"c_Evalue", "i_Evalue", "domain_score", "domain_bias",
"hmm_from", "hmm_to", "ali_from", "ali_to", "env_from", "env_to",
"acc", "description"
)
##read hmmer hits table
hmmer <- read_table2("~/Downloads/Sep24_OG_filter/05_og_fetch_for_hmm_results/og_fastas/pfam_scan.domtblout",
comment = "#",
col_names = colnames_hmmer
)
## Warning: `read_table2()` was deprecated in readr 2.0.0.
## ℹ Please use `read_table()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## .default = col_double(),
## target_name = col_character(),
## target_accession = col_character(),
## query_name = col_character(),
## query_accession = col_character(),
## description = col_character()
## )
## ℹ Use `spec()` for the full column specifications.
class(hmmer)
## [1] "spec_tbl_df" "tbl_df" "tbl" "data.frame"
hmmer <- as.data.frame(hmmer) %>% tibble::as_tibble()
hmmer <- hmmer %>%
mutate(
fullseq_Evalue = as.numeric(fullseq_Evalue),
fullseq_score = as.numeric(fullseq_score)
)
######################################################################
##isoform single ids from legumes data
isoform_single<-read.table("~/Downloads/Sep24_OG_filter/single_isoform_id.txt",header = T)
##merge with hmm data
hmmer <- merge(hmmer, isoform_single, by = "query_name", all = TRUE)
##remove isofrom i.e not available
hmmer<-hmmer[complete.cases(hmmer), ]
##select the best hit based on c_value domains(86167)
hmmer <- hmmer %>%
group_by(query_name) %>%
slice_min(order_by = c_Evalue,n=1,with_ties = FALSE) %>%
ungroup()
hmmer <-hmmer %>% filter(c_Evalue <=1e-5 )
##now load the all protein blast tsv 5887 flowering genes here
myData_proteins_HMM_5887<-read.table("~/Downloads/Sep24_OG_filter/myData_proteins_HMM_5887.tsv",header = T)
#calculate the query coverage
myData_proteins_HMM_5887<-myData_proteins_HMM_5887 %>%
mutate(query_coverage = ((qend - qstart + 1) / length) * 100)
#calculate the best hit total protein was 167253 and i got hits 108304
myData_proteins_HMM_5887 <- myData_proteins_HMM_5887 %>%
group_by(qseqid) %>%
slice_min(order_by = evalue,n=1,with_ties = FALSE) %>%
ungroup()
myData_proteins_HMM_5887<-myData_proteins_HMM_5887 %>% mutate(query_name = qseqid)
##merge the blast hits in hmmer df
hmmer <- merge(hmmer,myData_proteins_HMM_5887 , by = "query_name", all = TRUE)
#hmmer <- hmmer[!duplicated(hmmer$query_name), ]
hmmer2 <- hmmer %>%
group_by(query_name) %>%
slice_min(order_by = c_Evalue,n=1,with_ties = FALSE) %>%
ungroup()
hmmer2<-hmmer2[complete.cases(hmmer2), ]
################################################################################
# List of unique species from your column
species_list <- unique(hmmer2$Species_level)
# Function to apply your pipeline for one species
filter_species <- function(species_name) {
hmmer2 %>%
filter(Species_level == species_name) %>%
filter(query_coverage >= 70) %>%
group_by(stitle) %>%
slice_min(order_by = evalue, n = 1, with_ties = FALSE) %>%
ungroup() %>%
filter(!is.na(Pident) & Pident >= 55)
}
# Apply the function to each species and store in a named list
species_results <- lapply(species_list, filter_species)
names(species_results) <- species_list
# Now you can access like:
Glycine_max <- species_results[["Glycine_max"]]
species_results[["C_reticulatum"]]
## # A tibble: 729 × 39
## query_name target_name target_accession tlen query_accession qlen
## <chr> <chr> <chr> <dbl> <chr> <dbl>
## 1 ret_00995.1 OG0000087_align - 221 - 291
## 2 ret_01480.1 OG0000154_align - 1125 - 141
## 3 ret_05571.1 OG0000255_align - 987 - 1234
## 4 ret_26161.1 OG0001087_align - 161 - 161
## 5 ret_26185.1 OG0000573_align - 273 - 269
## 6 ret_05427.1 OG0000040_align - 536 - 553
## 7 ret_03725.1 OG0000169_align - 666 - 638
## 8 ret_06361.1 OG0000067_align - 186 - 169
## 9 ret_01025.1 OG0000088_align - 551 - 1038
## 10 ret_12124.1 OG0000111_align - 303 - 303
## # ℹ 719 more rows
## # ℹ 33 more variables: fullseq_Evalue <dbl>, fullseq_score <dbl>,
## # fullseq_bias <dbl>, domain_num <dbl>, domain_of <dbl>, c_Evalue <dbl>,
## # i_Evalue <dbl>, domain_score <dbl>, domain_bias <dbl>, hmm_from <dbl>,
## # hmm_to <dbl>, ali_from <dbl>, ali_to <dbl>, env_from <dbl>, env_to <dbl>,
## # acc <dbl>, description <chr>, Species_level <chr>, qseqid <chr>,
## # sseqid <chr>, Pident <dbl>, length <int>, mismatch <int>, gapopen <int>, …
Desi_PBA_Striker<-species_results[["Desi_PBA_Striker"]]
Chickpea_XJ01<-species_results[["Chickpea_XJ01"]]
CDC<-species_results[["CDC"]]
Desi_Kyabra<-species_results[["Desi_Kyabra"]]
################################################################################
##add family
gene_family<-read.csv("~/Downloads/Sep24_OG_filter/gene_protein_family_CFG.csv")
colnames(gene_family)[3]<-"stitle"
gene_family<-gene_family[1:3]
################################################################################
#below code written for checking purpose only for indivisual species
################################################################################
################################################################################
##found 1336 genes
Glycine_max <- species_results[["Glycine_max"]]
#merge gene family with Glycine max data
Glycine_max <-merge(Glycine_max, gene_family,by="stitle",all=TRUE)
#Glycine_max<-Glycine_max[complete.cases(Glycine_max), ]
Glycine_max <- Glycine_max[!is.na(Glycine_max$query_name), ]
################################################################################
#merge gene family with Glycine max data from NR
gene_family_NR<-read.csv("~/Downloads/Sep24_OG_filter/myData_proteins_HMM_NR.tsv",sep = "\t")
gene_family_NR<-gene_family_NR %>%
mutate(query_coverage_NR = ((qend_NR - qstart_NR + 1) / length_NR) * 100)
gene_family_NR<-gene_family_NR %>% filter(Pident_NR >= 55)
plant_accessions<-fread("~/Downloads/Sep24_OG_filter/gene_protein_family_CFG.csv") %>%
mutate(source="NR_GREEN") #%>% select(-length)
colnames(plant_accessions)[1]<-'sseqid_NR'
colnames(plant_accessions)[2]<-'description_NR'
gene_family_NR<-left_join(gene_family_NR, plant_accessions, by = "sseqid_NR")
gene_family_NR <- gene_family_NR %>%
group_by(qseqid_NR) %>%
slice_min(order_by = evalue_NR,n=1,with_ties = FALSE) %>%
ungroup()
gene_family_NR<-gene_family_NR %>% mutate(query_name=qseqid_NR)
################################################################################
##1336 hits from Glycine max query_coverage >= 70,Pident >=55
Glycine_max <-merge(Glycine_max, gene_family_NR,by="query_name",all=TRUE)
Glycine_max <- Glycine_max[!is.na(Glycine_max$stitle), ]
Glycine_max<-select(Glycine_max,-mismatch,-gapopen,-target_accession,-query_accession,-description)
################################################################################
##Chickpea Desi hits 734
Chickpea_XJ01 <- species_results[["Chickpea_XJ01"]]
#merge gene family with Glycine max data
Chickpea_XJ01 <-merge(Chickpea_XJ01, gene_family,by="stitle",all=TRUE)
#Glycine_max<-Glycine_max[complete.cases(Glycine_max), ]
Chickpea_XJ01 <- Chickpea_XJ01[!is.na(Chickpea_XJ01$query_name), ]
Chickpea_XJ01 <-merge(Chickpea_XJ01, gene_family_NR,by="query_name",all=TRUE)
Chickpea_XJ01 <- Chickpea_XJ01[!is.na(Chickpea_XJ01$stitle), ]
Chickpea_XJ01<-select(Chickpea_XJ01,-mismatch,-gapopen,-target_accession,-query_accession,-description)