My_Quarto

Step by step documentation of Flowering Pipeline analysis.

Running Code

Step by step scripts to produce the requred files for hmm domain search for flowerings genes in legumes:

##STEPS
## STEP 1
#Downloaded all gene data from the CFG database for each species (protein sequences marked as *_ref.fasta).  
#Used the Python script "CFG_fetch_fastas.py" to generate FASTA files for each protein, then ran OrthoFinder with existing datasets from chickpea and other legumes.  

#Each CFG species FASTA was renamed as *_ref.fasta and stored in HPC at:  
#/isilon/swarupsir_user/majnu_bhai/flowering_data_udita/legumes/Legumes_AA_NUC  

#- Total dataset: 112 species with 82,104 proteins  
#- My dataset: 36 species with 167253 flowering proteins  

#OrthoFinder outputs were stored at:  
#~/Downloads/Sep23_long_work/01_long_work_data/Orthogroups  

#Key output files:  
#- Orthogroups.GeneCount.tsv  
#- Orthogroups.tsv (28,636 OGs)  
#- Orthogroups_SingleCopyOrthologues.txt  
#- Orthogroups_UnassignedGenes.tsv
##
# from collections import defaultdict
# import os
# 
# # Input files
# species_file = "Species112.txt"
# mapping_file = "Species_protein.tsv"
# fasta_file = "protein.fa"
# output_dir = "species_fastas"
# 
# # Create output directory
# os.makedirs(output_dir, exist_ok=True)
# 
# # Step 1: load species list
# with open(species_file) as f:
#     species_list = [line.strip() for line in f if line.strip()]
# 
# # Step 2: load mapping file (Species ↔ ProteinID)
# protein_to_species = {}
# with open(mapping_file) as f:
#     header = f.readline()  # skip header
#     for line in f:
#         if not line.strip():
#             continue
#         parts = line.strip().split("\t")
#         if len(parts) < 2:
#             continue
#         sp, prot = parts[0], parts[1]
#         protein_to_species[prot] = sp
# 
# # Step 3: open FASTA and collect sequences by species
# species_fastas = defaultdict(list)
# 
# with open(fasta_file) as f:
#     seq_id = None
#     seq_lines = []
#     for line in f:
#         if line.startswith(">"):  # new sequence
#             if seq_id:
#                 if seq_id in protein_to_species:
#                     sp = protein_to_species[seq_id]
#                     species_fastas[sp].append((seq_id, "".join(seq_lines)))
#             seq_id = line[1:].strip()
#             seq_lines = []
#         else:
#             seq_lines.append(line.strip())
#     if seq_id and seq_id in protein_to_species:
#         sp = protein_to_species[seq_id]
#         species_fastas[sp].append((seq_id, "".join(seq_lines)))
# 
# # Step 4: write species-specific FASTA files
# for sp, seqs in species_fastas.items():
#     if sp not in species_list:
#         continue  # only keep species in Species112.txt
#     parts = sp.split()
#     abbr = f"{parts[0][0]}. {' '.join(parts[1:])}"
#     fasta_name = f"{abbr}_ref.fasta".replace(" ", "_")
#     out_path = os.path.join(output_dir, fasta_name)
#     
#     with open(out_path, "w") as out:
#         for prot_id, seq in seqs:
#             out.write(f">{prot_id}\n{seq}\n")
#     
#     print(f"✅ {sp} → {out_path} ({len(seqs)} proteins)")
## STEP 2
#I analyzed the orthofinder results and filtered the orthogroups (OGs) associated with #flowering genes for further downstream processing.Next, I generated separate HMM profiles for each flowering gene family and then merged them into a single HMM profile database.  
#The individual HMMs were stored in: /home/mustafa/Downloads/Flowering_data_CFG##/01_flowering_data/02_HMMSCAN_inputs/custom_DB/aligned  
#The merged main HMM database was stored in: /home/mustafa/Downloads/Flowering_data_CFG/01_flowering_data/02_HMMSCAN_inputs/custom_DB

##Script:01_matched_OG3_cfg_5887.sh
#!/bin/bash

# ID_FILE="cfg_DB.txt" ##cfg data with 5887 protein ids
# ORTHO_FILE="Orthogroups.txt"
# OUTPUT_FILE="01_matched_hmm5887_OGs.txt" ##fetch 5887 genes OGs contains 348 OGS in output, but other flowering genes also present
# so i will drop in the next script using keep_flw_genes_ids.py and the file name is 04_hmm_OGs_req.txt    
# 
# awk '
#     BEGIN { FS="[: ]+"; OFS="\t" }  # split by colon or spaces
#     NR==FNR {
#         ids[$1] = 1
#         next
#     }
#     {
#         og = $1   # orthogroup ID before colon
#         matched = 0
#         for (i = 2; i <= NF; i++) {
#             if ($i in ids) {
#                 matched = 1
#                 break
#             }
#         }
#         if (matched) {
#             print og, $0
#         }
#     }
# ' "$ID_FILE" "$ORTHO_FILE" > "$OUTPUT_FILE"
# 
# echo "✅ Done! Matches written to: $OUTPUT_FILE"

##Script:02_matched_OG3_CFG_82104.sh
#!/bin/bash

ID_FILE="id_list_flower.txt"
ORTHO_FILE="Orthogroups.txt"
OUTPUT_FILE="02_matched_All_flower_82104_OGs.txt" ## i fetchout here flower OGs(748) for 82104 genes of CFG data

# awk '
#     BEGIN { FS="[: ]+"; OFS="\t" }  # split by colon or spaces
#     NR==FNR {
#         ids[$1] = 1
#         next
#     }
#     {
#         og = $1   # orthogroup ID before colon
#         matched = 0
#         for (i = 2; i <= NF; i++) {
#             if ($i in ids) {
#                 matched = 1
#                 break
#             }
#         }
#         if (matched) {
#             print og, $0
#         }
#     }
# ' "$ID_FILE" "$ORTHO_FILE" > "$OUTPUT_FILE"
# 
# echo "✅ Done! Matches written to: $OUTPUT_FILE"
#  

##Script:03_matched_OG3_legumes_1397779.sh
#!/bin/bash

# ID_FILE="legumes_ID.txt"
# ORTHO_FILE="02_matched_All_flower_82104_OGs.txt"
# OUTPUT_FILE="03_matched_All_flower_OGs.txt" ## i fetchout here legumes OGs(526) for 1397779 genes of my data (legumes) 
# #then run this script (03_keep_legumes_genes_ids.py) to exclude other species protein id and keep only mydata legumes flowering genes 167253 proteins in 526 OG's
# #03_matched_All_flower_OGs2.txt
# 
# awk '
#     BEGIN { FS="[: ]+"; OFS="\t" }  # split by colon or spaces
#     NR==FNR {
#         ids[$1] = 1
#         next
#     }
#     {
#         og = $1   # orthogroup ID before colon
#         matched = 0
#         for (i = 2; i <= NF; i++) {
#             if ($i in ids) {
#                 matched = 1
#                 break
#             }
#         }
#         if (matched) {
#             print og, $0
#         }
#     }
# ' "$ID_FILE" "$ORTHO_FILE" > "$OUTPUT_FILE"
# 
# echo "✅ Done! Matches written to: $OUTPUT_FILE"

##Script4:03_keep_legumes_genes_ids.py
# Files
# id_file = "legumes_ID.txt"
# input_file = "03_matched_All_flower_1397779_OGs.txt"
# output_file = "03_matched_All_flower_1397779_OGs2.txt" ## This will contains only 9 legumes  ids 167253 genes ids  
# 
# # Load IDs to KEEP
# with open(id_file) as f:
#     keep_ids = set(line.strip() for line in f if line.strip())
# 
# with open(input_file) as fin, open(output_file, "w") as fout:
#     for line in fin:
#         parts = line.strip().split()
#         if not parts:
#             continue
#         og = parts[0]         # OG ID
#         ids = parts[1:]       # member IDs
#         # Keep only IDs that are in cfg_DB.txt
#         filtered_ids = [i for i in ids if i in keep_ids]
#         if filtered_ids:  # only save OG if it still has members
#             fout.write(og + "\t" + "\t".join(filtered_ids) + "\n")
# 
# print(f"✅ Filtered OGs (only IDs in cfg_DB.txt) saved to {output_file}")
# 
##Script5:04_keep_flw_genes_ids.py
# Files
# id_file = "cfg_DB.txt"
# input_file = "01_matched_hmm5887_OGs.txt"
# output_file = "04_hmm_OGs_req.txt" ## This will contains only 9 legumes + A. thaliana and Oryza sativa ids for hmm 
# 
# # Load IDs to KEEP
# with open(id_file) as f:
#     keep_ids = set(line.strip() for line in f if line.strip())
# 
# with open(input_file) as fin, open(output_file, "w") as fout:
#     for line in fin:
#         parts = line.strip().split()
#         if not parts:
#             continue
#         og = parts[0]         # OG ID
#         ids = parts[1:]       # member IDs
#         # Keep only IDs that are in cfg_DB.txt
#         filtered_ids = [i for i in ids if i in keep_ids]
#         if filtered_ids:  # only save OG if it still has members
#             fout.write(og + "\t" + "\t".join(filtered_ids) + "\n")
# 
# print(f"✅ Filtered OGs (only IDs in cfg_DB.txt) saved to {output_file}")

##Script6:05_og_fetch_for_hmm.py

# id_file = "cfg_DB.txt"
# input_file = "01_matched_hmm5887_OGs.txt"
# output_file = "04_hmm_OGs_req.txt" ## This will contains only 9 legumes + A. thaliana and Oryza sativa ids for hmm 
# 
# # Load IDs to KEEP
# with open(id_file) as f:
#     keep_ids = set(line.strip() for line in f if line.strip())
# 
# with open(input_file) as fin, open(output_file, "w") as fout:
#     for line in fin:
#         parts = line.strip().split()
#         if not parts:
#             continue
#         og = parts[0]         # OG ID
#         ids = parts[1:]       # member IDs
#         # Keep only IDs that are in cfg_DB.txt
#         filtered_ids = [i for i in ids if i in keep_ids]
#         if filtered_ids:  # only save OG if it still has members
#             fout.write(og + "\t" + "\t".join(filtered_ids) + "\n")
# 
# print(f"✅ Filtered OGs (only IDs in cfg_DB.txt) saved to {output_file}")
## STEP 4

# 1. Run HMM search:
# 
#    **hmmscan --cpu 48 --acc -E 1e-5 --tblout pfam_scan.tblout --domtblout pfam_scan.domtblout custom_DB/custom_DB.hmm myData_proteins_HMM.faa > pfam_scan_myDATA.stdout**
# 
# 2. Process HMMER results in R:
#    - Use the hmmer2.R script to filter and analyze the output.
# 
# 3. Run DIAMOND BLASTP:
#    - Use the legume dataset (5,887 sequences) as the database.
#    - Query with protein IDs from HMMER results (36 species).
#    - Process BLAST results using the same hmmer2.R script.
# 
# 4. Annotate with NR protein database:
#    - Run BLASTP against NR using the same protein IDs.
#    - Collect hit and annotation data.
# 
# *Note: All filtering and processing parameters are defined within the R script.
##load the packages
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readr)
library("data.table")
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
##path
setwd("/Users/Tina/Downloads/Sep24_OG_filter/")

##here I define the colnames of hmmer results
colnames_hmmer <- c(
  "target_name", "target_accession", "tlen",
  "query_name", "query_accession", "qlen",
  "fullseq_Evalue", "fullseq_score", "fullseq_bias",
  "domain_num", "domain_of",
  "c_Evalue", "i_Evalue", "domain_score", "domain_bias",
  "hmm_from", "hmm_to", "ali_from", "ali_to", "env_from", "env_to",
  "acc", "description"
)
##read hmmer hits table
hmmer <- read_table2("~/Downloads/Sep24_OG_filter/05_og_fetch_for_hmm_results/og_fastas/pfam_scan.domtblout",
  comment = "#",
  col_names = colnames_hmmer
)
## Warning: `read_table2()` was deprecated in readr 2.0.0.
## ℹ Please use `read_table()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_double(),
##   target_name = col_character(),
##   target_accession = col_character(),
##   query_name = col_character(),
##   query_accession = col_character(),
##   description = col_character()
## )
## ℹ Use `spec()` for the full column specifications.
class(hmmer)
## [1] "spec_tbl_df" "tbl_df"      "tbl"         "data.frame"
hmmer <- as.data.frame(hmmer) %>% tibble::as_tibble()

hmmer <- hmmer %>%
  mutate(
    fullseq_Evalue = as.numeric(fullseq_Evalue),
    fullseq_score  = as.numeric(fullseq_score)
  )
######################################################################
##isoform single ids from legumes data
isoform_single<-read.table("~/Downloads/Sep24_OG_filter/single_isoform_id.txt",header = T)

##merge with hmm data
hmmer <- merge(hmmer, isoform_single, by = "query_name", all = TRUE)

##remove isofrom i.e not available
hmmer<-hmmer[complete.cases(hmmer), ]

##select the best hit based on c_value domains(86167)
hmmer <- hmmer %>%
  group_by(query_name) %>%
  slice_min(order_by = c_Evalue,n=1,with_ties = FALSE) %>%
  ungroup()
hmmer <-hmmer %>% filter(c_Evalue <=1e-5 )

##now load the all protein blast tsv  5887 flowering genes here
myData_proteins_HMM_5887<-read.table("~/Downloads/Sep24_OG_filter/myData_proteins_HMM_5887.tsv",header = T)

#calculate the query coverage
myData_proteins_HMM_5887<-myData_proteins_HMM_5887 %>%
  mutate(query_coverage = ((qend - qstart + 1) / length) * 100)

#calculate the best hit total protein was 167253 and i got hits 108304
myData_proteins_HMM_5887 <- myData_proteins_HMM_5887 %>%
  group_by(qseqid) %>%
  slice_min(order_by = evalue,n=1,with_ties = FALSE) %>%
  ungroup()
myData_proteins_HMM_5887<-myData_proteins_HMM_5887 %>% mutate(query_name = qseqid)

##merge the blast hits in hmmer df
hmmer <- merge(hmmer,myData_proteins_HMM_5887 , by = "query_name", all = TRUE)
#hmmer <- hmmer[!duplicated(hmmer$query_name), ]
hmmer2 <- hmmer %>%
  group_by(query_name) %>%
  slice_min(order_by = c_Evalue,n=1,with_ties = FALSE) %>%
  ungroup()
hmmer2<-hmmer2[complete.cases(hmmer2), ]
################################################################################
# List of unique species from your column
species_list <- unique(hmmer2$Species_level)

# Function to apply your pipeline for one species
filter_species <- function(species_name) {
  hmmer2 %>%
    filter(Species_level == species_name) %>%
    filter(query_coverage >= 70) %>%
    group_by(stitle) %>%
    slice_min(order_by = evalue, n = 1, with_ties = FALSE) %>%
    ungroup() %>%
    filter(!is.na(Pident) & Pident >= 55)
}

# Apply the function to each species and store in a named list
species_results <- lapply(species_list, filter_species)
names(species_results) <- species_list

# Now you can access like:
Glycine_max <- species_results[["Glycine_max"]]

species_results[["C_reticulatum"]]
## # A tibble: 729 × 39
##    query_name  target_name     target_accession  tlen query_accession  qlen
##    <chr>       <chr>           <chr>            <dbl> <chr>           <dbl>
##  1 ret_00995.1 OG0000087_align -                  221 -                 291
##  2 ret_01480.1 OG0000154_align -                 1125 -                 141
##  3 ret_05571.1 OG0000255_align -                  987 -                1234
##  4 ret_26161.1 OG0001087_align -                  161 -                 161
##  5 ret_26185.1 OG0000573_align -                  273 -                 269
##  6 ret_05427.1 OG0000040_align -                  536 -                 553
##  7 ret_03725.1 OG0000169_align -                  666 -                 638
##  8 ret_06361.1 OG0000067_align -                  186 -                 169
##  9 ret_01025.1 OG0000088_align -                  551 -                1038
## 10 ret_12124.1 OG0000111_align -                  303 -                 303
## # ℹ 719 more rows
## # ℹ 33 more variables: fullseq_Evalue <dbl>, fullseq_score <dbl>,
## #   fullseq_bias <dbl>, domain_num <dbl>, domain_of <dbl>, c_Evalue <dbl>,
## #   i_Evalue <dbl>, domain_score <dbl>, domain_bias <dbl>, hmm_from <dbl>,
## #   hmm_to <dbl>, ali_from <dbl>, ali_to <dbl>, env_from <dbl>, env_to <dbl>,
## #   acc <dbl>, description <chr>, Species_level <chr>, qseqid <chr>,
## #   sseqid <chr>, Pident <dbl>, length <int>, mismatch <int>, gapopen <int>, …
Desi_PBA_Striker<-species_results[["Desi_PBA_Striker"]]

Chickpea_XJ01<-species_results[["Chickpea_XJ01"]]

CDC<-species_results[["CDC"]]

Desi_Kyabra<-species_results[["Desi_Kyabra"]]
################################################################################
##add family
gene_family<-read.csv("~/Downloads/Sep24_OG_filter/gene_protein_family_CFG.csv")
colnames(gene_family)[3]<-"stitle"
gene_family<-gene_family[1:3]
################################################################################
#below code written for checking purpose only for indivisual species
################################################################################
################################################################################
##found 1336 genes
Glycine_max <- species_results[["Glycine_max"]]

#merge gene family with Glycine max data
Glycine_max <-merge(Glycine_max, gene_family,by="stitle",all=TRUE)

#Glycine_max<-Glycine_max[complete.cases(Glycine_max), ]
Glycine_max <- Glycine_max[!is.na(Glycine_max$query_name), ]
################################################################################
#merge gene family with Glycine max data from NR
gene_family_NR<-read.csv("~/Downloads/Sep24_OG_filter/myData_proteins_HMM_NR.tsv",sep = "\t")

gene_family_NR<-gene_family_NR %>%
  mutate(query_coverage_NR = ((qend_NR - qstart_NR + 1) / length_NR) * 100)

gene_family_NR<-gene_family_NR %>% filter(Pident_NR >= 55)

plant_accessions<-fread("~/Downloads/Sep24_OG_filter/gene_protein_family_CFG.csv") %>% 
  mutate(source="NR_GREEN") #%>% select(-length)

colnames(plant_accessions)[1]<-'sseqid_NR'

colnames(plant_accessions)[2]<-'description_NR'

gene_family_NR<-left_join(gene_family_NR, plant_accessions, by = "sseqid_NR")

gene_family_NR <- gene_family_NR %>%
  group_by(qseqid_NR) %>%
  slice_min(order_by = evalue_NR,n=1,with_ties = FALSE) %>%
  ungroup()

gene_family_NR<-gene_family_NR %>% mutate(query_name=qseqid_NR)
################################################################################
##1336 hits from Glycine max query_coverage >= 70,Pident >=55
Glycine_max <-merge(Glycine_max, gene_family_NR,by="query_name",all=TRUE)

Glycine_max <- Glycine_max[!is.na(Glycine_max$stitle), ]

Glycine_max<-select(Glycine_max,-mismatch,-gapopen,-target_accession,-query_accession,-description)
################################################################################
##Chickpea Desi hits 734
Chickpea_XJ01 <- species_results[["Chickpea_XJ01"]]

#merge gene family with Glycine max data
Chickpea_XJ01 <-merge(Chickpea_XJ01, gene_family,by="stitle",all=TRUE)

#Glycine_max<-Glycine_max[complete.cases(Glycine_max), ]
Chickpea_XJ01 <- Chickpea_XJ01[!is.na(Chickpea_XJ01$query_name), ]

Chickpea_XJ01 <-merge(Chickpea_XJ01, gene_family_NR,by="query_name",all=TRUE)

Chickpea_XJ01 <- Chickpea_XJ01[!is.na(Chickpea_XJ01$stitle), ]

Chickpea_XJ01<-select(Chickpea_XJ01,-mismatch,-gapopen,-target_accession,-query_accession,-description)