knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(cache = TRUE)
knitr::opts_chunk$set(warning = FALSE, message = FALSE) 
knitr::opts_chunk$set(comment = NA)
library(stringr)
library(ggplot2)
library(gridExtra)
library(svglite)
library(grid)

Data Preparation

In this section, This section involves reading data from two tab-delimited files . Additionally, the code performs various data manipulation tasks, including extracting scan IDs, selecting columns of interest, adding binary columns for decoy and target, and concatenating the data frames side by side.

# Files
file.target <-'PSM_results_search_real_sequences_1.txt' 
file.decoy <-'PSM_results_decoy_search_1.txt'

# Read data from a tab-delimited file named 'PSM_results.txt'
data.target <- read.table(file.target, header = TRUE, sep = '\t')
data.decoy <- read.table(file.decoy, header = TRUE, sep = '\t')

# Extract the numeric part after "scan=" in the 'Spectrum.Title' column and assign it to a new column 'id'
data.target$id <- as.numeric(str_match(str_extract(data.target[,"Spectrum.Title"], "scan=\\d+"), "scan=(\\d+)")[, 2])
data.decoy$id <- as.numeric(str_match(str_extract(data.decoy[,"Spectrum.Title"], "scan=\\d+"), "scan=(\\d+)")[, 2])

# Columns that we are interested in
data.cols <- c(
  "id",
  "Theoretical.Mass",
  "Identification.Charge",
  "Protein.s.",
  "Sequence",
  "Raw.Score",
  "Score"
)

# Subset the data to include only the columns of interest
data.target <- data.target[, data.cols]
data.decoy <- data.decoy[, data.cols]

# Add decoy and target binary
data.target$Decoy <- numeric(dim(data.target)[1])
data.decoy$Decoy <- numeric(dim(data.decoy)[1])+1

# Sort the data frame by the 'score' column
data.target <- data.target[order(data.target$Score), ]
data.decoy <- data.decoy[order(data.decoy$Score), ]

# Concatenated data base side by side
data.concat <- merge(data.target, data.decoy, by = "id", suffixes = c("_target", "_decoy"))

head(data.target)
       id Theoretical.Mass Identification.Charge Protein.s.        Sequence
10   5255         718.3610                     2     P53688        TVNGGSGK
37  57295        1716.9814                     2     Q12291 KFPIQFLIDTILVGN
58   3726         977.3326                     2     P53717        MTCSMVSM
91   3378         734.3963                     2     P33202        GAGAFLLS
119  1448         754.3497                     2     P33329        GVTGFSST
140 54160        1188.6575                     2     Q99252      NLTRTSTLQR
    Raw.Score     Score Decoy
10        999 -29.99565     0
37        999 -29.99565     0
58        999 -29.99565     0
91        999 -29.99565     0
119       999 -29.99565     0
140       999 -29.99565     0

Exploring Distributions

This section focuses on exploratory data analysis (EDA), including creating histograms for target and decoy data, sorting concatenated data by score, and visualizing the distribution of scores through multiple histograms. The plots are then arranged and saved as an SVG file.

library(grid)
# Concantenated data base sort by score
db.concatenated <-rbind(data.target, data.decoy)
db.concatenated <-db.concatenated[order(-db.concatenated$Score),]

# Histogram for data1
hist1 <- ggplot(data.concat, aes(x = Score_target)) +
  geom_density(fill = "orange") +
  ggtitle("Histogram Target") +
  theme(plot.title = element_text(size = 10))

# Histogram for data2
hist2 <- ggplot(data.concat, aes(x = Score_decoy)) +
  geom_density(fill = "aquamarine") +
  ggtitle("Histogram decoy") +
  theme(plot.title = element_text(size = 10))

# Histogram for data3
hist3 <- ggplot(db.concatenated, aes(x = Score, fill = factor(Decoy))) +
  geom_density( alpha = 0.5) +
  ggtitle("Histogram target-decoy")+
  scale_fill_manual(values = c("orange", "aquamarine"))+
  labs(fill = "0 = D, 1 = T")+
  theme(legend.position = c(0.8,0.7),
        plot.title = element_text(size = 10),
        legend.title = element_text(size = 8))

# Arrange the plots side by side
grid.arrange(hist1, hist2, hist3, ncol = 3, top=textGrob( "Distribution of Scores", gp=gpar(fontsize=12,font=2)))

Exploring FDR

In this section, the code calculates False Discovery Rate (FDR) using concatenated and separated strategies. It computes cumulative sums, calculates FDR using different strategies, and creates a data frame (FDR2) for comparison.

library(grid)
#Exploring FDR 
db.concatenated$sumdecoy <- cumsum(db.concatenated$Decoy)
db.concatenated$sumtarget <- cumsum(db.concatenated$Decoy == 0)

#Exploring FDR TD strategy concatenated 2Nd/(Nd+Nt)
db.concatenated$FDR <-100*2*(cumsum(db.concatenated$Decoy))/(cumsum(db.concatenated$Decoy == 0)+cumsum(db.concatenated$Decoy))

# Exploring FDR TD strategy separated: Nd/Nt)
score <- seq(from = -30, to = 100, length.out = 10000)
FDR2 <- data.frame()

for (s in score){
  nt <- length(data.target[data.target$Score > s,"Score"])
  nd <- length(data.decoy[data.decoy$Score > s,"Score"])
  
  # Create a new row as a data frame
  new.row <- data.frame(Score = s, ND = nd, NT = nt, FDR = 100 * nd / nt)
  
  # Append the new row to FDR2 using 
  FDR2 <- rbind(FDR2, new.row)
}

gg <- ggplot(db.concatenated, aes(x = Score, y = FDR)) +
  geom_line(aes(color = "FDR")) +
  geom_line(data = FDR2, aes(x = Score, y = FDR, color = "FDR2"))+
  geom_hline(yintercept = 1, linetype = "dashed", color = "red") +  # Add a horizontal line at y = 1
  labs(x = "Score",y = NULL,
       title = "FDR Comparison")+
  scale_x_reverse()+
  theme_bw()+
  scale_color_manual(values = c("FDR" = "blue", "FDR2" = "black"), 
                     labels = c("FDR" = "Nd/Nt", "FDR2" = "2Nd/(Nd+Nt)"))+
  theme(legend.position = c(0.3,0.6),
        legend.title = element_blank(),
        plot.title = element_text(size = 10)) 
 

gg.zoom <- ggplot(db.concatenated, aes(x = Score, y = FDR)) +
  geom_line(color = "blue") +
  geom_line(data = FDR2, aes(x = Score, y = FDR))+
  geom_hline(yintercept = 1, linetype = "dashed", color = "red") +  # Add a horizontal line at y = 1
  labs(x = "Score",y = NULL,
       title = "Augmented view")+
  scale_x_reverse()+
  theme_bw()+
  coord_cartesian(xlim = c(40, 20), ylim = c(0,2.5)) +
  theme(plot.title = element_text(size = 10))

grid.arrange(gg, gg.zoom, ncol = 2, top=textGrob( "Comparison of FDR strategies",gp=gpar(fontsize=12, font = 2)))

in1 <- which.min(abs(db.concatenated$FDR - 1))
in2 <- which.min(abs(FDR2$FDR - 1))

in1<-cbind(db.concatenated$Score[in1], 
           db.concatenated$FDR[in1], 
           db.concatenated$sumdecoy[in1], 
           db.concatenated$sumtarget[in1] )
in2<-cbind(FDR2$Score[in2], 
           FDR2$FDR[in2], 
           FDR2$ND[in2], 
           FDR2$NT[in2])
FDR.strt <- rbind(in1,in2)
rownames(FDR.strt) <-c("Strategy 1 (Concat)", "Strategy 2 (Separate)")
colnames(FDR.strt) <-c("Score", "FDR", "Nd", "Nt")
FDR.strt
                         Score       FDR  Nd    Nt
Strategy 1 (Concat)   29.82967 0.9985528  69 13751
Strategy 2 (Separate) 24.07241 1.0028204 160 15955