knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(cache = TRUE)
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
knitr::opts_chunk$set(comment = NA)
library(stringr)
library(ggplot2)
library(gridExtra)
library(svglite)
library(grid)
In this section, This section involves reading data from two tab-delimited files . Additionally, the code performs various data manipulation tasks, including extracting scan IDs, selecting columns of interest, adding binary columns for decoy and target, and concatenating the data frames side by side.
# Files
file.target <-'PSM_results_search_real_sequences_1.txt'
file.decoy <-'PSM_results_decoy_search_1.txt'
# Read data from a tab-delimited file named 'PSM_results.txt'
data.target <- read.table(file.target, header = TRUE, sep = '\t')
data.decoy <- read.table(file.decoy, header = TRUE, sep = '\t')
# Extract the numeric part after "scan=" in the 'Spectrum.Title' column and assign it to a new column 'id'
data.target$id <- as.numeric(str_match(str_extract(data.target[,"Spectrum.Title"], "scan=\\d+"), "scan=(\\d+)")[, 2])
data.decoy$id <- as.numeric(str_match(str_extract(data.decoy[,"Spectrum.Title"], "scan=\\d+"), "scan=(\\d+)")[, 2])
# Columns that we are interested in
data.cols <- c(
"id",
"Theoretical.Mass",
"Identification.Charge",
"Protein.s.",
"Sequence",
"Raw.Score",
"Score"
)
# Subset the data to include only the columns of interest
data.target <- data.target[, data.cols]
data.decoy <- data.decoy[, data.cols]
# Add decoy and target binary
data.target$Decoy <- numeric(dim(data.target)[1])
data.decoy$Decoy <- numeric(dim(data.decoy)[1])+1
# Sort the data frame by the 'score' column
data.target <- data.target[order(data.target$Score), ]
data.decoy <- data.decoy[order(data.decoy$Score), ]
# Concatenated data base side by side
data.concat <- merge(data.target, data.decoy, by = "id", suffixes = c("_target", "_decoy"))
head(data.target)
id Theoretical.Mass Identification.Charge Protein.s. Sequence
10 5255 718.3610 2 P53688 TVNGGSGK
37 57295 1716.9814 2 Q12291 KFPIQFLIDTILVGN
58 3726 977.3326 2 P53717 MTCSMVSM
91 3378 734.3963 2 P33202 GAGAFLLS
119 1448 754.3497 2 P33329 GVTGFSST
140 54160 1188.6575 2 Q99252 NLTRTSTLQR
Raw.Score Score Decoy
10 999 -29.99565 0
37 999 -29.99565 0
58 999 -29.99565 0
91 999 -29.99565 0
119 999 -29.99565 0
140 999 -29.99565 0
This section focuses on exploratory data analysis (EDA), including creating histograms for target and decoy data, sorting concatenated data by score, and visualizing the distribution of scores through multiple histograms. The plots are then arranged and saved as an SVG file.
library(grid)
# Concantenated data base sort by score
db.concatenated <-rbind(data.target, data.decoy)
db.concatenated <-db.concatenated[order(-db.concatenated$Score),]
# Histogram for data1
hist1 <- ggplot(data.concat, aes(x = Score_target)) +
geom_density(fill = "orange") +
ggtitle("Histogram Target") +
theme(plot.title = element_text(size = 10))
# Histogram for data2
hist2 <- ggplot(data.concat, aes(x = Score_decoy)) +
geom_density(fill = "aquamarine") +
ggtitle("Histogram decoy") +
theme(plot.title = element_text(size = 10))
# Histogram for data3
hist3 <- ggplot(db.concatenated, aes(x = Score, fill = factor(Decoy))) +
geom_density( alpha = 0.5) +
ggtitle("Histogram target-decoy")+
scale_fill_manual(values = c("orange", "aquamarine"))+
labs(fill = "0 = D, 1 = T")+
theme(legend.position = c(0.8,0.7),
plot.title = element_text(size = 10),
legend.title = element_text(size = 8))
# Arrange the plots side by side
grid.arrange(hist1, hist2, hist3, ncol = 3, top=textGrob( "Distribution of Scores", gp=gpar(fontsize=12,font=2)))
In this section, the code calculates False Discovery Rate (FDR) using concatenated and separated strategies. It computes cumulative sums, calculates FDR using different strategies, and creates a data frame (FDR2) for comparison.
library(grid)
#Exploring FDR
db.concatenated$sumdecoy <- cumsum(db.concatenated$Decoy)
db.concatenated$sumtarget <- cumsum(db.concatenated$Decoy == 0)
#Exploring FDR TD strategy concatenated 2Nd/(Nd+Nt)
db.concatenated$FDR <-100*2*(cumsum(db.concatenated$Decoy))/(cumsum(db.concatenated$Decoy == 0)+cumsum(db.concatenated$Decoy))
# Exploring FDR TD strategy separated: Nd/Nt)
score <- seq(from = -30, to = 100, length.out = 10000)
FDR2 <- data.frame()
for (s in score){
nt <- length(data.target[data.target$Score > s,"Score"])
nd <- length(data.decoy[data.decoy$Score > s,"Score"])
# Create a new row as a data frame
new.row <- data.frame(Score = s, ND = nd, NT = nt, FDR = 100 * nd / nt)
# Append the new row to FDR2 using
FDR2 <- rbind(FDR2, new.row)
}
gg <- ggplot(db.concatenated, aes(x = Score, y = FDR)) +
geom_line(aes(color = "FDR")) +
geom_line(data = FDR2, aes(x = Score, y = FDR, color = "FDR2"))+
geom_hline(yintercept = 1, linetype = "dashed", color = "red") + # Add a horizontal line at y = 1
labs(x = "Score",y = NULL,
title = "FDR Comparison")+
scale_x_reverse()+
theme_bw()+
scale_color_manual(values = c("FDR" = "blue", "FDR2" = "black"),
labels = c("FDR" = "Nd/Nt", "FDR2" = "2Nd/(Nd+Nt)"))+
theme(legend.position = c(0.3,0.6),
legend.title = element_blank(),
plot.title = element_text(size = 10))
gg.zoom <- ggplot(db.concatenated, aes(x = Score, y = FDR)) +
geom_line(color = "blue") +
geom_line(data = FDR2, aes(x = Score, y = FDR))+
geom_hline(yintercept = 1, linetype = "dashed", color = "red") + # Add a horizontal line at y = 1
labs(x = "Score",y = NULL,
title = "Augmented view")+
scale_x_reverse()+
theme_bw()+
coord_cartesian(xlim = c(40, 20), ylim = c(0,2.5)) +
theme(plot.title = element_text(size = 10))
grid.arrange(gg, gg.zoom, ncol = 2, top=textGrob( "Comparison of FDR strategies",gp=gpar(fontsize=12, font = 2)))
in1 <- which.min(abs(db.concatenated$FDR - 1))
in2 <- which.min(abs(FDR2$FDR - 1))
in1<-cbind(db.concatenated$Score[in1],
db.concatenated$FDR[in1],
db.concatenated$sumdecoy[in1],
db.concatenated$sumtarget[in1] )
in2<-cbind(FDR2$Score[in2],
FDR2$FDR[in2],
FDR2$ND[in2],
FDR2$NT[in2])
FDR.strt <- rbind(in1,in2)
rownames(FDR.strt) <-c("Strategy 1 (Concat)", "Strategy 2 (Separate)")
colnames(FDR.strt) <-c("Score", "FDR", "Nd", "Nt")
FDR.strt
Score FDR Nd Nt
Strategy 1 (Concat) 29.82967 0.9985528 69 13751
Strategy 2 (Separate) 24.07241 1.0028204 160 15955