Load all the libraraies necessary for the resulting data operations
library(dplyr)
library(reshape2)
library(ggplot2)
library(readr)
library(UpSetR)
Import Human proteome data and the transcriptome data and conditioning it for set comparisons
HPA_proteins <- read.csv(file = "Data-HPA/normal_tissue.csv")
HPA_transcript <- read.csv(file = "Data-HPA/rna_tissue.csv")
#Change the HPA Transcript Header to Match
#the Labelling Scheme in the HPA proteins Data
colnames(HPA_transcript) <- gsub(colnames(HPA_transcript),
pattern = "Sample",
replacement = "Tissue")
Building Common data table for quick accession between genes, protein data generated by the HPA project and our experiments.
Combined_HPA_map <- merge(HPA_proteins,HPA_transcript,by = c("Gene.name","Tissue"))
Combined_HPA_map <- Combined_HPA_map %>% mutate(Level = gsub( Level,pattern = "Not detected", replacement = 0)) %>%
mutate(Level = gsub( Level,pattern = "Low", replacement = 10)) %>%
mutate(Level = gsub( Level,pattern = "Medium", replacement = 100)) %>%
mutate(Level = gsub( Level,pattern = "High", replacement = 1000))
To check the HPA against our data, we will look at high confidnece “Supported” proteins that are expressed exclusively in one tissue or all to determine the specificity of the expression in the HPA. For this we must first load in count data that indicates which protein fragments were found in which organs.
Exp_protein <-read.csv(file = "Tissue_Specificity.csv")
Exp_protein[,1] <- toupper(Exp_protein[,1]) #Capitalizing Gene.names to so it is formatted like gene names in the HPA
This funcitons takes in two arguments:
The number of fragments of a protein that can be found for it to be considered “Expressed in Exclusively in the Tissue”. This is set to one by default. The higher the higher the more proteins, proteins with many fragments are allowed
The number of total tissues the protein can be found in. If set to 1, the function returns proteins only found in 1 tissue. If set to five, the function returns proteins found in all tissues and so on for all n tissue combinations
Plot_Selective_Expression <- function(nFragment_in_Tissue_cutoff, Tissue_sum_cut_off){
Upset_Model_Matrix <- Exp_protein[,c(5:9)]
Upset_Model_Matrix <- lapply(Upset_Model_Matrix, function(x) as.numeric(x >= nFragment_in_Tissue_cutoff))
Upset_Model_Matrix <- data.frame(Upset_Model_Matrix)
Upset_Model_Matrix$Total <- rowSums(Upset_Model_Matrix)
Upset_Model_Matrix$Protein <- Exp_protein[,1]
Upset_Model_Matrix <- Upset_Model_Matrix[c(7,1:6)]
Upset_Model_Matrix$Protein <-toupper(Upset_Model_Matrix$Protein)
colnames(Upset_Model_Matrix) <- gsub(colnames(Upset_Model_Matrix),
pattern = "Protein",replacement = "Gene.name")
Upset_Model_Matrix <- na.omit(Upset_Model_Matrix)
Upset_Model_Matrix <- filter(Upset_Model_Matrix, Total == Tissue_sum_cut_off )
return(Upset_Model_Matrix)}
This inital grpah illistrates the Subset of Fragments that exist in ONLY in one tissue
upset(Plot_Selective_Expression(1,1)) #Proteins with 1 fragment, in only 1 tissue
This second graph shows Subset of Fragments that exist across all assayed Tissues. There are close to 8000 fragments deteched across all proteins which belong to a non-unique set of proteins.
upset(Plot_Selective_Expression(1,5)) #Proteins with 1 fragments detected in All Tissues
This can be done Using Plot_Selective_Expression(1,1). Gives Proteins with only 1 fragment expressed in only 1 tissue. The BAT is removed from the Analysis as there is no comperable tissue in the HPA dataset. Furthermore, only a single highest intensity fragment is taken from each protein.
Expressed_in_Tissue <- function(nFragment_in_Tissue_cutoff, Tissue_sum_cut_off){
data <- Plot_Selective_Expression(nFragment_in_Tissue_cutoff,Tissue_sum_cut_off)
upset(data, sets = c("Brain", "Quad", "Liver","BAT","Heart"),
mainbar.y.label = paste("Number of Proteins with Selected Cut Offs",
nFragment_in_Tissue_cutoff, Tissue_sum_cut_off))
#This plot should show no intersections as it displays proteins that are specific to one tissue
data <- data %>% select(-one_of("BAT"))
data_melt <- melt(data)
colnames(data_melt) <- c("Gene.name","Tissue","Expressed")
data_melt$Tissue <- gsub(data_melt$Tissue,pattern = "Heart",replacement = "heart muscle")
data_melt$Tissue <- gsub(data_melt$Tissue,pattern = "Liver",replacement = "liver")
data_melt$Tissue <- gsub(data_melt$Tissue,pattern = "Brain",replacement = "cerebral cortex")
data_melt$Tissue <- gsub(data_melt$Tissue,pattern = "Quad",replacement = "skeletal muscle")
return(data_melt)}
This required first building a merged data fram using the gene names of the protein identified in the experiments and the HPA. The total sum of protein in the HPA is quite large and therefore, this resultant dataframe contains only proteins that are found at least once in one of the tissue, allowing for direct comparisons.
Generate_Full_Table <- function(data_melted) {
Combined_HPA_EXP <- merge(Combined_HPA_map,
data_melted,
by = c("Gene.name","Tissue"))
Combined_HPA_EXP <- Combined_HPA_EXP %>%
filter(Reliability == "Supported") %>%
select(-one_of("Gene.x", "Gene.y")) %>%
select(-one_of(c("Reliability","Unit")))
colnames(Combined_HPA_EXP)
colnames(Combined_HPA_EXP) <- c("Gene.name","Tissue","Cell.type","HPA.Prot.Expressed",
"HPA.mRNA.Expressed","Expreimental.Expressed")
#Remove the Dulicated Rows
df_dups <- Combined_HPA_EXP[c("Gene.name", "Tissue")]
return((Combined_HPA_EXP <- Combined_HPA_EXP[!duplicated(df_dups),]))
}
Proteins that are labelled “Not detected” are placed in the UnExpressed.HPA protein set. Proteins with low, medium or high expression are all bundled into the Expressed.HPA.Protein set.
In terms of the Experimental data. Proteins for any given Tissue which have an mScore below 2 and have an intensity above 0 are considered expressed. Additionally proteins with an mScore above 2 are not considered in the anaylsis. Protein with an intensity below 1000 are considered expressed.
Upset_tissue_plot <- function(Tissue_name, Tissue_sets){
Combined_HPA_EXP <- Generate_Full_Table(data_melted)
Combined_HPA_EXP_model <- Combined_HPA_EXP[,c(1,4,5,6,2)]
Combined_HPA_EXP_model <- Combined_HPA_EXP_model %>% filter(Tissue %in% Tissue_name)
Combined_HPA_EXP_model <- Combined_HPA_EXP_model %>% select(-one_of("Tissue"))
Combined_HPA_EXP_model$HPA.Prot.Expressed <- as.numeric(Combined_HPA_EXP_model$HPA.Prot.Expressed > 0)
Combined_HPA_EXP_model$HPA.Prot.UnExpressed <- as.numeric(Combined_HPA_EXP_model$HPA.Prot.Expressed == 0)
Combined_HPA_EXP_model$Expreimental.Expressed <- as.numeric(Combined_HPA_EXP_model$Expreimental.Expressed > 0)
Combined_HPA_EXP_model$Expreimental.UnExpressed <- as.numeric(Combined_HPA_EXP_model$Expreimental.Expressed == 0)
Combined_HPA_EXP_model$HPA.mRNA.Expressed <- as.numeric(Combined_HPA_EXP_model$HPA.mRNA.Expressed > 0)
Combined_HPA_EXP_model$HPA.mRNA.UnExpressed <- as.numeric(Combined_HPA_EXP_model$HPA.mRNA.Expressed == 0)
upset(Combined_HPA_EXP_model, order.by = "freq",
sets = Tissue_sets,
mainbar.y.label = paste(Tissue_name,"Proteins in Set")
)}
Using Expressed_in_Tissue(8,5) returns Proteins with up 8 fragmentines expressed in all 5 tissues. The figure below shows the proteins selected for which have up to 8 fragments in any tissue AND is seen in all tissues.
data_melted <- Expressed_in_Tissue(8,5)
With this data set there are no proteins that are NOT seem in our experiments beacuse we have filtered for only proteins that we know are found in ALL tissues. We subsequenetly run the analysis on the set to detemine the level of coherence with the HPA expression patterns.
All_tissues_sets <- c("HPA.Prot.Expressed","HPA.Prot.UnExpressed","Expreimental.Expressed")
Upset_tissue_plot("liver",All_tissues_sets)
Upset_tissue_plot("cerebral cortex",All_tissues_sets)
Upset_tissue_plot("skeletal muscle",All_tissues_sets)
Upset_tissue_plot("heart muscle",All_tissues_sets)
data_melted <- Expressed_in_Tissue(1,1)
There are the proteins that are only found in 1 of the tissues. Thus there are 4 tissues in which the tissue will not be found. The Tissues that have not expressed there proteins have 0 fragments from the proteins found within the samples.
Single_tissues_sets <- c("HPA.Prot.Expressed","HPA.Prot.UnExpressed",
"Expreimental.Expressed", "Expreimental.UnExpressed")
Upset_tissue_plot("liver",Single_tissues_sets)
Upset_tissue_plot("cerebral cortex",Single_tissues_sets)
Upset_tissue_plot("skeletal muscle",Single_tissues_sets)
Upset_tissue_plot("heart muscle",Single_tissues_sets)