rm(list = ls())
#if (!requireNamespace("BiocManager", quietly = TRUE))
#  install.packages("BiocManager")
#BiocManager::install("STRINGdb")
#########################################1.1 library
#Loading the package
library(STRINGdb)
#browseVignettes("STRINGdb")
#STRINGdb$help("get_graph") # To visualize their documentation.
STRINGdb$methods() ## To list all the methods available.
##  [1] ".objectPackage"                      ".objectParent"                      
##  [3] "add_diff_exp_color"                  "add_proteins_description"           
##  [5] "benchmark_ppi"                       "benchmark_ppi_pathway_view"         
##  [7] "callSuper"                           "copy"                               
##  [9] "enrichment_heatmap"                  "export"                             
## [11] "field"                               "get_aliases"                        
## [13] "get_annotations"                     "get_bioc_graph"                     
## [15] "get_clusters"                        "get_enrichment"                     
## [17] "get_graph"                           "get_homologs"                       
## [19] "get_homologs_besthits"               "get_homology_graph"                 
## [21] "get_interactions"                    "get_link"                           
## [23] "get_neighbors"                       "get_paralogs"                       
## [25] "get_pathways_benchmarking_blackList" "get_png"                            
## [27] "get_ppi_enrichment"                  "get_ppi_enrichment_full"            
## [29] "get_proteins"                        "get_pubmed"                         
## [31] "get_pubmed_interaction"              "get_subnetwork"                     
## [33] "get_summary"                         "get_term_proteins"                  
## [35] "getClass"                            "getRefClass"                        
## [37] "import"                              "initFields"                         
## [39] "initialize"                          "load"                               
## [41] "load_all"                            "map"                                
## [43] "mp"                                  "plot_network"                       
## [45] "plot_ppi_enrichment"                 "post_payload"                       
## [47] "ppi_enrichment"                      "remove_homologous_interactions"     
## [49] "set_background"                      "show"                               
## [51] "show#envRefClass"                    "trace"                              
## [53] "untrace"                             "usingMethods"
#9606 for Human, 10090 for mouse
string_db <- STRINGdb$new( version="11", species=9606,
                           score_threshold=500, input_directory="")
class(string_db)
## [1] "STRINGdb"
## attr(,"package")
## [1] "STRINGdb"
#Importing the list with the positional candiate genes
data(diff_exp_example1)
head(diff_exp_example1)
##      pvalue    logFC         gene
## 1 0.0001018 3.333461       VSTM2L
## 2 0.0001392 3.822383       TBC1D2
## 3 0.0001720 3.306056        LENG9
## 4 0.0001739 3.024605       TMEM27
## 5 0.0001990 3.854414 LOC100506014
## 6 0.0002393 3.082052       TSPAN1
dim(diff_exp_example1) #[1] 20861     3
## [1] 20861     3
#View(diff_exp_example1)
#The map function adds an additional column with STRING identifiers to 
#the dataframe that is passed as first parameter
example1_mapped <- string_db$map(diff_exp_example1, "gene", removeUnmappedRows = TRUE )
## Warning:  we couldn't map to STRING 15% of your identifiers
dim(example1_mapped) #[1] 17748     4
## [1] 17708     4
#View(example1_mapped)
hits <- example1_mapped$STRING_id[1:200]
string_db$plot_network(hits)

##################################################2 PAYLOAD MECHANISM
# filter by p-value and add a color column
# (i.e. green down-regulated gened and red for up-regulated genes)
head(subset(example1_mapped, pvalue<0.05),20)
##       gene     pvalue     logFC            STRING_id
## 1   VSTM2L 0.00010180  3.333461 9606.ENSP00000362560
## 2   TBC1D2 0.00013920  3.822383 9606.ENSP00000481721
## 3    LENG9 0.00017200  3.306056 9606.ENSP00000479355
## 4   TMEM27 0.00017390  3.024605 9606.ENSP00000369699
## 5   TSPAN1 0.00023930  3.082052 9606.ENSP00000361072
## 6    TNNC1 0.00029210  2.932060 9606.ENSP00000232975
## 7     MGAM 0.00030510  2.369738 9606.ENSP00000447378
## 8   TRIM22 0.00032350  2.293125 9606.ENSP00000369299
## 9    KLK11 0.00035400  3.266867 9606.ENSP00000473047
## 10  TYROBP 0.00035930  2.410288 9606.ENSP00000262629
## 11  IGDCC4 0.00051270  2.409806 9606.ENSP00000319623
## 12   DEFB1 0.00056170  3.080354 9606.ENSP00000297439
## 13 TMEM52B 0.00056200  5.299586 9606.ENSP00000371348
## 14   UPK3B 0.00056430  2.073072 9606.ENSP00000257632
## 15 SLC52A1 0.00059290  3.214998 9606.ENSP00000399979
## 16    TMC1 0.00069220  2.050402 9606.ENSP00000297784
## 17    CPA6 0.00070690  2.091150 9606.ENSP00000297770
## 18   GRHL3 0.00071440  2.611668 9606.ENSP00000288955
## 19 SPATA18 0.00076575  2.436230 9606.ENSP00000295213
## 20    RBP3 0.00084440 -2.217963 9606.ENSP00000463151
dim(example1_mapped[example1_mapped$pvalue < 0.05,])
## [1] 1057    4
example1_mapped_pval05 <- string_db$add_diff_exp_color(subset(example1_mapped, pvalue<0.05),
                                                        logFcColStr="logFC" )
head(example1_mapped_pval05)
##     gene    pvalue    logFC            STRING_id     color
## 1 VSTM2L 0.0001018 3.333461 9606.ENSP00000362560 #FFDCDCFF
## 2 TBC1D2 0.0001392 3.822383 9606.ENSP00000481721 #FFC6C6FF
## 3  LENG9 0.0001720 3.306056 9606.ENSP00000479355 #FFDDDDFF
## 4 TMEM27 0.0001739 3.024605 9606.ENSP00000369699 #FFE6E6FF
## 5 TSPAN1 0.0002393 3.082052 9606.ENSP00000361072 #FFE4E4FF
## 6  TNNC1 0.0002921 2.932060 9606.ENSP00000232975 #FFE8E8FF
table(example1_mapped_pval05$color)
## 
## #00FF00FF #20FF20FF #47FF47FF #4EFF4EFF #51FF51FF #52FF52FF #56FF56FF #60FF60FF 
##         1         1         1         2         1         1         1         1 
## #68FF68FF #6CFF6CFF #78FF78FF #7BFF7BFF #7EFF7EFF #86FF86FF #8AFF8AFF #8DFF8DFF 
##         1         1         1         1         1         1         1         1 
## #92FF92FF #93FF93FF #98FF98FF #9BFF9BFF #9DFF9DFF #9FFF9FFF #A4FFA4FF #A6FFA6FF 
##         1         1         2         2         1         1         1         1 
## #A7FFA7FF #ADFFADFF #AEFFAEFF #AFFFAFFF #B0FFB0FF #B2FFB2FF #B3FFB3FF #B5FFB5FF 
##         1         1         1         1         1         2         1         1 
## #B9FFB9FF #BCFFBCFF #BDFFBDFF #BEFFBEFF #C0FFC0FF #C2FFC2FF #C4FFC4FF #C6FFC6FF 
##         1         1         2         1         1         1         1         2 
## #C7FFC7FF #CAFFCAFF #CBFFCBFF #CCFFCCFF #CEFFCEFF #CFFFCFFF #D0FFD0FF #D1FFD1FF 
##         1         2         1         2         1         1         2         1 
## #D2FFD2FF #D3FFD3FF #D4FFD4FF #D5FFD5FF #D6FFD6FF #D7FFD7FF #D8FFD8FF #D9FFD9FF 
##         2         3         4         1         2         3         3         2 
## #DAFFDAFF #DBFFDBFF #DCFFDCFF #DDFFDDFF #DEFFDEFF #E0FFE0FF #E1FFE1FF #E2FFE2FF 
##         3         1         2         2         3         1         3         6 
## #E3FFE3FF #E4FFE4FF #E5FFE5FF #E6FFE6FF #E7FFE7FF #E8FFE8FF #E9FFE9FF #EAFFEAFF 
##         3         5         3         4         7         1         4         9 
## #EBFFEBFF #ECFFECFF #EDFFEDFF #EEFFEEFF #EFFFEFFF #F0FFF0FF #F1FFF1FF #F2FFF2FF 
##        10         8         9        10        11        13        12        10 
## #F3FFF3FF #F4FFF4FF #F5FFF5FF #F6FFF6FF #F7FFF7FF #F8FFF8FF #F9FFF9FF #FAFFFAFF 
##        13        20        10         4         5         1         1         1 
## #FCFFFCFF #FEFFFEFF #FF0000FF #FFADADFF #FFC6C6FF #FFC7C7FF #FFCBCBFF #FFCECEFF 
##         2         1         1         1         1         2         2         1 
## #FFD6D6FF #FFD7D7FF #FFD8D8FF #FFDBDBFF #FFDCDCFF #FFDDDDFF #FFDFDFFF #FFE0E0FF 
##         1         1         1         1         2         1         2         1 
## #FFE2E2FF #FFE3E3FF #FFE4E4FF #FFE6E6FF #FFE7E7FF #FFE8E8FF #FFE9E9FF #FFEAEAFF 
##         2         1         2         3         1         2         2         1 
## #FFEBEBFF #FFECECFF #FFEDEDFF #FFEEEEFF #FFEFEFFF #FFF0F0FF #FFF1F1FF #FFF2F2FF 
##         2         1         6         3         6         6         8        12 
## #FFF3F3FF #FFF4F4FF #FFF5F5FF #FFF6F6FF #FFF7F7FF #FFF8F8FF #FFF9F9FF #FFFAFAFF 
##         3         8         6        13         8        22        24        29 
## #FFFBFBFF #FFFCFCFF #FFFDFDFF #FFFEFEFF #FFFFFFFF 
##        59        96       157       273         8
dim(example1_mapped_pval05)
## [1] 1057    5
example1_mapped_pval05[example1_mapped_pval05$color == '#D7FFD7FF', ]
##         gene    pvalue     logFC            STRING_id     color
## 202 PSORS1C1 0.0064309 -1.276350 9606.ENSP00000259881 #D7FFD7FF
## 313  CCDC121 0.0106570 -1.270537 9606.ENSP00000412150 #D7FFD7FF
## 326    IQCF6 0.0110005 -1.276934 9606.ENSP00000381760 #D7FFD7FF
example1_mapped_pval05[example1_mapped_pval05$color == '#FFEDEDFF', ]
##         gene    pvalue    logFC            STRING_id     color
## 45      GPR1 0.0013288 2.737352 9606.ENSP00000480405 #FFEDEDFF
## 112 PPP1R14C 0.0032367 2.712630 9606.ENSP00000355260 #FFEDEDFF
## 186  CXORF21 0.0059798 2.689851 9606.ENSP00000368245 #FFEDEDFF
## 362      LUM 0.0122549 2.708596 9606.ENSP00000266718 #FFEDEDFF
## 441  ZCCHC13 0.0161518 2.691808 9606.ENSP00000345633 #FFEDEDFF
## 863   GOLT1A 0.0383622 2.720178 9606.ENSP00000308535 #FFEDEDFF
example1_mapped_pval05[example1_mapped_pval05$color == '#FFF6F6FF', ]
##        gene      pvalue    logFC            STRING_id     color
## 14    UPK3B 0.000564300 2.073072 9606.ENSP00000257632 #FFF6F6FF
## 16     TMC1 0.000692200 2.050402 9606.ENSP00000297784 #FFF6F6FF
## 17     CPA6 0.000706900 2.091150 9606.ENSP00000297770 #FFF6F6FF
## 32    TIMP4 0.000960300 2.124868 9606.ENSP00000287814 #FFF6F6FF
## 59    PSMB8 0.001680700 2.086213 9606.ENSP00000364016 #FFF6F6FF
## 99   CSF2RA 0.002901725 2.115444 9606.ENSP00000394227 #FFF6F6FF
## 101   MGARP 0.002943500 2.081564 9606.ENSP00000381928 #FFF6F6FF
## 109 PLA2G2A 0.003131100 2.102297 9606.ENSP00000383364 #FFF6F6FF
## 512   ACTL8 0.020269500 2.068434 9606.ENSP00000364555 #FFF6F6FF
## 591   PRTN3 0.023486300 2.075010 9606.ENSP00000234347 #FFF6F6FF
## 616    EFHB 0.025116600 2.117086 9606.ENSP00000295824 #FFF6F6FF
## 714   MMP10 0.030393100 2.132074 9606.ENSP00000279441 #FFF6F6FF
## 856    PYGM 0.037511600 2.056828 9606.ENSP00000164139 #FFF6F6FF
# post payload information to the STRING server
payload_id <- string_db$post_payload( example1_mapped_pval05$STRING_id,
                                      colors=example1_mapped_pval05$color )
# display a STRING network png with the "halo"
string_db$plot_network(hits, payload_id=payload_id)

################################################3 ENRICHMENT
head(hits)
## [1] "9606.ENSP00000362560" "9606.ENSP00000481721" "9606.ENSP00000479355"
## [4] "9606.ENSP00000369699" "9606.ENSP00000361072" "9606.ENSP00000232975"
enrichment <- string_db$get_enrichment(hits)
head(enrichment, n=2)
##    category       term number_of_genes number_of_genes_in_background
## 1 Component GO.0005576              45                          2505
## 2      KEGG   hsa04115               6                            68
##   ncbiTaxonId
## 1        9606
## 2        9606
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         inputGenes
## 1 9606.ENSP00000008938,9606.ENSP00000216286,9606.ENSP00000223642,9606.ENSP00000243611,9606.ENSP00000245907,9606.ENSP00000249842,9606.ENSP00000254722,9606.ENSP00000258613,9606.ENSP00000261172,9606.ENSP00000262776,9606.ENSP00000263413,9606.ENSP00000264474,9606.ENSP00000278853,9606.ENSP00000282091,9606.ENSP00000287814,9606.ENSP00000296370,9606.ENSP00000297439,9606.ENSP00000297770,9606.ENSP00000301258,9606.ENSP00000321853,9606.ENSP00000334042,9606.ENSP00000337466,9606.ENSP00000342012,9606.ENSP00000345751,9606.ENSP00000347979,9606.ENSP00000349709,9606.ENSP00000362108,9606.ENSP00000362560,9606.ENSP00000368771,9606.ENSP00000375778,9606.ENSP00000377047,9606.ENSP00000383364,9606.ENSP00000383894,9606.ENSP00000385451,9606.ENSP00000385452,9606.ENSP00000388001,9606.ENSP00000394227,9606.ENSP00000421725,9606.ENSP00000433209,9606.ENSP00000442304,9606.ENSP00000451768,9606.ENSP00000463151,9606.ENSP00000473047,9606.ENSP00000480821,9606.ENSP00000480826
## 2                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    9606.ENSP00000238721,9606.ENSP00000256996,9606.ENSP00000347979,9606.ENSP00000360025,9606.ENSP00000384849,9606.ENSP00000393762
##                                                                                                                                                                                                                                                              preferredNames
## 1 PGLYRP1,NID2,C5,C4BPB,C3,ISLR,SERPINF1,THSD1,EPYC,LGALS3BP,C6,CSTA,ZP1,PTH,TIMP4,S100P,DEFB1,CPA6,PSCA,SERPINF2,TAC4,WFDC10B,DMKN,SCNN1B,FAS,RAET1E,LCN2,VSTM2L,MUC3A,LAMB3,PTPRZ1,PLA2G2A,MATN3,MDK,CGREF1,OAS1,CSF2RA,GC,GSDMD,FAM132B,C6orf174,RBP3,KLK11,LSR,HIST1H3A
## 2                                                                                                                                                                                                                                      TP53I3,DDB2,FAS,GADD45A,CDKN1A,SESN1
##   p_value    fdr           description
## 1 0.00012 0.0472  extracellular region
## 2 0.00010 0.0189 p53 signaling pathway
#
#If you have performed your experiment on a predefined set of proteins, it is important to run the
#enrichment statistics using that set as a background
backgroundV <- example1_mapped$STRING_id[1:2000]
#string_db$set_background(backgroundV)
#You can also set the background when you instantiate the STRINGdb object
#string_db <- STRINGdb$new( score_threshold=200, backgroundV = backgroundV )
##############################################4 CLUSTERING
#example1_mapped$STRING_id[1:3] #[1] "9606.ENSP00000362560" "9606.ENSP00000364207" "9606.ENSP00000331647"
#clustersList <- string_db$get_clusters(example1_mapped$STRING_id[1:300])
# plot first 4 clusters
#par(mfrow=c(2,1))
#for(i in seq(1:2)){
#   string_db$plot_network(clustersList[[i]])
#   }
##############################################5 help 
STRINGdb$help("map")
## Call:
## $map(my_data_frame, my_data_frame_id_col_names, takeFirst = , removeUnmappedRows = , quiet = )
## 
## 
## 
## Description:
##   Maps the gene identifiers of the input dataframe to STRING identifiers.
##   It returns the input dataframe with the "STRING_id" additional column.
## 
## Input parameters:
##   "my_data_frame"                 data frame provided as input. 
##   "my_data_frame_id_col_names"    vector contatining the names of the columns of "my_data_frame" that have to be used for the mapping.
##   "takeFirst"                     boolean indicating what to do in case of multiple STRING proteins that map to the same name. 
##                                       If TRUE, only the first of those is taken. Otherwise all of them are used. (default TRUE)
##   "removeUnmappedRows"            remove the rows that cannot be mapped to STRING 
##                                       (by default those lines are left and their STRING_id is set to NA)
##   "quiet"                         Setting this variable to TRUE we can avoid printing the warning relative to the unmapped values.
## 
## Author(s):
##    Andrea Franceschini
############################################5 ADDITIONAL PROTEIN INFORMATION
#You can get a table that contains all the proteins that are present in our database of the species of
#interest. The protein table also include the preferred name, the size and a short description of each
#protein.
string_proteins <- string_db$get_proteins()
#In the following section we will show how to query STRING with R on some specific proteins. In
#the examples, we will use the famous tumor proteins AHR and CYP1A1
#First we need to get the STRING identifier of those proteins, using our mp method:
tp53 = string_db$mp( "tp53" )
atm = string_db$mp( "atm" )
tp53; atm
## [1] "9606.ENSP00000269305"
## [1] "9606.ENSP00000278616"
#Using the following method, you can see the proteins that interact with one or more of your proteins:
string_db$get_neighbors( c(tp53, atm) ) [1:10]
##  [1] "9606.ENSP00000001008" "9606.ENSP00000002829" "9606.ENSP00000005260"
##  [4] "9606.ENSP00000006015" "9606.ENSP00000013807" "9606.ENSP00000014914"
##  [7] "9606.ENSP00000019317" "9606.ENSP00000025008" "9606.ENSP00000027335"
## [10] "9606.ENSP00000044462"
#It is also possible to retrieve the interactions that connect certain input proteins between each other.
#Using the ”get interactions” method we can clearly see that TP53 and ATM interact with each other
#with a good evidence/score.
string_db$get_interactions( c(tp53, atm) )
##                   from                   to combined_score
## 1 9606.ENSP00000269305 9606.ENSP00000278616            999
## 2 9606.ENSP00000269305 9606.ENSP00000278616            999
#STRING provides a way to get homologous proteins: in our database we store ALL-AGAINSTALL alignments within all 5090 organisms. You can retrive all of the paralogs of the protein using
#”get paralogs” method.
string_db$get_paralogs(tp53)
## [1] X9606             ENSP00000269305   X9606.1           ENSP00000269305.1
## [5] X815.8           
## <0 rows> (or 0-length row.names)