rm(list = ls())
#if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
#BiocManager::install("STRINGdb")
#########################################1.1 library
#Loading the package
library(STRINGdb)
#browseVignettes("STRINGdb")
#STRINGdb$help("get_graph") # To visualize their documentation.
STRINGdb$methods() ## To list all the methods available.
## [1] ".objectPackage" ".objectParent"
## [3] "add_diff_exp_color" "add_proteins_description"
## [5] "benchmark_ppi" "benchmark_ppi_pathway_view"
## [7] "callSuper" "copy"
## [9] "enrichment_heatmap" "export"
## [11] "field" "get_aliases"
## [13] "get_annotations" "get_bioc_graph"
## [15] "get_clusters" "get_enrichment"
## [17] "get_graph" "get_homologs"
## [19] "get_homologs_besthits" "get_homology_graph"
## [21] "get_interactions" "get_link"
## [23] "get_neighbors" "get_paralogs"
## [25] "get_pathways_benchmarking_blackList" "get_png"
## [27] "get_ppi_enrichment" "get_ppi_enrichment_full"
## [29] "get_proteins" "get_pubmed"
## [31] "get_pubmed_interaction" "get_subnetwork"
## [33] "get_summary" "get_term_proteins"
## [35] "getClass" "getRefClass"
## [37] "import" "initFields"
## [39] "initialize" "load"
## [41] "load_all" "map"
## [43] "mp" "plot_network"
## [45] "plot_ppi_enrichment" "post_payload"
## [47] "ppi_enrichment" "remove_homologous_interactions"
## [49] "set_background" "show"
## [51] "show#envRefClass" "trace"
## [53] "untrace" "usingMethods"
#9606 for Human, 10090 for mouse
string_db <- STRINGdb$new( version="11", species=9606,
score_threshold=500, input_directory="")
class(string_db)
## [1] "STRINGdb"
## attr(,"package")
## [1] "STRINGdb"
#Importing the list with the positional candiate genes
data(diff_exp_example1)
head(diff_exp_example1)
## pvalue logFC gene
## 1 0.0001018 3.333461 VSTM2L
## 2 0.0001392 3.822383 TBC1D2
## 3 0.0001720 3.306056 LENG9
## 4 0.0001739 3.024605 TMEM27
## 5 0.0001990 3.854414 LOC100506014
## 6 0.0002393 3.082052 TSPAN1
dim(diff_exp_example1) #[1] 20861 3
## [1] 20861 3
#View(diff_exp_example1)
#The map function adds an additional column with STRING identifiers to
#the dataframe that is passed as first parameter
example1_mapped <- string_db$map(diff_exp_example1, "gene", removeUnmappedRows = TRUE )
## Warning: we couldn't map to STRING 15% of your identifiers
dim(example1_mapped) #[1] 17748 4
## [1] 17708 4
#View(example1_mapped)
hits <- example1_mapped$STRING_id[1:200]
string_db$plot_network(hits)

##################################################2 PAYLOAD MECHANISM
# filter by p-value and add a color column
# (i.e. green down-regulated gened and red for up-regulated genes)
head(subset(example1_mapped, pvalue<0.05),20)
## gene pvalue logFC STRING_id
## 1 VSTM2L 0.00010180 3.333461 9606.ENSP00000362560
## 2 TBC1D2 0.00013920 3.822383 9606.ENSP00000481721
## 3 LENG9 0.00017200 3.306056 9606.ENSP00000479355
## 4 TMEM27 0.00017390 3.024605 9606.ENSP00000369699
## 5 TSPAN1 0.00023930 3.082052 9606.ENSP00000361072
## 6 TNNC1 0.00029210 2.932060 9606.ENSP00000232975
## 7 MGAM 0.00030510 2.369738 9606.ENSP00000447378
## 8 TRIM22 0.00032350 2.293125 9606.ENSP00000369299
## 9 KLK11 0.00035400 3.266867 9606.ENSP00000473047
## 10 TYROBP 0.00035930 2.410288 9606.ENSP00000262629
## 11 IGDCC4 0.00051270 2.409806 9606.ENSP00000319623
## 12 DEFB1 0.00056170 3.080354 9606.ENSP00000297439
## 13 TMEM52B 0.00056200 5.299586 9606.ENSP00000371348
## 14 UPK3B 0.00056430 2.073072 9606.ENSP00000257632
## 15 SLC52A1 0.00059290 3.214998 9606.ENSP00000399979
## 16 TMC1 0.00069220 2.050402 9606.ENSP00000297784
## 17 CPA6 0.00070690 2.091150 9606.ENSP00000297770
## 18 GRHL3 0.00071440 2.611668 9606.ENSP00000288955
## 19 SPATA18 0.00076575 2.436230 9606.ENSP00000295213
## 20 RBP3 0.00084440 -2.217963 9606.ENSP00000463151
dim(example1_mapped[example1_mapped$pvalue < 0.05,])
## [1] 1057 4
example1_mapped_pval05 <- string_db$add_diff_exp_color(subset(example1_mapped, pvalue<0.05),
logFcColStr="logFC" )
head(example1_mapped_pval05)
## gene pvalue logFC STRING_id color
## 1 VSTM2L 0.0001018 3.333461 9606.ENSP00000362560 #FFDCDCFF
## 2 TBC1D2 0.0001392 3.822383 9606.ENSP00000481721 #FFC6C6FF
## 3 LENG9 0.0001720 3.306056 9606.ENSP00000479355 #FFDDDDFF
## 4 TMEM27 0.0001739 3.024605 9606.ENSP00000369699 #FFE6E6FF
## 5 TSPAN1 0.0002393 3.082052 9606.ENSP00000361072 #FFE4E4FF
## 6 TNNC1 0.0002921 2.932060 9606.ENSP00000232975 #FFE8E8FF
table(example1_mapped_pval05$color)
##
## #00FF00FF #20FF20FF #47FF47FF #4EFF4EFF #51FF51FF #52FF52FF #56FF56FF #60FF60FF
## 1 1 1 2 1 1 1 1
## #68FF68FF #6CFF6CFF #78FF78FF #7BFF7BFF #7EFF7EFF #86FF86FF #8AFF8AFF #8DFF8DFF
## 1 1 1 1 1 1 1 1
## #92FF92FF #93FF93FF #98FF98FF #9BFF9BFF #9DFF9DFF #9FFF9FFF #A4FFA4FF #A6FFA6FF
## 1 1 2 2 1 1 1 1
## #A7FFA7FF #ADFFADFF #AEFFAEFF #AFFFAFFF #B0FFB0FF #B2FFB2FF #B3FFB3FF #B5FFB5FF
## 1 1 1 1 1 2 1 1
## #B9FFB9FF #BCFFBCFF #BDFFBDFF #BEFFBEFF #C0FFC0FF #C2FFC2FF #C4FFC4FF #C6FFC6FF
## 1 1 2 1 1 1 1 2
## #C7FFC7FF #CAFFCAFF #CBFFCBFF #CCFFCCFF #CEFFCEFF #CFFFCFFF #D0FFD0FF #D1FFD1FF
## 1 2 1 2 1 1 2 1
## #D2FFD2FF #D3FFD3FF #D4FFD4FF #D5FFD5FF #D6FFD6FF #D7FFD7FF #D8FFD8FF #D9FFD9FF
## 2 3 4 1 2 3 3 2
## #DAFFDAFF #DBFFDBFF #DCFFDCFF #DDFFDDFF #DEFFDEFF #E0FFE0FF #E1FFE1FF #E2FFE2FF
## 3 1 2 2 3 1 3 6
## #E3FFE3FF #E4FFE4FF #E5FFE5FF #E6FFE6FF #E7FFE7FF #E8FFE8FF #E9FFE9FF #EAFFEAFF
## 3 5 3 4 7 1 4 9
## #EBFFEBFF #ECFFECFF #EDFFEDFF #EEFFEEFF #EFFFEFFF #F0FFF0FF #F1FFF1FF #F2FFF2FF
## 10 8 9 10 11 13 12 10
## #F3FFF3FF #F4FFF4FF #F5FFF5FF #F6FFF6FF #F7FFF7FF #F8FFF8FF #F9FFF9FF #FAFFFAFF
## 13 20 10 4 5 1 1 1
## #FCFFFCFF #FEFFFEFF #FF0000FF #FFADADFF #FFC6C6FF #FFC7C7FF #FFCBCBFF #FFCECEFF
## 2 1 1 1 1 2 2 1
## #FFD6D6FF #FFD7D7FF #FFD8D8FF #FFDBDBFF #FFDCDCFF #FFDDDDFF #FFDFDFFF #FFE0E0FF
## 1 1 1 1 2 1 2 1
## #FFE2E2FF #FFE3E3FF #FFE4E4FF #FFE6E6FF #FFE7E7FF #FFE8E8FF #FFE9E9FF #FFEAEAFF
## 2 1 2 3 1 2 2 1
## #FFEBEBFF #FFECECFF #FFEDEDFF #FFEEEEFF #FFEFEFFF #FFF0F0FF #FFF1F1FF #FFF2F2FF
## 2 1 6 3 6 6 8 12
## #FFF3F3FF #FFF4F4FF #FFF5F5FF #FFF6F6FF #FFF7F7FF #FFF8F8FF #FFF9F9FF #FFFAFAFF
## 3 8 6 13 8 22 24 29
## #FFFBFBFF #FFFCFCFF #FFFDFDFF #FFFEFEFF #FFFFFFFF
## 59 96 157 273 8
dim(example1_mapped_pval05)
## [1] 1057 5
example1_mapped_pval05[example1_mapped_pval05$color == '#D7FFD7FF', ]
## gene pvalue logFC STRING_id color
## 202 PSORS1C1 0.0064309 -1.276350 9606.ENSP00000259881 #D7FFD7FF
## 313 CCDC121 0.0106570 -1.270537 9606.ENSP00000412150 #D7FFD7FF
## 326 IQCF6 0.0110005 -1.276934 9606.ENSP00000381760 #D7FFD7FF
example1_mapped_pval05[example1_mapped_pval05$color == '#FFEDEDFF', ]
## gene pvalue logFC STRING_id color
## 45 GPR1 0.0013288 2.737352 9606.ENSP00000480405 #FFEDEDFF
## 112 PPP1R14C 0.0032367 2.712630 9606.ENSP00000355260 #FFEDEDFF
## 186 CXORF21 0.0059798 2.689851 9606.ENSP00000368245 #FFEDEDFF
## 362 LUM 0.0122549 2.708596 9606.ENSP00000266718 #FFEDEDFF
## 441 ZCCHC13 0.0161518 2.691808 9606.ENSP00000345633 #FFEDEDFF
## 863 GOLT1A 0.0383622 2.720178 9606.ENSP00000308535 #FFEDEDFF
example1_mapped_pval05[example1_mapped_pval05$color == '#FFF6F6FF', ]
## gene pvalue logFC STRING_id color
## 14 UPK3B 0.000564300 2.073072 9606.ENSP00000257632 #FFF6F6FF
## 16 TMC1 0.000692200 2.050402 9606.ENSP00000297784 #FFF6F6FF
## 17 CPA6 0.000706900 2.091150 9606.ENSP00000297770 #FFF6F6FF
## 32 TIMP4 0.000960300 2.124868 9606.ENSP00000287814 #FFF6F6FF
## 59 PSMB8 0.001680700 2.086213 9606.ENSP00000364016 #FFF6F6FF
## 99 CSF2RA 0.002901725 2.115444 9606.ENSP00000394227 #FFF6F6FF
## 101 MGARP 0.002943500 2.081564 9606.ENSP00000381928 #FFF6F6FF
## 109 PLA2G2A 0.003131100 2.102297 9606.ENSP00000383364 #FFF6F6FF
## 512 ACTL8 0.020269500 2.068434 9606.ENSP00000364555 #FFF6F6FF
## 591 PRTN3 0.023486300 2.075010 9606.ENSP00000234347 #FFF6F6FF
## 616 EFHB 0.025116600 2.117086 9606.ENSP00000295824 #FFF6F6FF
## 714 MMP10 0.030393100 2.132074 9606.ENSP00000279441 #FFF6F6FF
## 856 PYGM 0.037511600 2.056828 9606.ENSP00000164139 #FFF6F6FF
# post payload information to the STRING server
payload_id <- string_db$post_payload( example1_mapped_pval05$STRING_id,
colors=example1_mapped_pval05$color )
# display a STRING network png with the "halo"
string_db$plot_network(hits, payload_id=payload_id)

################################################3 ENRICHMENT
head(hits)
## [1] "9606.ENSP00000362560" "9606.ENSP00000481721" "9606.ENSP00000479355"
## [4] "9606.ENSP00000369699" "9606.ENSP00000361072" "9606.ENSP00000232975"
enrichment <- string_db$get_enrichment(hits)
head(enrichment, n=2)
## category term number_of_genes number_of_genes_in_background
## 1 Component GO.0005576 45 2505
## 2 KEGG hsa04115 6 68
## ncbiTaxonId
## 1 9606
## 2 9606
## inputGenes
## 1 9606.ENSP00000008938,9606.ENSP00000216286,9606.ENSP00000223642,9606.ENSP00000243611,9606.ENSP00000245907,9606.ENSP00000249842,9606.ENSP00000254722,9606.ENSP00000258613,9606.ENSP00000261172,9606.ENSP00000262776,9606.ENSP00000263413,9606.ENSP00000264474,9606.ENSP00000278853,9606.ENSP00000282091,9606.ENSP00000287814,9606.ENSP00000296370,9606.ENSP00000297439,9606.ENSP00000297770,9606.ENSP00000301258,9606.ENSP00000321853,9606.ENSP00000334042,9606.ENSP00000337466,9606.ENSP00000342012,9606.ENSP00000345751,9606.ENSP00000347979,9606.ENSP00000349709,9606.ENSP00000362108,9606.ENSP00000362560,9606.ENSP00000368771,9606.ENSP00000375778,9606.ENSP00000377047,9606.ENSP00000383364,9606.ENSP00000383894,9606.ENSP00000385451,9606.ENSP00000385452,9606.ENSP00000388001,9606.ENSP00000394227,9606.ENSP00000421725,9606.ENSP00000433209,9606.ENSP00000442304,9606.ENSP00000451768,9606.ENSP00000463151,9606.ENSP00000473047,9606.ENSP00000480821,9606.ENSP00000480826
## 2 9606.ENSP00000238721,9606.ENSP00000256996,9606.ENSP00000347979,9606.ENSP00000360025,9606.ENSP00000384849,9606.ENSP00000393762
## preferredNames
## 1 PGLYRP1,NID2,C5,C4BPB,C3,ISLR,SERPINF1,THSD1,EPYC,LGALS3BP,C6,CSTA,ZP1,PTH,TIMP4,S100P,DEFB1,CPA6,PSCA,SERPINF2,TAC4,WFDC10B,DMKN,SCNN1B,FAS,RAET1E,LCN2,VSTM2L,MUC3A,LAMB3,PTPRZ1,PLA2G2A,MATN3,MDK,CGREF1,OAS1,CSF2RA,GC,GSDMD,FAM132B,C6orf174,RBP3,KLK11,LSR,HIST1H3A
## 2 TP53I3,DDB2,FAS,GADD45A,CDKN1A,SESN1
## p_value fdr description
## 1 0.00012 0.0472 extracellular region
## 2 0.00010 0.0189 p53 signaling pathway
#
#If you have performed your experiment on a predefined set of proteins, it is important to run the
#enrichment statistics using that set as a background
backgroundV <- example1_mapped$STRING_id[1:2000]
#string_db$set_background(backgroundV)
#You can also set the background when you instantiate the STRINGdb object
#string_db <- STRINGdb$new( score_threshold=200, backgroundV = backgroundV )
##############################################4 CLUSTERING
#example1_mapped$STRING_id[1:3] #[1] "9606.ENSP00000362560" "9606.ENSP00000364207" "9606.ENSP00000331647"
#clustersList <- string_db$get_clusters(example1_mapped$STRING_id[1:300])
# plot first 4 clusters
#par(mfrow=c(2,1))
#for(i in seq(1:2)){
# string_db$plot_network(clustersList[[i]])
# }
##############################################5 help
STRINGdb$help("map")
## Call:
## $map(my_data_frame, my_data_frame_id_col_names, takeFirst = , removeUnmappedRows = , quiet = )
##
##
##
## Description:
## Maps the gene identifiers of the input dataframe to STRING identifiers.
## It returns the input dataframe with the "STRING_id" additional column.
##
## Input parameters:
## "my_data_frame" data frame provided as input.
## "my_data_frame_id_col_names" vector contatining the names of the columns of "my_data_frame" that have to be used for the mapping.
## "takeFirst" boolean indicating what to do in case of multiple STRING proteins that map to the same name.
## If TRUE, only the first of those is taken. Otherwise all of them are used. (default TRUE)
## "removeUnmappedRows" remove the rows that cannot be mapped to STRING
## (by default those lines are left and their STRING_id is set to NA)
## "quiet" Setting this variable to TRUE we can avoid printing the warning relative to the unmapped values.
##
## Author(s):
## Andrea Franceschini
############################################5 ADDITIONAL PROTEIN INFORMATION
#You can get a table that contains all the proteins that are present in our database of the species of
#interest. The protein table also include the preferred name, the size and a short description of each
#protein.
string_proteins <- string_db$get_proteins()
#In the following section we will show how to query STRING with R on some specific proteins. In
#the examples, we will use the famous tumor proteins AHR and CYP1A1
#First we need to get the STRING identifier of those proteins, using our mp method:
tp53 = string_db$mp( "tp53" )
atm = string_db$mp( "atm" )
tp53; atm
## [1] "9606.ENSP00000269305"
## [1] "9606.ENSP00000278616"
#Using the following method, you can see the proteins that interact with one or more of your proteins:
string_db$get_neighbors( c(tp53, atm) ) [1:10]
## [1] "9606.ENSP00000001008" "9606.ENSP00000002829" "9606.ENSP00000005260"
## [4] "9606.ENSP00000006015" "9606.ENSP00000013807" "9606.ENSP00000014914"
## [7] "9606.ENSP00000019317" "9606.ENSP00000025008" "9606.ENSP00000027335"
## [10] "9606.ENSP00000044462"
#It is also possible to retrieve the interactions that connect certain input proteins between each other.
#Using the ”get interactions” method we can clearly see that TP53 and ATM interact with each other
#with a good evidence/score.
string_db$get_interactions( c(tp53, atm) )
## from to combined_score
## 1 9606.ENSP00000269305 9606.ENSP00000278616 999
## 2 9606.ENSP00000269305 9606.ENSP00000278616 999
#STRING provides a way to get homologous proteins: in our database we store ALL-AGAINSTALL alignments within all 5090 organisms. You can retrive all of the paralogs of the protein using
#”get paralogs” method.
string_db$get_paralogs(tp53)
## [1] X9606 ENSP00000269305 X9606.1 ENSP00000269305.1
## [5] X815.8
## <0 rows> (or 0-length row.names)