This notebook is an adaptation of the 2015 introductory vignette by Andrea Franceschini, available here.
STRING is a database of known and predicted
protein-protein interactions. The interactions include direct (physical)
and indirect (functional) associations. The database contains
information from numerous sources, including experimental repositories,
computational prediction methods and public text collections. Each
interaction is associated with a combined confidence score that
integrates the various evidences.
As you will learn in this guide, the STRING database can
be useful to add meaning to list of genes (e.g. the best hits coming out
from a screen or the most differentially expressed genes coming out from
a Microarray/RNAseq experiment.)
As an example, we use the analyzed data of a microarray study taken
from GEO (Gene Expression Omnibus. GSE9008). This study investigates the
activity of Resveratrol, a natural phytoestrogen found in red winde and
a variety of plants, in A549 lung cancer cells. Microarray gene
expression profiling after 48 hours exposure to Resveratrol has been
performed and compared to a control composed by A549 lung cancer cells
threated only with ethanol. This data is already analyzed for
differential expression using the limma package: the genes
are sorted by fdr corrected pvalues and log fold change of the
differential expression is also reported in the table.
# Importing the list with the positional candiate genes
utils::data(diff_exp_example1)
head(diff_exp_example1)
## pvalue logFC gene
## 1 0.0001018 3.333461 VSTM2L
## 2 0.0001392 3.822383 TBC1D2
## 3 0.0001720 3.306056 LENG9
## 4 0.0001739 3.024605 TMEM27
## 5 0.0001990 3.854414 LOC100506014
## 6 0.0002393 3.082052 TSPAN1
# load string database
#9606 for Human, 10090 for mouse
string_db <- STRINGdb$new(version = "11.5", species = 9606, score_threshold = 200, input_directory="")
class(string_db)
## [1] "STRINGdb"
## attr(,"package")
## [1] "STRINGdb"
As a first step, we map the gene names to the STRING
database identifiers using the map method.
The map function adds an additional column with
STRING identifiers to the dataframe that is passed as first
parameter.
example1_mapped <- string_db$map(diff_exp_example1, "gene", removeUnmappedRows = TRUE )
## Warning: we couldn't map to STRING 15% of your identifiers
dim(example1_mapped)
## [1] 17684 4
#View(example1_mapped)
hits <- example1_mapped$STRING_id[1:200]
string_db$plot_network(hits)
# PAYLOAD MECHANISM
# filter by p-value and add a color column
# (i.e. green down-regulated gened and red for up-regulated genes)
head(subset(example1_mapped, log10(pvalue) >= -log10(0.01) | abs(logFC) >= 0.5))
## gene pvalue logFC STRING_id
## 1 VSTM2L 0.0001018 3.333461 9606.ENSP00000362560
## 2 TBC1D2 0.0001392 3.822383 9606.ENSP00000481721
## 3 LENG9 0.0001720 3.306056 9606.ENSP00000479355
## 4 TMEM27 0.0001739 3.024605 9606.ENSP00000369699
## 5 TSPAN1 0.0002393 3.082052 9606.ENSP00000361072
## 6 TNNC1 0.0002921 2.932060 9606.ENSP00000232975
example1_mapped_sig <- string_db$add_diff_exp_color(subset(example1_mapped, log10(pvalue) >= -log10(0.01) | abs(logFC) >= 0.5),
logFcColStr="logFC" )
head(example1_mapped_sig)
## gene pvalue logFC STRING_id color
## 1 VSTM2L 0.0001018 3.333461 9606.ENSP00000362560 #FFDDDDFF
## 2 TBC1D2 0.0001392 3.822383 9606.ENSP00000481721 #FFC6C6FF
## 3 LENG9 0.0001720 3.306056 9606.ENSP00000479355 #FFDEDEFF
## 4 TMEM27 0.0001739 3.024605 9606.ENSP00000369699 #FFE7E7FF
## 5 TSPAN1 0.0002393 3.082052 9606.ENSP00000361072 #FFE5E5FF
## 6 TNNC1 0.0002921 2.932060 9606.ENSP00000232975 #FFE9E9FF
table(example1_mapped_sig$color)
##
## #00FF00FF #04FF04FF #0CFF0CFF #21FF21FF #39FF39FF #49FF49FF #51FF51FF #54FF54FF
## 1 1 2 1 1 1 1 1
## #55FF55FF #59FF59FF #5BFF5BFF #5DFF5DFF #62FF62FF #63FF63FF #68FF68FF #6AFF6AFF
## 1 1 1 1 1 1 1 1
## #6DFF6DFF #70FF70FF #75FF75FF #7BFF7BFF #7DFF7DFF #80FF80FF #83FF83FF #85FF85FF
## 1 1 1 1 1 1 1 1
## #86FF86FF #8CFF8CFF #8DFF8DFF #8FFF8FFF #92FF92FF #94FF94FF #95FF95FF #98FF98FF
## 1 3 2 1 2 2 1 1
## #99FF99FF #9BFF9BFF #9CFF9CFF #9DFF9DFF #9EFF9EFF #A0FFA0FF #A1FFA1FF #A2FFA2FF
## 1 3 1 1 2 2 1 1
## #A4FFA4FF #A5FFA5FF #A7FFA7FF #A9FFA9FF #AAFFAAFF #ACFFACFF #ADFFADFF #AEFFAEFF
## 2 1 1 1 2 2 4 1
## #AFFFAFFF #B1FFB1FF #B2FFB2FF #B4FFB4FF #B5FFB5FF #B6FFB6FF #B7FFB7FF #B8FFB8FF
## 2 2 1 5 2 4 3 2
## #B9FFB9FF #BAFFBAFF #BBFFBBFF #BCFFBCFF #BFFFBFFF #C0FFC0FF #C1FFC1FF #C2FFC2FF
## 4 1 2 4 1 2 1 2
## #C3FFC3FF #C4FFC4FF #C5FFC5FF #C6FFC6FF #C7FFC7FF #C8FFC8FF #C9FFC9FF #CAFFCAFF
## 1 3 2 3 2 2 2 6
## #CBFFCBFF #CCFFCCFF #CDFFCDFF #CEFFCEFF #CFFFCFFF #D0FFD0FF #D1FFD1FF #D2FFD2FF
## 2 2 6 4 4 1 2 8
## #D3FFD3FF #D4FFD4FF #D5FFD5FF #D6FFD6FF #D7FFD7FF #D8FFD8FF #D9FFD9FF #DAFFDAFF
## 5 5 4 2 6 10 8 8
## #DBFFDBFF #DCFFDCFF #DDFFDDFF #DEFFDEFF #DFFFDFFF #E0FFE0FF #E1FFE1FF #E2FFE2FF
## 9 8 12 12 14 9 13 10
## #E3FFE3FF #E4FFE4FF #E5FFE5FF #E6FFE6FF #E7FFE7FF #E8FFE8FF #E9FFE9FF #EAFFEAFF
## 13 11 9 12 16 12 21 18
## #EBFFEBFF #ECFFECFF #EDFFEDFF #EEFFEEFF #EFFFEFFF #F0FFF0FF #F1FFF1FF #F2FFF2FF
## 25 22 29 26 26 39 34 26
## #F3FFF3FF #F4FFF4FF #F5FFF5FF #F6FFF6FF #F7FFF7FF #F8FFF8FF #F9FFF9FF #FAFFFAFF
## 34 57 53 62 71 75 89 86
## #FBFFFBFF #FCFFFCFF #FDFFFDFF #FEFFFEFF #FF0000FF #FFAEAEFF #FFC6C6FF #FFC7C7FF
## 113 137 133 169 1 1 1 1
## #FFC8C8FF #FFCBCBFF #FFCCCCFF #FFCFCFFF #FFD7D7FF #FFD8D8FF #FFDCDCFF #FFDDDDFF
## 1 1 1 1 2 1 1 2
## #FFDEDEFF #FFDFDFFF #FFE0E0FF #FFE1E1FF #FFE3E3FF #FFE4E4FF #FFE5E5FF #FFE6E6FF
## 2 1 1 1 2 2 2 1
## #FFE7E7FF #FFE8E8FF #FFE9E9FF #FFEAEAFF #FFEBEBFF #FFECECFF #FFEDEDFF #FFEEEEFF
## 2 1 2 2 1 4 2 9
## #FFEFEFFF #FFF0F0FF #FFF1F1FF #FFF2F2FF #FFF3F3FF #FFF4F4FF #FFF5F5FF #FFF6F6FF
## 4 11 10 9 11 4 13 12
## #FFF7F7FF #FFF8F8FF #FFF9F9FF #FFFAFAFF #FFFBFBFF #FFFCFCFF #FFFDFDFF #FFFEFEFF
## 26 20 46 67 82 159 315 727
## #FFFFFFFF
## 1096
dim(example1_mapped_sig)
## [1] 4366 5
example1_mapped_sig[example1_mapped_sig$color == '#D7FFD7FF', ]
## gene pvalue logFC STRING_id color
## 157 MT4 0.0051062 -1.402174 9606.ENSP00000219162 #D7FFD7FF
## 2597 DOHH 0.1452406 -1.407355 9606.ENSP00000398882 #D7FFD7FF
## 2997 TECTB 0.1658944 -1.402841 9606.ENSP00000358430 #D7FFD7FF
## 4785 TTLL2 0.2553177 -1.401929 9606.ENSP00000239587 #D7FFD7FF
## 6021 ZNF225 0.3087013 -1.412648 9606.ENSP00000262894 #D7FFD7FF
## 6435 TTC40 0.3263855 -1.408716 9606.ENSP00000357575 #D7FFD7FF
example1_mapped_sig[example1_mapped_sig$color == '#FFEDEDFF', ]
## gene pvalue logFC STRING_id color
## 45 GPR1 0.0013288 2.737352 9606.ENSP00000480405 #FFEDEDFF
## 102 NCF2 0.0029710 2.772363 9606.ENSP00000356505 #FFEDEDFF
example1_mapped_sig[example1_mapped_sig$color == '#FFF6F6FF', ]
## gene pvalue logFC STRING_id color
## 32 TIMP4 0.000960300 2.124868 9606.ENSP00000287814 #FFF6F6FF
## 50 FAM212A 0.001464200 2.157259 9606.ENSP00000329735 #FFF6F6FF
## 62 SLC25A31 0.001743400 2.173779 9606.ENSP00000281154 #FFF6F6FF
## 99 CSF2RA 0.002901725 2.115444 9606.ENSP00000394227 #FFF6F6FF
## 161 PKD2L1 0.005162900 2.175662 9606.ENSP00000325296 #FFF6F6FF
## 354 RNF128 0.011995900 2.158618 9606.ENSP00000255499 #FFF6F6FF
## 606 FABP2 0.024723100 2.172116 9606.ENSP00000274024 #FFF6F6FF
## 613 EFHB 0.025116600 2.117086 9606.ENSP00000295824 #FFF6F6FF
## 711 MMP10 0.030393100 2.132074 9606.ENSP00000279441 #FFF6F6FF
## 1597 DSC1 0.083357100 2.183146 9606.ENSP00000257198 #FFF6F6FF
## 2807 BNC1 0.155771500 2.156336 9606.ENSP00000307041 #FFF6F6FF
## 10431 KLHDC1 0.476332350 2.122160 9606.ENSP00000352282 #FFF6F6FF
# post payload information to the STRING server
payload_id <- string_db$post_payload( example1_mapped_sig$STRING_id,
colors=example1_mapped_sig$color )
# display a STRING network png with the "halo"
string_db$plot_network(hits, payload_id=payload_id)
clustersList <- string_db$get_clusters(example1_mapped$STRING_id[1:600])
# plot first 4 clusters
par(mfrow=c(2,2))
for(i in seq(1:4)){
string_db$plot_network(clustersList[[i]])
}
#GO enrichment analysis
enrichmentGO <- string_db$get_enrichment( hits, category = "Process" )
head(enrichmentGO)
## category term number_of_genes number_of_genes_in_background ncbiTaxonId
## 1 Process GO:0006952 34 1296 9606
## 2 Process GO:0010951 12 248 9606
## 3 Process GO:0051707 31 1256 9606
## inputGenes
## 1 9606.ENSP00000008938,9606.ENSP00000223642,9606.ENSP00000224337,9606.ENSP00000243611,9606.ENSP00000245907,9606.ENSP00000262629,9606.ENSP00000262776,9606.ENSP00000263413,9606.ENSP00000264908,9606.ENSP00000265052,9606.ENSP00000297439,9606.ENSP00000321853,9606.ENSP00000325589,9606.ENSP00000332659,9606.ENSP00000334042,9606.ENSP00000337466,9606.ENSP00000349238,9606.ENSP00000349709,9606.ENSP00000355340,9606.ENSP00000356505,9606.ENSP00000359512,9606.ENSP00000360869,9606.ENSP00000361471,9606.ENSP00000362108,9606.ENSP00000364016,9606.ENSP00000369299,9606.ENSP00000379003,9606.ENSP00000383364,9606.ENSP00000385451,9606.ENSP00000388001,9606.ENSP00000406478,9606.ENSP00000426613,9606.ENSP00000433209,9606.ENSP00000475084
## 2 9606.ENSP00000223642,9606.ENSP00000237696,9606.ENSP00000245907,9606.ENSP00000254722,9606.ENSP00000264474,9606.ENSP00000265598,9606.ENSP00000287814,9606.ENSP00000321853,9606.ENSP00000337466,9606.ENSP00000407375,9606.ENSP00000433560,9606.ENSP00000434412
## 3 9606.ENSP00000008938,9606.ENSP00000223642,9606.ENSP00000243611,9606.ENSP00000245907,9606.ENSP00000262629,9606.ENSP00000263413,9606.ENSP00000264908,9606.ENSP00000287814,9606.ENSP00000297439,9606.ENSP00000310110,9606.ENSP00000332659,9606.ENSP00000337466,9606.ENSP00000341610,9606.ENSP00000349238,9606.ENSP00000349709,9606.ENSP00000355340,9606.ENSP00000356505,9606.ENSP00000359512,9606.ENSP00000360869,9606.ENSP00000361471,9606.ENSP00000362108,9606.ENSP00000364016,9606.ENSP00000365318,9606.ENSP00000369299,9606.ENSP00000383364,9606.ENSP00000388001,9606.ENSP00000404705,9606.ENSP00000407375,9606.ENSP00000426613,9606.ENSP00000433209,9606.ENSP00000475084
## preferredNames
## 1 PGLYRP1,C5,BLNK,C4BPB,C3,TYROBP,LGALS3BP,C6,ANXA3,MGLL,DEFB1,SERPINF2,PTPRCAP,CCR4,TAC4,WFDC10B,CCDC88B,RAET1E,APOBEC3C,NCF2,GBP3,IFIT1,ASS1,LCN2,PSMB8,TRIM22,NUPR1,PLA2G2A,MDK,OAS1,LILRB5,TNIP3,GSDMD,DUOX2
## 2 C5,RARRES1,C3,SERPINF1,CSTA,LAMP3,TIMP4,SERPINF2,WFDC10B,GPX1,CRYAB,SERPINH1
## 3 PGLYRP1,C5,C4BPB,C3,TYROBP,C6,ANXA3,TIMP4,DEFB1,TMEM255A,CCR4,WFDC10B,C15orf48,CCDC88B,RAET1E,APOBEC3C,NCF2,GBP3,IFIT1,ASS1,LCN2,PSMB8,NFKBIL1,TRIM22,PLA2G2A,OAS1,NAALADL2,GPX1,TNIP3,GSDMD,DUOX2
## p_value fdr description
## 1 5.07e-07 0.0065 Defense response
## 2 1.43e-05 0.0378 Negative regulation of endopeptidase activity
## 3 5.89e-06 0.0378 Response to other organism
#KEGG enrichment analysis
enrichmentKEGG <- string_db$get_enrichment( hits, category = "KEGG" )
head(enrichmentKEGG)
## category term number_of_genes number_of_genes_in_background ncbiTaxonId
## 12 KEGG hsa04115 6 72 9606
## inputGenes
## 12 9606.ENSP00000238721,9606.ENSP00000256996,9606.ENSP00000347979,9606.ENSP00000360025,9606.ENSP00000384849,9606.ENSP00000393762
## preferredNames p_value fdr description
## 12 TP53I3,DDB2,FAS,GADD45A,CDKN1A,SESN1 0.00014 0.0469 p53 signaling pathway
Seurat# specify your cell type
my_celltype <- "Ast"
# specify time point of interest
my_timept <- "2mo"
# Load Seurat data
# Change the filepath in quotes "" below to match where you
# have your celltype data saved
# The expected filepath is "~/AlzheimersDS/student-notebooks/FinalMergedData-my_celltype.rds"
# (where `my-celltype` is your celltype code)
my_celltype_data <- readRDS("/data/Alzheimers_DS/FinalMergedData-Ast.rds")
# Subset my celltype-specific Seurat object by time
my_seurat <- subset(x = my_celltype_data, subset = Age == my_timept)
# set active identities of my subset seurat to "Mt" attribute (denoting variant)
# for DE testing
Idents(my_seurat) <- "Mt"
summary(as.factor(my_seurat$Mt))
## V337M V337V
## 81 98
# Run D.E. test to identify genes differentially expressed between
# between diseased and normal cells, for my celltype and at this timepoint
marker_genes.df <- FindMarkers(my_seurat,
ident.1 = "V337M", ident.2 = "V337V")
# order results by p-value
marker_genes.df <- marker_genes.df %>% arrange(p_val)
head(marker_genes.df)
## p_val avg_log2FC pct.1 pct.2 p_val_adj
## HMGXB4 2.964098e-05 -0.4700985 0.247 0.541 1
## SNRPGP10 6.754363e-05 0.3308939 0.222 0.031 1
## TTBK2 1.044893e-04 -0.3491722 0.049 0.265 1
## RAB28 1.456217e-04 0.3951541 0.383 0.143 1
## HIRIP3 3.108470e-04 -0.2753372 0.037 0.224 1
## DOK5 3.644166e-04 0.4949083 0.617 0.357 1
# format a new dataframe using the contents of
# marker_genes.df, to pass to STRINGdb
stringdb.df <- marker_genes.df %>%
tibble::rownames_to_column("geneIDs") %>%
dplyr::select(geneIDs, avg_log2FC, p_val) %>%
mutate(logpval = -log10(p_val))
colnames(stringdb.df) <- c("geneIDs","logFC","pvalue","logpval")
# mapping
seurat_mapped <- string_db$map(stringdb.df, "geneIDs", removeUnmappedRows = TRUE )
## Warning: we couldn't map to STRING 5% of your identifiers
dim(seurat_mapped)
## [1] 317 5
#View(seurat_mapped)
hits_seurat <- seurat_mapped$STRING_id[1:200]
string_db$plot_network(hits_seurat)
# PAYLOAD MECHANISM
# filter by p-value and add a color column
# (i.e. green down-regulated gened and red for up-regulated genes)
head(subset(seurat_mapped, logpval >= -log10(0.01) & abs(logFC) >= 0.5))
## geneIDs logFC pvalue logpval STRING_id
## 9 HMGN3 0.5064855 0.001051848 2.978047 9606.ENSP00000482613
## 16 PCLO -0.5419183 0.002314014 2.635634 9606.ENSP00000334319
## 35 PTGDS -1.0792829 0.004890413 2.310654 9606.ENSP00000360687
## 49 KIDINS220 -0.5072945 0.006594584 2.180813 9606.ENSP00000256707
## 55 RUNX1T1 -0.5372801 0.007386023 2.131589 9606.ENSP00000402257
## 63 TMSB4X 0.5183016 0.008903777 2.050426 9606.ENSP00000370010
seurat_mapped_sig <- string_db$add_diff_exp_color(subset(seurat_mapped, logpval >= -log10(0.01) & abs(logFC) >= 0.5),
logFcColStr="logFC" )
head(seurat_mapped_sig)
## geneIDs logFC pvalue logpval STRING_id color
## 9 HMGN3 0.5064855 0.001051848 2.978047 9606.ENSP00000482613 #FFFFFFFF
## 63 TMSB4X 0.5183016 0.008903777 2.050426 9606.ENSP00000370010 #FF0000FF
## 68 CARHSP1 0.5097236 0.009482226 2.023090 9606.ENSP00000379838 #FFB9B9FF
## 16 PCLO -0.5419183 0.002314014 2.635634 9606.ENSP00000334319 #F3FFF3FF
## 35 PTGDS -1.0792829 0.004890413 2.310654 9606.ENSP00000360687 #00FF00FF
## 49 KIDINS220 -0.5072945 0.006594584 2.180813 9606.ENSP00000256707 #FFFFFFFF
table(seurat_mapped_sig$color)
##
## #00FF00FF #F3FFF3FF #F5FFF5FF #FF0000FF #FFB9B9FF #FFFFFFFF
## 1 1 1 1 1 2
dim(seurat_mapped_sig)
## [1] 7 6
seurat_mapped_sig$halo <- c("#FF0000FF", "#00FF00", "#00FF00", "#00FF00", "#00FF00", "#FF0000FF", "#FF0000FF")
# post payload information to the STRING server
payload_id_seurat <- string_db$post_payload( seurat_mapped_sig$STRING_id,
colors = seurat_mapped_sig$halo )
# display a STRING network png with the "halo"
string_db$plot_network(hits_seurat, payload_id = payload_id_seurat)
title(sub = "STRINGdb: protein-protein interaction of Astrocytes at 2mo",
cex.sub = 1, font.sub = 3, col.sub = "darkgreen"
)
clustersList_seurat <- string_db$get_clusters(seurat_mapped$STRING_id[1:600])
# plot first 4 clusters
par(mfrow=c(2,2))
for(i in seq(1:4)){
string_db$plot_network(clustersList_seurat[[i]])
}
# specify your cell type
my_celltype <- "Ast"
# specify time point of interest
my_timept_2 <- "4mo"
# Load Seurat data
# Change the filepath in quotes "" below to match where you
# have your celltype data saved
# The expected filepath is "~/AlzheimersDS/student-notebooks/FinalMergedData-my_celltype.rds"
# (where `my-celltype` is your celltype code)
my_celltype_data <- readRDS("/data/Alzheimers_DS/FinalMergedData-Ast.rds")
# Subset my celltype-specific Seurat object by time
my_seurat_2 <- subset(x = my_celltype_data, subset = Age == my_timept_2)
# set active identities of my subset seurat to "Mt" attribute (denoting variant)
# for DE testing
Idents(my_seurat_2) <- "Mt"
summary(as.factor(my_seurat_2$Mt))
## V337M V337V
## 83 93
# Run D.E. test to identify genes differentially expressed between
# between diseased and normal cells, for my celltype and at this timepoint
marker_genes.df_2 <- FindMarkers(my_seurat_2,
ident.1 = "V337M", ident.2 = "V337V")
# order results by p-value
marker_genes.df_2 <- marker_genes.df_2 %>% arrange(p_val)
head(marker_genes.df_2)
## p_val avg_log2FC pct.1 pct.2 p_val_adj
## RPL9 2.436679e-06 -0.4309564 1.000 1.000 0.09770109
## CHCHD2 4.873454e-05 0.9258074 0.639 0.452 1.00000000
## RPS5 6.022260e-05 -0.3538973 1.000 1.000 1.00000000
## RPL24 6.876765e-05 -0.2768466 1.000 1.000 1.00000000
## RPS13 8.903810e-05 -0.3510350 1.000 1.000 1.00000000
## HSP90AB1 1.384171e-04 -0.3997375 1.000 1.000 1.00000000
# format a new dataframe using the contents of
# marker_genes.df, to pass to STRINGdb
stringdb.df_2 <- marker_genes.df_2 %>%
tibble::rownames_to_column("geneIDs") %>%
dplyr::select(geneIDs, avg_log2FC, p_val) %>%
mutate(logpval = -log10(p_val))
colnames(stringdb.df_2) <- c("geneIDs","logFC","pvalue","logpval")
# mapping
seurat_mapped_2 <- string_db$map(stringdb.df_2, "geneIDs", removeUnmappedRows = TRUE )
## Warning: we couldn't map to STRING 7% of your identifiers
dim(seurat_mapped_2)
## [1] 386 5
#View(seurat_mapped)
hits_seurat_2 <- seurat_mapped_2$STRING_id[1:200]
# PAYLOAD MECHANISM
# filter by p-value and add a color column
# (i.e. green down-regulated gened and red for up-regulated genes)
head(subset(seurat_mapped_2, logpval >= -log10(0.01) & abs(logFC) >= 0.5))
## geneIDs logFC pvalue logpval STRING_id
## 2 CHCHD2 0.9258074 4.873454e-05 4.312163 9606.ENSP00000378812
## 9 CNTN1 -0.6736092 3.473495e-04 3.459233 9606.ENSP00000447006
## 31 RSPO2 -0.8157027 2.437428e-03 2.613068 9606.ENSP00000276659
## 33 LRRC3B 0.5120427 2.567023e-03 2.590570 9606.ENSP00000379880
## 34 SCG3 0.5644354 2.588237e-03 2.586996 9606.ENSP00000220478
## 73 ITM2C 0.5206845 7.820685e-03 2.106755 9606.ENSP00000322730
seurat_mapped_pval05_2 <- string_db$add_diff_exp_color(subset(seurat_mapped_2, logpval >= -log10(0.01) & abs(logFC) >= 0.5),
logFcColStr="logFC" )
head(seurat_mapped_pval05_2)
## geneIDs logFC pvalue logpval STRING_id color
## 2 CHCHD2 0.9258074 4.873454e-05 4.312163 9606.ENSP00000378812 #FF0000FF
## 33 LRRC3B 0.5120427 2.567023e-03 2.590570 9606.ENSP00000379880 #FFFFFFFF
## 34 SCG3 0.5644354 2.588237e-03 2.586996 9606.ENSP00000220478 #FFE4E4FF
## 73 ITM2C 0.5206845 7.820685e-03 2.106755 9606.ENSP00000322730 #FFFBFBFF
## 78 TFPI 0.6270914 8.997844e-03 2.045862 9606.ENSP00000233156 #FFC2C2FF
## 9 CNTN1 -0.6736092 3.473495e-04 3.459233 9606.ENSP00000447006 #FFFFFFFF
table(seurat_mapped_pval05_2$color)
##
## #00FF00FF #DBFFDBFF #FF0000FF #FFC2C2FF #FFE4E4FF #FFFBFBFF #FFFFFFFF
## 1 1 1 1 1 1 2
dim(seurat_mapped_pval05_2)
## [1] 8 6
seurat_mapped_pval05_2$halo <- c("#FF0000FF", "#00FF00", "#FF0000FF", "#FF0000FF", "#FF0000FF", "#FF0000FF", "#00FF00", "#00FF00")
# post payload information to the STRING server
payload_id_seurat_2 <- string_db$post_payload( seurat_mapped_pval05_2$STRING_id,
colors = seurat_mapped_pval05_2$halo )
string_db$plot_network(hits_seurat_2, payload_id = payload_id_seurat_2)
title(sub = "STRINGdb: protein-protein interaction of Astrocytes at 4mo",
cex.sub = 1, font.sub = 3, col.sub = "darkgreen"
)
clustersList_seurat_2 <- string_db$get_clusters(seurat_mapped_2$STRING_id[1:600])
# plot first 4 clusters
par(mfrow=c(2,2))
for(i in seq(1:4)){
string_db$plot_network(clustersList_seurat_2[[i]])
}
# specify your cell type
my_celltype <- "Ast"
# specify time point of interest
my_timept_3 <- "6mo"
# Load Seurat data
# Change the filepath in quotes "" below to match where you
# have your celltype data saved
# The expected filepath is "~/AlzheimersDS/student-notebooks/FinalMergedData-my_celltype.rds"
# (where `my-celltype` is your celltype code)
my_celltype_data <- readRDS("/data/Alzheimers_DS/FinalMergedData-Ast.rds")
# Subset my celltype-specific Seurat object by time
my_seurat_3 <- subset(x = my_celltype_data, subset = Age == my_timept_3)
# set active identities of my subset seurat to "Mt" attribute (denoting variant)
# for DE testing
Idents(my_seurat_3) <- "Mt"
summary(as.factor(my_seurat_3$Mt))
## V337M V337V
## 120 206
# Run D.E. test to identify genes differentially expressed between
# between diseased and normal cells, for my celltype and at this timepoint
marker_genes.df_3 <- FindMarkers(my_seurat_3,
ident.1 = "V337M", ident.2 = "V337V")
# order results by p-value
marker_genes.df_3 <- marker_genes.df_3 %>% arrange(p_val)
head(marker_genes.df_3)
## p_val avg_log2FC pct.1 pct.2 p_val_adj
## FAM183A 1.057416e-06 0.3126794 0.208 0.039 0.04239813
## RSPH1 6.416327e-06 0.5408231 0.292 0.107 0.25726906
## SDR39U1 8.923202e-06 0.2776002 0.367 0.146 0.35778471
## CHCHD2 7.523072e-05 0.5406251 0.742 0.534 1.00000000
## GSTO1 1.188225e-04 0.2758435 0.483 0.277 1.00000000
## KIF5B 2.266152e-04 -0.2973771 0.917 0.961 1.00000000
# format a new dataframe using the contents of
# marker_genes.df, to pass to STRINGdb
stringdb.df_3 <- marker_genes.df_3 %>%
tibble::rownames_to_column("geneIDs") %>%
dplyr::select(geneIDs, avg_log2FC, p_val) %>%
mutate(logpval = -log10(p_val))
colnames(stringdb.df_3) <- c("geneIDs","logFC","pvalue","logpval")
# mapping
seurat_mapped_3 <- string_db$map(stringdb.df_3, "geneIDs", removeUnmappedRows = TRUE )
## Warning: we couldn't map to STRING 4% of your identifiers
dim(seurat_mapped_3)
## [1] 174 5
#View(seurat_mapped)
hits_seurat_3 <- seurat_mapped_3$STRING_id[1:200]
# PAYLOAD MECHANISM
# filter by p-value and add a color column
# (i.e. green down-regulated gened and red for up-regulated genes)
head(subset(seurat_mapped_3, logpval >= -log10(0.01) & abs(logFC) >= 0.5))
## geneIDs logFC pvalue logpval STRING_id
## 2 RSPH1 0.5408231 6.416327e-06 5.192713 9606.ENSP00000291536
## 4 CHCHD2 0.5406251 7.523072e-05 4.123605 9606.ENSP00000378812
## 20 FOS 0.5067945 7.649047e-04 3.116393 9606.ENSP00000306245
## 24 C1orf194 0.5333269 8.290246e-04 3.081433 9606.ENSP00000358965
## 25 JUNB 0.6154213 9.516035e-04 3.021544 9606.ENSP00000303315
## 42 ZFP36 0.5511778 3.236093e-03 2.489979 9606.ENSP00000469647
seurat_mapped_pval05_3 <- string_db$add_diff_exp_color(subset(seurat_mapped_3, logpval >= -log10(0.01) & abs(logFC) >= 0.5),
logFcColStr="logFC" )
head(seurat_mapped_pval05_3)
## geneIDs logFC pvalue logpval STRING_id color
## 2 RSPH1 0.5408231 6.416327e-06 5.192713 9606.ENSP00000291536 #FFB2B2FF
## 4 CHCHD2 0.5406251 7.523072e-05 4.123605 9606.ENSP00000378812 #FFB3B3FF
## 20 FOS 0.5067945 7.649047e-04 3.116393 9606.ENSP00000306245 #FFFFFFFF
## 24 C1orf194 0.5333269 8.290246e-04 3.081433 9606.ENSP00000358965 #FFC3C3FF
## 25 JUNB 0.6154213 9.516035e-04 3.021544 9606.ENSP00000303315 #FF0000FF
## 42 ZFP36 0.5511778 3.236093e-03 2.489979 9606.ENSP00000469647 #FF9A9AFF
table(seurat_mapped_pval05_3$color)
##
## #FF0000FF #FF9A9AFF #FFB2B2FF #FFB3B3FF #FFBDBDFF #FFC3C3FF #FFFFFFFF
## 1 1 1 1 1 1 2
dim(seurat_mapped_pval05_3)
## [1] 8 6
c <- subset(seurat_mapped_3, logpval >= -log10(0.01) & abs(logFC) >= 0.5)
c
## geneIDs logFC pvalue logpval STRING_id
## 2 RSPH1 0.5408231 6.416327e-06 5.192713 9606.ENSP00000291536
## 4 CHCHD2 0.5406251 7.523072e-05 4.123605 9606.ENSP00000378812
## 20 FOS 0.5067945 7.649047e-04 3.116393 9606.ENSP00000306245
## 24 C1orf194 0.5333269 8.290246e-04 3.081433 9606.ENSP00000358965
## 25 JUNB 0.6154213 9.516035e-04 3.021544 9606.ENSP00000303315
## 42 ZFP36 0.5511778 3.236093e-03 2.489979 9606.ENSP00000469647
## 70 GADD45B 0.5359409 9.291844e-03 2.031898 9606.ENSP00000215631
## 73 GFAP -0.7652751 9.937663e-03 2.002716 9606.ENSP00000468500
seurat_mapped_pval05_3$halo <- c("#FF0000FF", "#FF0000FF", "#FF0000FF", "#FF0000FF", "#FF0000FF", "#FF0000FF", "#FF0000FF", "#00FF00")
# post payload information to the STRING server
payload_id_seurat_3 <- string_db$post_payload( seurat_mapped_pval05_3$STRING_id,
colors = seurat_mapped_pval05_3$halo )
string_db$plot_network(hits_seurat_3, payload_id = payload_id_seurat_3)
title(sub = "STRINGdb: protein-protein interaction of Astrocytes at 6mo",
cex.sub = 1, font.sub = 3, col.sub = "darkgreen"
)
clustersList_seurat_3 <- string_db$get_clusters(seurat_mapped_3$STRING_id[1:600])
# plot first 4 clusters
par(mfrow=c(2,2))
for(i in seq(1:4)){
string_db$plot_network(clustersList_seurat_3[[i]])
}