Dataset: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE125620
library(Seurat)
library(dplyr)
library(Matrix)
library(readxl)
library(ggplot2)
library(rhdf5)
library(patchwork)
setwd("/home/itxaso/Desktop/Project")
This matrix is in h5 format, we need special packages to upload it.
counts_matrix <- Read10X_h5("GSM3578982_raw_gene_bc_matrices_h5.h5")
Getting the initial dimensions of the matrix: (genes)x(cel.num)
dim(counts_matrix)
## [1] 28694 737280
The metadata provided with the experiment dataset doesn’t correspond to each cell, it is related only to the experiment. As a result we can’t add it to the Seurat object in a regular way.
metadata <- read.csv("SraRunTable.csv", row.names = 1)
print(metadata)
## Assay Type AvgSpotLen Bases BioProject BioSample Bytes
## SRR8485394 RNA-Seq 134 5411582496 PRJNA516927 SAMN10815725 2827208337 Embryoid
## SRR8485395 RNA-Seq 134 5066920024 PRJNA516927 SAMN10815725 2648158132 Embryoid
## SRR8485396 RNA-Seq 134 5609275108 PRJNA516927 SAMN10815725 2933035210 Embryoid
## SRR8485397 RNA-Seq 134 5198843024 PRJNA516927 SAMN10815725 2716994279 Embryoid
## cell_type Center Name Consent DATASTORE
## SRR8485394 Bodies GEO public run.zq,sra,fastq ncbi,gs,s3
## SRR8485395 Bodies GEO public run.zq,sra,fastq s3,ncbi,gs
## SRR8485396 Bodies GEO public sra,fastq,run.zq ncbi,gs,s3
## SRR8485397 Bodies GEO public sra,run.zq,fastq ncbi,gs,s3
## filetype DATASTORE.1 provider
## SRR8485394 ncbi.public,s3.us-east-1,gs.us-east1 SRX5290059 pooled
## SRR8485395 s3.us-east-1,gs.us-east1,ncbi.public SRX5290059 pooled
## SRR8485396 s3.us-east-1,gs.us-east1,ncbi.public SRX5290059 pooled
## SRR8485397 s3.us-east-1,gs.us-east1,ncbi.public SRX5290059 pooled
## DATASTORE.2 region Experiment genotype GEO_Accession
## SRR8485394 Ainv15(iAscl1.V5) and Ainv15(iNeurog2 Tubb3::GFP) GSM3578982
## SRR8485395 Ainv15(iAscl1.V5) and Ainv15(iNeurog2 Tubb3::GFP) GSM3578982
## SRR8485396 Ainv15(iAscl1.V5) and Ainv15(iNeurog2 Tubb3::GFP) GSM3578982
## SRR8485397 Ainv15(iAscl1.V5) and Ainv15(iNeurog2 Tubb3::GFP) GSM3578982
## X.exp. Instrument LibraryLayout LibrarySelection LibrarySource
## SRR8485394 NextSeq 500 PAIRED cDNA TRANSCRIPTOMIC
## SRR8485395 NextSeq 500 PAIRED cDNA TRANSCRIPTOMIC
## SRR8485396 NextSeq 500 PAIRED cDNA TRANSCRIPTOMIC
## SRR8485397 NextSeq 500 PAIRED cDNA TRANSCRIPTOMIC
## Organism Platform ReleaseDate create_date
## SRR8485394 Mus musculus ILLUMINA 2019-02-28T00:00:00Z
## SRR8485395 Mus musculus ILLUMINA 2019-02-28T00:00:00Z
## SRR8485396 Mus musculus ILLUMINA 2019-02-28T00:00:00Z
## SRR8485397 Mus musculus ILLUMINA 2019-02-28T00:00:00Z
## version Sample Name.1 source_name SRA Study
## SRR8485394 2019-01-25T03:03:00Z 1 GSM3578982 ES-derived embryoid bodies
## SRR8485395 2019-01-25T03:25:00Z 1 GSM3578982 ES-derived embryoid bodies
## SRR8485396 2019-01-25T02:15:00Z 1 GSM3578982 ES-derived embryoid bodies
## SRR8485397 2019-01-25T02:23:00Z 1 GSM3578982 ES-derived embryoid bodies
## treatment X X.1 X.2 X.3 X.4 X.5 X.6
## SRR8485394 with induced transcription factors SRP181957 Dox induction +
## SRR8485395 with induced transcription factors SRP181957 Dox induction +
## SRR8485396 with induced transcription factors SRP181957 Dox induction +
## SRR8485397 with induced transcription factors SRP181957 Dox induction +
## X.7 X.8
## SRR8485394 48 hours
## SRR8485395 48 hours
## SRR8485396 48 hours
## SRR8485397 48 hours
seurat_obj <- CreateSeuratObject(counts = counts_matrix)
As it was previously explained, we don’t have a real metadata file. We will add manually the available metadata useful for our analysis.
seurat_obj$sample <- "GSM3578982"
seurat_obj$run <- "SRR8485394-97"
seurat_obj$condition <- "Dox induction + 48 hours"
seurat_obj$cell_type <- "ES-derived embryoid bodies with induced transcription factors"
seurat_obj$species <- "Mus musculus"
#See the result
seurat_obj
## An object of class Seurat
## 28694 features across 737280 samples within 1 assay
## Active assay: RNA (28694 features, 0 variable features)
## 1 layer present: counts
head(seurat_obj@meta.data)
## orig.ident nCount_RNA nFeature_RNA sample
## AAACCTGAGAAACCAT-1 SeuratProject 0 0 GSM3578982
## AAACCTGAGAAACCGC-1 SeuratProject 0 0 GSM3578982
## AAACCTGAGAAACCTA-1 SeuratProject 274 211 GSM3578982
## AAACCTGAGAAACGAG-1 SeuratProject 0 0 GSM3578982
## AAACCTGAGAAACGCC-1 SeuratProject 1 1 GSM3578982
## AAACCTGAGAAAGTGG-1 SeuratProject 0 0 GSM3578982
## run condition
## AAACCTGAGAAACCAT-1 SRR8485394-97 Dox induction + 48 hours
## AAACCTGAGAAACCGC-1 SRR8485394-97 Dox induction + 48 hours
## AAACCTGAGAAACCTA-1 SRR8485394-97 Dox induction + 48 hours
## AAACCTGAGAAACGAG-1 SRR8485394-97 Dox induction + 48 hours
## AAACCTGAGAAACGCC-1 SRR8485394-97 Dox induction + 48 hours
## AAACCTGAGAAAGTGG-1 SRR8485394-97 Dox induction + 48 hours
## cell_type
## AAACCTGAGAAACCAT-1 ES-derived embryoid bodies with induced transcription factors
## AAACCTGAGAAACCGC-1 ES-derived embryoid bodies with induced transcription factors
## AAACCTGAGAAACCTA-1 ES-derived embryoid bodies with induced transcription factors
## AAACCTGAGAAACGAG-1 ES-derived embryoid bodies with induced transcription factors
## AAACCTGAGAAACGCC-1 ES-derived embryoid bodies with induced transcription factors
## AAACCTGAGAAAGTGG-1 ES-derived embryoid bodies with induced transcription factors
## species
## AAACCTGAGAAACCAT-1 Mus musculus
## AAACCTGAGAAACCGC-1 Mus musculus
## AAACCTGAGAAACCTA-1 Mus musculus
## AAACCTGAGAAACGAG-1 Mus musculus
## AAACCTGAGAAACGCC-1 Mus musculus
## AAACCTGAGAAAGTGG-1 Mus musculus
28694 features across 737280 samples this indicates that a huge number of genes have 0 counts in some cells, it’s necessary to do the filtering, normalization and scaling.
erccs <- grep("^ERCC-", rownames(seurat_obj), value = TRUE)
if (length(erccs) > 0) {
counts <- GetAssayData(seurat_obj, assay = "RNA", layer = "counts")
percent.ercc <- Matrix::colSums(counts[erccs, , drop = FALSE]) /
Matrix::colSums(counts)
seurat_obj <- AddMetaData(seurat_obj, percent.ercc, col.name = "percent.ercc")
}
#Our dataset is already filtered for ERC, but just in case we apply the filtering
#Filtering: erasing low quality cells and outliers (doublets, too high expression)
seurat_obj <- subset(
seurat_obj,
subset =
nFeature_RNA > 500 &
nFeature_RNA < 8000 &
nCount_RNA > 5000 &
nCount_RNA < 60000
)
#Normalization
seurat_obj <- NormalizeData(seurat_obj, normalization.method = "LogNormalize", scale.factor = 1e4)
## Normalizing layer: counts
# Variable genes
seurat_obj <- FindVariableFeatures(seurat_obj)
## Finding variable features for layer counts
# Scaling
seurat_obj <- ScaleData(seurat_obj)
## Centering and scaling data matrix
dim(seurat_obj)
## [1] 28694 3477
p <- VlnPlot(
seurat_obj,
features = c("nFeature_RNA", "nCount_RNA"),
ncol = 2
)
p & theme(
axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank()
)
seurat_obj <- FindVariableFeatures(object = seurat_obj)
## Finding variable features for layer counts
top10 <- head(VariableFeatures(seurat_obj), 10)
plot1 <- VariableFeaturePlot(seurat_obj)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
## When using repel, set xnudge and ynudge to 0 for optimal results
plot1
## Warning in scale_x_log10(): log-10 transformation introduced infinite values.
plot2
## Warning in scale_x_log10(): log-10 transformation introduced infinite values.
seurat_obj <- RunPCA(seurat_obj, features = VariableFeatures(seurat_obj))
## PC_ 1
## Positive: Npm1, Rps2, Mdk, H2afv, Rpl27a, Rpl12, Gapdh, Rplp1, Ran, Hmgn2
## Hspe1, Hmgb2, Slc2a1, Rpl36a, Siva1, Mif, H2afz, Hmgb1, Rpl10a, 2810417H13Rik
## Rps12, Hes5, Cdca7, Id3, Rrm2, Ranbp1, Sox21, Dek, Birc5, Wls
## Negative: Rtn1, Mllt11, Onecut2, Map1b, Gng3, Tubb2b, Stmn2, Ebf1, Ina, Basp1
## Gap43, Dcx, Snhg11, Crmp1, Kif5c, Stmn3, Stmn1, Tagln3, Rufy3, Nefl
## Klc1, Tubb3, Syt11, Ank3, Tmsb10, App, Tubb2a, Aplp1, Ebf3, Stmn4
## PC_ 2
## Positive: Nbl1, Ckb, Pkib, Homer2, Tfap2b, Chrna4, Tlx3, Podxl2, Rgs16, Pkd2l1
## Lzts1, Sec11c, Hes6, Snca, Tpm2, Mxra7, Fam19a4, Lgals1, Ngfr, Dmpk
## Hepacam2, Ptn, Plk3, Gadd45g, Cyba, Dner, Ola1, Dpysl3, Peg10, Tubb3
## Negative: Top2a, Cdca8, Birc5, Spc25, Ccna2, Nusap1, Cenpf, 2810417H13Rik, Cdk1, Ccnb1
## Cks2, Ube2c, Tpx2, Cenpe, Cdca3, Tuba1b, Fam64a, Plk1, H2afx, Spc24
## Rrm2, Mki67, Ckap2l, Cenpa, Hmmr, Cdc20, Hmgb2, Prc1, Incenp, Pbk
## PC_ 3
## Positive: Neurog2, Mfap4, Ppp1r14a, Rps21, Vim, Prmt8, Rpl10a, Rps12, Neurod1, Rpl27a
## Ly6e, Rplp1, Rpl36a, Adssl1, Eef1b2, Rpl12, Gapdh, Nhlh2, Nhlh1, Irx3
## Kcnmb2, Epb41, tubb3gfp, Pcdh17, Btg2, Rps12-ps3, Phlda1, Plagl1, Rgs13, Litaf
## Negative: Tlx3, Snca, Sec11c, Hepacam2, Podxl2, Arl6ip1, Tpm2, Dmpk, Lgals1, P2rx3
## Tfap2b, Pkib, Dner, Nusap1, Ccnb1, Tubb4b, Plk1, Ola1, Cenpf, Nbl1
## Mthfd2, Prc1, Cnr1, Ptn, Ngfr, Ccna2, Lzts1, F2rl2, Chrna4, Mxra7
## PC_ 4
## Positive: Sparc, Cntnap2, Eif4ebp1, Apoe, Id3, Slc2a3, Ung, Bmp7, Mt1, Ubash3b
## Gpm6a, Mapt, Tagln2, Irf2bpl, Epcam, Cdh1, Anxa2, Nefl, Hrk, Perp
## Fhl1, Gsta4, Kcna6, Ifitm3, Pou6f2, Hes1, Ccng1, Dscam, Gyltl1b, Pmm1
## Negative: Sox11, Slc1a2, Mfap4, Marcks, Ebf2, Foxp2, Prmt8, Neurod4, Elavl3, Rgs13
## Plagl1, Elavl2, Elavl4, Btg2, Celsr2, Lhx5, Kcnmb2, Tmem2, Nhlh2, Ube2c
## Igsf8, Angpt1, Gadd45g, Racgap1, 3110035E14Rik, St18, Akr1b8, Neurog2, Sox4, Ly6e
## PC_ 5
## Positive: Chchd10, Ldha, Igfbp2, Csrp1, Tpi1, Aldoa, Adssl1, Eno1, Slc16a3, Tagln2
## Ndufa4l2, Smtnl2, Epcam, Dppa5a, Anxa2, Cdh1, Rasgrp2, Krt8, Ifitm1, Apoe
## Tcea3, Crip1, Igsf8, Krt18, Ndrg1, Ppp1r14a, Cox6b2, Perp, Cyba, Mfap4
## Negative: Meis2, Fez1, Sox21, Pdzrn4, Igdcc3, Cdc25b, RP23-59P20.3, Gata3, Frzb, Ddah1
## Hes5, Fhl1, Nrep, Prss23, Ptn, Sox11, Ptprz1, Rgma, Sox9, Pdpn
## Fndc3c1, Tcf7l2, Foxb1, Celf2, Marcks, Lmo4, Peg12, Ckb, Peg3, Pantr1
ElbowPlot(seurat_obj, ndims = 50)
DimHeatmap(seurat_obj, dims = 1:15, cells = 500, balanced = TRUE)
We choose 25 as the number of sgnificant PC to use for the subsequent UMAP.
seurat_obj <- FindNeighbors(seurat_obj, dims = 1:25)
## Computing nearest neighbor graph
## Computing SNN
seurat_obj <- FindClusters(seurat_obj, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 3477
## Number of edges: 125983
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8570
## Number of communities: 7
## Elapsed time: 0 seconds
# ----- UMAP -----
seurat_obj <- RunUMAP(seurat_obj, dims = 1:25)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 11:24:38 UMAP embedding parameters a = 0.9922 b = 1.112
## 11:24:38 Read 3477 rows and found 25 numeric columns
## 11:24:38 Using Annoy for neighbor search, n_neighbors = 30
## 11:24:38 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 11:24:39 Writing NN index file to temp file /tmp/Rtmp37y3HI/file586559185041
## 11:24:39 Searching Annoy index using 1 thread, search_k = 3000
## 11:24:39 Annoy recall = 100%
## 11:24:40 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 11:24:40 Initializing from normalized Laplacian + noise (using RSpectra)
## 11:24:40 Commencing optimization for 500 epochs, with 143368 positive edges
## 11:24:40 Using rng type: pcg
## 11:24:42 Optimization finished
#Number of clusters
levels(seurat_obj)
## [1] "0" "1" "2" "3" "4" "5" "6"
DimPlot(seurat_obj, reduction = "umap", group.by = "sample")
DimPlot(seurat_obj, reduction = "umap", label = TRUE)
We will save the top markers for each cluster in order to identify the different cell types present in our dataset.
markers <- FindAllMarkers(seurat_obj,
only.pos = TRUE,
min.pct = 0.25,
logfc.threshold = 0.25)
## Calculating cluster 0
## For a (much!) faster implementation of the Wilcoxon Rank Sum Test,
## (default method for FindMarkers) please install the presto package
## --------------------------------------------
## install.packages('devtools')
## devtools::install_github('immunogenomics/presto')
## --------------------------------------------
## After installation of presto, Seurat will automatically use the more
## efficient implementation (no further action necessary).
## This message will be shown once per session
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
## Calculating cluster 6
top_markers <- markers %>% group_by(cluster) %>% top_n(5, avg_log2FC)
print(top_markers)
## # A tibble: 35 × 7
## # Groups: cluster [7]
## p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene
## <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <chr>
## 1 7.91e- 76 2.17 0.45 0.169 2.27e- 71 0 Pdgfa
## 2 1.37e- 58 2.11 0.371 0.133 3.93e- 54 0 Hes1
## 3 5.06e- 47 2.33 0.259 0.079 1.45e- 42 0 Etl4
## 4 1.23e- 35 2.17 0.287 0.123 3.53e- 31 0 Adcyap1r1
## 5 1.23e- 31 2.14 0.255 0.107 3.53e- 27 0 Kcnh7
## 6 0 3.81 0.888 0.177 0 1 Tlx3
## 7 1.30e-218 3.67 0.597 0.089 3.73e-214 1 Hepacam2
## 8 1.45e-117 3.52 0.426 0.086 4.16e-113 1 Cldn5
## 9 4.70e-105 4.19 0.257 0.023 1.35e-100 1 Eva1c
## 10 6.55e- 98 3.91 0.254 0.026 1.88e- 93 1 Cdh22
## # ℹ 25 more rows
sel <- dplyr::select(top_markers, cluster, gene)
sel$gene <- toupper(sel$gene)
Based on literature we set the following labels:
seurat_obj <- RenameIdents(seurat_obj,
`0` = "Early neural progenitors/Radial‑glia–like",
`1` = "Sensory/Dorsal interneurons Tlx3+",
`2` = "Dorsal interneurons/Sensory neuroblast",
`3` = "Mitotic/Proliferating progenitors",
`4` = "Mesenchymal-like/Intermediate Progenitors",
`5` = "Immature/Post‑mitotic glutamatergic neurons",
`6` = "Autonomic Neurons")
DimPlot(seurat_obj, reduction = "umap", label.size = 3) +
labs(title = "Cell type prediction") +
theme(
plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
legend.text = element_text(size = 8),
legend.title = element_text(size = 10),
aspect.ratio = 0.7
)
How is the gene’s expression distributed in our data?
FeaturePlot(seurat_obj, features = "Neurog2", label = TRUE, repel = T, label.size = 3)
VlnPlot(seurat_obj, features = "Neurog2") +
theme(
plot.margin = margin(20, 20, 20, 20),
axis.title.x = element_blank(),
axis.text.x = element_blank(),
legend.text = element_text(size = 8),
legend.title = element_text(size = 10)
)
FeaturePlot(seurat_obj, features = "Ascl1", label = TRUE, repel = T, label.size = 3)
#VGLUT2 expression for the glutamatergic identity confirmation.
FeaturePlot(seurat_obj, features = "Slc17a6", label = TRUE, repel = T, label.size = 3)
Based on the results obtained from the ChIP-Seq significant genes were selected, the sequences where Neurog2 acts. Some of the most significant genes were selected, selecting the biggest values in the V5 column, corresponding to the -log(p-value).
chip<- read.table("CHIPIP1_summits_resized_annotation_mm10.tsv",
header = TRUE,
sep = "\t",
check.names = FALSE)
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : number of items read is not a multiple of the number of columns
sum(duplicated(chip[,1]))
## [1] 0
head(chip)
## V4 SYMBOL GENENAME
## 1 IP1_peak_980 4930517M08Rik RIKEN cDNA 4930517M08 gene
## 2 IP1_peak_1027 Dpp9 dipeptidylpeptidase 9
## 3 IP1_peak_2016 Prb1c proline-rich protein BstNI subfamily 1C
## 4 IP1_peak_664 Enc1 ectodermal-neural cortex 1
## 5 IP1_peak_305 4933427E13Rik RIKEN cDNA 4933427E13 gene
## 6 IP1_peak_776 Pou4f1 POU domain, class 4, transcription factor 1
## seqnames start end width strand V5
## 1 chr17 4626460 4626460 1 * 151.34766
## 2 chr17 56200054 56200054 1 * 72.42954
## 3 chr6 132463771 132463771 1 * 71.38757
## 4 chr13 97292182 97292182 1 * 70.13784
## 5 chr11 25100171 25100171 1 * 67.13946
## 6 chr14 104336989 104336989 1 * 66.92199
## annotation geneChr
## 1 Distal Intergenic 17
## 2 Intron (ENSMUST00000223616.1/224897, intron 7 of 18) 17
## 3 Distal Intergenic 6
## 4 Distal Intergenic 13
## 5 Distal Intergenic 11
## 6 Intron (ENSMUST00000228095.1/ENSMUST00000228095.1, intron 1 of 4) 14
## geneStart geneEnd geneLength geneStrand geneId transcriptId
## 1 4634978 4637471 2494 1 75117 ENSMUST00000232624.1
## 2 56186807 56209208 22402 2 224897 ENSMUST00000223616.1
## 3 132361041 132364134 3094 2 667929 ENSMUST00000080849.5
## 4 97241105 97253034 11930 1 13803 ENSMUST00000041623.8
## 5 25326107 25367137 41031 1 71234 ENSMUST00000125234.7
## 6 104463973 104465365 1393 2 18996 ENSMUST00000139505.1
## distanceToTSS ENSEMBL
## 1 -8518 ENSMUSG00000116767
## 2 9154 ENSMUSG00000001229
## 3 -99637 ENSMUSG00000030143
## 4 51077 ENSMUSG00000041773
## 5 -225936 ENSMUSG00000033015
## 6 128376 ENSMUSG00000048349
Using the markers extracted from the Single-cell analysis, the ones with a significant p-value, and a logFC bigger than 0.25 (logfc.threshold = 0.25), we overlapped the significant genes from the ChIP-Seq analysis. We expect a coexpression of these genes with NeuroG2.
#Only saving the gene name and -log(p-value)
#For the gene name we will use the ENSEMBL ID, to make sure the same gene ID is contained in the chip and seurat object
chip_ids <- chip[, c("ENSEMBL", "V5")]
chip_ids <- chip_ids[!is.na(chip_ids$ENSEMBL), ]
library(biomaRt)
mart <- useMart("ensembl", dataset = "mmusculus_gene_ensembl")
mapping <- getBM(
attributes = c("ensembl_gene_id", "mgi_symbol"),
filters = "ensembl_gene_id",
values = unique(chip_ids$ENSEMBL),
mart = mart
)
chip_mapped <- merge(
chip_ids,
mapping,
by.x = "ENSEMBL",
by.y = "ensembl_gene_id",
all.x = TRUE
)
mark<- markers[, c("p_val_adj", "gene", "avg_log2FC")]
#Once we ensured same gene notation we can overlap the significant markers with the chipseq results
#overlap <- chip_mapped[chip_mapped$mgi_symbol %in% markers$gene, ]
overlap <- merge(
x = mark,
y = chip_mapped,
by.x = "gene",
by.y = "mgi_symbol",
all = FALSE
)
overlap <- overlap[order(-overlap$V5), ]
head(overlap)
## gene p_val_adj avg_log2FC ENSEMBL V5
## 160 Enc1 2.866558e-24 1.2499249 ENSMUSG00000041773 70.13784
## 162 Enc1 1.048743e-17 0.9415913 ENSMUSG00000041773 70.13784
## 164 Enc1 9.255236e-20 0.8501110 ENSMUSG00000041773 70.13784
## 430 Pou4f1 4.300340e-163 2.6123279 ENSMUSG00000048349 66.92199
## 431 Pou4f1 4.838743e-65 1.2978100 ENSMUSG00000048349 66.92199
## 41 Btbd17 1.219519e-54 0.8959460 ENSMUSG00000000202 61.20669
highV5 <- quantile(overlap$V5, 0.75) # top 25%
highFC <- quantile(overlap$avg_log2FC, 0.75) # top 25%
strong_overlap <- overlap[
overlap$V5 >= highV5 & overlap$avg_log2FC >= highFC,
]
strong_overlap
## gene p_val_adj avg_log2FC ENSEMBL V5
## 160 Enc1 2.866558e-24 1.249925 ENSMUSG00000041773 70.13784
## 430 Pou4f1 4.300340e-163 2.612328 ENSMUSG00000048349 66.92199
## 431 Pou4f1 4.838743e-65 1.297810 ENSMUSG00000048349 66.92199
## 398 Olig3 1.792863e-89 3.325452 ENSMUSG00000045591 56.45586
## 227 Hes6 4.541142e-101 1.415463 ENSMUSG00000067071 44.41752
## 329 Mtus1 8.249217e-54 2.242453 ENSMUSG00000045636 34.98355
## 201 Gadd45g 4.039895e-121 2.354110 ENSMUSG00000021453 34.71003
## 526 Sncaip 2.639564e-34 1.436981 ENSMUSG00000024534 30.88683
## 497 Serinc5 2.419223e-13 1.542958 ENSMUSG00000021703 28.46338
## 498 Serinc5 5.950352e-04 1.570370 ENSMUSG00000021703 28.46338
## 499 Sesn1 2.345517e-16 1.172481 ENSMUSG00000038332 28.30834
## 387 Ntm 5.518425e-05 1.430251 ENSMUSG00000059974 15.53478
## 17 App 7.121729e-79 1.200616 ENSMUSG00000022892 15.40514
## 19 App 2.717924e-16 1.256128 ENSMUSG00000022892 15.40514
## 93 Crnde 2.547980e-07 1.228318 ENSMUSG00000031736 15.40514
## 403 Otx2 2.595010e-25 1.236738 ENSMUSG00000021848 15.40514
## 416 Phf24 3.291399e-61 1.672574 ENSMUSG00000036062 15.35059
## 417 Phf24 4.094788e-149 2.673707 ENSMUSG00000036062 15.35059
## 71 Celf4 5.295431e-185 2.713502 ENSMUSG00000024268 15.05017
## 75 Celf4 6.627962e-128 1.390621 ENSMUSG00000024268 15.05017
## 423 Plcb1 1.057820e-37 1.824285 ENSMUSG00000051177 14.75609
## 319 Mfap4 6.067849e-69 1.503434 ENSMUSG00000042436 14.67002
## 320 Mfap4 5.833026e-119 1.783510 ENSMUSG00000042436 14.67002
## 445 Rabgap1l 5.376378e-74 2.314538 ENSMUSG00000026721 14.46189
## 291 Lzts1 9.841750e-207 2.708025 ENSMUSG00000036306 11.67319
## 315 Mdga1 4.992229e-37 1.328536 ENSMUSG00000043557 11.67319
## 424 Plcb1 1.057820e-37 1.824285 ENSMUSG00000051177 11.67319
## 590 Uncx 5.193775e-142 1.997435 ENSMUSG00000029546 11.60869
## 592 Uncx 6.878583e-105 1.659642 ENSMUSG00000029546 11.60869
## 200 Gadd45g 4.039895e-121 2.354110 ENSMUSG00000021453 11.34462
## 455 Rcn1 1.175142e-40 1.328328 ENSMUSG00000005973 11.28587
## 539 Spock1 7.952841e-77 1.956003 ENSMUSG00000056222 11.24010
## 540 Spock1 9.587964e-39 2.861777 ENSMUSG00000056222 11.24010
## 126 Dok4 2.752426e-36 1.442075 ENSMUSG00000040631 10.77882
## 171 Epha5 6.196087e-97 1.749136 ENSMUSG00000029245 10.77882
## 172 Epha5 2.738633e-148 2.425795 ENSMUSG00000029245 10.77882
## 354 Nefl 7.392488e-154 2.493698 ENSMUSG00000022055 10.77882
## 356 Nefl 1.384559e-34 2.378868 ENSMUSG00000022055 10.77882
FeaturePlot(seurat_obj, features = "Enc1", label = TRUE, repel = T, label.size = 3)
FeaturePlot(seurat_obj, features = "Pou4f1", label = TRUE, repel = T, label.size = 3)
FeaturePlot(seurat_obj, features = "Btbd17", label = TRUE, repel = T, label.size = 3)
FeaturePlot(seurat_obj, features = "Olig3", label = TRUE, repel = T, label.size = 3)
FeaturePlot(seurat_obj, features = "Tead1", label = TRUE, repel = T, label.size = 3)
FeaturePlot(seurat_obj, features = "Neurod2", label = TRUE, repel = T, label.size = 3)
FeaturePlot(seurat_obj, features = "Celf4", label = TRUE, repel = T, label.size = 3)
FeaturePlot(seurat_obj, features = "Phf24", label = TRUE, repel = T, label.size = 3)
FeaturePlot(seurat_obj, features = "Epha4", label = TRUE, repel = T, label.size = 3)