Single-Cell-Analysis

Setting up necessary files/packages

Dataset: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE125620

Molecular basis of neuronal subtype bias introduced by proneural factors Ascl1 and Neurog2 (single-cell RNA-seq)
Mus musculus
Embrionic bodies exposed to differentiation factors and a Doxycycline treatment for 48 hours.

library(Seurat)
library(dplyr)
library(Matrix)
library(readxl)
library(ggplot2)
library(rhdf5)
library(patchwork)

setwd("/home/itxaso/Desktop/Project")

Uploading the raw count matrix

This matrix is in h5 format, we need special packages to upload it.

counts_matrix <- Read10X_h5("GSM3578982_raw_gene_bc_matrices_h5.h5")

Getting the initial dimensions of the matrix: (genes)x(cel.num)

dim(counts_matrix)

## [1]  28694 737280

Metadata

The metadata provided with the experiment dataset doesn’t correspond to each cell, it is related only to the experiment. As a result we can’t add it to the Seurat object in a regular way.

metadata <- read.csv("SraRunTable.csv", row.names = 1)
print(metadata)

##              Assay Type AvgSpotLen       Bases   BioProject  BioSample    Bytes
## SRR8485394 RNA-Seq  134 5411582496 PRJNA516927 SAMN10815725 2827208337 Embryoid
## SRR8485395 RNA-Seq  134 5066920024 PRJNA516927 SAMN10815725 2648158132 Embryoid
## SRR8485396 RNA-Seq  134 5609275108 PRJNA516927 SAMN10815725 2933035210 Embryoid
## SRR8485397 RNA-Seq  134 5198843024 PRJNA516927 SAMN10815725 2716994279 Embryoid
##            cell_type Center   Name          Consent  DATASTORE
## SRR8485394    Bodies    GEO public run.zq,sra,fastq ncbi,gs,s3
## SRR8485395    Bodies    GEO public run.zq,sra,fastq s3,ncbi,gs
## SRR8485396    Bodies    GEO public sra,fastq,run.zq ncbi,gs,s3
## SRR8485397    Bodies    GEO public sra,run.zq,fastq ncbi,gs,s3
##                                        filetype DATASTORE.1 provider
## SRR8485394 ncbi.public,s3.us-east-1,gs.us-east1  SRX5290059   pooled
## SRR8485395 s3.us-east-1,gs.us-east1,ncbi.public  SRX5290059   pooled
## SRR8485396 s3.us-east-1,gs.us-east1,ncbi.public  SRX5290059   pooled
## SRR8485397 s3.us-east-1,gs.us-east1,ncbi.public  SRX5290059   pooled
##                  DATASTORE.2 region      Experiment    genotype GEO_Accession
## SRR8485394 Ainv15(iAscl1.V5)    and Ainv15(iNeurog2 Tubb3::GFP)    GSM3578982
## SRR8485395 Ainv15(iAscl1.V5)    and Ainv15(iNeurog2 Tubb3::GFP)    GSM3578982
## SRR8485396 Ainv15(iAscl1.V5)    and Ainv15(iNeurog2 Tubb3::GFP)    GSM3578982
## SRR8485397 Ainv15(iAscl1.V5)    and Ainv15(iNeurog2 Tubb3::GFP)    GSM3578982
##             X.exp. Instrument LibraryLayout LibrarySelection  LibrarySource
## SRR8485394 NextSeq        500        PAIRED             cDNA TRANSCRIPTOMIC
## SRR8485395 NextSeq        500        PAIRED             cDNA TRANSCRIPTOMIC
## SRR8485396 NextSeq        500        PAIRED             cDNA TRANSCRIPTOMIC
## SRR8485397 NextSeq        500        PAIRED             cDNA TRANSCRIPTOMIC
##            Organism Platform ReleaseDate          create_date
## SRR8485394      Mus musculus    ILLUMINA 2019-02-28T00:00:00Z
## SRR8485395      Mus musculus    ILLUMINA 2019-02-28T00:00:00Z
## SRR8485396      Mus musculus    ILLUMINA 2019-02-28T00:00:00Z
## SRR8485397      Mus musculus    ILLUMINA 2019-02-28T00:00:00Z
##                         version Sample     Name.1 source_name      SRA  Study
## SRR8485394 2019-01-25T03:03:00Z      1 GSM3578982  ES-derived embryoid bodies
## SRR8485395 2019-01-25T03:25:00Z      1 GSM3578982  ES-derived embryoid bodies
## SRR8485396 2019-01-25T02:15:00Z      1 GSM3578982  ES-derived embryoid bodies
## SRR8485397 2019-01-25T02:23:00Z      1 GSM3578982  ES-derived embryoid bodies
##            treatment       X           X.1     X.2       X.3 X.4       X.5 X.6
## SRR8485394      with induced transcription factors SRP181957 Dox induction   +
## SRR8485395      with induced transcription factors SRP181957 Dox induction   +
## SRR8485396      with induced transcription factors SRP181957 Dox induction   +
## SRR8485397      with induced transcription factors SRP181957 Dox induction   +
##            X.7   X.8
## SRR8485394  48 hours
## SRR8485395  48 hours
## SRR8485396  48 hours
## SRR8485397  48 hours

Seurat object creation

seurat_obj <- CreateSeuratObject(counts = counts_matrix)

As it was previously explained, we don’t have a real metadata file. We will add manually the available metadata useful for our analysis.

seurat_obj$sample <- "GSM3578982"
seurat_obj$run <- "SRR8485394-97"
seurat_obj$condition <- "Dox induction + 48 hours"
seurat_obj$cell_type <- "ES-derived embryoid bodies with induced transcription factors"
seurat_obj$species <- "Mus musculus"

#See the result
seurat_obj

## An object of class Seurat 
## 28694 features across 737280 samples within 1 assay 
## Active assay: RNA (28694 features, 0 variable features)
##  1 layer present: counts

head(seurat_obj@meta.data)

##                       orig.ident nCount_RNA nFeature_RNA     sample
## AAACCTGAGAAACCAT-1 SeuratProject          0            0 GSM3578982
## AAACCTGAGAAACCGC-1 SeuratProject          0            0 GSM3578982
## AAACCTGAGAAACCTA-1 SeuratProject        274          211 GSM3578982
## AAACCTGAGAAACGAG-1 SeuratProject          0            0 GSM3578982
## AAACCTGAGAAACGCC-1 SeuratProject          1            1 GSM3578982
## AAACCTGAGAAAGTGG-1 SeuratProject          0            0 GSM3578982
##                              run                condition
## AAACCTGAGAAACCAT-1 SRR8485394-97 Dox induction + 48 hours
## AAACCTGAGAAACCGC-1 SRR8485394-97 Dox induction + 48 hours
## AAACCTGAGAAACCTA-1 SRR8485394-97 Dox induction + 48 hours
## AAACCTGAGAAACGAG-1 SRR8485394-97 Dox induction + 48 hours
## AAACCTGAGAAACGCC-1 SRR8485394-97 Dox induction + 48 hours
## AAACCTGAGAAAGTGG-1 SRR8485394-97 Dox induction + 48 hours
##                                                                        cell_type
## AAACCTGAGAAACCAT-1 ES-derived embryoid bodies with induced transcription factors
## AAACCTGAGAAACCGC-1 ES-derived embryoid bodies with induced transcription factors
## AAACCTGAGAAACCTA-1 ES-derived embryoid bodies with induced transcription factors
## AAACCTGAGAAACGAG-1 ES-derived embryoid bodies with induced transcription factors
## AAACCTGAGAAACGCC-1 ES-derived embryoid bodies with induced transcription factors
## AAACCTGAGAAAGTGG-1 ES-derived embryoid bodies with induced transcription factors
##                         species
## AAACCTGAGAAACCAT-1 Mus musculus
## AAACCTGAGAAACCGC-1 Mus musculus
## AAACCTGAGAAACCTA-1 Mus musculus
## AAACCTGAGAAACGAG-1 Mus musculus
## AAACCTGAGAAACGCC-1 Mus musculus
## AAACCTGAGAAAGTGG-1 Mus musculus

Filtering and normalizing

28694 features across 737280 samples this indicates that a huge number of genes have 0 counts in some cells, it’s necessary to do the filtering, normalization and scaling.

erccs <- grep("^ERCC-", rownames(seurat_obj), value = TRUE)

if (length(erccs) > 0) {

  counts <- GetAssayData(seurat_obj, assay = "RNA", layer = "counts")

  percent.ercc <- Matrix::colSums(counts[erccs, , drop = FALSE]) /
                  Matrix::colSums(counts)

  seurat_obj <- AddMetaData(seurat_obj, percent.ercc, col.name = "percent.ercc")
}
#Our dataset is already filtered for ERC, but just in case we apply the filtering

#Filtering: erasing low quality cells and outliers (doublets, too high expression)
seurat_obj <- subset(
  seurat_obj,
  subset =
    nFeature_RNA > 500 &
    nFeature_RNA < 8000 &
    nCount_RNA > 5000 &
    nCount_RNA < 60000
)



#Normalization
seurat_obj <- NormalizeData(seurat_obj, normalization.method = "LogNormalize", scale.factor = 1e4)

## Normalizing layer: counts

# Variable genes
seurat_obj <- FindVariableFeatures(seurat_obj)

## Finding variable features for layer counts

# Scaling
seurat_obj <- ScaleData(seurat_obj)

## Centering and scaling data matrix

dim(seurat_obj)

## [1] 28694  3477

Violin plots: QC

p <- VlnPlot(
  seurat_obj,
  features = c("nFeature_RNA", "nCount_RNA"),
  ncol = 2
)

p & theme(
  axis.title.x = element_blank(),
  axis.text.x  = element_blank(),
  axis.ticks.x = element_blank()
)

seurat_obj <- FindVariableFeatures(object = seurat_obj)

## Finding variable features for layer counts

top10 <- head(VariableFeatures(seurat_obj), 10)
plot1 <- VariableFeaturePlot(seurat_obj)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)

## When using repel, set xnudge and ynudge to 0 for optimal results

plot1

## Warning in scale_x_log10(): log-10 transformation introduced infinite values.

plot2

## Warning in scale_x_log10(): log-10 transformation introduced infinite values.

PCA

seurat_obj <- RunPCA(seurat_obj, features = VariableFeatures(seurat_obj))

## PC_ 1 
## Positive:  Npm1, Rps2, Mdk, H2afv, Rpl27a, Rpl12, Gapdh, Rplp1, Ran, Hmgn2 
##     Hspe1, Hmgb2, Slc2a1, Rpl36a, Siva1, Mif, H2afz, Hmgb1, Rpl10a, 2810417H13Rik 
##     Rps12, Hes5, Cdca7, Id3, Rrm2, Ranbp1, Sox21, Dek, Birc5, Wls 
## Negative:  Rtn1, Mllt11, Onecut2, Map1b, Gng3, Tubb2b, Stmn2, Ebf1, Ina, Basp1 
##     Gap43, Dcx, Snhg11, Crmp1, Kif5c, Stmn3, Stmn1, Tagln3, Rufy3, Nefl 
##     Klc1, Tubb3, Syt11, Ank3, Tmsb10, App, Tubb2a, Aplp1, Ebf3, Stmn4 
## PC_ 2 
## Positive:  Nbl1, Ckb, Pkib, Homer2, Tfap2b, Chrna4, Tlx3, Podxl2, Rgs16, Pkd2l1 
##     Lzts1, Sec11c, Hes6, Snca, Tpm2, Mxra7, Fam19a4, Lgals1, Ngfr, Dmpk 
##     Hepacam2, Ptn, Plk3, Gadd45g, Cyba, Dner, Ola1, Dpysl3, Peg10, Tubb3 
## Negative:  Top2a, Cdca8, Birc5, Spc25, Ccna2, Nusap1, Cenpf, 2810417H13Rik, Cdk1, Ccnb1 
##     Cks2, Ube2c, Tpx2, Cenpe, Cdca3, Tuba1b, Fam64a, Plk1, H2afx, Spc24 
##     Rrm2, Mki67, Ckap2l, Cenpa, Hmmr, Cdc20, Hmgb2, Prc1, Incenp, Pbk 
## PC_ 3 
## Positive:  Neurog2, Mfap4, Ppp1r14a, Rps21, Vim, Prmt8, Rpl10a, Rps12, Neurod1, Rpl27a 
##     Ly6e, Rplp1, Rpl36a, Adssl1, Eef1b2, Rpl12, Gapdh, Nhlh2, Nhlh1, Irx3 
##     Kcnmb2, Epb41, tubb3gfp, Pcdh17, Btg2, Rps12-ps3, Phlda1, Plagl1, Rgs13, Litaf 
## Negative:  Tlx3, Snca, Sec11c, Hepacam2, Podxl2, Arl6ip1, Tpm2, Dmpk, Lgals1, P2rx3 
##     Tfap2b, Pkib, Dner, Nusap1, Ccnb1, Tubb4b, Plk1, Ola1, Cenpf, Nbl1 
##     Mthfd2, Prc1, Cnr1, Ptn, Ngfr, Ccna2, Lzts1, F2rl2, Chrna4, Mxra7 
## PC_ 4 
## Positive:  Sparc, Cntnap2, Eif4ebp1, Apoe, Id3, Slc2a3, Ung, Bmp7, Mt1, Ubash3b 
##     Gpm6a, Mapt, Tagln2, Irf2bpl, Epcam, Cdh1, Anxa2, Nefl, Hrk, Perp 
##     Fhl1, Gsta4, Kcna6, Ifitm3, Pou6f2, Hes1, Ccng1, Dscam, Gyltl1b, Pmm1 
## Negative:  Sox11, Slc1a2, Mfap4, Marcks, Ebf2, Foxp2, Prmt8, Neurod4, Elavl3, Rgs13 
##     Plagl1, Elavl2, Elavl4, Btg2, Celsr2, Lhx5, Kcnmb2, Tmem2, Nhlh2, Ube2c 
##     Igsf8, Angpt1, Gadd45g, Racgap1, 3110035E14Rik, St18, Akr1b8, Neurog2, Sox4, Ly6e 
## PC_ 5 
## Positive:  Chchd10, Ldha, Igfbp2, Csrp1, Tpi1, Aldoa, Adssl1, Eno1, Slc16a3, Tagln2 
##     Ndufa4l2, Smtnl2, Epcam, Dppa5a, Anxa2, Cdh1, Rasgrp2, Krt8, Ifitm1, Apoe 
##     Tcea3, Crip1, Igsf8, Krt18, Ndrg1, Ppp1r14a, Cox6b2, Perp, Cyba, Mfap4 
## Negative:  Meis2, Fez1, Sox21, Pdzrn4, Igdcc3, Cdc25b, RP23-59P20.3, Gata3, Frzb, Ddah1 
##     Hes5, Fhl1, Nrep, Prss23, Ptn, Sox11, Ptprz1, Rgma, Sox9, Pdpn 
##     Fndc3c1, Tcf7l2, Foxb1, Celf2, Marcks, Lmo4, Peg12, Ckb, Peg3, Pantr1

ElbowPlot(seurat_obj, ndims = 50)

More in detail analysis of PCs:

DimHeatmap(seurat_obj, dims = 1:15, cells = 500, balanced = TRUE)

We choose 25 as the number of sgnificant PC to use for the subsequent UMAP.

Clustering and UMAP

seurat_obj <- FindNeighbors(seurat_obj, dims = 1:25)

## Computing nearest neighbor graph

## Computing SNN

seurat_obj <- FindClusters(seurat_obj, resolution = 0.5)

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 3477
## Number of edges: 125983
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8570
## Number of communities: 7
## Elapsed time: 0 seconds

# ----- UMAP -----
seurat_obj <- RunUMAP(seurat_obj, dims = 1:25)

## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session

## 11:24:38 UMAP embedding parameters a = 0.9922 b = 1.112

## 11:24:38 Read 3477 rows and found 25 numeric columns

## 11:24:38 Using Annoy for neighbor search, n_neighbors = 30

## 11:24:38 Building Annoy index with metric = cosine, n_trees = 50

## 0%   10   20   30   40   50   60   70   80   90   100%

## [----|----|----|----|----|----|----|----|----|----|

## **************************************************|
## 11:24:39 Writing NN index file to temp file /tmp/Rtmp37y3HI/file586559185041
## 11:24:39 Searching Annoy index using 1 thread, search_k = 3000
## 11:24:39 Annoy recall = 100%
## 11:24:40 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 11:24:40 Initializing from normalized Laplacian + noise (using RSpectra)
## 11:24:40 Commencing optimization for 500 epochs, with 143368 positive edges
## 11:24:40 Using rng type: pcg
## 11:24:42 Optimization finished

#Number of clusters
levels(seurat_obj)

## [1] "0" "1" "2" "3" "4" "5" "6"

DimPlot(seurat_obj, reduction = "umap", group.by = "sample")

DimPlot(seurat_obj, reduction = "umap", label = TRUE)

Markers

We will save the top markers for each cluster in order to identify the different cell types present in our dataset.

markers <- FindAllMarkers(seurat_obj,
                          only.pos = TRUE,
                          min.pct = 0.25,
                          logfc.threshold = 0.25)

## Calculating cluster 0

## For a (much!) faster implementation of the Wilcoxon Rank Sum Test,
## (default method for FindMarkers) please install the presto package
## --------------------------------------------
## install.packages('devtools')
## devtools::install_github('immunogenomics/presto')
## --------------------------------------------
## After installation of presto, Seurat will automatically use the more 
## efficient implementation (no further action necessary).
## This message will be shown once per session

## Calculating cluster 1

## Calculating cluster 2

## Calculating cluster 3

## Calculating cluster 4

## Calculating cluster 5

## Calculating cluster 6

top_markers <- markers %>% group_by(cluster) %>% top_n(5, avg_log2FC)
print(top_markers)

## # A tibble: 35 × 7
## # Groups:   cluster [7]
##        p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene     
##        <dbl>      <dbl> <dbl> <dbl>     <dbl> <fct>   <chr>    
##  1 7.91e- 76       2.17 0.45  0.169 2.27e- 71 0       Pdgfa    
##  2 1.37e- 58       2.11 0.371 0.133 3.93e- 54 0       Hes1     
##  3 5.06e- 47       2.33 0.259 0.079 1.45e- 42 0       Etl4     
##  4 1.23e- 35       2.17 0.287 0.123 3.53e- 31 0       Adcyap1r1
##  5 1.23e- 31       2.14 0.255 0.107 3.53e- 27 0       Kcnh7    
##  6 0               3.81 0.888 0.177 0         1       Tlx3     
##  7 1.30e-218       3.67 0.597 0.089 3.73e-214 1       Hepacam2 
##  8 1.45e-117       3.52 0.426 0.086 4.16e-113 1       Cldn5    
##  9 4.70e-105       4.19 0.257 0.023 1.35e-100 1       Eva1c    
## 10 6.55e- 98       3.91 0.254 0.026 1.88e- 93 1       Cdh22    
## # ℹ 25 more rows

sel <- dplyr::select(top_markers, cluster, gene)

sel$gene <- toupper(sel$gene)

Based on literature we set the following labels:

seurat_obj <- RenameIdents(seurat_obj,
                           `0` = "Early neural progenitors/Radial‑glia–like",
                           `1` = "Sensory/Dorsal interneurons Tlx3+",
                           `2` = "Dorsal interneurons/Sensory neuroblast",
                           `3` = "Mitotic/Proliferating progenitors",
                           `4` = "Mesenchymal-like/Intermediate Progenitors",
                           `5` = "Immature/Post‑mitotic glutamatergic neurons",
                           `6` = "Autonomic Neurons")


DimPlot(seurat_obj, reduction = "umap", label.size = 3) + 
  labs(title = "Cell type prediction") +
  theme(
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),  
    legend.text = element_text(size = 8),                               
    legend.title = element_text(size = 10),
    aspect.ratio = 0.7                                                 
  )

NEUROG2 visualization

How is the gene’s expression distributed in our data?

FeaturePlot(seurat_obj, features = "Neurog2", label = TRUE, repel = T, label.size = 3)

 VlnPlot(seurat_obj, features = "Neurog2") +
  theme(
    plot.margin = margin(20, 20, 20, 20),
    axis.title.x = element_blank(),
    axis.text.x = element_blank(),
    legend.text = element_text(size = 8),
    legend.title = element_text(size = 10)
  )

 FeaturePlot(seurat_obj, features = "Ascl1", label = TRUE, repel = T, label.size = 3)

 #VGLUT2 expression for the glutamatergic identity confirmation. 
  FeaturePlot(seurat_obj, features = "Slc17a6", label = TRUE, repel = T, label.size = 3)

Gene co-expression

Based on the results obtained from the ChIP-Seq significant genes were selected, the sequences where Neurog2 acts. Some of the most significant genes were selected, selecting the biggest values in the V5 column, corresponding to the -log(p-value).

chip<- read.table("CHIPIP1_summits_resized_annotation_mm10.tsv",
                 header = TRUE,
                 sep = "\t",
                 check.names = FALSE)

## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string

## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : number of items read is not a multiple of the number of columns

sum(duplicated(chip[,1]))

## [1] 0

head(chip)

##              V4        SYMBOL                                    GENENAME
## 1  IP1_peak_980 4930517M08Rik                  RIKEN cDNA 4930517M08 gene
## 2 IP1_peak_1027          Dpp9                       dipeptidylpeptidase 9
## 3 IP1_peak_2016         Prb1c     proline-rich protein BstNI subfamily 1C
## 4  IP1_peak_664          Enc1                  ectodermal-neural cortex 1
## 5  IP1_peak_305 4933427E13Rik                  RIKEN cDNA 4933427E13 gene
## 6  IP1_peak_776        Pou4f1 POU domain, class 4, transcription factor 1
##   seqnames     start       end width strand        V5
## 1    chr17   4626460   4626460     1      * 151.34766
## 2    chr17  56200054  56200054     1      *  72.42954
## 3     chr6 132463771 132463771     1      *  71.38757
## 4    chr13  97292182  97292182     1      *  70.13784
## 5    chr11  25100171  25100171     1      *  67.13946
## 6    chr14 104336989 104336989     1      *  66.92199
##                                                          annotation geneChr
## 1                                                 Distal Intergenic      17
## 2              Intron (ENSMUST00000223616.1/224897, intron 7 of 18)      17
## 3                                                 Distal Intergenic       6
## 4                                                 Distal Intergenic      13
## 5                                                 Distal Intergenic      11
## 6 Intron (ENSMUST00000228095.1/ENSMUST00000228095.1, intron 1 of 4)      14
##   geneStart   geneEnd geneLength geneStrand geneId         transcriptId
## 1   4634978   4637471       2494          1  75117 ENSMUST00000232624.1
## 2  56186807  56209208      22402          2 224897 ENSMUST00000223616.1
## 3 132361041 132364134       3094          2 667929 ENSMUST00000080849.5
## 4  97241105  97253034      11930          1  13803 ENSMUST00000041623.8
## 5  25326107  25367137      41031          1  71234 ENSMUST00000125234.7
## 6 104463973 104465365       1393          2  18996 ENSMUST00000139505.1
##   distanceToTSS            ENSEMBL
## 1         -8518 ENSMUSG00000116767
## 2          9154 ENSMUSG00000001229
## 3        -99637 ENSMUSG00000030143
## 4         51077 ENSMUSG00000041773
## 5       -225936 ENSMUSG00000033015
## 6        128376 ENSMUSG00000048349

Most significant in Single-Cell and ChIP-seq

Using the markers extracted from the Single-cell analysis, the ones with a significant p-value, and a logFC bigger than 0.25 (logfc.threshold = 0.25), we overlapped the significant genes from the ChIP-Seq analysis. We expect a coexpression of these genes with NeuroG2.

We expect them to be expressed in the same clusters (aprox.)

#Only saving the gene name and -log(p-value)
#For the gene name we will use the ENSEMBL ID, to make sure the same gene ID is contained in the chip and seurat object
chip_ids <- chip[, c("ENSEMBL", "V5")]
chip_ids <- chip_ids[!is.na(chip_ids$ENSEMBL), ]
library(biomaRt)

mart <- useMart("ensembl", dataset = "mmusculus_gene_ensembl")

mapping <- getBM(
    attributes = c("ensembl_gene_id", "mgi_symbol"),
    filters = "ensembl_gene_id",
    values = unique(chip_ids$ENSEMBL),
    mart = mart
)

chip_mapped <- merge(
    chip_ids,
    mapping,
    by.x = "ENSEMBL",
    by.y = "ensembl_gene_id",
    all.x = TRUE
)

mark<- markers[, c("p_val_adj", "gene", "avg_log2FC")]
#Once we ensured same gene notation we can overlap the significant markers with the chipseq results
#overlap <- chip_mapped[chip_mapped$mgi_symbol %in% markers$gene, ]
overlap <- merge(
  x = mark, 
  y = chip_mapped, 
  by.x = "gene", 
  by.y = "mgi_symbol",
  all = FALSE
)
overlap <- overlap[order(-overlap$V5), ]
head(overlap)

##       gene     p_val_adj avg_log2FC            ENSEMBL       V5
## 160   Enc1  2.866558e-24  1.2499249 ENSMUSG00000041773 70.13784
## 162   Enc1  1.048743e-17  0.9415913 ENSMUSG00000041773 70.13784
## 164   Enc1  9.255236e-20  0.8501110 ENSMUSG00000041773 70.13784
## 430 Pou4f1 4.300340e-163  2.6123279 ENSMUSG00000048349 66.92199
## 431 Pou4f1  4.838743e-65  1.2978100 ENSMUSG00000048349 66.92199
## 41  Btbd17  1.219519e-54  0.8959460 ENSMUSG00000000202 61.20669

highV5 <- quantile(overlap$V5, 0.75)          # top 25%
highFC <- quantile(overlap$avg_log2FC, 0.75)  # top 25%

strong_overlap <- overlap[
  overlap$V5 >= highV5 & overlap$avg_log2FC >= highFC, 
]

strong_overlap

##         gene     p_val_adj avg_log2FC            ENSEMBL       V5
## 160     Enc1  2.866558e-24   1.249925 ENSMUSG00000041773 70.13784
## 430   Pou4f1 4.300340e-163   2.612328 ENSMUSG00000048349 66.92199
## 431   Pou4f1  4.838743e-65   1.297810 ENSMUSG00000048349 66.92199
## 398    Olig3  1.792863e-89   3.325452 ENSMUSG00000045591 56.45586
## 227     Hes6 4.541142e-101   1.415463 ENSMUSG00000067071 44.41752
## 329    Mtus1  8.249217e-54   2.242453 ENSMUSG00000045636 34.98355
## 201  Gadd45g 4.039895e-121   2.354110 ENSMUSG00000021453 34.71003
## 526   Sncaip  2.639564e-34   1.436981 ENSMUSG00000024534 30.88683
## 497  Serinc5  2.419223e-13   1.542958 ENSMUSG00000021703 28.46338
## 498  Serinc5  5.950352e-04   1.570370 ENSMUSG00000021703 28.46338
## 499    Sesn1  2.345517e-16   1.172481 ENSMUSG00000038332 28.30834
## 387      Ntm  5.518425e-05   1.430251 ENSMUSG00000059974 15.53478
## 17       App  7.121729e-79   1.200616 ENSMUSG00000022892 15.40514
## 19       App  2.717924e-16   1.256128 ENSMUSG00000022892 15.40514
## 93     Crnde  2.547980e-07   1.228318 ENSMUSG00000031736 15.40514
## 403     Otx2  2.595010e-25   1.236738 ENSMUSG00000021848 15.40514
## 416    Phf24  3.291399e-61   1.672574 ENSMUSG00000036062 15.35059
## 417    Phf24 4.094788e-149   2.673707 ENSMUSG00000036062 15.35059
## 71     Celf4 5.295431e-185   2.713502 ENSMUSG00000024268 15.05017
## 75     Celf4 6.627962e-128   1.390621 ENSMUSG00000024268 15.05017
## 423    Plcb1  1.057820e-37   1.824285 ENSMUSG00000051177 14.75609
## 319    Mfap4  6.067849e-69   1.503434 ENSMUSG00000042436 14.67002
## 320    Mfap4 5.833026e-119   1.783510 ENSMUSG00000042436 14.67002
## 445 Rabgap1l  5.376378e-74   2.314538 ENSMUSG00000026721 14.46189
## 291    Lzts1 9.841750e-207   2.708025 ENSMUSG00000036306 11.67319
## 315    Mdga1  4.992229e-37   1.328536 ENSMUSG00000043557 11.67319
## 424    Plcb1  1.057820e-37   1.824285 ENSMUSG00000051177 11.67319
## 590     Uncx 5.193775e-142   1.997435 ENSMUSG00000029546 11.60869
## 592     Uncx 6.878583e-105   1.659642 ENSMUSG00000029546 11.60869
## 200  Gadd45g 4.039895e-121   2.354110 ENSMUSG00000021453 11.34462
## 455     Rcn1  1.175142e-40   1.328328 ENSMUSG00000005973 11.28587
## 539   Spock1  7.952841e-77   1.956003 ENSMUSG00000056222 11.24010
## 540   Spock1  9.587964e-39   2.861777 ENSMUSG00000056222 11.24010
## 126     Dok4  2.752426e-36   1.442075 ENSMUSG00000040631 10.77882
## 171    Epha5  6.196087e-97   1.749136 ENSMUSG00000029245 10.77882
## 172    Epha5 2.738633e-148   2.425795 ENSMUSG00000029245 10.77882
## 354     Nefl 7.392488e-154   2.493698 ENSMUSG00000022055 10.77882
## 356     Nefl  1.384559e-34   2.378868 ENSMUSG00000022055 10.77882

FeaturePlot(seurat_obj, features = "Enc1", label = TRUE, repel = T, label.size = 3)

FeaturePlot(seurat_obj, features = "Pou4f1", label = TRUE, repel = T, label.size = 3)

FeaturePlot(seurat_obj, features = "Btbd17", label = TRUE, repel = T, label.size = 3)

FeaturePlot(seurat_obj, features = "Olig3", label = TRUE, repel = T, label.size = 3)

FeaturePlot(seurat_obj, features = "Tead1", label = TRUE, repel = T, label.size = 3)

FeaturePlot(seurat_obj, features = "Neurod2", label = TRUE, repel = T, label.size = 3)

FeaturePlot(seurat_obj, features = "Celf4", label = TRUE, repel = T, label.size = 3)

FeaturePlot(seurat_obj, features = "Phf24", label = TRUE, repel = T, label.size = 3)

FeaturePlot(seurat_obj, features = "Epha4", label = TRUE, repel = T, label.size = 3)