#if (!requireNamespace("BiocManager", quietly = TRUE))
#install.packages("BiocManager")
#install.packages("devtools")
#devtools::install_github('cole-trapnell-lab/monocle3')
#install.packages('Seurat')
#remotes::install_github('satijalab/seurat-wrappers')
library(Seurat)
## Warning: package 'Seurat' was built under R version 4.2.1
## Attaching SeuratObject
## Attaching sp
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.1
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.7     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## Warning: package 'tibble' was built under R version 4.2.1
## Warning: package 'dplyr' was built under R version 4.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(dplyr)
library(monocle3)
## Loading required package: Biobase
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: SingleCellExperiment
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
## 
##     anyMissing, rowMedians
## The following object is masked from 'package:dplyr':
## 
##     count
## 
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## The following object is masked from 'package:Biobase':
## 
##     rowMedians
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## The following object is masked from 'package:tidyr':
## 
##     expand
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice
## The following object is masked from 'package:purrr':
## 
##     reduce
## The following object is masked from 'package:sp':
## 
##     %over%
## The following object is masked from 'package:grDevices':
## 
##     windows
## Loading required package: GenomeInfoDb
## 
## Attaching package: 'SummarizedExperiment'
## The following object is masked from 'package:SeuratObject':
## 
##     Assays
## The following object is masked from 'package:Seurat':
## 
##     Assays
## 
## Attaching package: 'monocle3'
## The following objects are masked from 'package:Biobase':
## 
##     exprs, fData, fData<-, pData, pData<-
library(SeuratWrappers)
library(ggplot2)
library(ggridges)

Read 10X dataset(feature barcode matrix) and create a Seurat Object

data <- Read10X(data.dir = "D:/ChIP seq and RNA seq analysis/Arlotta_lab scRNA/E17.5.filtered_feature_bc_matrix")
data <- CreateSeuratObject(counts = data, project='E17.5', min.cells = 3, min.features = 200)

##Check and Visualize QC metrics like mitochonrial RNA reads.

data[["percent.mt"]] <- PercentageFeatureSet(data, pattern = "^MT-")
VlnPlot(data, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
## Warning in SingleExIPlot(type = type, data = data[, x, drop = FALSE], idents =
## idents, : All cells have the same value of percent.mt.

Visualising the quality of reads per cell

FeatureScatter(data, feature1 = "nCount_RNA", feature2 = "nFeature_RNA") + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

Filtering and removing low quality cells

data <- subset(data, subset = nFeature_RNA >200 & nFeature_RNA <2500 & percent.mt <5)

Normalising data

data <- NormalizeData(data)

Identify highly variable features

data <- FindVariableFeatures(data, selection.method = "vst", nfeatures = 2000)
top10 <- head(VariableFeatures(data), 10)
top10
##  [1] "Fabp7"  "Hbb-bs" "Hba-a1" "Hba-a2" "Npy"    "Mt3"    "Sst"    "Hbb-bt"
##  [9] "Dbi"    "Apoe"

Plot variable features

plot1 <- VariableFeaturePlot(data)
LabelPoints(plot = plot1, points = top10, repel = T)
## When using repel, set xnudge and ynudge to 0 for optimal results
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 63 rows containing missing values (geom_point).

Scaling data

all.genes <- rownames(data)
data <- ScaleData(data, features = all.genes)

Perform Dimensionality Reduction

data <- RunPCA(data, features = VariableFeatures(object = data))
## PC_ 1 
## Positive:  Tubb3, Tmsb10, Tuba1a, Stmn2, Mllt11, Tubb2b, Cd24a, Nsg1, Mapt, Stmn1 
##     Map1b, Stmn3, Crmp1, Mef2c, Stmn4, Tubb5, Dpysl3, Actg1, Neurod6, Thra 
##     Ly6h, Mpped1, Nrep, mt-Cytb, Sox11, Serf1, Zeb2, Gpm6a, Ncam1, Actb 
## Negative:  Fabp7, Phgdh, Hes5, Dbi, Hmgb2, Mfge8, Mt3, Slc1a3, Aldoc, Ccnd2 
##     Ddah1, Pclaf, Pbk, Qk, Vim, Rgcc, Top2a, Mt1, Zfp36l1, Spc25 
##     Pax6, Cdca3, Birc5, Lockd, Cdk1, Ccna2, Ndrg2, Cdkn2c, Cks2, Sox9 
## PC_ 2 
## Positive:  Igfbpl1, Sox11, Sstr2, Zbtb20, Mir99ahg, Nnat, Unc5d, Meis2, Neurod1, Sema3c 
##     Mdk, Elavl4, Pou3f2, Hmgn1, Malat1, Mllt3, Nrn1, Pantr1, Gm17750, Cited2 
##     Eomes, Mfng, Epha5, Mfap4, Pcp4, Pam, Plekhf2, 2600014E21Rik, H1f0, Nrxn3 
## Negative:  Gap43, Sybu, Mef2c, Tubb2a, Dync1i1, Ntm, Nme2, Opcml, Foxp1, Rac3 
##     Osbpl1a, S100a10, Camkv, Map1b, Stmn2, Mapt, Bend6, Crip2, Caly, Cadm2 
##     Ldha, Thra, Grin2b, Rprm, Hmgcs1, Ppp1r1a, Fdps, Ppia, Ssbp2, Fabp3 
## PC_ 3 
## Positive:  Spc25, Top2a, Ube2c, Nusap1, Birc5, Ccnb1, Pbk, Ccna2, Cks2, Cdca3 
##     Cdk1, Cenpf, Stmn1, Prc1, Aurkb, Sgo1, Kif22, Cdca8, Tpx2, Pclaf 
##     Ckap2l, Cenpe, Cenpa, Cdc20, Spc24, Mis18bp1, Kif2c, Hmmr, Aurka, Ccnb2 
## Negative:  Aldoc, Apoe, Sparc, Ddah1, Mt1, Ndrg2, Mt3, Atp1a2, Tnc, Slc1a3 
##     Rgcc, Slc9a3r1, S100a1, Ttyh1, Pdpn, Pea15a, Mlc1, Trim47, Mfge8, Gm11627 
##     Fabp7, Mt2, Baalc, Ccn1, AW047730, Tst, Phgdh, S100a11, Vim, Ccdc80 
## PC_ 4 
## Positive:  Ptn, Pantr1, 9130024F11Rik, Neurod6, Neurod2, Ttc28, Dok5, Zbtb18, Pou3f2, Satb2 
##     Nfix, Tmsb4x, Hs6st2, Gria2, Tsc22d1, Abracl, Gpm6a, Eif1b, Igfbpl1, Id2 
##     Pou3f3, Limch1, Frmd4a, Tafa2, Mn1, Bhlhe22, Dab1, Mdk, Plxna4, Mir99ahg 
## Negative:  Meg3, Nrxn3, Rpp25, Maf, Lhx6, Gm14204, Dlx2, Npas1, Dlx5, Mafb 
##     Arx, Sst, Nxph1, Dlx1, Gad2, Cadm1, Neto1, Slc32a1, Elmo1, Bcl11b 
##     Akap7, Tshz3, Gria1, Atp1b1, Celf4, Junb, Sox2ot, Erbb4, Cbfa2t3, Rian 
## PC_ 5 
## Positive:  Nfib, Ppp1r1b, Meis2, Islr2, Pcp4, Sox5, Hs3st4, Tle4, Rprm, Wnt7b 
##     Nfia, Foxp2, Ppp2r2b, Nxph3, Nfix, Xpr1, Nfe2l3, Fezf2, Tmem108, Ramp3 
##     Ephb1, Kctd12, Sstr2, Id2, Kif26a, Lmo7, Nxph4, Syt6, Gpr22, Baiap2 
## Negative:  Rpp25, Maf, Nrxn3, Lhx6, Ptn, Gucy1a1, Gm14204, Nxph1, Npas1, Bzw2 
##     Tafa2, Arx, Mafb, Mif, Sst, Dlx2, Dok5, Dlx5, Dlx1, Mef2c 
##     Gpm6a, Inhba, Gad2, Runx1t1, Gria1, Limch1, Map1b, Sox2ot, Eif1b, Cux1
DimHeatmap(data, dims = 1:6, cells = 500, balanced = T)

Determine Dimensionality of the data

ElbowPlot(data)

Clustering data

data <- FindNeighbors(data, dims = 1:20)
## Computing nearest neighbor graph
## Computing SNN

Understanding the resolution of the clustering

data <- FindClusters(data, resolution = c(0.3, 0.5, 0.7, 1))
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 8401
## Number of edges: 326259
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9365
## Number of communities: 11
## Elapsed time: 1 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 8401
## Number of edges: 326259
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9142
## Number of communities: 15
## Elapsed time: 1 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 8401
## Number of edges: 326259
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8934
## Number of communities: 16
## Elapsed time: 1 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 8401
## Number of edges: 326259
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8696
## Number of communities: 19
## Elapsed time: 1 seconds
head(data@meta.data)
##                    orig.ident nCount_RNA nFeature_RNA percent.mt
## AAACCTGAGCAGACTG-1      E17.5       5022         2087          0
## AAACCTGAGCCAGAAC-1      E17.5       3563         1706          0
## AAACCTGAGCTACCGC-1      E17.5       3414         1543          0
## AAACCTGAGGCAATTA-1      E17.5       5643         2149          0
## AAACCTGAGTGGGATC-1      E17.5       2551         1298          0
## AAACCTGAGTGTCTCA-1      E17.5       3733         1678          0
##                    RNA_snn_res.0.3 RNA_snn_res.0.5 RNA_snn_res.0.7
## AAACCTGAGCAGACTG-1               0               0               0
## AAACCTGAGCCAGAAC-1               4               8               5
## AAACCTGAGCTACCGC-1               0               0               0
## AAACCTGAGGCAATTA-1               2               2               2
## AAACCTGAGTGGGATC-1               0               0               0
## AAACCTGAGTGTCTCA-1               5               3               3
##                    RNA_snn_res.1 seurat_clusters
## AAACCTGAGCAGACTG-1             1               1
## AAACCTGAGCCAGAAC-1             8               8
## AAACCTGAGCTACCGC-1             2               2
## AAACCTGAGGCAATTA-1             4               4
## AAACCTGAGTGGGATC-1             2               2
## AAACCTGAGTGTCTCA-1             3               3
p1 <- DimPlot(data, group.by = "RNA_snn_res.0.3", label = T)
p2 <- DimPlot(data, group.by = "RNA_snn_res.0.5", label = T)
p3 <- DimPlot(data, group.by = "RNA_snn_res.0.7", label = T)
p4 <- DimPlot(data, group.by = "RNA_snn_res.1", label = T)
p1 + p2 + p3 + p4

Setting Identity of Clusters

Idents(data) <- "RNA_snn_res.0.7"

Non-linear Dimensionality Reduction: UMAP

data <- RunUMAP(data, dims = 1:20)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 14:18:14 UMAP embedding parameters a = 0.9922 b = 1.112
## 14:18:14 Read 8401 rows and found 20 numeric columns
## 14:18:14 Using Annoy for neighbor search, n_neighbors = 30
## 14:18:14 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 14:18:15 Writing NN index file to temp file C:\Users\Lenovo\AppData\Local\Temp\Rtmp2jaCs0\file126417c81007
## 14:18:15 Searching Annoy index using 1 thread, search_k = 3000
## 14:18:18 Annoy recall = 100%
## 14:18:19 Commencing smooth kNN distance calibration using 1 thread
## 14:18:20 Initializing from normalized Laplacian + noise
## 14:18:21 Commencing optimization for 500 epochs, with 356346 positive edges
## 14:18:44 Optimization finished
DimPlot(data, reduction = "umap")

Looking at Variable Features to identify cluster markers

VlnPlot(data, features = c("Pax6", "Rbfox1"), slot = "counts", log = TRUE)

Looking at cluster markers to identify UMAP clusters

FeaturePlot(data, features = c("Pax6",  "Eomes", "Aldh1l1",
                               "Tbr1",  "Olig2", "Sox2", "Cux2", "Neurog2"))

Rename Clusters

#new.cluster.ids <- c("A progenitors","B progenitors", "Cnn2+ Msx1+ population" , "B progenitors", "Putative early neuronal population", "Lmx1a+ population",
#                      "Intermediate Projenitors")
#names(new.cluster.ids) <- levels(data)
#data <- RenameIdents(data, new.cluster.ids)
#DimPlot(data, reduction = "umap", label = TRUE, pt.size = 0.5) 

Trajectory Analysis with Monocle3

Converting seuratobject to celldataset object for Monocle3

cds <- as.cell_data_set(data)
## Warning: Monocle 3 trajectories require cluster partitions, which Seurat does
## not calculate. Please run 'cluster_cells' on your cell_data_set object

Get cell metadata

head(colData(cds))
## DataFrame with 6 rows and 11 columns
##                    orig.ident nCount_RNA nFeature_RNA percent.mt
##                      <factor>  <numeric>    <integer>  <numeric>
## AAACCTGAGCAGACTG-1      E17.5       5022         2087          0
## AAACCTGAGCCAGAAC-1      E17.5       3563         1706          0
## AAACCTGAGCTACCGC-1      E17.5       3414         1543          0
## AAACCTGAGGCAATTA-1      E17.5       5643         2149          0
## AAACCTGAGTGGGATC-1      E17.5       2551         1298          0
## AAACCTGAGTGTCTCA-1      E17.5       3733         1678          0
##                    RNA_snn_res.0.3 RNA_snn_res.0.5 RNA_snn_res.0.7
##                           <factor>        <factor>        <factor>
## AAACCTGAGCAGACTG-1               0               0               0
## AAACCTGAGCCAGAAC-1               4               8               5
## AAACCTGAGCTACCGC-1               0               0               0
## AAACCTGAGGCAATTA-1               2               2               2
## AAACCTGAGTGGGATC-1               0               0               0
## AAACCTGAGTGTCTCA-1               5               3               3
##                    RNA_snn_res.1 seurat_clusters    ident Size_Factor
##                         <factor>        <factor> <factor>   <numeric>
## AAACCTGAGCAGACTG-1             1               1        0        5022
## AAACCTGAGCCAGAAC-1             8               8        5        3563
## AAACCTGAGCTACCGC-1             2               2        0        3414
## AAACCTGAGGCAATTA-1             4               4        2        5643
## AAACCTGAGTGGGATC-1             2               2        0        2551
## AAACCTGAGTGTCTCA-1             3               3        3        3733

Get feature/gene metadata

fData(cds)
## DataFrame with 17008 rows and 0 columns
rownames(fData(cds))[1:10]
##  [1] "Xkr4"    "Gm19938" "Rp1"     "Sox17"   "Mrpl15"  "Lypla1"  "Tcea1"  
##  [8] "Rgs20"   "Atp6v1h" "Rb1cc1"
fData(cds)$gene_short_name <- rownames(fData(cds))
head(fData(cds))
## DataFrame with 6 rows and 1 column
##         gene_short_name
##             <character>
## Xkr4               Xkr4
## Gm19938         Gm19938
## Rp1                 Rp1
## Sox17             Sox17
## Mrpl15           Mrpl15
## Lypla1           Lypla1

Get counts

head(counts(cds))
##    [[ suppressing 8401 column names 'AAACCTGAGCAGACTG-1', 'AAACCTGAGCCAGAAC-1', 'AAACCTGAGCTACCGC-1' ... ]]

Retrieve clustering information from Surat object

1. Assign partitions

recreate.partitions <- c(rep(1, length(cds@colData@rownames)))
names(recreate.partitions) <- cds@colData@rownames
recreate.partitions <- as.factor(recreate.partitions)
recreate.partitions

cds@clusters@listData[["UMAP"]][["partitions"]] <- recreate.partitions

Assign cluster information

list.cluster <- data@active.ident
cds@clusters@listData[["UMAP"]][["clusters"]] <- list.cluster

Assign UMAP coordinates

cds@int_colData@listData[["reducedDims"]]@listData[["UMAP"]] <- data@reductions$umap@cell.embeddings

Plot

cluster.before.traj <-plot_cells(cds, color_cells_by = "cluster", label_groups_by_cluster = F, 
           group_label_size = 5) + theme(legend.position = "right")
## No trajectory to plot. Has learn_graph() been called yet?
cluster.before.traj

Learn Trajectory

cds <- learn_graph(cds, use_partition = F)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
## Warning in igraph::graph.dfs(stree_ori, root = root_cell, neimode = "all", :
## Argument `neimode' is deprecated; use `mode' instead
plot_cells(cds, color_cells_by = "cluster", label_groups_by_cluster = F,
           label_branch_points = T, label_roots = T, label_leaves = F,
           group_label_size = 5)

Order cells in Pseudotime

cds <- order_cells(cds, reduction_method = "UMAP", root_cells = colnames(cds[, clusters(cds) == 5]))
plot_cells(cds, color_cells_by = "pseudotime", label_groups_by_cluster = T,
           label_branch_points = T, label_roots = F, label_leaves = F)
## Cells aren't colored in a way that allows them to be grouped.

Cells ordered by Monocle3 Pseudotime

head(pseudotime(cds), 10)
## AAACCTGAGCAGACTG-1 AAACCTGAGCCAGAAC-1 AAACCTGAGCTACCGC-1 AAACCTGAGGCAATTA-1 
##        24.50532906         0.04643948        22.93558701        33.49121137 
## AAACCTGAGTGGGATC-1 AAACCTGAGTGTCTCA-1 AAACCTGCAAAGAATC-1 AAACCTGCACATCCAA-1 
##        20.18428700        47.71342517        12.55716513        54.82340234 
## AAACCTGCACATGGGA-1 AAACCTGCACGAAATA-1 
##        23.75238886        26.69687864
cds$monocle3_pseudotime <- pseudotime(cds)
data.pseudo <- as.data.frame(colData(cds))

ggplot(data.pseudo, aes(monocle3_pseudotime, seurat_clusters, fill = seurat_clusters)) + geom_boxplot()

ggplot(data.pseudo, aes(monocle3_pseudotime, reorder(seurat_clusters, monocle3_pseudotime), fill = seurat_clusters)) + geom_boxplot()

Find genes that change as a function of pseudotime

deg <- graph_test(cds, neighbor_graph = "principal_graph")
deg %>% arrange(q_value) %>% filter(status == "OK") %>% head()
FeaturePlot(data, features = c("Rgs20", "Bend6", "Mcm3", "B3gat2"))

Add pseudotime values into the seuratobject

data$pseudotime <- pseudotime(cds)
FeaturePlot(data, features = "pseudotime")

RidgePlot(data, features = c("Sox2", "Eomes", "Neurod2"), sort = T, idents = c("5", "6", "0", "1", "7"))
## Picking joint bandwidth of 0.0446
## Picking joint bandwidth of 0.0551
## Picking joint bandwidth of 0.148

my_genes <- row.names(subset(fData(cds), gene_short_name %in% c("Sox2", "Eomes", "Neurod2"))) 
cds_subset <- cds[my_genes,]
plot_genes_in_pseudotime(cds_subset, color_cells_by = "monocle3_pseudotime" )
## Warning: Transformation introduced infinite values in continuous y-axis
## Transformation introduced infinite values in continuous y-axis
## Warning in min(x): no non-missing arguments to min; returning Inf
## Warning in max(x): no non-missing arguments to max; returning -Inf
## Warning: Removed 25203 rows containing missing values (geom_point).