library(Seurat)
library(dplyr)
library(scmap)
library(SingleCellExperiment)
library(pheatmap)
library(RColorBrewer)
library(rgl)
library(gam)
library(slingshot)
library(ggplot2)
setwd("/Users/archanabalan/Documents/Spring Semester/CSCB/CSCB_P5_2019 2/data")
# helper functions for P4
# these are from CellNet
gamFit<-function(expMat,
genes, # genes to test
celltime){
genes2 <- intersect(genes, rownames(expMat))
# could print out if any missing
ans <- apply(expMat[genes2,names(celltime)],1,function(z){
d <- data.frame(z=z, t=celltime)
tmp <- gam(z ~ lo(celltime), data=d)
p <- summary(tmp)[4][[1]][1,5]
p
})
ans
}
hm_ti<-function(
expDat,
genes,
grps, ## vector of cellnames -> grp label
cRow=FALSE,
cCol=FALSE,
limits=c(0,10),
toScale=FALSE,
fontsize_row=4){
allgenes<-rownames(expDat)
missingGenes<-setdiff(genes, allgenes)
if(length(missingGenes)>0){
cat("Missing genes: ", paste0(missingGenes, collapse=","), "\n")
genes<-intersect(genes, allgenes)
}
value<-expDat[genes,]
if(toScale){
if(class(value)[1]!='matrix'){
value <- t(scale(Matrix::t(value)))
}
else{
value <- t(scale(t(value)))
}
}
value[value < limits[1]] <- limits[1]
value[value > limits[2]] <- limits[2]
groupNames<-unique(grps)
cells<-names(grps)
##
## groupNames<-myGrpSort(grps)
##
xcol <- colorRampPalette(rev(brewer.pal(n = 12,name = "Paired")))(length(groupNames))
names(xcol) <- groupNames
anno_colors <- list(group = xcol)
xx<-data.frame(group=as.factor(grps))
rownames(xx)<-cells
val_col <- colorRampPalette(rev(brewer.pal(n = 12,name = "Spectral")))(25)
#val_col <- colorRampPalette(brewer.pal(n = 12,name = "Spectral"))(100)
pheatmap(value, cluster_rows = cRow, cluster_cols = cCol, color=val_col,
show_colnames = FALSE, annotation_names_row = FALSE,
## annotation_col = annTab,
annotation_col = xx,
annotation_names_col = FALSE, annotation_colors = anno_colors, fontsize_row=fontsize_row)
}
extractClassLabel<-function(
scmRes,
sce, #SingleCellExperiment
queryClusLabel="seurat_clusters"
){
mtDat = colData(sce)
ans<-scmRes$combined_labs
sClustersVect = as.vector(mtDat[,queryClusLabel])
sClusters = unique( sClustersVect )
for(sCluster in sClusters){
cat(sCluster, "\n")
xi = which( sClustersVect == sCluster )
# cat(xi,"\n")
tmpLab = names(which.max( table(ans[xi]) ) )
cat(tmpLab, "\n")
ans[xi] = tmpLab
}
ans
}
plot3d.SlingshotDataSet <- function(x,
type = NULL,
add = FALSE,
dims = seq_len(3),
aspect = 'iso',
...){
if (!requireNamespace("rgl", quietly = TRUE)) {
stop("Package 'rgl' is required for 3D plotting.",
call. = FALSE)
}
n <- nrow(reducedDim(x))
curves <- FALSE
lineages <- FALSE
if(is.null(type)){
if(length(slingCurves(x)) > 0){
type <- 'curves'
}else if(length(slingLineages(x)) > 0){
type <- 'lineages'
}else{
stop('No lineages or curves detected.')
}
}else{
type <- c('curves','lineages','both')[pmatch(type,c('curves','lineages',
'both'))]
if(is.na(type)){
stop('Unrecognized type argument.')
}
}
if(type %in% c('lineages','both')){
lineages <- TRUE
}
if(type %in% c('curves','both')){
curves <- TRUE
}
if(lineages & (length(slingLineages(x))==0)){
stop('No lineages detected.')
}
if(curves & (length(slingCurves(x))==0)){
stop('No curves detected.')
}
if(lineages){
X <- reducedDim(x)
clusterLabels <- clusterLabels(x)
connectivity <- slingAdjacency(x)
clusters <- rownames(connectivity)
nclus <- nrow(connectivity)
centers <- t(vapply(clusters,function(clID){
w <- clusterLabels[,clID]
return(apply(X, 2, weighted.mean, w = w))
}, rep(0,ncol(X))))
rownames(centers) <- clusters
X <- X[rowSums(clusterLabels) > 0, , drop = FALSE]
clusterLabels <- clusterLabels[rowSums(clusterLabels) > 0, ,
drop = FALSE]
}
if(!add){
xs <- NULL
ys <- NULL
zs <- NULL
if(lineages){
xs <- c(xs, centers[,dims[1]])
ys <- c(ys, centers[,dims[2]])
zs <- c(zs, centers[,dims[3]])
}
if(curves){
xs <- c(xs, as.numeric(vapply(slingCurves(x), function(c){
c$s[,dims[1]] }, rep(0,n))))
ys <- c(ys, as.numeric(vapply(slingCurves(x), function(c){
c$s[,dims[2]] }, rep(0,n))))
zs <- c(zs, as.numeric(vapply(slingCurves(x), function(c){
c$s[,dims[3]] }, rep(0,n))))
}
rgl::plot3d(x = NULL, y = NULL, z = NULL, aspect = aspect,
xlim = range(xs), ylim = range(ys), zlim = range(zs),
xlab = colnames(reducedDim(x))[dims[1]],
ylab = colnames(reducedDim(x))[dims[2]],
zlab = colnames(reducedDim(x))[dims[3]])
}
if(lineages){
for(i in seq_len(nclus-1)){
for(j in seq(i+1,nclus)){
if(connectivity[i,j]==1){
rgl::lines3d(x = centers[c(i,j),dims[1]],
y = centers[c(i,j),dims[2]],
z = centers[c(i,j),dims[3]], ...)
}
}
}
}
if(curves){
for(c in slingCurves(x)){ rgl::lines3d(c$s[c$ord,dims], ...) }
}
invisible(NULL)
}
#' print date
#'
#' print date
#' @return string
#'
#' @export
utils_myDate<-function
###
()
{
format(Sys.time(), "%b_%d_%Y");
}
#' loads an R object when you don't know the name
#'
#' loads an R object when you don't know the name
#' @param fname file
#'
#' @return variable
#'
#' @export
utils_loadObject<-function
(fname
### file name
){
x<-load(fname);
get(x);
}
#' @export
splitCommon<-function(sampTab, ncells, dLevel="cell_ontology_class"){
cts<-unique(as.vector(sampTab[,dLevel]))
trainingids<-vector()
for(ct in cts){
cat(ct,": ")
stX<-sampTab[sampTab[,dLevel]==ct,]
ccount<-nrow(stX)-3
ccount<-min(ccount, ncells)
cat(nrow(stX),"\n")
trainingids<-append(trainingids, sample(rownames(stX), ccount))
}
val_ids<-setdiff(rownames(sampTab), trainingids)
list(train=sampTab[trainingids,], val=sampTab[val_ids,])
}
#' weighted subtraction from mapped reades, applied to all
#'
#' Simulate expression profile of _total_ mapped reads
#' @param expRaw matrix of total mapped reads per gene/transcript
#' @param total numeric post transformation sum of read counts
#'
#' @return vector of downsampled read mapped to genes/transcripts
#'
#' @export
weighted_down<-function
(expDat,
total,
dThresh=0
){
if(class(expDat)[1]!='matrix'){
cSums <- Matrix::colSums(expDat)
props <- Matrix::t(expDat) / cSums
rrids <- cSums - total
tmpAns <- expDat - Matrix::t(props * rrids)
tmpAns[Matrix::which(tmpAns<dThresh)] <- 0
}
else{
cSums <- colSums(expDat)
props <- t(expDat) / cSums
rrids <- cSums - total
tmpAns <- expDat - t(props * rrids)
tmpAns[which(tmpAns<dThresh)] <- 0
}
tmpAns
}
#' @export
trans_prop<-function
(expDat,
xFact=1e5){
if(class(expDat)[1]!='matrix'){
cSums <- Matrix::colSums(expDat)
ans <- Matrix::t(log(1 + xFact * Matrix::t(expDat) / cSums))
}
else{
cSums <- colSums(expDat)
ans<- t(log(1 + xFact * t(expDat) / cSums))
}
ans
}
Metrics used for assessing performance:
Accuracy = Number of cells with matching labels(in test set & scmap cluster) / Total number of cells
Uassigned Cells = The lesser the number of unassigned cells the better the performance
# load bertie data
load('sce_bertie_Apr_22_2019.rda')
# To subsample the data set
# 80% of the data is used for training (obtaining cluster annotations)
# Remaning 20% is used for evaluating performance of scmap cluster
set.seed(7)
# Total indices
index_total = 1:dim(sce_bertie)[2]
train_set_index = sample(index_total, 0.8*length(index_total))
train_set = sce_bertie[, train_set_index]
test_set = sce_bertie[ , -train_set_index]
# Remove existing metadata (smap cluster index from train and test set)
metadata(train_set)=list()
metadata(test_set)= list()
train_set<- selectFeatures(train_set, suppress_plot = FALSE)
# Indexing the reference dataset
train_set <- indexCluster(train_set, cluster_col = 'celltype')
# Represent the cluseter indices using a heat map
heatmap(as.matrix(metadata(train_set)$scmap_cluster_index))
# Scmap clustering using default threshold value (0.7)
scmapCluster_results <- scmapCluster(
projection = test_set,
index_list = list(
yan= metadata(train_set)$scmap_cluster_index
)
)
# Computing performance measures:
original_labs = colData(test_set)[["celltype"]]
labs = scmapCluster_results$scmap_cluster_labs
correct_pred = length(which(labs == original_labs))
acc_default = correct_pred/dim(test_set)[2]
unassign_default = length(which(labs == "unassigned"))
print(c('Accuracy for threshold of 0.7 is ',acc_default) )
## [1] "Accuracy for threshold of 0.7 is " "0.847027027027027"
print(c('Number of unassigned cells for threshold of 0.7 is ',unassign_default))
## [1] "Number of unassigned cells for threshold of 0.7 is "
## [2] "4"
# Testing different threshold values ranging from 0.1 to 1
threshold_list = c(0.1,0.2, 0.3,0.4,0.5,0.6,0.7,0.8,0.9)
acc_list = c()
unassign_list = c()
for (i in threshold_list){
# Cluster sce_query to index values computes using reference data
scmapCluster_results <- scmapCluster(
projection = test_set,
index_list = list(
yan= metadata(train_set)$scmap_cluster_index
),
threshold = i
)
# Comparing performance
labs = scmapCluster_results$scmap_cluster_labs
corr_pred = length(which(labs == original_labs))
acc_list = c(acc_list, (corr_pred/dim(test_set)[2]))
unassign_list= c(unassign_list, (length(which(labs == "unassigned"))))
}
thres_df = as.data.frame(cbind(threshold_list , acc_list, unassign_list))
print(thres_df)
## threshold_list acc_list unassign_list
## 1 0.1 0.8486486 1
## 2 0.2 0.8486486 1
## 3 0.3 0.8491892 1
## 4 0.4 0.8486486 1
## 5 0.5 0.8486486 1
## 6 0.6 0.8486486 1
## 7 0.7 0.8470270 4
## 8 0.8 0.8443243 16
## 9 0.9 0.6562162 445
acc_plot = ggplot(data=thres_df, aes(x=threshold_list, y=acc_list))+
geom_point(stat = "identity")+
labs(x="Threshold", y="Accuracy")+
ggtitle("Accuracy Plot ")
acc_plot
unassign_plot = ggplot(data=thres_df, aes(x=threshold_list, y= unassign_list))+
geom_point(stat = "identity")+
labs(x="Threshold", y="Number of unassigned cells")+
ggtitle("Plot for Unassigned Cells ")
unassign_plot
It is observed that the performance decreases with an increase in threshold. While it remains comparable(same) for lower thrshold values of 0.1 to 0.4 it decreases significantly for a high threshold value of 0.8 and 0.9.
# load Spangler data
load('seurat_spangler_Apr_22_2019.rda')
# 2. Select variably expressed genes (top 2000)
qDatX <- FindVariableFeatures(object = qDatX, selection.method = "vst", nfeatures = 2000)
# Identify the 10 most highly variable genes
top10 <- head(x = VariableFeatures(object = qDatX), 10)
# 3. Scale and PCA
# Scale
all.genes <- rownames(x = qDatX)
qDatX <- ScaleData(object = qDatX, features = all.genes)
# PCA
qDatX <- RunPCA(object =qDatX, features = VariableFeatures(object = qDatX))
## PC_ 1
## Positive: Sox11, Tmsb4x, Crabp1, Meis2, Crabp2, Map1b, Tubb2b, Gm1673, Tuba1a, Hoxb3os
## Sox4, Nnat, Vim, Cd24a, Basp1, Mir124-2hg, Tcf12, Tubb3, Elavl4, Ccnd2
## Rtn1, Nhlh2, Rnf165, Fabp7, Tagln3, Nkain4, Ckb, Elavl3, Dpysl3, Map2
## Negative: Chchd10, Dppa5a, Mt2, Epcam, Mt1, Utf1, Apoe, Crip1, Eif4ebp1, Rplp2
## Aldoa, L1td1, Zfp42, Dnmt3b, Eno1, Apoc1, Klf2, Rpl23a, Tcea3, Rps21
## Lima1, Rpl36, Rpl37, Nhp2, Hsp90aa1, Bst2, Ncl, Fabp3, Mif, Tdgf1
## PC_ 2
## Positive: 2810417H13Rik, Hmgb1, Sfrp1, Hspe1, Ccnd2, Hmgb2, Mif, Ptn, Prtg, Rbp1
## Mest, Hmga2, Tceal8, Serpinh1, Ppp1r1a, Hes1, Fst, Rpl22l1, Txn1, Dut
## Bex4, Foxb1, Ldha, Dbi, Lig1, Krt8, Peg3, Pcna, T, Ncl
## Negative: Tubb3, Nhlh2, Ina, Elavl3, Dcx, Map2, Zfp42, Onecut2, Dppa5a, Ebf1
## Nefm, Elavl4, Robo3, D930028M14Rik, Nefl, Uncx, Chgb, Mllt11, Ebf2, Rtn1
## Gap43, Ebf3, Klf2, Sst, Pou2f2, Srrm3, Nkain4, Utf1, Igfbpl1, Nhlh1
## PC_ 3
## Positive: Krt8, Krt18, Lhx1, Emb, Trh, Stmn2, S100a10, T, Cer1, Tagln2
## Gsc, Tmsb10, Samd3, Dkk1, Cthrc1, Sox17, Flt1, Foxa2, Fgf5, Cldn6
## Tgfb2, Krt7, Slit2, Car4, Pifo, Cfc1, Chrd, Fgf8, Lhx1os, Rspo3
## Negative: Sox2, Hes5, Crabp2, Ccnd2, Sfrp2, Fabp7, Ptn, Ccnd1, Nes, Ildr2
## Fez1, Ckb, Igdcc3, Pax6, Mest, Pantr1, Cyp26b1, Rfx4, Rrm2, Zfp42
## Fgfbp3, Hoxb3os, Crabp1, Ly6h, Fam181b, Hmga2, Top2a, Hells, Id2, Ncl
## PC_ 4
## Positive: Pim2, Meg3, Dnmt3b, Snrpn, Arl4c, Pou3f1, Bst2, Gng3, 2810417H13Rik, Hmgb2
## Stmn2, Ldha, G3bp2, Tubb4b, Cdca8, Dut, Hmgn5, Myo10, Cenpf, Gja1
## L1td1, Cenpa, F2rl1, Cdk1, Rbpms2, Fabp5, Arl6ip1, Ndufa5, Crmp1, Nop10
## Negative: Ifitm1, Zfp42, Krt8, Dppa3, Cfc1, Tgfb2, Arl4a, Samd3, Spp1, Arid5b
## Dkk1, Tdh, Lhx1, Cer1, Morc1, Klf2, Sox17, Fam25c, Krt18, Flt1
## Ooep, Gstm1, Slit2, T, Pcdh7, Prrc1, Gtsf1l, Bmp2, Fn1, Dgkk
## PC_ 5
## Positive: Atp5j2, Cox5b, Cox7b, Cox7c, Rpl27, Txn1, Atp5h, Rps17, Rpl38, Cox7a2
## Dbi, Atpif1, Fabp5, Rpl37, Mif, Atp5k, Hsp90aa1, Ndufa7, Rpl37a, Ifitm2
## Ndufb6, Ndufa11, 2010107E04Rik, Usmg5, Sod1, Atp5g1, Uqcr10, Hspe1, Rplp2, Rpl22
## Negative: Malat1, Slc2a1, Slc2a3, Gja1, Dnmt3b, Cenpf, Plk1, Crb3, Iqgap1, Cdh1
## Tpx2, Aspm, Cenpe, Meg3, Hmmr, Klf6, Rif1, Epcam, Slc16a1, Incenp
## Pvrl2, Trh, Sox17, F2rl1, Krt18, Cdc20, Top2a, Otx2, Aurka, Klf9
# Printing out first 5 principal components
print(x = qDatX[["pca"]], dims = 1:5, nfeatures = 5)
## PC_ 1
## Positive: Sox11, Tmsb4x, Crabp1, Meis2, Crabp2
## Negative: Chchd10, Dppa5a, Mt2, Epcam, Mt1
## PC_ 2
## Positive: 2810417H13Rik, Hmgb1, Sfrp1, Hspe1, Ccnd2
## Negative: Tubb3, Nhlh2, Ina, Elavl3, Dcx
## PC_ 3
## Positive: Krt8, Krt18, Lhx1, Emb, Trh
## Negative: Sox2, Hes5, Crabp2, Ccnd2, Sfrp2
## PC_ 4
## Positive: Pim2, Meg3, Dnmt3b, Snrpn, Arl4c
## Negative: Ifitm1, Zfp42, Krt8, Dppa3, Cfc1
## PC_ 5
## Positive: Atp5j2, Cox5b, Cox7b, Cox7c, Rpl27
## Negative: Malat1, Slc2a1, Slc2a3, Gja1, Dnmt3b
# Determing significant PCs using elbow plot
#ELBOW PLOT
ElbowPlot(object = qDatX, ndims = 35)
Significant PCs: It can be observed from the elbow plot that the first 30 PCs cover almost 90% of the varience in data. Thus only top 30 PCs are selected for further analysis.
###Visualization: PCA, Clustering and Differentially expressed genes
# Visualizing PCs
# PCA plot
# PC1 vs PC2
DimPlot(object = qDatX, reduction = "pca")
# PC1 vs PC3
DimPlot(object = qDatX, dims = c(1,3), reduction = "pca")
# Visualizing Clusters
# Choosing top 30 PCs as significant PCs
#Clustering
qDatX <- FindNeighbors(object = qDatX, dims = 1:30)
qDatX<- FindClusters(object = qDatX, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 3841
## Number of edges: 143949
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8522
## Number of communities: 7
## Elapsed time: 0 seconds
# Visualizing the clusters
qDatX <- RunUMAP(object = qDatX, dims = 1:30)
DimPlot(object = qDatX, reduction = "umap")
# plot variable genes with and without labels
# Top 2000 variable genes with names of top 10 differentially expressed genes
plot1 <- VariableFeaturePlot(object = qDatX)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
plot2
# Convert Seurat object to SingleCellExperiment object
qDatX_sce<- SingleCellExperiment(assays = list(logcounts = as.matrix(qDatX[["RNA"]]@data)), colData = qDatX@meta.data)
rowData(qDatX_sce)$feature_symbol <- rownames(qDatX_sce)
# Cluster qDatX_sce to index values computes using reference data
# Using threshold = 0.2 for optimal results
scmapCluster_results <- scmapCluster(
projection = qDatX_sce,
index_list = list(
yan= metadata(sce_bertie)$scmap_cluster_index
),
threshold = 0.2
)
plot(
getSankey(
colData(qDatX_sce)$seurat_clusters,
scmapCluster_results$scmap_cluster_labs[,'yan'],
plot_height = 400
)
)
qDatX@meta.data$pred_types = extractClassLabel(scmapCluster_results, qDatX_sce)
## 5
## Forebrain/Midbrain/Hindbrain
## 1
## Epiblast
## 2
## Forebrain/Midbrain/Hindbrain
## 4
## Def. endoderm
## 3
## Forebrain/Midbrain/Hindbrain
## 6
## Forebrain/Midbrain/Hindbrain
## 0
## Rostral neurectoderm
# Plot umap with the annotations
DimPlot(object = qDatX, reduction = "umap", group.by = 'pred_types')
The data is divided into 4 main clusters: Epiblast, Def endoderm, Rostral neurectoderm and Forebrain/Midbrain/Hindbrain
GetLineages function input: The starting cluster is given as epiblast and ending cluster is given as Forebrain/Midbrain/Hindbrain
# Convert Spangler data from Seurat object to SingleCellExperiment object
qDatX_sce= as.SingleCellExperiment(qDatX)
# Generate lineages for first 3 PCs
lin <- getLineages(reducedDims(qDatX_sce)$PCA[, 1:3], colData(qDatX_sce)[, "pred_types"], start.clus = 'Epiblast', end.clus = 'Forebrain/Midbrain/Hindbrain')
# Print out the lineages generated
lin
## class: SlingshotDataSet
##
## Samples Dimensions
## 3841 3
##
## lineages: 2
## Lineage1: Epiblast Rostral neurectoderm Forebrain/Midbrain/Hindbrain
## Lineage2: Epiblast Rostral neurectoderm Def. endoderm
##
## curves: 0
# Obtain curves from the lineages generated
crv <- getCurves(lin)
# Generate 2D plot of lineage curves
# PC1 vs PC2
rd = reducedDims(qDatX_sce)$PCA[, c(1,2)]
cl = colData(qDatX_sce)[, "seurat_clusters"]
plot(rd, col = brewer.pal(9,'Set1')[cl], asp=1, pch = 16)
lines(lin, lwd = 3)
# PC1 vs PC3
rd = reducedDims(qDatX_sce)$PCA[, c(1,3)]
cl = colData(qDatX_sce)[, "seurat_clusters"]
plot(rd, col = brewer.pal(9,'Set1')[cl], asp=1, pch = 16)
lines(lin, lwd = 3)
# Generate 3D plot to visualize all 3 PCs
xdata = as.matrix(qDatX@reductions$pca@cell.embeddings[,1:3])
reducedDim(qDatX_sce) <- xdata
mtData = qDatX@meta.data$pred_types
names(mtData) = rownames(qDatX@meta.data)
getPalette = colorRampPalette(brewer.pal(7, "Paired"))
plot3d.SlingshotDataSet(crv, lwd=4)
points3d(c(1,2,3), col = getPalette(length(unique(mtData)))[mtData], alpha=.75)
Slingshot defines two distinct lineages for the Spangler dataset:
Lineage1: Epiblast Rostral neurectoderm Forebrain/Midbrain/Hindbrain
Lineage2: Epiblast Rostral neurectoderm Def. endoderm
The clusters taken by each of the paths are:
Epiblast -> Rostral neurectoderm -> Forebrain/Midbrain/Hindbrain
Epiblast -> Rostral neurectoderm -> Def. endoderm
Embryonic Stages of Development: (Based on teh Theiler stages of mouse embryonic development)
Theiler Stage 7 -> Theiler Stage 9 -> Theiler Stage 19
Theiler Stage 7 -> Theiler Stage 9 -> Theiler Stage 7
# Compute psuedotime using slingshot
pst = slingPseudotime(crv)
# Finding differentially expressed TFs and heatmap for 1st path
pt1<-pst[which(!is.na(pst[,1])),]
t1 = pt1[,1]
expDat = as.matrix(qDatX[["RNA"]]@data)
#Loading mouse TFs
load("mmTFs.rda")
# Gamfit
system.time(gpC1 <-gamFit(expDat, mmTFs, t1))
## user system elapsed
## 147.856 34.549 186.223
# Picking the top 50 differentially expressed genes
yy = gpC1[order(gpC1)][1:50]
gene50_path1 = names(yy)
# Picking out cells attributed to path 1
path1_ind = which(qDatX_sce$pred_types=='Epiblast' | qDatX_sce$pred_types=='Rostral neurectoderm' | qDatX_sce$pred_types =='Forebrain/Midbrain/Hindbrain')
path1_sr = qDatX[,path1_ind]
cell_names_path1 = path1_sr$pred_types
# generating heat map
hm_ti(
expDat,
gene50_path1,
cell_names_path1, ## vector of cellnames -> grp label
cRow=FALSE,
cCol=FALSE,
limits=c(0,10),
toScale=FALSE,
fontsize_row=4)
Analysis of TF clusters:
TFs that are uniformly expressed throughout the trajectory:Npm1, Ncl, Ebf1, Uncx and Hmg2
TFs that are highly expressed at first and shut off later:Klf2, Sox 11, Zfp42
TFs that are shut off at first and highly expressed later: Hes5 , Hoxa2, Hmbg2
Discussion on role of select TFs: Zfp42: Expressed mainly in undifferentiated cells and plays a major role in maintaining pluripotency. It has an important role in mesoendoderm potential [1]
Hoxa2: It is a part of cluster A of the homeobox genes. It plays an important role in placing some segments of the hindbrain along the anterior-posterior axis. Thus it is more highly expressed later in the trajectory.[2]
Ebf1: Required for B-cell differentiation, and signalling pathways.
# Finding differentially expressed TFs and heatmap for 2nd path
pt2<-pst[which(!is.na(pst[,2])),]
t2 = pt1[,2]
# Gamfit
system.time(gpC2 <-gamFit(expDat, mmTFs, t2))
## user system elapsed
## 51.988 16.520 72.535
# Picking the top 50 differentially expressed genes
yy = gpC2[order(gpC2)][1:50]
gene50_path2 = names(yy)
# Picking out cells attributed to path 1
path2_ind = which(qDatX_sce$pred_types=='Epiblast' | qDatX_sce$pred_types=='Rostral neurectoderm' | qDatX_sce$pred_types =='Def. endoderm')
path2_sr = qDatX[,path2_ind]
cell_names_path2 = path2_sr$pred_types
# generating heat map
hm_ti(
expDat,
gene50_path2,
cell_names_path2, ## vector of cellnames -> grp label
cRow=FALSE,
cCol=FALSE,
limits=c(0,10),
toScale=FALSE,
fontsize_row=4)
Analysis of TF clusters:
TFs that are uniformly expressed throughout the trajectory:Hmbg1, Hnrnpa2b1, Nr0b1, Klf5
TFs that are highly expressed at first and shut off later:Klf2, Sox 11, Zfp42
TFs that are shut off at first and highly expressed later: Hes1, Zeb2 and Ash21
Discussion on role of select TFs: Zfp42: Expressed mainly in undifferentiated cells and plays a major role in maintaining pluripotency. It has an important role in mesoendoderm potential [1]
Zeb2: Crucial for cell and tissue development and is thus more highly expressed in the later part of the trajectory (endoderm)[3]
Nr0b1: Carries DAX1 protein that is crucial for development of endocrine glands.
[1]Son, Mi‐Young, et al. “Unveiling the critical role of REX1 in the regulation of human stem cell pluripotency.” Stem cells 31.11 (2013): 2374-2387.
[2] NCBI report: HOXA2 homeobox A2 [ Homo sapiens (human) ] [3]Genetics Home Reference: Zeb2 gene