CSCB_Project

Code Setup

library(Seurat)
library(dplyr)
library(scmap)
library(SingleCellExperiment)
library(pheatmap)
library(RColorBrewer)
library(rgl)
library(gam)
library(slingshot)
library(ggplot2)

setwd("/Users/archanabalan/Documents/Spring Semester/CSCB/CSCB_P5_2019 2/data")

Importing fuctions from helper code (provided along with the project)

# helper functions for P4
# these are from CellNet

gamFit<-function(expMat,
 genes, # genes to test
  celltime){

  genes2 <- intersect(genes, rownames(expMat))
  # could print out if any missing
  ans <- apply(expMat[genes2,names(celltime)],1,function(z){
     d <- data.frame(z=z, t=celltime)
     tmp <- gam(z ~ lo(celltime), data=d)
    p <- summary(tmp)[4][[1]][1,5]
    p
  })
  ans
}




hm_ti<-function(
  expDat,
  genes,
  grps, ## vector of cellnames -> grp label
  cRow=FALSE,
  cCol=FALSE,
  limits=c(0,10),
  toScale=FALSE,
  fontsize_row=4){
  allgenes<-rownames(expDat)
  missingGenes<-setdiff(genes, allgenes)
  if(length(missingGenes)>0){
    cat("Missing genes: ", paste0(missingGenes, collapse=","), "\n")
    genes<-intersect(genes, allgenes)
  }
  value<-expDat[genes,]
  if(toScale){
      if(class(value)[1]!='matrix'){
        value <- t(scale(Matrix::t(value)))
      }
      else{
        value <- t(scale(t(value)))
      }
    }
  value[value < limits[1]] <- limits[1]
  value[value > limits[2]] <- limits[2]
  groupNames<-unique(grps)
  cells<-names(grps)  
##
 ## groupNames<-myGrpSort(grps)
##

  xcol <- colorRampPalette(rev(brewer.pal(n = 12,name = "Paired")))(length(groupNames))
    names(xcol) <- groupNames
    anno_colors <- list(group = xcol)
    xx<-data.frame(group=as.factor(grps))
    rownames(xx)<-cells
   val_col <- colorRampPalette(rev(brewer.pal(n = 12,name = "Spectral")))(25)
   #val_col <- colorRampPalette(brewer.pal(n = 12,name = "Spectral"))(100)
  pheatmap(value, cluster_rows = cRow, cluster_cols = cCol, color=val_col,
        show_colnames = FALSE, annotation_names_row = FALSE,
##        annotation_col = annTab,
        annotation_col = xx,
        annotation_names_col = FALSE, annotation_colors = anno_colors, fontsize_row=fontsize_row)
}


extractClassLabel<-function(
  scmRes,
  sce, #SingleCellExperiment
  queryClusLabel="seurat_clusters"
){
  mtDat = colData(sce)
  ans<-scmRes$combined_labs


 
  sClustersVect = as.vector(mtDat[,queryClusLabel])
  sClusters = unique( sClustersVect )
  for(sCluster in sClusters){
    cat(sCluster, "\n")
    xi = which(  sClustersVect == sCluster )
 #   cat(xi,"\n")
    tmpLab = names(which.max( table(ans[xi]) ) )
    cat(tmpLab, "\n")  
    ans[xi] = tmpLab
  }
  ans
}


plot3d.SlingshotDataSet <- function(x, 


type = NULL, 

add = FALSE, 

dims = seq_len(3), 

aspect = 'iso', 

...){ 

if (!requireNamespace("rgl", quietly = TRUE)) { 

stop("Package 'rgl' is required for 3D plotting.", 

call. = FALSE) 

} 

n <- nrow(reducedDim(x)) 

curves <- FALSE

lineages <- FALSE

if(is.null(type)){ 

if(length(slingCurves(x)) > 0){ 

type <- 'curves'

}else if(length(slingLineages(x)) > 0){ 

type <- 'lineages'

}else{ 

stop('No lineages or curves detected.') 

} 

}else{ 

type <- c('curves','lineages','both')[pmatch(type,c('curves','lineages', 

'both'))] 

if(is.na(type)){ 

stop('Unrecognized type argument.') 

} 

} 



if(type %in% c('lineages','both')){ 

lineages <- TRUE

} 

if(type %in% c('curves','both')){ 

curves <- TRUE

} 



if(lineages & (length(slingLineages(x))==0)){ 

stop('No lineages detected.') 

} 

if(curves & (length(slingCurves(x))==0)){ 

stop('No curves detected.') 

} 



if(lineages){ 

X <- reducedDim(x) 

clusterLabels <- clusterLabels(x) 

connectivity <- slingAdjacency(x) 

clusters <- rownames(connectivity) 

nclus <- nrow(connectivity) 

centers <- t(vapply(clusters,function(clID){ 

w <- clusterLabels[,clID] 

return(apply(X, 2, weighted.mean, w = w)) 

}, rep(0,ncol(X)))) 

rownames(centers) <- clusters

X <- X[rowSums(clusterLabels) > 0, , drop = FALSE] 

clusterLabels <- clusterLabels[rowSums(clusterLabels) > 0, , 

drop = FALSE] 

} 



if(!add){ 

xs <- NULL

ys <- NULL

zs <- NULL

if(lineages){ 

xs <- c(xs, centers[,dims[1]]) 

ys <- c(ys, centers[,dims[2]]) 

zs <- c(zs, centers[,dims[3]]) 

} 

if(curves){ 

xs <- c(xs, as.numeric(vapply(slingCurves(x), function(c){ 

c$s[,dims[1]] }, rep(0,n)))) 

ys <- c(ys, as.numeric(vapply(slingCurves(x), function(c){ 

c$s[,dims[2]] }, rep(0,n)))) 

zs <- c(zs, as.numeric(vapply(slingCurves(x), function(c){ 

c$s[,dims[3]] }, rep(0,n)))) 

} 

rgl::plot3d(x = NULL, y = NULL, z = NULL, aspect = aspect, 

xlim = range(xs), ylim = range(ys), zlim = range(zs), 

xlab = colnames(reducedDim(x))[dims[1]], 

ylab = colnames(reducedDim(x))[dims[2]], 

zlab = colnames(reducedDim(x))[dims[3]]) 

} 



if(lineages){ 

for(i in seq_len(nclus-1)){ 

for(j in seq(i+1,nclus)){ 

if(connectivity[i,j]==1){ 

rgl::lines3d(x = centers[c(i,j),dims[1]], 

y = centers[c(i,j),dims[2]], 

z = centers[c(i,j),dims[3]], ...) 

} 

} 

} 

} 

if(curves){ 

for(c in slingCurves(x)){ rgl::lines3d(c$s[c$ord,dims], ...) } 

} 

invisible(NULL) 

}


#' print date
#'
#' print date
#' @return string
#'
#' @export
utils_myDate<-function
### 
()
{
  format(Sys.time(), "%b_%d_%Y");
}



#' loads an R object when you don't know the name
#'
#' loads an R object when you don't know the name
#' @param fname file
#'
#' @return variable
#'
#' @export
utils_loadObject<-function
(fname
 ### file name
){
  x<-load(fname);
  get(x);
}



#' @export
splitCommon<-function(sampTab, ncells, dLevel="cell_ontology_class"){
  cts<-unique(as.vector(sampTab[,dLevel]))
  trainingids<-vector()
  for(ct in cts){
    cat(ct,": ")
    stX<-sampTab[sampTab[,dLevel]==ct,]
    ccount<-nrow(stX)-3
    ccount<-min(ccount, ncells)
    cat(nrow(stX),"\n")
    trainingids<-append(trainingids, sample(rownames(stX), ccount))
  }
  val_ids<-setdiff(rownames(sampTab), trainingids)
  list(train=sampTab[trainingids,], val=sampTab[val_ids,])
}

#' weighted subtraction from mapped reades, applied to all
#'
#' Simulate expression profile of  _total_ mapped reads
#' @param expRaw matrix of total mapped reads per gene/transcript
#' @param total numeric post transformation sum of read counts
#'
#' @return vector of downsampled read mapped to genes/transcripts
#'
#' @export
weighted_down<-function
(expDat,
 total,
 dThresh=0
 ){
  if(class(expDat)[1]!='matrix'){
    cSums  <- Matrix::colSums(expDat)
    props <- Matrix::t(expDat) / cSums
    rrids  <- cSums - total
    tmpAns <- expDat - Matrix::t(props * rrids)
    tmpAns[Matrix::which(tmpAns<dThresh)] <- 0
  }
  else{
    cSums  <- colSums(expDat)
    props <- t(expDat) / cSums
    rrids  <- cSums - total
    tmpAns <- expDat - t(props * rrids)
    tmpAns[which(tmpAns<dThresh)] <- 0
  }
  
  tmpAns
}



#' @export
trans_prop<-function
(expDat,
  xFact=1e5){

  if(class(expDat)[1]!='matrix'){
    cSums  <- Matrix::colSums(expDat)
    ans <- Matrix::t(log(1 + xFact * Matrix::t(expDat) / cSums))
  }
  else{
    cSums  <- colSums(expDat)
    ans<-  t(log(1 + xFact * t(expDat) / cSums))
  }
  ans
 
}

Question 1 :Scmap clustering for sce_bertie data

Metrics used for assessing performance:
Accuracy = Number of cells with matching labels(in test set & scmap cluster) / Total number of cells
Uassigned Cells = The lesser the number of unassigned cells the better the performance

Testing performance using default thresold value (0.7)

# load bertie data
load('sce_bertie_Apr_22_2019.rda')

# To subsample the data set
# 80% of the data is used for training (obtaining cluster annotations)
# Remaning 20% is used for evaluating performance of scmap cluster

set.seed(7)
# Total indices 
index_total = 1:dim(sce_bertie)[2]
train_set_index = sample(index_total, 0.8*length(index_total)) 

train_set = sce_bertie[, train_set_index]
test_set = sce_bertie[ , -train_set_index]

# Remove existing metadata (smap cluster index from train and test set)
metadata(train_set)=list()
metadata(test_set)= list()


train_set<- selectFeatures(train_set, suppress_plot = FALSE)

# Indexing the reference dataset
train_set <- indexCluster(train_set, cluster_col = 'celltype')

# Represent the cluseter indices using a heat map
heatmap(as.matrix(metadata(train_set)$scmap_cluster_index))

# Scmap clustering using default threshold value (0.7)

scmapCluster_results <- scmapCluster(
  projection = test_set,
  index_list = list(
    yan= metadata(train_set)$scmap_cluster_index
  )
)

# Computing performance measures: 
original_labs = colData(test_set)[["celltype"]]
labs = scmapCluster_results$scmap_cluster_labs
correct_pred = length(which(labs == original_labs))
acc_default = correct_pred/dim(test_set)[2]
unassign_default = length(which(labs == "unassigned"))

print(c('Accuracy for threshold of 0.7 is ',acc_default) )

## [1] "Accuracy for threshold of 0.7 is " "0.847027027027027"

print(c('Number of unassigned cells for threshold of 0.7 is ',unassign_default))

## [1] "Number of unassigned cells for threshold of 0.7 is "
## [2] "4"

Testing performance for different threshold values.

# Testing different threshold values ranging from 0.1 to 1

threshold_list = c(0.1,0.2, 0.3,0.4,0.5,0.6,0.7,0.8,0.9)
acc_list = c()
unassign_list = c()
for (i in  threshold_list){
# Cluster sce_query to index values computes using reference data
scmapCluster_results <- scmapCluster(
  projection = test_set,
  index_list = list(
    yan= metadata(train_set)$scmap_cluster_index
  ),
  threshold = i
)


# Comparing performance
labs = scmapCluster_results$scmap_cluster_labs

corr_pred = length(which(labs == original_labs))
acc_list = c(acc_list, (corr_pred/dim(test_set)[2]))
unassign_list= c(unassign_list, (length(which(labs == "unassigned"))))

}

thres_df = as.data.frame(cbind(threshold_list , acc_list, unassign_list))
print(thres_df)

##   threshold_list  acc_list unassign_list
## 1            0.1 0.8486486             1
## 2            0.2 0.8486486             1
## 3            0.3 0.8491892             1
## 4            0.4 0.8486486             1
## 5            0.5 0.8486486             1
## 6            0.6 0.8486486             1
## 7            0.7 0.8470270             4
## 8            0.8 0.8443243            16
## 9            0.9 0.6562162           445

 acc_plot = ggplot(data=thres_df, aes(x=threshold_list, y=acc_list))+
              geom_point(stat = "identity")+
                 labs(x="Threshold", y="Accuracy")+
                  ggtitle("Accuracy Plot ")
 acc_plot

 unassign_plot = ggplot(data=thres_df, aes(x=threshold_list, y= unassign_list))+
              geom_point(stat = "identity")+  
                labs(x="Threshold", y="Number of unassigned cells")+
                ggtitle("Plot for Unassigned Cells ")
   
   
  unassign_plot

It is observed that the performance decreases with an increase in threshold. While it remains comparable(same) for lower thrshold values of 0.1 to 0.4 it decreases significantly for a high threshold value of 0.8 and 0.9.

Question 2: Clustering for Spangler data

Feature selection and PCA

# load Spangler data
load('seurat_spangler_Apr_22_2019.rda')

# 2. Select variably expressed genes (top 2000)
qDatX <- FindVariableFeatures(object = qDatX, selection.method = "vst", nfeatures = 2000)

# Identify the 10 most highly variable genes
top10 <- head(x = VariableFeatures(object = qDatX), 10)

# 3. Scale and PCA
# Scale
all.genes <- rownames(x = qDatX)
qDatX <- ScaleData(object = qDatX, features = all.genes)

# PCA
qDatX <- RunPCA(object =qDatX, features = VariableFeatures(object = qDatX))

## PC_ 1 
## Positive:  Sox11, Tmsb4x, Crabp1, Meis2, Crabp2, Map1b, Tubb2b, Gm1673, Tuba1a, Hoxb3os 
##     Sox4, Nnat, Vim, Cd24a, Basp1, Mir124-2hg, Tcf12, Tubb3, Elavl4, Ccnd2 
##     Rtn1, Nhlh2, Rnf165, Fabp7, Tagln3, Nkain4, Ckb, Elavl3, Dpysl3, Map2 
## Negative:  Chchd10, Dppa5a, Mt2, Epcam, Mt1, Utf1, Apoe, Crip1, Eif4ebp1, Rplp2 
##     Aldoa, L1td1, Zfp42, Dnmt3b, Eno1, Apoc1, Klf2, Rpl23a, Tcea3, Rps21 
##     Lima1, Rpl36, Rpl37, Nhp2, Hsp90aa1, Bst2, Ncl, Fabp3, Mif, Tdgf1 
## PC_ 2 
## Positive:  2810417H13Rik, Hmgb1, Sfrp1, Hspe1, Ccnd2, Hmgb2, Mif, Ptn, Prtg, Rbp1 
##     Mest, Hmga2, Tceal8, Serpinh1, Ppp1r1a, Hes1, Fst, Rpl22l1, Txn1, Dut 
##     Bex4, Foxb1, Ldha, Dbi, Lig1, Krt8, Peg3, Pcna, T, Ncl 
## Negative:  Tubb3, Nhlh2, Ina, Elavl3, Dcx, Map2, Zfp42, Onecut2, Dppa5a, Ebf1 
##     Nefm, Elavl4, Robo3, D930028M14Rik, Nefl, Uncx, Chgb, Mllt11, Ebf2, Rtn1 
##     Gap43, Ebf3, Klf2, Sst, Pou2f2, Srrm3, Nkain4, Utf1, Igfbpl1, Nhlh1 
## PC_ 3 
## Positive:  Krt8, Krt18, Lhx1, Emb, Trh, Stmn2, S100a10, T, Cer1, Tagln2 
##     Gsc, Tmsb10, Samd3, Dkk1, Cthrc1, Sox17, Flt1, Foxa2, Fgf5, Cldn6 
##     Tgfb2, Krt7, Slit2, Car4, Pifo, Cfc1, Chrd, Fgf8, Lhx1os, Rspo3 
## Negative:  Sox2, Hes5, Crabp2, Ccnd2, Sfrp2, Fabp7, Ptn, Ccnd1, Nes, Ildr2 
##     Fez1, Ckb, Igdcc3, Pax6, Mest, Pantr1, Cyp26b1, Rfx4, Rrm2, Zfp42 
##     Fgfbp3, Hoxb3os, Crabp1, Ly6h, Fam181b, Hmga2, Top2a, Hells, Id2, Ncl 
## PC_ 4 
## Positive:  Pim2, Meg3, Dnmt3b, Snrpn, Arl4c, Pou3f1, Bst2, Gng3, 2810417H13Rik, Hmgb2 
##     Stmn2, Ldha, G3bp2, Tubb4b, Cdca8, Dut, Hmgn5, Myo10, Cenpf, Gja1 
##     L1td1, Cenpa, F2rl1, Cdk1, Rbpms2, Fabp5, Arl6ip1, Ndufa5, Crmp1, Nop10 
## Negative:  Ifitm1, Zfp42, Krt8, Dppa3, Cfc1, Tgfb2, Arl4a, Samd3, Spp1, Arid5b 
##     Dkk1, Tdh, Lhx1, Cer1, Morc1, Klf2, Sox17, Fam25c, Krt18, Flt1 
##     Ooep, Gstm1, Slit2, T, Pcdh7, Prrc1, Gtsf1l, Bmp2, Fn1, Dgkk 
## PC_ 5 
## Positive:  Atp5j2, Cox5b, Cox7b, Cox7c, Rpl27, Txn1, Atp5h, Rps17, Rpl38, Cox7a2 
##     Dbi, Atpif1, Fabp5, Rpl37, Mif, Atp5k, Hsp90aa1, Ndufa7, Rpl37a, Ifitm2 
##     Ndufb6, Ndufa11, 2010107E04Rik, Usmg5, Sod1, Atp5g1, Uqcr10, Hspe1, Rplp2, Rpl22 
## Negative:  Malat1, Slc2a1, Slc2a3, Gja1, Dnmt3b, Cenpf, Plk1, Crb3, Iqgap1, Cdh1 
##     Tpx2, Aspm, Cenpe, Meg3, Hmmr, Klf6, Rif1, Epcam, Slc16a1, Incenp 
##     Pvrl2, Trh, Sox17, F2rl1, Krt18, Cdc20, Top2a, Otx2, Aurka, Klf9

# Printing out first 5 principal components
print(x = qDatX[["pca"]], dims = 1:5, nfeatures = 5)

## PC_ 1 
## Positive:  Sox11, Tmsb4x, Crabp1, Meis2, Crabp2 
## Negative:  Chchd10, Dppa5a, Mt2, Epcam, Mt1 
## PC_ 2 
## Positive:  2810417H13Rik, Hmgb1, Sfrp1, Hspe1, Ccnd2 
## Negative:  Tubb3, Nhlh2, Ina, Elavl3, Dcx 
## PC_ 3 
## Positive:  Krt8, Krt18, Lhx1, Emb, Trh 
## Negative:  Sox2, Hes5, Crabp2, Ccnd2, Sfrp2 
## PC_ 4 
## Positive:  Pim2, Meg3, Dnmt3b, Snrpn, Arl4c 
## Negative:  Ifitm1, Zfp42, Krt8, Dppa3, Cfc1 
## PC_ 5 
## Positive:  Atp5j2, Cox5b, Cox7b, Cox7c, Rpl27 
## Negative:  Malat1, Slc2a1, Slc2a3, Gja1, Dnmt3b

# Determing significant PCs using elbow plot
#ELBOW PLOT
ElbowPlot(object = qDatX, ndims = 35)

Significant PCs: It can be observed from the elbow plot that the first 30 PCs cover almost 90% of the varience in data. Thus only top 30 PCs are selected for further analysis.
###Visualization: PCA, Clustering and Differentially expressed genes

# Visualizing PCs
# PCA plot
# PC1 vs PC2
DimPlot(object = qDatX, reduction = "pca")

# PC1 vs PC3
DimPlot(object = qDatX, dims = c(1,3), reduction = "pca")

# Visualizing Clusters
# Choosing top 30 PCs as significant PCs
#Clustering
qDatX <- FindNeighbors(object = qDatX, dims = 1:30)
qDatX<- FindClusters(object = qDatX, resolution = 0.5)

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 3841
## Number of edges: 143949
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8522
## Number of communities: 7
## Elapsed time: 0 seconds

# Visualizing the clusters
qDatX <- RunUMAP(object = qDatX, dims = 1:30)
DimPlot(object = qDatX, reduction = "umap")

# plot variable genes with and without labels
# Top 2000 variable genes with names of top 10 differentially expressed genes
plot1 <- VariableFeaturePlot(object = qDatX)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
plot2

Question3: Cell typing for Spangler data

# Convert Seurat object to SingleCellExperiment object
qDatX_sce<- SingleCellExperiment(assays = list(logcounts = as.matrix(qDatX[["RNA"]]@data)), colData = qDatX@meta.data)
rowData(qDatX_sce)$feature_symbol <- rownames(qDatX_sce)

# Cluster qDatX_sce to index values computes using reference data
# Using threshold = 0.2 for optimal results
scmapCluster_results <- scmapCluster(
  projection = qDatX_sce,
  index_list = list(
    yan= metadata(sce_bertie)$scmap_cluster_index
  ),
  threshold = 0.2
)

plot(
  getSankey(
    colData(qDatX_sce)$seurat_clusters, 
    scmapCluster_results$scmap_cluster_labs[,'yan'],
    plot_height = 400
  )
)

qDatX@meta.data$pred_types = extractClassLabel(scmapCluster_results, qDatX_sce)

## 5 
## Forebrain/Midbrain/Hindbrain 
## 1 
## Epiblast 
## 2 
## Forebrain/Midbrain/Hindbrain 
## 4 
## Def. endoderm 
## 3 
## Forebrain/Midbrain/Hindbrain 
## 6 
## Forebrain/Midbrain/Hindbrain 
## 0 
## Rostral neurectoderm

# Plot umap with the annotations
DimPlot(object = qDatX, reduction = "umap", group.by = 'pred_types')

The data is divided into 4 main clusters: Epiblast, Def endoderm, Rostral neurectoderm and Forebrain/Midbrain/Hindbrain

Question 4: Trajectory Inference

GetLineages function input: The starting cluster is given as epiblast and ending cluster is given as Forebrain/Midbrain/Hindbrain

# Convert Spangler data from Seurat object to SingleCellExperiment object  
qDatX_sce=  as.SingleCellExperiment(qDatX)
# Generate lineages for first 3 PCs
lin <- getLineages(reducedDims(qDatX_sce)$PCA[, 1:3], colData(qDatX_sce)[, "pred_types"], start.clus = 'Epiblast', end.clus = 'Forebrain/Midbrain/Hindbrain')
# Print out the lineages generated
lin

## class: SlingshotDataSet 
## 
##  Samples Dimensions
##     3841          3
## 
## lineages: 2 
## Lineage1: Epiblast  Rostral neurectoderm  Forebrain/Midbrain/Hindbrain  
## Lineage2: Epiblast  Rostral neurectoderm  Def. endoderm  
## 
## curves: 0

# Obtain curves from the lineages generated
crv <- getCurves(lin)

# Generate 2D plot of lineage curves
# PC1 vs PC2
rd = reducedDims(qDatX_sce)$PCA[, c(1,2)]
cl = colData(qDatX_sce)[, "seurat_clusters"]
plot(rd, col = brewer.pal(9,'Set1')[cl], asp=1,  pch = 16)
lines(lin, lwd = 3)

# PC1 vs PC3
rd = reducedDims(qDatX_sce)$PCA[, c(1,3)]
cl = colData(qDatX_sce)[, "seurat_clusters"]
plot(rd, col = brewer.pal(9,'Set1')[cl], asp=1,  pch = 16)
lines(lin, lwd = 3)

# Generate 3D plot to visualize all 3 PCs
xdata = as.matrix(qDatX@reductions$pca@cell.embeddings[,1:3])
reducedDim(qDatX_sce) <- xdata
mtData  = qDatX@meta.data$pred_types
names(mtData) = rownames(qDatX@meta.data)
getPalette = colorRampPalette(brewer.pal(7, "Paired"))
plot3d.SlingshotDataSet(crv, lwd=4)
points3d(c(1,2,3), col = getPalette(length(unique(mtData)))[mtData], alpha=.75)

Slingshot defines two distinct lineages for the Spangler dataset:
Lineage1: Epiblast Rostral neurectoderm Forebrain/Midbrain/Hindbrain
Lineage2: Epiblast Rostral neurectoderm Def. endoderm

The clusters taken by each of the paths are:
Epiblast -> Rostral neurectoderm -> Forebrain/Midbrain/Hindbrain
Epiblast -> Rostral neurectoderm -> Def. endoderm

Embryonic Stages of Development: (Based on teh Theiler stages of mouse embryonic development)
Theiler Stage 7 -> Theiler Stage 9 -> Theiler Stage 19
Theiler Stage 7 -> Theiler Stage 9 -> Theiler Stage 7

Question 5:

# Compute psuedotime using slingshot
pst = slingPseudotime(crv)

# Finding differentially expressed TFs and heatmap for 1st path 
pt1<-pst[which(!is.na(pst[,1])),]
t1 = pt1[,1]
expDat = as.matrix(qDatX[["RNA"]]@data)
#Loading mouse TFs
load("mmTFs.rda")
# Gamfit
system.time(gpC1 <-gamFit(expDat, mmTFs, t1))

##    user  system elapsed 
## 147.856  34.549 186.223

# Picking the top 50 differentially expressed genes
yy = gpC1[order(gpC1)][1:50]
gene50_path1 = names(yy)

# Picking out cells attributed to path 1
path1_ind = which(qDatX_sce$pred_types=='Epiblast' | qDatX_sce$pred_types=='Rostral neurectoderm' | qDatX_sce$pred_types =='Forebrain/Midbrain/Hindbrain')
path1_sr = qDatX[,path1_ind]
cell_names_path1 = path1_sr$pred_types

# generating heat map
hm_ti(
  expDat,
  gene50_path1,
  cell_names_path1, ## vector of cellnames -> grp label
  cRow=FALSE,
  cCol=FALSE,
  limits=c(0,10),
  toScale=FALSE,
  fontsize_row=4)

Analysis of TF clusters:
TFs that are uniformly expressed throughout the trajectory:Npm1, Ncl, Ebf1, Uncx and Hmg2
TFs that are highly expressed at first and shut off later:Klf2, Sox 11, Zfp42
TFs that are shut off at first and highly expressed later: Hes5 , Hoxa2, Hmbg2

Discussion on role of select TFs: Zfp42: Expressed mainly in undifferentiated cells and plays a major role in maintaining pluripotency. It has an important role in mesoendoderm potential [1]
Hoxa2: It is a part of cluster A of the homeobox genes. It plays an important role in placing some segments of the hindbrain along the anterior-posterior axis. Thus it is more highly expressed later in the trajectory.[2]
Ebf1: Required for B-cell differentiation, and signalling pathways.

# Finding differentially expressed TFs and heatmap for 2nd path 
pt2<-pst[which(!is.na(pst[,2])),]
t2 = pt1[,2]

# Gamfit
system.time(gpC2 <-gamFit(expDat, mmTFs, t2))

##    user  system elapsed 
##  51.988  16.520  72.535

# Picking the top 50 differentially expressed genes
yy = gpC2[order(gpC2)][1:50]
gene50_path2 = names(yy)

# Picking out cells attributed to path 1
path2_ind = which(qDatX_sce$pred_types=='Epiblast' | qDatX_sce$pred_types=='Rostral neurectoderm' | qDatX_sce$pred_types =='Def. endoderm')
path2_sr = qDatX[,path2_ind]
cell_names_path2 = path2_sr$pred_types

# generating heat map
hm_ti(
  expDat,
  gene50_path2,
  cell_names_path2, ## vector of cellnames -> grp label
  cRow=FALSE,
  cCol=FALSE,
  limits=c(0,10),
  toScale=FALSE,
  fontsize_row=4)

Analysis of TF clusters:
TFs that are uniformly expressed throughout the trajectory:Hmbg1, Hnrnpa2b1, Nr0b1, Klf5
TFs that are highly expressed at first and shut off later:Klf2, Sox 11, Zfp42
TFs that are shut off at first and highly expressed later: Hes1, Zeb2 and Ash21

Discussion on role of select TFs: Zfp42: Expressed mainly in undifferentiated cells and plays a major role in maintaining pluripotency. It has an important role in mesoendoderm potential [1]
Zeb2: Crucial for cell and tissue development and is thus more highly expressed in the later part of the trajectory (endoderm)[3]
Nr0b1: Carries DAX1 protein that is crucial for development of endocrine glands.

[1]Son, Mi‐Young, et al. “Unveiling the critical role of REX1 in the regulation of human stem cell pluripotency.” Stem cells 31.11 (2013): 2374-2387.
[2] NCBI report: HOXA2 homeobox A2 [ Homo sapiens (human) ] [3]Genetics Home Reference: Zeb2 gene

CSCB_Project_5

ArchanaBalan (abalan2)