Nova-seq
- SevenBridges Run -> BD Rhapsodyâ„¢ Targeted Analysis Pipeline run - 06-10-22 08:01:34
Flowchart
Workflow Flowchart.
##########################################################################
####################### Load relevant libraries ##########################
##########################################################################
library(dplyr)
library(tidyr)
library(deMULTIplex)
library(ggplot2)
library(stringr)
library(ShortRead)
library(Seurat)
library(SeuratDisk)
library(SeuratObject)
library(SingleCellExperiment)
library(scater)
library(scran)
library(tibble)Read in SevenBridges datasets
- Read in count matrix from SevenBridges OR Read in an already transposed data-frame. Then format data to construct a Seurat or SCE (Single Cell Experiment) object. Most single-cell algorithms and packages expect matrices to be in genes x cells format (genes as rownames , cells as colnames) and SevenBridges outputs the data tables in cell x genes format.
- Read in Immune Classification file and SMK Calls file from SevenBridges run
##########################################################################
########################## SB data-frames ################################
##########################################################################
## read in files
# counts
df_sb <- read.csv("C:/Users/Cathal/Downloads/Combined_Nova-merged-LMO-filtered-OUT-Abseq-cell-calling-TRUE_RSEC_MolsPerCell.csv")
##1 - transpose in R and construct single-cell object
# transpose existing df
#counts_matrix_transposed_existing <- t(counts_matrix)
##2 - transpose matrix in Excel and read in
# write out
#write.csv(counts_matrix, file = "C:/Users/Cathal/Desktop/counts_mtx_untransposed.csv", row.names = F)
head(df_sb)[1:6,1:5]## Cell_Index CCR7.CCR7.AHS0273.pAbO CD11c.B.LY6.ITGAX.AHS0056.pAbO
## 1 748355 168668 111
## 2 255398 313 134
## 3 156129 5281 3678
## 4 703830 4869 3730
## 5 468645 4530 3421
## 6 240985 231 93
## CD127.IL7R.AHS0028.pAbO CD134.ACT35.TNFRSF4.AHS0013.pAbO
## 1 57 164
## 2 62 168
## 3 3488 7220
## 4 3133 7577
## 5 3525 6553
## 6 115 241
## - 2 read in the already transposed data-frame
#colnames(counts_matrix_new_tt) <- gsub("X", "", colnames(counts_matrix_new_tt))
# rownames(counts_matrix_new_tt) <- counts_matrix_new_tt$Cell_Index
#
# counts_matrix_new_tt$Cell_Index <- NULL
#
# counts_matrix_sc <- as.matrix(counts_matrix_new_tt)
####
## Nova-seq
## Immune class
df_Immune_class_Nova <- read.csv(file = "C:/Users/Cathal/Desktop/Nova_seq/Nova-cell_type_90k_experimental.csv")
## Calls file
df_smk_calls_Nova <- read.csv(file = "C:/Users/Cathal/Desktop/Nova_seq/Nova-90k_Sample_Tag_Calls.csv")
## merge them together
df_sev_bri_meta <- full_join(df_smk_calls_Nova, df_Immune_class_Nova, by = "Cell_Index")
head(df_sev_bri_meta)## Cell_Index Sample_Tag Sample_Name Cell_Type_Experimental
## 1 748355 SampleTag04_hs P3 Natural_killer
## 2 255398 Undetermined Undetermined T_CD4_memory
## 3 156129 SampleTag02_hs P1 T_CD4_naive
## 4 703830 Multiplet Multiplet B
## 5 468645 SampleTag04_hs P3 B
## 6 240985 Undetermined Undetermined T_CD8_memory
# counts_mat <- read.csv(file = "C:/Users/Cathal/Desktop/Nova_seq/sev_bri_90k_tt_onlin.csv")
#
# rownames(counts_mat) <- counts_mat$Cell_Index
#
# counts_mat$Cell_Index <- NULL
#
# counts_mat <- as.matrix(counts_mat)deMULTIplex workflow - Nova-seq Parsed FASTQs
# 1. Read the full FASTQs into R using ShortRead (command:
## R1
r1 <- readFastq("C:/Users/Cathal/Downloads/Nova_merged_R1_filtered.fastq.gz") #)
#and converted to dataframe containing the first 76 bases (command:
readTable_r1 <- as.data.frame(subseq(sread(r1), 1, 60))
## R2
r2 <- readFastq("C:/Users/Cathal/Downloads/Nova_merged_R1_filtered.fastq.gz") #)
#and converted to dataframe containing the first 76 bases (command:
readTable_r2 <- as.data.frame(subseq(sread(r2), 1, 42))## R1
###
#2. Parsed the R1 dataframe to include only 300K total reads to minimize compute and split into CellID and UMI sections. Command:
# readTable_r1_2 <- as.data.frame(matrix(0L, nrow=1e3, ncol=2))
# colnames(readTable_r1_2) <- c("cell","umi")
#
# #readTable_r1_2 <- as.character(readTable_r1_2)
#
# for (i in 1:1e3) {
# temp <- unlist(strsplit(readTable_r1$x[i], split=""))
# readTable_r1_2$cell[i] <- paste(temp[c(1:9,22:30,44:52)], collapse='')
# readTable_r1_2$umi[i] <- paste(temp[c(53:60)], collapse='')
# }
#
# head(readTable_r1_2)# 3. Identified the most abundant cell barcodes (total reads >= 10) and parsed both the R1_2 and R2 dataframes using this index. Command:
# temp <- table(readTable_r1_2$cell)
# temp <- names(temp)[which(temp >= 10)] ## 4640 cells
# ind <- which(readTable_r1_2$cell %in% temp)
#
# readTable_r1_3 <- readTable_r1_2[ind, ] ## dim = 62040 x 2
# readTable_r2_2_t <- readTable_r2$x[ind] ## dim = 62040 x 1
#
#
#
# ## R2
# temp <- table(readTable_r2_2$cell)
# temp <- names(temp)[which(temp >= 10)] ## 4640 cells
# ind <- which(readTable_r2_2$cell %in% temp)
#
# readTable_r2_3 <- readTable_r2_2[ind, ] ## dim = 62040 x 2
# readTable_r2_2 <- readTable_r2$x[ind] ## dim = 62040 x 1# 4. Identified reads in R2_2 dataframe including either MULTI-seq Bar1 or Bar2 sequences, combined indexed R1_3 and R2_2 into a final readTable object. Command:
# ind <- c(grep("GGAGAAGA", readTable_r2_2_t),grep("CCACAATG", readTable_r2_2_t))
# readTable_final <- as.data.frame(matrix(0L, nrow=length(ind), ncol=3)) ## dim = 3409 x 3
# colnames(readTable_final) <- c("Cell","UMI","Sample")
# readTable_final[,c("Cell","UMI")] <- readTable_r1_3[ind, ]
# temp <- readTable_r2_2_t[ind]
# readTable_final[grep("GGAGAAGA", temp),'Sample'] <- "GGAGAAGA"
# readTable_final[grep("CCACAATG", temp),'Sample'] <- "CCACAATG"# 5. Ran MULTIseq.Align on this object and visualized the results. Command:
#barTable_final <- MULTIseq.align(readTable_final, cellIDs=unique(readTable_final$Cell), ref = unique(readTable_final$Sample))
barTable_all <- read.csv("C:/Users/Cathal/Downloads/bar_table_full.csv")
barTable_all <- select(barTable_all,-contains("nUMI"))
barTable_all <- select(barTable_all,-contains("label"))
head(barTable_all)## Bar1 Bar2 Cell_Index
## 1 99 1266 816010
## 2 79 706 514137
## 3 322 15551 74134
## 4 295 10550 846872
## 5 22 813 679912
## 6 3 2 223365
p <- ggplot(barTable_all, aes(x=Bar1, y=Bar2)) + geom_point(size=1) + theme_classic()
pConstruct Seurat / SCE object
##########################################################################
######################## Make SC obj #####################################
##########################################################################
# SCE
#sce <- SingleCellExperiment(assays = list(counts = counts_mat))
seurat <- LoadH5Seurat("C:/Users/Cathal/Desktop/Nova_seq/seurat_object.h5seurat")
# Seurat
#seurat <- CreateSeuratObject(counts = counts_mat)
###
## View
#sce
seurat## An object of class Seurat
## 427 features across 90522 samples within 1 assay
## Active assay: RNA (427 features, 0 variable features)
## 3 dimensional reductions calculated: pca, tsne, umap
#head(rownames(sce))
head(rownames(seurat))## [1] "CCR7-CCR7-AHS0273-pAbO" "CD11c:B-LY6-ITGAX-AHS0056-pAbO"
## [3] "CD127-IL7R-AHS0028-pAbO" "CD134:ACT35-TNFRSF4-AHS0013-pAbO"
## [5] "CD137-TNFRSF9-AHS0003-pAbO" "CD14:MPHIP9-CD14-AHS0037-pAbO"
Metadata to Seurat object
# join sev bridges data with LMO counts matrix
df_meta_left <- left_join(df_sev_bri_meta, barTable_all, by="Cell_Index")
## Add metadata to seurat object
# seurat <- AddMetaData(object = seurat, metadata = df_meta_left)
#
# seurat@meta.data$Cell_Type_Experimental <- as.character(df_meta_left$Cell_Type_Experimental)
# seurat@meta.data$Sample_Name <- as.character(df_meta_left$Sample_Name)
# seurat@meta.data$Sample_Tag <- as.character(df_meta_left$Sample_Tag)
# seurat@meta.data$Cell_Index <- as.character(df_meta_left$Cell_Index)
# seurat@meta.data$Bar1 <- as.character(df_meta_left$Bar1)
# seurat@meta.data$Bar2 <- as.character(df_meta_left$Bar2)Run PCA and TSNE / UMAP and add cluster labels to metadata slot
##########################################################################
####################### Seurat Clustering ################################
##########################################################################
########### Set features as Abseqs for Clustering
Abseqs <- rownames(x = seurat[1:30])
Abseqs## [1] "CCR7-CCR7-AHS0273-pAbO" "CD11c:B-LY6-ITGAX-AHS0056-pAbO"
## [3] "CD127-IL7R-AHS0028-pAbO" "CD134:ACT35-TNFRSF4-AHS0013-pAbO"
## [5] "CD137-TNFRSF9-AHS0003-pAbO" "CD14:MPHIP9-CD14-AHS0037-pAbO"
## [7] "CD161:HP-3G10-KLRB1-AHS0205-pAbO" "CD16:3G8-FCGR3A-AHS0053-pAbO"
## [9] "CD183-CXCR3-AHS0031-pAbO" "CD196-CCR6-AHS0034-pAbO"
## [11] "CD19:SJ25C1-CD19-AHS0030-pAbO" "CD25:2A3-IL2RA-AHS0026-pAbO"
## [13] "CD272-BTLA-AHS0052-pAbO" "CD278-ICOS-AHS0012-pAbO"
## [15] "CD279:EH12-1-PDCD1-AHS0014-pAbO" "CD27:M-T271-CD27-AHS0025-pAbO"
## [17] "CD28:L293-CD28-AHS0138-pAbO" "CD3:UCHT1-CD3E-AHS0231-pAbO"
## [19] "CD45RA:HI100-PTPRC-AHS0009-pAbO" "CD4:SK3-CD4-AHS0032-pAbO"
## [21] "CD56:NCAM16.2-NCAM1-AHS0019-pAbO" "CD62L:DREG-56-SELL-AHS0049-pAbO"
## [23] "CD8:SK1-CD8A-AHS0228-pAbO" "CXCR5-CXCR5-AHS0039-pAbO"
## [25] "CXCR6-CXCR6-AHS0148-pAbO" "GITR-TNFRSF18-AHS0104-pAbO"
## [27] "HLA-DR-CD74-AHS0035-pAbO" "IgD-IGHD-AHS0058-pAbO"
## [29] "IgM-IGHM-AHS0198-pAbO" "Tim3-HAVCR2-AHS0016-pAbO"
#all_genes <- rownames(seurat)
# seurat <- ScaleData(seurat)
#
# seurat <- RunPCA(seurat, npcs = 8, features = Abseqs)
#
# seurat <- FindNeighbors(seurat, dims = 1:8)
#
# seurat <- RunTSNE(seurat, features = Abseqs, check_duplicates = FALSE)
#
# seurat <- RunUMAP(seurat, dims = 1:8)
#
# seurat <- FindClusters(seurat)
## examine clustering results
print(seurat[["pca"]], dims = 1:5, nfeatures = 5)## PC_ 1
## Positive: CD183-CXCR3-AHS0031-pAbO, CXCR6-CXCR6-AHS0148-pAbO, CD279:EH12-1-PDCD1-AHS0014-pAbO, CD134:ACT35-TNFRSF4-AHS0013-pAbO, CD137-TNFRSF9-AHS0003-pAbO
## Negative: CD45RA:HI100-PTPRC-AHS0009-pAbO, HLA-DR-CD74-AHS0035-pAbO, CXCR5-CXCR5-AHS0039-pAbO, CD19:SJ25C1-CD19-AHS0030-pAbO, CD4:SK3-CD4-AHS0032-pAbO
## PC_ 2
## Positive: CD27:M-T271-CD27-AHS0025-pAbO, CD3:UCHT1-CD3E-AHS0231-pAbO, CD8:SK1-CD8A-AHS0228-pAbO, CD134:ACT35-TNFRSF4-AHS0013-pAbO, CD137-TNFRSF9-AHS0003-pAbO
## Negative: CD196-CCR6-AHS0034-pAbO, Tim3-HAVCR2-AHS0016-pAbO, IgM-IGHM-AHS0198-pAbO, CD127-IL7R-AHS0028-pAbO, CXCR6-CXCR6-AHS0148-pAbO
## PC_ 3
## Positive: CD3:UCHT1-CD3E-AHS0231-pAbO, CD4:SK3-CD4-AHS0032-pAbO, CD62L:DREG-56-SELL-AHS0049-pAbO, CD27:M-T271-CD27-AHS0025-pAbO, CD278-ICOS-AHS0012-pAbO
## Negative: HLA-DR-CD74-AHS0035-pAbO, CD19:SJ25C1-CD19-AHS0030-pAbO, CD11c:B-LY6-ITGAX-AHS0056-pAbO, CXCR5-CXCR5-AHS0039-pAbO, CD137-TNFRSF9-AHS0003-pAbO
## PC_ 4
## Positive: CD11c:B-LY6-ITGAX-AHS0056-pAbO, IgD-IGHD-AHS0058-pAbO, CD14:MPHIP9-CD14-AHS0037-pAbO, CD16:3G8-FCGR3A-AHS0053-pAbO, CD56:NCAM16.2-NCAM1-AHS0019-pAbO
## Negative: CD19:SJ25C1-CD19-AHS0030-pAbO, CXCR5-CXCR5-AHS0039-pAbO, CD45RA:HI100-PTPRC-AHS0009-pAbO, CD62L:DREG-56-SELL-AHS0049-pAbO, CD27:M-T271-CD27-AHS0025-pAbO
## PC_ 5
## Positive: CD25:2A3-IL2RA-AHS0026-pAbO, CD11c:B-LY6-ITGAX-AHS0056-pAbO, HLA-DR-CD74-AHS0035-pAbO, CD279:EH12-1-PDCD1-AHS0014-pAbO, IgM-IGHM-AHS0198-pAbO
## Negative: CD56:NCAM16.2-NCAM1-AHS0019-pAbO, GITR-TNFRSF18-AHS0104-pAbO, CD45RA:HI100-PTPRC-AHS0009-pAbO, CXCR5-CXCR5-AHS0039-pAbO, IgD-IGHD-AHS0058-pAbO
VizDimLoadings(seurat, dims = 1, reduction = "pca")VizDimLoadings(seurat, dims = 2, reduction = "pca")##########################################################################
####################### SCE Clustering ###################################
##########################################################################
## convert Seurat to SCE
# This function can produce some strange behavior so be cautious using it
#sce_conv <- as.SingleCellExperiment(seurat)
## Perform calculations on SCE object
## As per vignette found here -- http://bioconductor.org/books/3.13/OSCA.intro/analysis-overview.html#quick-start-simple
############1
## Feature selection for clustering
# Abseqs
# Abseqs_sce <- rownames(x = sce_conv[1:30])
# Abseqs_sce
#
# ## all rows
# #all_genes <- rownames(seurat_sce)
#
#
# ############2
# ## Run PCA
# # subset_row is an important parameter here as it chooses what to cluster on.
# set.seed(1234)
# sce_conv <- runPCA(sce_conv, ncomponents = 8, subset_row=Abseqs_sce)
#
#
# #3
# # Clustering.
# library(bluster) # https://bioconductor.org/packages/devel/bioc/vignettes/bluster/inst/doc/clusterRows.html#introduction
# colLabels(sce_conv) <- clusterCells(sce_conv, use.dimred='PCA',
# BLUSPARAM=NNGraphParam(cluster.fun="louvain"))
#
#
# # 4
# # tSNE / UMAP
# sce_conv <- runTSNE(sce_conv, dimred = 'PCA')
# #sce <- runUMAP(sce, dimred = 'PCA')tSNE & UMAP. Immune classification file from SevenBridges labelling clusters.
# Use SevenBridges Immune Classification file as clusters
seurat <- SetIdent(seurat, value = 'Cell_Type_Experimental')
DimPlot(seurat, reduction = 'tsne')DimPlot(seurat, reduction = 'umap')Number of cells per cell type
table(seurat@meta.data$Cell_Type_Experimental)##
## B Dendritic Monocyte_classical
## 10054 143 2072
## Monocyte_nonclassical Natural_killer T_CD4_memory
## 651 10681 21909
## T_CD4_naive T_CD8_memory T_CD8_naive
## 15784 22059 6884
## T_gamma_delta
## 285
tSNE - Sub-setting based on SMK’s and LMO’s.
##########################################################################
####################### Subset SMK & LMO's and plot ######################
##########################################################################
##### Note: Use Seurat object for this part
## Sub-setting based on SMK and LMO and generating plots
## Method 1
#LMO_1_cells_test <- WhichCells(seurat, idents = Cell_Type_Experimental == 'B' & Sample_Name == 'P1')
## Mean value in Bar1 column = 13
## Mean value in Bar2 column = 16
## P1
#cond 1
LMO_1_cells_P1 <- WhichCells(seurat, expression = Bar1 >= 13 & Sample_Name == 'P1')
# cond 2
LMO_2_cells_P1 <- WhichCells(seurat, expression = Bar2 >= 16 & Sample_Name == 'P1')
DimPlot(seurat, reduction = "tsne",
cells.highlight = list(LMO_1_cells_P1, LMO_2_cells_P1),
cols.highlight = c("darkblue", "darkorange"), cols = "grey", sizes.highlight = 0.9) +
ggtitle("SMK 2 - P1") + theme_bw() + theme(legend.position = 'top')#
#
# ### P2
# cond 1
LMO_1_cells_P2 <- WhichCells(seurat, expression = Bar1 >= 13 & Sample_Name == 'P2')
# cond 2
LMO_2_cells_P2 <- WhichCells(seurat, expression = Bar2 >= 16 & Sample_Name == 'P2')
DimPlot(seurat, reduction = "tsne",
cells.highlight = list(LMO_1_cells_P2, LMO_2_cells_P2),
cols.highlight = c("darkblue", "darkorange"), cols = "grey", sizes.highlight = 0.9) +
ggtitle("SMK 3 - P2") + theme_bw() + theme(legend.position = 'top')#
#
#
#
# ### P3
# # cond 1
LMO_1_cells_P3 <- WhichCells(seurat, expression = Bar1 >= 13 & Sample_Name == 'P3')
# cond 2
LMO_2_cells_P3 <- WhichCells(seurat, expression = Bar2 >= 16 & Sample_Name == 'P3')
DimPlot(seurat, reduction = "tsne",
cells.highlight = list(LMO_1_cells_P3, LMO_2_cells_P3),
cols.highlight = c("darkblue", "darkorange"), cols = "grey", sizes.highlight = 0.9) +
ggtitle("SMK 4 - P3") + theme_bw() + theme(legend.position = 'top')#
#
#
#
### P4
# cond 1
LMO_1_cells_P4 <- WhichCells(seurat, expression = Bar1 >= 13 & Sample_Name == 'P4')
# cond 2
LMO_2_cells_P4 <- WhichCells(seurat, expression = Bar2 >= 16 & Sample_Name == 'P4')
DimPlot(seurat, reduction = "tsne",
cells.highlight = list(LMO_1_cells_P4, LMO_2_cells_P4),
cols.highlight = c("darkblue", "darkorange"), cols = "grey", sizes.highlight = 0.9) +
ggtitle("SMK 5 - P4") + theme_bw() + theme(legend.position = 'top')#
#
# ### P5
# # cond 1
LMO_1_cells_P5 <- WhichCells(seurat, expression = Bar1 >= 13 & Sample_Name == 'P5')
# cond 2
LMO_2_cells_P5 <- WhichCells(seurat, expression = Bar2 >= 16 & Sample_Name == 'P5')
DimPlot(seurat, reduction = "tsne",
cells.highlight = list(LMO_1_cells_P5, LMO_2_cells_P5),
cols.highlight = c("darkblue", "darkorange"), cols = "grey", sizes.highlight = 0.9) +
ggtitle("SMK 6 - P5") + theme_bw() + theme(legend.position = 'top')######################################################################################################
######################################################################################################Dotplots
##########################################################################
############################ Dotplots ####################################
##########################################################################
########## RNA Dotplot
rna_list <- c('CD3E',
'CD4',
'CD8A',
'MS4A1',
'CD14',
'FCGR3A',
'ITGAM',
'CD5',
'FCER2',
'IL2RA',
'CD69',
'CD22',
'CD79B')
DotPlot(seurat, features = rna_list, group.by = 'Cell_Type_Experimental') + RotatedAxis()######## Abseq dotplot
# Use same Idents from above on y-axis
DotPlot(seurat, features = Abseqs, group.by = 'Cell_Type_Experimental') + RotatedAxis()