#load in the libraries

library(Seurat)

## Attaching SeuratObject

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Load the NSCLC data set

nsclc.sparse.m <- Read10X_h5(filename = '20k_NSCLC_DTC_3p_nextgem_Multiplex_count_raw_feature_bc_matrix.h5' )

## Genome matrix has multiple modalities, returning a list of matrices for this genome

Getting gene expression from matrix for analysis

cts <- nsclc.sparse.m$`Gene Expression`

Create the Seurat object with raw data

nsclc.seurat.obj <- CreateSeuratObject(counts = cts, project = 'NSCLC', min.cells = 3, min.features = 200)

looking at Seurat object

str(nsclc.seurat.obj)

## Formal class 'Seurat' [package "SeuratObject"] with 13 slots
##   ..@ assays      :List of 1
##   .. ..$ RNA:Formal class 'Assay' [package "SeuratObject"] with 8 slots
##   .. .. .. ..@ counts       :Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
##   .. .. .. .. .. ..@ i       : int [1:63710619] 59 64 76 77 89 98 142 204 270 312 ...
##   .. .. .. .. .. ..@ p       : int [1:42082] 0 789 1040 1401 1649 1950 8346 12231 12456 12884 ...
##   .. .. .. .. .. ..@ Dim     : int [1:2] 29552 42081
##   .. .. .. .. .. ..@ Dimnames:List of 2
##   .. .. .. .. .. .. ..$ : chr [1:29552] "AL627309.1" "AL627309.3" "AL627309.5" "AL627309.4" ...
##   .. .. .. .. .. .. ..$ : chr [1:42081] "AAACCCAAGAATTTGG-1" "AAACCCAAGAGATTCA-1" "AAACCCAAGATTGACA-1" "AAACCCAAGCAAATCA-1" ...
##   .. .. .. .. .. ..@ x       : num [1:63710619] 1 2 1 2 1 1 1 1 1 1 ...
##   .. .. .. .. .. ..@ factors : list()
##   .. .. .. ..@ data         :Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
##   .. .. .. .. .. ..@ i       : int [1:63710619] 59 64 76 77 89 98 142 204 270 312 ...
##   .. .. .. .. .. ..@ p       : int [1:42082] 0 789 1040 1401 1649 1950 8346 12231 12456 12884 ...
##   .. .. .. .. .. ..@ Dim     : int [1:2] 29552 42081
##   .. .. .. .. .. ..@ Dimnames:List of 2
##   .. .. .. .. .. .. ..$ : chr [1:29552] "AL627309.1" "AL627309.3" "AL627309.5" "AL627309.4" ...
##   .. .. .. .. .. .. ..$ : chr [1:42081] "AAACCCAAGAATTTGG-1" "AAACCCAAGAGATTCA-1" "AAACCCAAGATTGACA-1" "AAACCCAAGCAAATCA-1" ...
##   .. .. .. .. .. ..@ x       : num [1:63710619] 1 2 1 2 1 1 1 1 1 1 ...
##   .. .. .. .. .. ..@ factors : list()
##   .. .. .. ..@ scale.data   : num[0 , 0 ] 
##   .. .. .. ..@ key          : chr "rna_"
##   .. .. .. ..@ assay.orig   : NULL
##   .. .. .. ..@ var.features : logi(0) 
##   .. .. .. ..@ meta.features:'data.frame':   29552 obs. of  0 variables
##   .. .. .. ..@ misc         : list()
##   ..@ meta.data   :'data.frame': 42081 obs. of  3 variables:
##   .. ..$ orig.ident  : Factor w/ 1 level "NSCLC": 1 1 1 1 1 1 1 1 1 1 ...
##   .. ..$ nCount_RNA  : num [1:42081] 1333 295 470 296 364 ...
##   .. ..$ nFeature_RNA: int [1:42081] 789 251 361 248 301 6396 3885 225 428 215 ...
##   ..@ active.assay: chr "RNA"
##   ..@ active.ident: Factor w/ 1 level "NSCLC": 1 1 1 1 1 1 1 1 1 1 ...
##   .. ..- attr(*, "names")= chr [1:42081] "AAACCCAAGAATTTGG-1" "AAACCCAAGAGATTCA-1" "AAACCCAAGATTGACA-1" "AAACCCAAGCAAATCA-1" ...
##   ..@ graphs      : list()
##   ..@ neighbors   : list()
##   ..@ reductions  : list()
##   ..@ images      : list()
##   ..@ project.name: chr "NSCLC"
##   ..@ misc        : list()
##   ..@ version     :Classes 'package_version', 'numeric_version'  hidden list of 1
##   .. ..$ : int [1:3] 4 1 3
##   ..@ commands    : list()
##   ..@ tools       : list()

Quality Control

% MT reads

#calculating the percentage of mitochondrial genes
#so all genes starting with mt 
#save it to another object within the metadata
nsclc.seurat.obj[["percent.mt"]] <- PercentageFeatureSet(nsclc.seurat.obj, pattern = "^MT-")

Visualzing the features

VlnPlot(nsclc.seurat.obj, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)

FeatureScatter(nsclc.seurat.obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA") + geom_smooth(method = 'lm')

## `geom_smooth()` using formula = 'y ~ x'

# Filtering data

nsclc.seurat.obj <- subset(nsclc.seurat.obj, subset =  nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5)

Normalizing Data

nsclc.seurat.obj <- NormalizeData(nsclc.seurat.obj)

Identify Highly variable features

nsclc.seurat.obj <- FindVariableFeatures(nsclc.seurat.obj, selection.method = "vst", nfeatures = 2000)

Identify the 10 most highly variable features

top10 <- head(VariableFeatures(nsclc.seurat.obj), 10)

Plot variable features with and without labels

plot <- VariableFeaturePlot(nsclc.seurat.obj)
LabelPoints(plot = plot, points = top10, repel = TRUE)

## When using repel, set xnudge and ynudge to 0 for optimal results

## Warning: Transformation introduced infinite values in continuous x-axis

Scaling

all.genes <- rownames(nsclc.seurat.obj)
nsclc.seurat.obj <- ScaleData(nsclc.seurat.obj, features = all.genes)

## Centering and scaling data matrix

Perform Linear Dimensionality of the Data

nsclc.seurat.obj <- RunPCA(nsclc.seurat.obj, features = VariableFeatures(object = nsclc.seurat.obj))

## PC_ 1 
## Positive:  MTRNR2L12, CD69, CD7, CREM, IL32, LTB, CCL5, TRAC, BIRC3, TRBC2 
##     IL7R, S100A4, TXNIP, DUSP2, CLEC2B, TUBA4A, NR3C1, ITM2A, RORA, ZNF331 
##     CST7, DUSP4, TRBC1, NKG7, BCAS2, PIK3R1, GZMA, RGCC, KLRB1, PMAIP1 
## Negative:  FTL, IGKC, SPP1, IGHG3, APOE, CXCL8, IGHA1, IGLC2, COL3A1, COL1A2 
##     KRT19, CXCL1, SFTPB, SFTPC, CLDN4, APOC1, LYZ, WFDC2, COL1A1, ELF3 
##     SCGB3A1, KRT18, LUM, MDK, IGHG4, TCIM, CCL18, EPCAM, CXCL2, SCGB3A2 
## PC_ 2 
## Positive:  FCN1, AIF1, CD68, AC020656.1, S100A8, S100A9, TYROBP, FCER1G, C5AR1, IFI30 
##     SMIM25, CYBB, CD14, TREM1, PLAUR, CST3, CTSS, FTL, NCF2, FCGR2A 
##     EREG, VCAN, CLEC4E, PHACTR1, PLEK, SPI1, OLR1, LST1, IL1RN, FPR1 
## Negative:  IL32, TRAC, CD7, TRBC2, CCL5, TRBC1, ITM2A, RORA, GZMA, KLRB1 
##     IL7R, TIGIT, ICOS, NKG7, KLRK1, TRAT1, CST7, TUBA4A, GPR171, IFNG 
##     CTSW, SH2D1A, CD8A, GZMH, GZMK, CD69, GIMAP7, BATF, KLRD1, CD8B 
## PC_ 3 
## Positive:  MS4A1, BANK1, CD79A, VPREB3, TNFRSF13C, MEF2C, LY9, TNFRSF13B, SMIM14, RALGPS2 
##     LINC01781, CD79B, LINC01857, CD40, IRF8, HLA-DQA1, LINC00926, POU2F2, SPIB, COBLL1 
##     HLA-DQB1, FCRL5, LTB, FAM30A, CXCR5, CD70, TCF4, FCRL2, CD83, LINC02397 
## Negative:  NKG7, CCL5, CST7, GZMA, IL32, CD7, GZMH, KLRK1, KLRD1, CTSW 
##     ANXA1, CD8A, CLEC2B, IFNG, PRF1, TRGC2, LINC02446, TRBC1, CD8B, GNLY 
##     ID2, IL7R, S100A4, GZMB, XCL2, LINC01871, S100A11, GZMK, MT2A, GIMAP7 
## PC_ 4 
## Positive:  NKG7, KLRD1, KLRK1, CCL5, GZMH, CTSW, GZMB, LINC02446, CD8A, GZMA 
##     TRGC2, GNLY, PRF1, CD8B, XCL2, KLRC3, KLRC2, ZNF683, CST7, KLRC4 
##     XCL1, TRDC, IFNG, HOPX, CCL4, CRTAM, KLRC1, LINC01871, TRGC1, ITGA1 
## Negative:  TNFRSF4, CTLA4, BATF, ICA1, MAF, MAGEH1, ICOS, CXCL13, TNFRSF18, TSHZ2 
##     MAL, TBC1D4, TIGIT, LINC01943, IL7R, FOXP3, NR3C1, CD200, RORA, GK 
##     PASK, THADA, LTB, AC004585.1, CD27, SESN3, IL2RA, JMY, NMB, LAIR2 
## PC_ 5 
## Positive:  MS4A1, BANK1, VPREB3, TXNIP, TNFRSF13C, FCN1, LINC01781, CD69, TNFRSF13B, CCL5 
##     S100A8, LINC01857, CD70, C5AR1, ATP2B1, SCIMP, GZMK, SMIM25, AC020656.1, CD79A 
##     LINC00926, TRBC2, KLF2, IFI30, SMIM14, TREM1, AIM2, CD79B, GZMA, CST7 
## Negative:  DERL3, LILRA4, MZB1, SCT, JCHAIN, SMPD3, PTCRA, IL3RA, PLD4, EGLN3 
##     CLIC3, LRRC26, SHD, CLEC4C, SERPINF1, GZMB, ITM2C, LINC00996, IRF4, RRBP1 
##     TCF4, PTGDS, IRF7, LAMP5, GAS6, SLC32A1, SLC15A4, PHEX, DNASE1L3, ZFAT

visualize PCA results

print(nsclc.seurat.obj[["pca"]], dims = 1:5, nfeatures = 5)

## PC_ 1 
## Positive:  MTRNR2L12, CD69, CD7, CREM, IL32 
## Negative:  FTL, IGKC, SPP1, IGHG3, APOE 
## PC_ 2 
## Positive:  FCN1, AIF1, CD68, AC020656.1, S100A8 
## Negative:  IL32, TRAC, CD7, TRBC2, CCL5 
## PC_ 3 
## Positive:  MS4A1, BANK1, CD79A, VPREB3, TNFRSF13C 
## Negative:  NKG7, CCL5, CST7, GZMA, IL32 
## PC_ 4 
## Positive:  NKG7, KLRD1, KLRK1, CCL5, GZMH 
## Negative:  TNFRSF4, CTLA4, BATF, ICA1, MAF 
## PC_ 5 
## Positive:  MS4A1, BANK1, VPREB3, TXNIP, TNFRSF13C 
## Negative:  DERL3, LILRA4, MZB1, SCT, JCHAIN

DimHeatmap(nsclc.seurat.obj, dims = 1, cells = 500, balanced = TRUE)

# determine dimensionality of the data

ElbowPlot(nsclc.seurat.obj)

Clustering

nsclc.seurat.obj <- FindNeighbors(nsclc.seurat.obj, dims = 1:15)

## Computing nearest neighbor graph

## Computing SNN

Understanding resolution

nsclc.seurat.obj <- FindClusters(nsclc.seurat.obj, resolution = c(0.3, 0.5, 0.7, 1))

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 24708
## Number of edges: 613303
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9058
## Number of communities: 12
## Elapsed time: 9 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 24708
## Number of edges: 613303
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8552
## Number of communities: 13
## Elapsed time: 9 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 24708
## Number of edges: 613303
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8220
## Number of communities: 15
## Elapsed time: 11 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 24708
## Number of edges: 613303
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7860
## Number of communities: 20
## Elapsed time: 12 seconds

View(nsclc.seurat.obj@meta.data)

DimPlot(nsclc.seurat.obj, group.by = "RNA_snn_res.0.5", label = TRUE)

Analyzing Single Cell-RNA Sequence Data

Arhat Pradhan

2023-05-25

Load the NSCLC data set

Getting gene expression from matrix for analysis

Create the Seurat object with raw data

looking at Seurat object

Quality Control

% MT reads

Visualzing the features

Normalizing Data

Identify Highly variable features

Identify the 10 most highly variable features

Plot variable features with and without labels

Scaling

Perform Linear Dimensionality of the Data

visualize PCA results

Clustering

Understanding resolution