#load in the libraries
library(Seurat)
## Attaching SeuratObject
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
nsclc.sparse.m <- Read10X_h5(filename = '20k_NSCLC_DTC_3p_nextgem_Multiplex_count_raw_feature_bc_matrix.h5' )
## Genome matrix has multiple modalities, returning a list of matrices for this genome
cts <- nsclc.sparse.m$`Gene Expression`
nsclc.seurat.obj <- CreateSeuratObject(counts = cts, project = 'NSCLC', min.cells = 3, min.features = 200)
str(nsclc.seurat.obj)
## Formal class 'Seurat' [package "SeuratObject"] with 13 slots
## ..@ assays :List of 1
## .. ..$ RNA:Formal class 'Assay' [package "SeuratObject"] with 8 slots
## .. .. .. ..@ counts :Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
## .. .. .. .. .. ..@ i : int [1:63710619] 59 64 76 77 89 98 142 204 270 312 ...
## .. .. .. .. .. ..@ p : int [1:42082] 0 789 1040 1401 1649 1950 8346 12231 12456 12884 ...
## .. .. .. .. .. ..@ Dim : int [1:2] 29552 42081
## .. .. .. .. .. ..@ Dimnames:List of 2
## .. .. .. .. .. .. ..$ : chr [1:29552] "AL627309.1" "AL627309.3" "AL627309.5" "AL627309.4" ...
## .. .. .. .. .. .. ..$ : chr [1:42081] "AAACCCAAGAATTTGG-1" "AAACCCAAGAGATTCA-1" "AAACCCAAGATTGACA-1" "AAACCCAAGCAAATCA-1" ...
## .. .. .. .. .. ..@ x : num [1:63710619] 1 2 1 2 1 1 1 1 1 1 ...
## .. .. .. .. .. ..@ factors : list()
## .. .. .. ..@ data :Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
## .. .. .. .. .. ..@ i : int [1:63710619] 59 64 76 77 89 98 142 204 270 312 ...
## .. .. .. .. .. ..@ p : int [1:42082] 0 789 1040 1401 1649 1950 8346 12231 12456 12884 ...
## .. .. .. .. .. ..@ Dim : int [1:2] 29552 42081
## .. .. .. .. .. ..@ Dimnames:List of 2
## .. .. .. .. .. .. ..$ : chr [1:29552] "AL627309.1" "AL627309.3" "AL627309.5" "AL627309.4" ...
## .. .. .. .. .. .. ..$ : chr [1:42081] "AAACCCAAGAATTTGG-1" "AAACCCAAGAGATTCA-1" "AAACCCAAGATTGACA-1" "AAACCCAAGCAAATCA-1" ...
## .. .. .. .. .. ..@ x : num [1:63710619] 1 2 1 2 1 1 1 1 1 1 ...
## .. .. .. .. .. ..@ factors : list()
## .. .. .. ..@ scale.data : num[0 , 0 ]
## .. .. .. ..@ key : chr "rna_"
## .. .. .. ..@ assay.orig : NULL
## .. .. .. ..@ var.features : logi(0)
## .. .. .. ..@ meta.features:'data.frame': 29552 obs. of 0 variables
## .. .. .. ..@ misc : list()
## ..@ meta.data :'data.frame': 42081 obs. of 3 variables:
## .. ..$ orig.ident : Factor w/ 1 level "NSCLC": 1 1 1 1 1 1 1 1 1 1 ...
## .. ..$ nCount_RNA : num [1:42081] 1333 295 470 296 364 ...
## .. ..$ nFeature_RNA: int [1:42081] 789 251 361 248 301 6396 3885 225 428 215 ...
## ..@ active.assay: chr "RNA"
## ..@ active.ident: Factor w/ 1 level "NSCLC": 1 1 1 1 1 1 1 1 1 1 ...
## .. ..- attr(*, "names")= chr [1:42081] "AAACCCAAGAATTTGG-1" "AAACCCAAGAGATTCA-1" "AAACCCAAGATTGACA-1" "AAACCCAAGCAAATCA-1" ...
## ..@ graphs : list()
## ..@ neighbors : list()
## ..@ reductions : list()
## ..@ images : list()
## ..@ project.name: chr "NSCLC"
## ..@ misc : list()
## ..@ version :Classes 'package_version', 'numeric_version' hidden list of 1
## .. ..$ : int [1:3] 4 1 3
## ..@ commands : list()
## ..@ tools : list()
#calculating the percentage of mitochondrial genes
#so all genes starting with mt
#save it to another object within the metadata
nsclc.seurat.obj[["percent.mt"]] <- PercentageFeatureSet(nsclc.seurat.obj, pattern = "^MT-")
VlnPlot(nsclc.seurat.obj, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
FeatureScatter(nsclc.seurat.obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA") + geom_smooth(method = 'lm')
## `geom_smooth()` using formula = 'y ~ x'
# Filtering data
nsclc.seurat.obj <- subset(nsclc.seurat.obj, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5)
nsclc.seurat.obj <- NormalizeData(nsclc.seurat.obj)
nsclc.seurat.obj <- FindVariableFeatures(nsclc.seurat.obj, selection.method = "vst", nfeatures = 2000)
top10 <- head(VariableFeatures(nsclc.seurat.obj), 10)
plot <- VariableFeaturePlot(nsclc.seurat.obj)
LabelPoints(plot = plot, points = top10, repel = TRUE)
## When using repel, set xnudge and ynudge to 0 for optimal results
## Warning: Transformation introduced infinite values in continuous x-axis
all.genes <- rownames(nsclc.seurat.obj)
nsclc.seurat.obj <- ScaleData(nsclc.seurat.obj, features = all.genes)
## Centering and scaling data matrix
nsclc.seurat.obj <- RunPCA(nsclc.seurat.obj, features = VariableFeatures(object = nsclc.seurat.obj))
## PC_ 1
## Positive: MTRNR2L12, CD69, CD7, CREM, IL32, LTB, CCL5, TRAC, BIRC3, TRBC2
## IL7R, S100A4, TXNIP, DUSP2, CLEC2B, TUBA4A, NR3C1, ITM2A, RORA, ZNF331
## CST7, DUSP4, TRBC1, NKG7, BCAS2, PIK3R1, GZMA, RGCC, KLRB1, PMAIP1
## Negative: FTL, IGKC, SPP1, IGHG3, APOE, CXCL8, IGHA1, IGLC2, COL3A1, COL1A2
## KRT19, CXCL1, SFTPB, SFTPC, CLDN4, APOC1, LYZ, WFDC2, COL1A1, ELF3
## SCGB3A1, KRT18, LUM, MDK, IGHG4, TCIM, CCL18, EPCAM, CXCL2, SCGB3A2
## PC_ 2
## Positive: FCN1, AIF1, CD68, AC020656.1, S100A8, S100A9, TYROBP, FCER1G, C5AR1, IFI30
## SMIM25, CYBB, CD14, TREM1, PLAUR, CST3, CTSS, FTL, NCF2, FCGR2A
## EREG, VCAN, CLEC4E, PHACTR1, PLEK, SPI1, OLR1, LST1, IL1RN, FPR1
## Negative: IL32, TRAC, CD7, TRBC2, CCL5, TRBC1, ITM2A, RORA, GZMA, KLRB1
## IL7R, TIGIT, ICOS, NKG7, KLRK1, TRAT1, CST7, TUBA4A, GPR171, IFNG
## CTSW, SH2D1A, CD8A, GZMH, GZMK, CD69, GIMAP7, BATF, KLRD1, CD8B
## PC_ 3
## Positive: MS4A1, BANK1, CD79A, VPREB3, TNFRSF13C, MEF2C, LY9, TNFRSF13B, SMIM14, RALGPS2
## LINC01781, CD79B, LINC01857, CD40, IRF8, HLA-DQA1, LINC00926, POU2F2, SPIB, COBLL1
## HLA-DQB1, FCRL5, LTB, FAM30A, CXCR5, CD70, TCF4, FCRL2, CD83, LINC02397
## Negative: NKG7, CCL5, CST7, GZMA, IL32, CD7, GZMH, KLRK1, KLRD1, CTSW
## ANXA1, CD8A, CLEC2B, IFNG, PRF1, TRGC2, LINC02446, TRBC1, CD8B, GNLY
## ID2, IL7R, S100A4, GZMB, XCL2, LINC01871, S100A11, GZMK, MT2A, GIMAP7
## PC_ 4
## Positive: NKG7, KLRD1, KLRK1, CCL5, GZMH, CTSW, GZMB, LINC02446, CD8A, GZMA
## TRGC2, GNLY, PRF1, CD8B, XCL2, KLRC3, KLRC2, ZNF683, CST7, KLRC4
## XCL1, TRDC, IFNG, HOPX, CCL4, CRTAM, KLRC1, LINC01871, TRGC1, ITGA1
## Negative: TNFRSF4, CTLA4, BATF, ICA1, MAF, MAGEH1, ICOS, CXCL13, TNFRSF18, TSHZ2
## MAL, TBC1D4, TIGIT, LINC01943, IL7R, FOXP3, NR3C1, CD200, RORA, GK
## PASK, THADA, LTB, AC004585.1, CD27, SESN3, IL2RA, JMY, NMB, LAIR2
## PC_ 5
## Positive: MS4A1, BANK1, VPREB3, TXNIP, TNFRSF13C, FCN1, LINC01781, CD69, TNFRSF13B, CCL5
## S100A8, LINC01857, CD70, C5AR1, ATP2B1, SCIMP, GZMK, SMIM25, AC020656.1, CD79A
## LINC00926, TRBC2, KLF2, IFI30, SMIM14, TREM1, AIM2, CD79B, GZMA, CST7
## Negative: DERL3, LILRA4, MZB1, SCT, JCHAIN, SMPD3, PTCRA, IL3RA, PLD4, EGLN3
## CLIC3, LRRC26, SHD, CLEC4C, SERPINF1, GZMB, ITM2C, LINC00996, IRF4, RRBP1
## TCF4, PTGDS, IRF7, LAMP5, GAS6, SLC32A1, SLC15A4, PHEX, DNASE1L3, ZFAT
print(nsclc.seurat.obj[["pca"]], dims = 1:5, nfeatures = 5)
## PC_ 1
## Positive: MTRNR2L12, CD69, CD7, CREM, IL32
## Negative: FTL, IGKC, SPP1, IGHG3, APOE
## PC_ 2
## Positive: FCN1, AIF1, CD68, AC020656.1, S100A8
## Negative: IL32, TRAC, CD7, TRBC2, CCL5
## PC_ 3
## Positive: MS4A1, BANK1, CD79A, VPREB3, TNFRSF13C
## Negative: NKG7, CCL5, CST7, GZMA, IL32
## PC_ 4
## Positive: NKG7, KLRD1, KLRK1, CCL5, GZMH
## Negative: TNFRSF4, CTLA4, BATF, ICA1, MAF
## PC_ 5
## Positive: MS4A1, BANK1, VPREB3, TXNIP, TNFRSF13C
## Negative: DERL3, LILRA4, MZB1, SCT, JCHAIN
DimHeatmap(nsclc.seurat.obj, dims = 1, cells = 500, balanced = TRUE)
# determine dimensionality of the data
ElbowPlot(nsclc.seurat.obj)
nsclc.seurat.obj <- FindNeighbors(nsclc.seurat.obj, dims = 1:15)
## Computing nearest neighbor graph
## Computing SNN
nsclc.seurat.obj <- FindClusters(nsclc.seurat.obj, resolution = c(0.3, 0.5, 0.7, 1))
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 24708
## Number of edges: 613303
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9058
## Number of communities: 12
## Elapsed time: 9 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 24708
## Number of edges: 613303
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8552
## Number of communities: 13
## Elapsed time: 9 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 24708
## Number of edges: 613303
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8220
## Number of communities: 15
## Elapsed time: 11 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 24708
## Number of edges: 613303
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7860
## Number of communities: 20
## Elapsed time: 12 seconds
View(nsclc.seurat.obj@meta.data)
DimPlot(nsclc.seurat.obj, group.by = "RNA_snn_res.0.5", label = TRUE)