library('dplyr')
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library('Matrix')
library('Seurat')
## Loading required package: SeuratObject
## Loading required package: sp
## 'SeuratObject' was built with package 'Matrix' 1.7.4 but the current
## version is 1.7.5; it is recomended that you reinstall 'SeuratObject' as
## the ABI for 'Matrix' may have changed
##
## Attaching package: 'SeuratObject'
## The following objects are masked from 'package:base':
##
## intersect, t
library('ggplot2')
data_dir <- "/Users/maryannemuchuki/Desktop/Transcriptomics_class/final_project/data"
list.files(data_dir)
## [1] "GSE167036_barcodes.csv" "GSE167036_cell_counts_matrix.mtx"
## [3] "GSE167036_features.csv" "GSE167036_meta_all.csv"
# read the metadata
liu_metadata <- read.csv("/Users/maryannemuchuki/Desktop/Transcriptomics_class/final_project/data/GSE167036_meta_all.csv")
head(liu_metadata)
summary(liu_metadata)
## X cell_id orig.ident nCount_RNA
## Min. : 1 Length:118845 Length:118845 Min. : 411
## 1st Qu.: 29712 Class :character Class :character 1st Qu.: 2072
## Median : 59423 Mode :character Mode :character Median : 3203
## Mean : 59423 Mean : 4963
## 3rd Qu.: 89134 3rd Qu.: 4919
## Max. :118845 Max. :113083
## nFeature_RNA percent.mt qc scrublet_doublet
## Min. : 299 Min. : 0.000 Mode:logical Mode:logical
## 1st Qu.: 874 1st Qu.: 3.108 TRUE:118845 TRUE:118845
## Median :1203 Median : 4.459
## Mean :1456 Mean : 5.228
## 3rd Qu.:1638 3rd Qu.: 6.479
## Max. :9324 Max. :25.541
## sample_type patient_id leiden cell_label
## Length:118845 Length:118845 Min. : 0.000 Length:118845
## Class :character Class :character 1st Qu.: 1.000 Class :character
## Mode :character Mode :character Median : 3.000 Mode :character
## Mean : 3.825
## 3rd Qu.: 5.000
## Max. :16.000
## main_label ER_statu Her2_statu Grade
## Length:118845 Length:118845 Length:118845 Length:118845
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## umap_1 umap_2
## Min. :-10.912 Min. :-8.9592
## 1st Qu.: -2.381 1st Qu.:-0.6115
## Median : 3.726 Median : 3.3533
## Mean : 2.102 Mean : 2.3596
## 3rd Qu.: 6.005 3rd Qu.: 5.9103
## Max. : 11.925 Max. :14.0657
matrix_file <- file.path(data_dir, "GSE167036_cell_counts_matrix.mtx")
mtx <- readMM(matrix_file)
dim(mtx)
## [1] 25301 118845
mtx[1:5,1:5]
## 5 x 5 sparse Matrix of class "dgTMatrix"
##
## [1,] . . . . .
## [2,] . . . . .
## [3,] . . . . .
## [4,] . . 1 . .
## [5,] . . . . .
barcodes_liu_project <- read.csv(file.path("/Users/maryannemuchuki/Desktop/Transcriptomics_class/final_project/data/GSE167036_barcodes.csv"),
header = TRUE, stringsAsFactors = FALSE)
dim(barcodes_liu_project)
## [1] 118845 2
features_liu_project <- read.csv(file.path("/Users/maryannemuchuki/Desktop/Transcriptomics_class/final_project/data/GSE167036_features.csv"),
header = TRUE, stringsAsFactors = FALSE)
dim(features_liu_project)
## [1] 25301 2
colnames(mtx) <- barcodes_liu_project[ ,1]
row.names(mtx) <- features_liu_project[ ,2]
dim(mtx)
## [1] 25301 118845
mtx[1:5,1:5]
## 5 x 5 sparse Matrix of class "dgTMatrix"
## 1 2 3 4 5
## AL627309.1 . . . . .
## AL669831.5 . . . . .
## FAM87B . . . . .
## LINC00115 . . 1 . .
## AL645608.7 . . . . .
We have the genes as the rows and barcodes as the columns making up the matrix
liu_project <- CreateSeuratObject(counts = mtx, project = 'liu_group_project', min.cells = 3, min.features = 200)
## Warning: Data is of class dgTMatrix. Coercing to dgCMatrix.
liu_project
## An object of class Seurat
## 24893 features across 118845 samples within 1 assay
## Active assay: RNA (24893 features, 0 variable features)
## 1 layer present: counts
# QC metrics
liu_project[['percent.mt']] <- PercentageFeatureSet(liu_project, pattern = '^MT-')
# visualize QC metrics
VlnPlot(liu_project, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.
plot1 <- FeatureScatter(liu_project, feature1 = "nFeature_RNA", feature2 = "nCount_RNA", raster = FALSE)
plot2 <- FeatureScatter(liu_project, feature1 = "nCount_RNA", feature2 = "percent.mt", raster = FALSE)
plot1
plot2
# subset by removing unwanted cells
liu_project <- subset(liu_project, subset = nFeature_RNA > 200 & nFeature_RNA < 7500 & percent.mt <20)
liu_project <- NormalizeData(liu_project)
## Normalizing layer: counts
liu_project <- FindVariableFeatures(liu_project, selection.method = "vst", nfeatures = 2000)
## Finding variable features for layer counts
# identify top ten variable features
top10 <- head(VariableFeatures(liu_project), 10)
# visualize the top ten highly variable features
plot1_var_features <- VariableFeaturePlot(liu_project)
plot2_var_features <- LabelPoints(plot = plot1_var_features, points = top10, repel = TRUE)
## When using repel, set xnudge and ynudge to 0 for optimal results
plot1_var_features
plot2_var_features
### Scale the data
liu_project <- ScaleData(liu_project)
## Centering and scaling data matrix
liu_project <- RunPCA(liu_project, features = VariableFeatures(object = liu_project))
## PC_ 1
## Positive: IFITM3, IGFBP4, MGP, CST3, IGFBP7, SPARC, CAVIN1, CALD1, IFI27, GSN
## SERPING1, SPARCL1, TPM1, CD9, TIMP3, A2M, PPIC, TM4SF1, CYR61, EMP1
## TIMP1, NNMT, PRSS23, SELENOP, ADIRF, CEBPD, COL4A2, IER3, S100A16, NUPR1
## Negative: HIST1H4C, CD74, HLA-DQA1, HLA-DPB1, BCL2A1, HLA-DPA1, MALAT1, IGHM, IGKC, TRBV20-1
## CCL4, TNF, IRF4, STAP1, CRTAM, JCHAIN, HLA-DRA, AREG, GZMH, IFNG
## HLA-DRB1, MT1H, IGHG1, LTA, IGHA1, TRBV28, MIR155HG, GZMB, TRBV5-1, LINC02446
## PC_ 2
## Positive: KRT19, KRT18, ELF3, AZGP1, KRT8, CLDN4, SMIM22, MLPH, CLDN3, CLDN7
## SPINT2, TACSTD2, EPCAM, SPDEF, CRABP2, AGR3, ZG16B, FXYD3, LRRC26, MUC1
## RAB25, KRT7, CD24, ESR1, ERBB3, S100A14, AC093001.1, SLC39A6, AGR2, ANKRD30A
## Negative: IGFBP7, SPARC, SPARCL1, A2M, CAVIN1, COL4A2, GNG11, CAV1, HSPG2, COL4A1
## ESAM, ENG, BGN, COL15A1, HTRA1, COL6A2, AQP1, CD34, PLVAP, CAVIN3
## FSTL1, LHFPL6, CALD1, ADGRL4, SERPING1, VWF, CFH, EMCN, CLEC14A, DLC1
## PC_ 3
## Positive: PLVAP, AQP1, VWF, ADGRL4, RAMP2, EMCN, CLEC14A, ECSCR, CYYR1, EGFL7
## PECAM1, CD93, CDH5, TSPAN7, MMRN2, CD34, FLT1, CXorf36, RNASE1, CLDN5
## PTPRB, TIE1, CCL14, CALCRL, ESAM, ACKR1, RBP7, RAMP3, HYAL2, ADCY4
## Negative: LUM, COL1A2, DCN, COL1A1, COL3A1, COL6A3, C1S, CTSK, SERPINF1, FBLN1
## ISLR, PCOLCE, SFRP2, PRRX1, COL6A1, THBS2, LRP1, CCDC80, MFAP4, AEBP1
## MXRA8, COL6A2, COL5A2, C1R, DPT, RARRES2, COL5A1, C3, BGN, EMILIN1
## PC_ 4
## Positive: LYZ, FCER1G, C1QC, CD68, TYROBP, C1QA, C1QB, MS4A6A, OLR1, IGSF6
## APOC1, LILRB4, CLEC7A, C5AR1, CSF1R, MS4A7, FCGR2A, CD14, SPI1, MSR1
## AIF1, PILRA, IL1B, TREM2, CCL3, CPVL, FPR3, FCGR3A, FCGR1A, IFI30
## Negative: BGN, CALD1, MYL9, COL1A2, COL1A1, TAGLN, SPARC, COL3A1, PLAC9, COL18A1
## PDGFRB, MALAT1, COL6A2, IGFBP7, SOD3, PCOLCE, ACTA2, TPM2, PRRX1, CPE
## MFGE8, COL5A2, AEBP1, MAP1B, LHFPL6, THY1, COL14A1, FRZB, NOTCH3, EDNRA
## PC_ 5
## Positive: SLPI, MUCL1, CALML5, S100A8, S100A9, ALOX15B, PFN2, AKR1C2, KRT7, ST8SIA6-AS1
## ASS1, SERHL2, CTAG2, FABP7, GLYATL2, MDK, SAA1, CBR3, MAGEA4, MIEN1
## PSMD3, TFAP2B, HPGD, SPINK8, ALDH3B2, KRT15, GRB7, PNMT, DHRS2, S100P
## Negative: AGR3, MALAT1, ESR1, ANKRD30A, GFRA1, BMPR1B, TFF1, CA12, SCUBE2, MRPS30-DT
## SLC39A6, UGT2B4, CLEC3A, ECM1, DEGS2, SYT1, NAT1, SSFA2, UGDH, SCGB1D2
## CRABP2, CFB, TNNT1, C5orf38, MAPT, TRPS1, FSIP1, CST3, IGFBP5, RTN1
# check the first 5 PCAs
print(liu_project[['pca']], dims = 1:5, nfeatures = 5)
## PC_ 1
## Positive: IFITM3, IGFBP4, MGP, CST3, IGFBP7
## Negative: HIST1H4C, CD74, HLA-DQA1, HLA-DPB1, BCL2A1
## PC_ 2
## Positive: KRT19, KRT18, ELF3, AZGP1, KRT8
## Negative: IGFBP7, SPARC, SPARCL1, A2M, CAVIN1
## PC_ 3
## Positive: PLVAP, AQP1, VWF, ADGRL4, RAMP2
## Negative: LUM, COL1A2, DCN, COL1A1, COL3A1
## PC_ 4
## Positive: LYZ, FCER1G, C1QC, CD68, TYROBP
## Negative: BGN, CALD1, MYL9, COL1A2, COL1A1
## PC_ 5
## Positive: SLPI, MUCL1, CALML5, S100A8, S100A9
## Negative: AGR3, MALAT1, ESR1, ANKRD30A, GFRA1
# visualize PC1 & PC2
VizDimLoadings(liu_project, dims = 1:2, reduction = 'pca')
# visualize the top 15 PCs
DimHeatmap(liu_project, dims = 1:10, cells = 500, balanced = TRUE)
### Determine the dimensionality of the dataset
ElbowPlot(liu_project)
Based on the elbow plot, the majority of true signals are in the first 15 PCs.
liu_project <- FindNeighbors(liu_project, dims = 1:15)
## Computing nearest neighbor graph
## Computing SNN
liu_project <- FindClusters(liu_project)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 118280
## Number of edges: 3175301
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9095
## Number of communities: 31
## Elapsed time: 29 seconds
liu_project <- RunUMAP(liu_project, dims = 1:15)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 11:09:18 UMAP embedding parameters a = 0.9922 b = 1.112
## 11:09:18 Read 118280 rows and found 15 numeric columns
## 11:09:18 Using Annoy for neighbor search, n_neighbors = 30
## 11:09:18 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 11:09:24 Writing NN index file to temp file /var/folders/h0/s0qb8mrj4qq_qfht_drqfnzw0000gn/T//RtmpP1kKgx/filecafa17c4bf4a
## 11:09:24 Searching Annoy index using 1 thread, search_k = 3000
## 11:09:44 Annoy recall = 100%
## 11:09:44 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 11:09:45 Initializing from normalized Laplacian + noise (using RSpectra)
## 11:09:50 Commencing optimization for 200 epochs, with 5420916 positive edges
## 11:09:50 Using rng type: pcg
## 11:10:11 Optimization finished
# plot the UMAPs
DimPlot(liu_project, reduction = 'umap', raster = FALSE)
liu_project.markers <- FindAllMarkers(object = liu_project,only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
## Calculating cluster 0
## For a (much!) faster implementation of the Wilcoxon Rank Sum Test,
## (default method for FindMarkers) please install the presto package
## --------------------------------------------
## install.packages('devtools')
## devtools::install_github('immunogenomics/presto')
## --------------------------------------------
## After installation of presto, Seurat will automatically use the more
## efficient implementation (no further action necessary).
## This message will be shown once per session
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
## Calculating cluster 6
## Calculating cluster 7
## Calculating cluster 8
## Calculating cluster 9
## Calculating cluster 10
## Calculating cluster 11
## Calculating cluster 12
## Calculating cluster 13
## Calculating cluster 14
## Calculating cluster 15
## Calculating cluster 16
## Calculating cluster 17
## Calculating cluster 18
## Calculating cluster 19
## Calculating cluster 20
## Calculating cluster 21
## Calculating cluster 22
## Calculating cluster 23
## Calculating cluster 24
## Calculating cluster 25
## Calculating cluster 26
## Calculating cluster 27
## Calculating cluster 28
## Calculating cluster 29
## Calculating cluster 30
write.csv(liu_project.markers, file = "/Users/maryannemuchuki/Desktop/Transcriptomics_class/final_project/results/liu_project.markers.csv")
saveRDS(liu_project, "/Users/maryannemuchuki/Desktop/Transcriptomics_class/final_project/analysis/liu_project_markers.rds")
genes.to.plot <- c("EPCAM","CD24","KRT19", #Epithelial cells / Thymic epithelial cells
#"EPCAM", # TEC - Thymic epithelial cells
"DCN","LUM","COL1A2", #Fibroblasts
"COL4A1","TAGLN","CALD1", # Connective tissue cells
# "COL4A1" > Marker for endothelial cells and cancer-associated fibrobalsts.
"AQP1","VWF","KDR", # Endothelial
"IL7R","CD8A","CD3E","FOXP3", #Tcell
# "IL7R", # CD4 T cells
# "CD8A" # CD8 T cells
"NKG7","GZMB", #NK
"JCHAIN","HBB",#Plasma, Erythroid
"CD74", "LGALS1","CD69",
"CD14","CSF1R","LYZ", #Macrophages
"CD79A","CD19","MS4A1", #B cells
"CENPF","TOP2A","MKI67", #proliferative
#"MKI67", # K167 - defnitive proliferative
"CD163","AIF1", # DC
"PLA2G2A","APOD","POSTN", # CAFs - cancer associated fibroblasts
"RGS5","MYH11", # PVL - Perivascular-like cells
#"RGS5", #Pericytes # RGS5 > marker for pericytes and vascular smooth muscle cells
"CD45" # Mast cells
)
# create a dot plot
plot_x <- DotPlot(liu_project, features = genes.to.plot, group.by = 'seurat_clusters') + RotatedAxis()
## Warning: The following requested variables were not found: CD45
plot_y <- DimPlot(liu_project, reduction = 'umap', group.by = 'seurat_clusters', label = TRUE) + NoLegend()
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
plot_x
plot_y
ggsave("/Users/maryannemuchuki/Desktop/Transcriptomics_class/final_project/results/plot_x_dotplot.png", plot = plot_x, width = 17, height = 7)
ggsave("/Users/maryannemuchuki/Desktop/Transcriptomics_class/final_project/results/plot_y_dimplot.png", plot = plot_y, width = 17, height = 9)
Idents(liu_project) <- liu_project$seurat_clusters
#levels(liu_project)
cluster.ids <- c('T cells', 'NK', 'B cells','B cells', 'Endothelial', # clusters 0-4
'EpithelialCells', 'Plasma', 'T cells','EpithelialCells','T cells', # clusters 5 - 9
'Fibroblasts', 'EpithelialCells', 'ConnectiveTissue','EpithelialCells','Macrophages', # clusters 10 - 14
'EpithelialCells', 'Macrophages', 'EpithelialCells','NK','Proliferative', # clusters 15 - 19
'EpithelialCells', 'NK', 'B cells','EpithelialCells','T cells', # clusters 20 - 24
'ConnectiveTissue', 'Endothelial', 'NK','B cells','EpithelialCells','EpithelialCells') # clusters 25 - 30
names(cluster.ids) <- levels(liu_project)
liu_project <- RenameIdents(liu_project, cluster.ids)
DimPlot(liu_project, reduction = 'umap', label = TRUE, pt.size = 0.8) + NoLegend()
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
liu_clusters <- DimPlot(liu_project, reduction = 'umap', label = TRUE, pt.size = 0.8) + NoLegend()
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
ggsave("/Users/maryannemuchuki/Desktop/Transcriptomics_class/final_project/results/liu_clusters.png", plot = liu_clusters, width = 15, height = 8)