library('dplyr')
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library('Matrix')
library('Seurat')
## Loading required package: SeuratObject
## Loading required package: sp
## 'SeuratObject' was built with package 'Matrix' 1.7.4 but the current
## version is 1.7.5; it is recomended that you reinstall 'SeuratObject' as
## the ABI for 'Matrix' may have changed
## 
## Attaching package: 'SeuratObject'
## The following objects are masked from 'package:base':
## 
##     intersect, t
library('ggplot2')

load the data

data_dir <- "/Users/maryannemuchuki/Desktop/Transcriptomics_class/final_project/data"
list.files(data_dir)
## [1] "GSE167036_barcodes.csv"           "GSE167036_cell_counts_matrix.mtx"
## [3] "GSE167036_features.csv"           "GSE167036_meta_all.csv"
# read the metadata
liu_metadata <- read.csv("/Users/maryannemuchuki/Desktop/Transcriptomics_class/final_project/data/GSE167036_meta_all.csv")
head(liu_metadata)
summary(liu_metadata)
##        X            cell_id           orig.ident          nCount_RNA    
##  Min.   :     1   Length:118845      Length:118845      Min.   :   411  
##  1st Qu.: 29712   Class :character   Class :character   1st Qu.:  2072  
##  Median : 59423   Mode  :character   Mode  :character   Median :  3203  
##  Mean   : 59423                                         Mean   :  4963  
##  3rd Qu.: 89134                                         3rd Qu.:  4919  
##  Max.   :118845                                         Max.   :113083  
##   nFeature_RNA    percent.mt        qc          scrublet_doublet
##  Min.   : 299   Min.   : 0.000   Mode:logical   Mode:logical    
##  1st Qu.: 874   1st Qu.: 3.108   TRUE:118845    TRUE:118845     
##  Median :1203   Median : 4.459                                  
##  Mean   :1456   Mean   : 5.228                                  
##  3rd Qu.:1638   3rd Qu.: 6.479                                  
##  Max.   :9324   Max.   :25.541                                  
##  sample_type         patient_id            leiden        cell_label       
##  Length:118845      Length:118845      Min.   : 0.000   Length:118845     
##  Class :character   Class :character   1st Qu.: 1.000   Class :character  
##  Mode  :character   Mode  :character   Median : 3.000   Mode  :character  
##                                        Mean   : 3.825                     
##                                        3rd Qu.: 5.000                     
##                                        Max.   :16.000                     
##   main_label          ER_statu          Her2_statu           Grade          
##  Length:118845      Length:118845      Length:118845      Length:118845     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##      umap_1            umap_2       
##  Min.   :-10.912   Min.   :-8.9592  
##  1st Qu.: -2.381   1st Qu.:-0.6115  
##  Median :  3.726   Median : 3.3533  
##  Mean   :  2.102   Mean   : 2.3596  
##  3rd Qu.:  6.005   3rd Qu.: 5.9103  
##  Max.   : 11.925   Max.   :14.0657

Read the matrix file

matrix_file <- file.path(data_dir, "GSE167036_cell_counts_matrix.mtx")
mtx <- readMM(matrix_file)
dim(mtx)
## [1]  25301 118845
mtx[1:5,1:5]
## 5 x 5 sparse Matrix of class "dgTMatrix"
##               
## [1,] . . . . .
## [2,] . . . . .
## [3,] . . . . .
## [4,] . . 1 . .
## [5,] . . . . .

Read the barcodes

barcodes_liu_project <- read.csv(file.path("/Users/maryannemuchuki/Desktop/Transcriptomics_class/final_project/data/GSE167036_barcodes.csv"), 
                                 header = TRUE, stringsAsFactors = FALSE)
dim(barcodes_liu_project)
## [1] 118845      2

Read the features

features_liu_project <- read.csv(file.path("/Users/maryannemuchuki/Desktop/Transcriptomics_class/final_project/data/GSE167036_features.csv"),
                                 header = TRUE, stringsAsFactors = FALSE)
dim(features_liu_project)
## [1] 25301     2

create the matrix with the barcodes and features

colnames(mtx) <- barcodes_liu_project[ ,1]
row.names(mtx) <- features_liu_project[ ,2]
dim(mtx)
## [1]  25301 118845
mtx[1:5,1:5]
## 5 x 5 sparse Matrix of class "dgTMatrix"
##            1 2 3 4 5
## AL627309.1 . . . . .
## AL669831.5 . . . . .
## FAM87B     . . . . .
## LINC00115  . . 1 . .
## AL645608.7 . . . . .

We have the genes as the rows and barcodes as the columns making up the matrix

Create a seurat object

liu_project <- CreateSeuratObject(counts = mtx, project = 'liu_group_project', min.cells = 3, min.features = 200)
## Warning: Data is of class dgTMatrix. Coercing to dgCMatrix.
liu_project
## An object of class Seurat 
## 24893 features across 118845 samples within 1 assay 
## Active assay: RNA (24893 features, 0 variable features)
##  1 layer present: counts

STANDARD PROCESSING STEPS

# QC metrics
liu_project[['percent.mt']] <- PercentageFeatureSet(liu_project, pattern = '^MT-')
# visualize QC metrics
VlnPlot(liu_project, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.

Visualize feature to feature relationships

plot1 <- FeatureScatter(liu_project, feature1 = "nFeature_RNA", feature2 = "nCount_RNA", raster = FALSE)
plot2 <- FeatureScatter(liu_project, feature1 = "nCount_RNA", feature2 = "percent.mt", raster = FALSE)
plot1

plot2

# subset by removing unwanted cells
liu_project <- subset(liu_project, subset = nFeature_RNA > 200 & nFeature_RNA < 7500 & percent.mt <20)

Data normlization

liu_project <- NormalizeData(liu_project)
## Normalizing layer: counts

Identification of highly variable features

liu_project <- FindVariableFeatures(liu_project, selection.method = "vst", nfeatures = 2000)
## Finding variable features for layer counts
# identify top ten variable features
top10 <- head(VariableFeatures(liu_project), 10)
# visualize the top ten highly variable features
plot1_var_features <- VariableFeaturePlot(liu_project)
plot2_var_features <- LabelPoints(plot = plot1_var_features, points = top10, repel = TRUE)
## When using repel, set xnudge and ynudge to 0 for optimal results
plot1_var_features 

plot2_var_features

### Scale the data

liu_project <- ScaleData(liu_project)
## Centering and scaling data matrix

perform linear dimensional reduction

liu_project <- RunPCA(liu_project, features = VariableFeatures(object = liu_project))
## PC_ 1 
## Positive:  IFITM3, IGFBP4, MGP, CST3, IGFBP7, SPARC, CAVIN1, CALD1, IFI27, GSN 
##     SERPING1, SPARCL1, TPM1, CD9, TIMP3, A2M, PPIC, TM4SF1, CYR61, EMP1 
##     TIMP1, NNMT, PRSS23, SELENOP, ADIRF, CEBPD, COL4A2, IER3, S100A16, NUPR1 
## Negative:  HIST1H4C, CD74, HLA-DQA1, HLA-DPB1, BCL2A1, HLA-DPA1, MALAT1, IGHM, IGKC, TRBV20-1 
##     CCL4, TNF, IRF4, STAP1, CRTAM, JCHAIN, HLA-DRA, AREG, GZMH, IFNG 
##     HLA-DRB1, MT1H, IGHG1, LTA, IGHA1, TRBV28, MIR155HG, GZMB, TRBV5-1, LINC02446 
## PC_ 2 
## Positive:  KRT19, KRT18, ELF3, AZGP1, KRT8, CLDN4, SMIM22, MLPH, CLDN3, CLDN7 
##     SPINT2, TACSTD2, EPCAM, SPDEF, CRABP2, AGR3, ZG16B, FXYD3, LRRC26, MUC1 
##     RAB25, KRT7, CD24, ESR1, ERBB3, S100A14, AC093001.1, SLC39A6, AGR2, ANKRD30A 
## Negative:  IGFBP7, SPARC, SPARCL1, A2M, CAVIN1, COL4A2, GNG11, CAV1, HSPG2, COL4A1 
##     ESAM, ENG, BGN, COL15A1, HTRA1, COL6A2, AQP1, CD34, PLVAP, CAVIN3 
##     FSTL1, LHFPL6, CALD1, ADGRL4, SERPING1, VWF, CFH, EMCN, CLEC14A, DLC1 
## PC_ 3 
## Positive:  PLVAP, AQP1, VWF, ADGRL4, RAMP2, EMCN, CLEC14A, ECSCR, CYYR1, EGFL7 
##     PECAM1, CD93, CDH5, TSPAN7, MMRN2, CD34, FLT1, CXorf36, RNASE1, CLDN5 
##     PTPRB, TIE1, CCL14, CALCRL, ESAM, ACKR1, RBP7, RAMP3, HYAL2, ADCY4 
## Negative:  LUM, COL1A2, DCN, COL1A1, COL3A1, COL6A3, C1S, CTSK, SERPINF1, FBLN1 
##     ISLR, PCOLCE, SFRP2, PRRX1, COL6A1, THBS2, LRP1, CCDC80, MFAP4, AEBP1 
##     MXRA8, COL6A2, COL5A2, C1R, DPT, RARRES2, COL5A1, C3, BGN, EMILIN1 
## PC_ 4 
## Positive:  LYZ, FCER1G, C1QC, CD68, TYROBP, C1QA, C1QB, MS4A6A, OLR1, IGSF6 
##     APOC1, LILRB4, CLEC7A, C5AR1, CSF1R, MS4A7, FCGR2A, CD14, SPI1, MSR1 
##     AIF1, PILRA, IL1B, TREM2, CCL3, CPVL, FPR3, FCGR3A, FCGR1A, IFI30 
## Negative:  BGN, CALD1, MYL9, COL1A2, COL1A1, TAGLN, SPARC, COL3A1, PLAC9, COL18A1 
##     PDGFRB, MALAT1, COL6A2, IGFBP7, SOD3, PCOLCE, ACTA2, TPM2, PRRX1, CPE 
##     MFGE8, COL5A2, AEBP1, MAP1B, LHFPL6, THY1, COL14A1, FRZB, NOTCH3, EDNRA 
## PC_ 5 
## Positive:  SLPI, MUCL1, CALML5, S100A8, S100A9, ALOX15B, PFN2, AKR1C2, KRT7, ST8SIA6-AS1 
##     ASS1, SERHL2, CTAG2, FABP7, GLYATL2, MDK, SAA1, CBR3, MAGEA4, MIEN1 
##     PSMD3, TFAP2B, HPGD, SPINK8, ALDH3B2, KRT15, GRB7, PNMT, DHRS2, S100P 
## Negative:  AGR3, MALAT1, ESR1, ANKRD30A, GFRA1, BMPR1B, TFF1, CA12, SCUBE2, MRPS30-DT 
##     SLC39A6, UGT2B4, CLEC3A, ECM1, DEGS2, SYT1, NAT1, SSFA2, UGDH, SCGB1D2 
##     CRABP2, CFB, TNNT1, C5orf38, MAPT, TRPS1, FSIP1, CST3, IGFBP5, RTN1
# check the first 5 PCAs
print(liu_project[['pca']], dims = 1:5, nfeatures = 5)
## PC_ 1 
## Positive:  IFITM3, IGFBP4, MGP, CST3, IGFBP7 
## Negative:  HIST1H4C, CD74, HLA-DQA1, HLA-DPB1, BCL2A1 
## PC_ 2 
## Positive:  KRT19, KRT18, ELF3, AZGP1, KRT8 
## Negative:  IGFBP7, SPARC, SPARCL1, A2M, CAVIN1 
## PC_ 3 
## Positive:  PLVAP, AQP1, VWF, ADGRL4, RAMP2 
## Negative:  LUM, COL1A2, DCN, COL1A1, COL3A1 
## PC_ 4 
## Positive:  LYZ, FCER1G, C1QC, CD68, TYROBP 
## Negative:  BGN, CALD1, MYL9, COL1A2, COL1A1 
## PC_ 5 
## Positive:  SLPI, MUCL1, CALML5, S100A8, S100A9 
## Negative:  AGR3, MALAT1, ESR1, ANKRD30A, GFRA1
# visualize PC1 & PC2
VizDimLoadings(liu_project, dims = 1:2, reduction = 'pca')

# visualize the top 15 PCs
DimHeatmap(liu_project, dims = 1:10, cells = 500, balanced = TRUE)

### Determine the dimensionality of the dataset

ElbowPlot(liu_project)

key findings

Based on the elbow plot, the majority of true signals are in the first 15 PCs.

Cluster the cells

liu_project <- FindNeighbors(liu_project, dims = 1:15)
## Computing nearest neighbor graph
## Computing SNN
liu_project <- FindClusters(liu_project)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 118280
## Number of edges: 3175301
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9095
## Number of communities: 31
## Elapsed time: 29 seconds

Run non-linear dimensional reduction (UMAP)

liu_project <- RunUMAP(liu_project, dims = 1:15)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 11:09:18 UMAP embedding parameters a = 0.9922 b = 1.112
## 11:09:18 Read 118280 rows and found 15 numeric columns
## 11:09:18 Using Annoy for neighbor search, n_neighbors = 30
## 11:09:18 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 11:09:24 Writing NN index file to temp file /var/folders/h0/s0qb8mrj4qq_qfht_drqfnzw0000gn/T//RtmpP1kKgx/filecafa17c4bf4a
## 11:09:24 Searching Annoy index using 1 thread, search_k = 3000
## 11:09:44 Annoy recall = 100%
## 11:09:44 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 11:09:45 Initializing from normalized Laplacian + noise (using RSpectra)
## 11:09:50 Commencing optimization for 200 epochs, with 5420916 positive edges
## 11:09:50 Using rng type: pcg
## 11:10:11 Optimization finished
# plot the UMAPs
DimPlot(liu_project, reduction = 'umap', raster = FALSE)

Find differentially expressed features

liu_project.markers <- FindAllMarkers(object = liu_project,only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
## Calculating cluster 0
## For a (much!) faster implementation of the Wilcoxon Rank Sum Test,
## (default method for FindMarkers) please install the presto package
## --------------------------------------------
## install.packages('devtools')
## devtools::install_github('immunogenomics/presto')
## --------------------------------------------
## After installation of presto, Seurat will automatically use the more 
## efficient implementation (no further action necessary).
## This message will be shown once per session
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
## Calculating cluster 6
## Calculating cluster 7
## Calculating cluster 8
## Calculating cluster 9
## Calculating cluster 10
## Calculating cluster 11
## Calculating cluster 12
## Calculating cluster 13
## Calculating cluster 14
## Calculating cluster 15
## Calculating cluster 16
## Calculating cluster 17
## Calculating cluster 18
## Calculating cluster 19
## Calculating cluster 20
## Calculating cluster 21
## Calculating cluster 22
## Calculating cluster 23
## Calculating cluster 24
## Calculating cluster 25
## Calculating cluster 26
## Calculating cluster 27
## Calculating cluster 28
## Calculating cluster 29
## Calculating cluster 30
write.csv(liu_project.markers, file = "/Users/maryannemuchuki/Desktop/Transcriptomics_class/final_project/results/liu_project.markers.csv")
saveRDS(liu_project, "/Users/maryannemuchuki/Desktop/Transcriptomics_class/final_project/analysis/liu_project_markers.rds")
genes.to.plot <- c("EPCAM","CD24","KRT19", #Epithelial cells / Thymic epithelial cells
                   #"EPCAM", # TEC - Thymic epithelial cells
                   "DCN","LUM","COL1A2", #Fibroblasts
                   "COL4A1","TAGLN","CALD1", # Connective tissue cells
                   # "COL4A1" > Marker for endothelial cells and cancer-associated fibrobalsts.
                   "AQP1","VWF","KDR", # Endothelial
                   "IL7R","CD8A","CD3E","FOXP3", #Tcell
                    # "IL7R", # CD4 T cells
                    # "CD8A"    # CD8 T cells
                   "NKG7","GZMB", #NK
                   "JCHAIN","HBB",#Plasma, Erythroid
                   "CD74", "LGALS1","CD69",
                   "CD14","CSF1R","LYZ", #Macrophages
                   "CD79A","CD19","MS4A1", #B cells
                   "CENPF","TOP2A","MKI67", #proliferative
                   #"MKI67", # K167 - defnitive proliferative 
                   "CD163","AIF1", # DC
                   "PLA2G2A","APOD","POSTN", # CAFs - cancer associated fibroblasts
                   "RGS5","MYH11", # PVL - Perivascular-like cells
                   #"RGS5", #Pericytes # RGS5 > marker for pericytes and vascular smooth muscle cells
                   "CD45" # Mast cells
                   ) 
# create a dot plot 
plot_x <- DotPlot(liu_project, features = genes.to.plot, group.by = 'seurat_clusters') + RotatedAxis()
## Warning: The following requested variables were not found: CD45
plot_y <- DimPlot(liu_project, reduction = 'umap', group.by = 'seurat_clusters', label = TRUE) + NoLegend()
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
plot_x 

plot_y

ggsave("/Users/maryannemuchuki/Desktop/Transcriptomics_class/final_project/results/plot_x_dotplot.png", plot = plot_x, width = 17, height = 7)
ggsave("/Users/maryannemuchuki/Desktop/Transcriptomics_class/final_project/results/plot_y_dimplot.png", plot = plot_y, width = 17, height = 9)
Idents(liu_project) <- liu_project$seurat_clusters
#levels(liu_project)
cluster.ids <- c('T cells', 'NK', 'B cells','B cells', 'Endothelial', # clusters 0-4
                 'EpithelialCells', 'Plasma', 'T cells','EpithelialCells','T cells', # clusters 5 - 9 
                 'Fibroblasts', 'EpithelialCells', 'ConnectiveTissue','EpithelialCells','Macrophages', # clusters 10 - 14 
                 'EpithelialCells', 'Macrophages', 'EpithelialCells','NK','Proliferative', # clusters 15 - 19
                 'EpithelialCells', 'NK', 'B cells','EpithelialCells','T cells', # clusters 20 - 24
                 'ConnectiveTissue', 'Endothelial', 'NK','B cells','EpithelialCells','EpithelialCells') # clusters 25 - 30
names(cluster.ids) <- levels(liu_project)
liu_project <- RenameIdents(liu_project, cluster.ids)
DimPlot(liu_project, reduction = 'umap', label = TRUE, pt.size = 0.8) + NoLegend()
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`

liu_clusters <- DimPlot(liu_project, reduction = 'umap', label = TRUE, pt.size = 0.8) + NoLegend()
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
ggsave("/Users/maryannemuchuki/Desktop/Transcriptomics_class/final_project/results/liu_clusters.png", plot = liu_clusters, width = 15, height = 8)