Load packages

suppressPackageStartupMessages({ 
  library(BiocFileCache)
  library(scran)
  library(scater)
  library(tidyverse)
  library(SingleCellExperiment)
  library(DropletUtils)
  library(ggplot2)
  library(cowplot)
  library(Seurat)
  library(monocle3)
  library(SeuratObject)
})

Read in all SNL Files

Reading in CCSP Control vs CXCR2i tumor cells and CMV CCSP Control vs CXCR2i tumor cells

####### SNL CCSP
data_dir <- "~/SNL MEX/17681X1/MEX"
list.files(data_dir)
## [1] "barcodes.tsv.gz" "features.tsv.gz" "matrix.mtx.gz"
SNL_CCSP <- Read10X(data_dir)
SNL_CCSP <- CreateSeuratObject(counts = SNL_CCSP,project="SNL_CCSP")
label<-"CCSP"  
tx<-"Control"
SNL_CCSP@meta.data$Cre<-label
SNL_CCSP@meta.data$tx<-tx
SNL_CCSP@meta.data$n<-"n=1"
SNL_CCSP@meta.data$gnomex<-"17681X1"
head(SNL_CCSP@meta.data)
##                    orig.ident nCount_RNA nFeature_RNA  Cre      tx   n  gnomex
## AAACCCAAGTAGAATC-1   SNL_CCSP       1837          660 CCSP Control n=1 17681X1
## AAACCCATCAGGAACG-1   SNL_CCSP      30130         5068 CCSP Control n=1 17681X1
## AAACCCATCTCAAAGC-1   SNL_CCSP       4218          983 CCSP Control n=1 17681X1
## AAACCCATCTGCACCT-1   SNL_CCSP      10055         1569 CCSP Control n=1 17681X1
## AAACCCATCTTGCAGA-1   SNL_CCSP       8658          564 CCSP Control n=1 17681X1
## AAACGAAAGCCATCCG-1   SNL_CCSP      33399         5967 CCSP Control n=1 17681X1
####### SNL CCSP + CXCR2i ####
data_dir <- "~/SNL/MEX/17681X2 AZD CCSP"
list.files(data_dir)
## [1] "barcodes.tsv.gz" "features.tsv.gz" "matrix.mtx.gz"
SNL_CCSP_Tx <- Read10X(data_dir)
SNL_CCSP_Tx <- CreateSeuratObject(counts = SNL_CCSP_Tx,projec="SNL_CCSP_Tx")
label<-"CCSP"  
tx<-"CXCR2i"
SNL_CCSP_Tx@meta.data$Cre<-label
SNL_CCSP_Tx@meta.data$tx<-tx
SNL_CCSP_Tx@meta.data$n<-"n=1"
SNL_CCSP_Tx@meta.data$gnomex<-"17681X2"
head(SNL_CCSP_Tx@meta.data)
##                     orig.ident nCount_RNA nFeature_RNA  Cre     tx   n  gnomex
## AAACCCAAGCAGCACA-1 SNL_CCSP_Tx       9301         2534 CCSP CXCR2i n=1 17681X2
## AAACCCAAGGGATCAC-1 SNL_CCSP_Tx       2701          858 CCSP CXCR2i n=1 17681X2
## AAACCCACACATACGT-1 SNL_CCSP_Tx       8817         1630 CCSP CXCR2i n=1 17681X2
## AAACCCACATCATCCC-1 SNL_CCSP_Tx       1059          616 CCSP CXCR2i n=1 17681X2
## AAACCCAGTCCAGTTA-1 SNL_CCSP_Tx      14102         1933 CCSP CXCR2i n=1 17681X2
## AAACCCAGTCTTTCAT-1 SNL_CCSP_Tx       9579         1885 CCSP CXCR2i n=1 17681X2
####### SNL CMV 1 (15242X1) ####
data_dir <- "~/SNL MEX/15242X1/MEX"
list.files(data_dir)
## [1] "barcodes.tsv" "genes.tsv"    "matrix.mtx"
SNL_CMV1 <- Read10X(data_dir)
SNL_CMV1 <- CreateSeuratObject(counts = SNL_CMV1,project="SNL_CMV1")
label<-"CMV"                                
SNL_CMV1@meta.data$Cre<-label
SNL_CMV1@meta.data$tx<-"Control"
SNL_CMV1@meta.data$n<-"n=1"
SNL_CMV1@meta.data$gnomex<-"15242X1"
head(SNL_CMV1@meta.data)
##                    orig.ident nCount_RNA nFeature_RNA Cre      tx   n  gnomex
## AAACCTGAGATAGCAT-1   SNL_CMV1       3247         1220 CMV Control n=1 15242X1
## AAACCTGCAATGGATA-1   SNL_CMV1      11893         3269 CMV Control n=1 15242X1
## AAACCTGCAATGTAAG-1   SNL_CMV1       6207         1908 CMV Control n=1 15242X1
## AAACCTGCAGAAGCAC-1   SNL_CMV1       9743         2638 CMV Control n=1 15242X1
## AAACCTGCAGCTCCGA-1   SNL_CMV1       4977         1701 CMV Control n=1 15242X1
## AAACCTGGTAGAGCTG-1   SNL_CMV1      12015         2934 CMV Control n=1 15242X1
####### SNL CMV CXCR2i (15242X2) ####
data_dir <- "~/SNL/SNL AZD treated 10x data"
list.files(data_dir)
## [1] "barcodes.tsv" "genes.tsv"    "matrix.mtx"
SNL_CMV_Tx <- Read10X(data_dir)
SNL_CMV_Tx <- CreateSeuratObject(counts = SNL_CMV_Tx,project="SNL_CMV_Tx")
label<-"CMV"                                
SNL_CMV_Tx@meta.data$Cre<-label
SNL_CMV_Tx@meta.data$tx<-"CXCR2i"
SNL_CMV_Tx@meta.data$n<-"n=1"
SNL_CMV_Tx@meta.data$gnomex<-"15242X2"
head(SNL_CMV_Tx@meta.data)
##                    orig.ident nCount_RNA nFeature_RNA Cre     tx   n  gnomex
## AAACCTGAGACATAAC-1 SNL_CMV_Tx       3157         1654 CMV CXCR2i n=1 15242X2
## AAACCTGAGCGCTCCA-1 SNL_CMV_Tx       3730         1539 CMV CXCR2i n=1 15242X2
## AAACCTGCAAGCGCTC-1 SNL_CMV_Tx       4510         1714 CMV CXCR2i n=1 15242X2
## AAACCTGGTAGTACCT-1 SNL_CMV_Tx       7010         2052 CMV CXCR2i n=1 15242X2
## AAACCTGTCACCATAG-1 SNL_CMV_Tx       2421         1085 CMV CXCR2i n=1 15242X2
## AAACCTGTCAGTGTTG-1 SNL_CMV_Tx       6889         2242 CMV CXCR2i n=1 15242X2
####### SNL CMV 2 (15739X2) ####
data_dir <- "~/SNL MEX/15739X2/MEX"
list.files(data_dir)
## [1] "barcodes.tsv.gz" "features.tsv.gz" "matrix.mtx.gz"
SNL_CMV2 <- Read10X(data_dir)
SNL_CMV2 <- CreateSeuratObject(counts = SNL_CMV2,project="SNL_CMV2")
label<-"CMV"                                
SNL_CMV2@meta.data$Cre<-label
SNL_CMV2@meta.data$tx<-"Control"
SNL_CMV2@meta.data$n<-"n=2"
SNL_CMV2@meta.data$gnomex<-"15739X2"
head(SNL_CMV2@meta.data)
##                    orig.ident nCount_RNA nFeature_RNA Cre      tx   n  gnomex
## AAACCTGCACATTTCT-1   SNL_CMV2       6886         1808 CMV Control n=2 15739X2
## AAACCTGGTACCAGTT-1   SNL_CMV2       6805         1746 CMV Control n=2 15739X2
## AAACCTGGTAGAGTGC-1   SNL_CMV2       8217         2530 CMV Control n=2 15739X2
## AAACCTGGTCGCTTCT-1   SNL_CMV2      17570         3915 CMV Control n=2 15739X2
## AAACCTGGTCTTCTCG-1   SNL_CMV2      15449         2881 CMV Control n=2 15739X2
## AAACCTGGTGGTACAG-1   SNL_CMV2        565          294 CMV Control n=2 15739X2
####### SNL CMV 3 (15739X3) ####
data_dir <- "~/SNL MEX/15739X3/MEX"
list.files(data_dir)
## [1] "barcodes.tsv.gz" "features.tsv.gz" "matrix.mtx.gz"
SNL_CMV3 <- Read10X(data_dir)
SNL_CMV3 <- CreateSeuratObject(counts = SNL_CMV3,project="SNL_CMV3")
label<-"CMV"                                
SNL_CMV3@meta.data$Cre<-label
SNL_CMV3@meta.data$tx<-"Control"
SNL_CMV3@meta.data$n<-"n=3"
SNL_CMV3@meta.data$gnomex<-"15739X3"
head(SNL_CMV3@meta.data)
##                    orig.ident nCount_RNA nFeature_RNA Cre      tx   n  gnomex
## AAACCTGAGCTGTTCA-1   SNL_CMV3       4753         1500 CMV Control n=3 15739X3
## AAACCTGCAAGCGATG-1   SNL_CMV3      16781         3515 CMV Control n=3 15739X3
## AAACCTGGTTGGGACA-1   SNL_CMV3      13883         2515 CMV Control n=3 15739X3
## AAACGGGAGGCTAGGT-1   SNL_CMV3       9831         2377 CMV Control n=3 15739X3
## AAACGGGGTCAACTGT-1   SNL_CMV3       7756         1734 CMV Control n=3 15739X3
## AAAGATGAGAAACCGC-1   SNL_CMV3       9215         1991 CMV Control n=3 15739X3

Merge seurat objects into 1 object for inital QC

SNL_All<-merge(SNL_CCSP, c(SNL_CCSP_Tx, SNL_CMV1, SNL_CMV2, SNL_CMV3, SNL_CMV_Tx))
## Warning in CheckDuplicateCellNames(object.list = objects): Some cell names are
## duplicated across objects provided. Renaming to enforce unique cell names.
SNL_All
## An object of class Seurat 
## 32454 features across 24380 samples within 1 assay 
## Active assay: RNA (32454 features, 0 variable features)
# Check cell counts per cell of origin infection.
table(SNL_All@meta.data$Cre)
## 
##  CCSP   CMV 
## 11690 12690
# Check cell counts per treatment.
table(SNL_All@meta.data$tx)
## 
## Control  CXCR2i 
##   14635    9745

Perform QC on all collected cells

SNL_All[["percent.mt"]] <- PercentageFeatureSet(SNL_All, pattern = "^mt-")

VlnPlot(SNL_All, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3,group.by="Cre", pt.size=0.2)

FeatureScatter(SNL_All, feature1 = "nCount_RNA", feature2 = "nFeature_RNA", group.by="Cre")

# Subsetting quality cells 
SNL_All<- subset(SNL_All, subset = nFeature_RNA > 500 & nFeature_RNA < 5000 & percent.mt < 20)

# Check new cell counts post QC.
table(SNL_All@meta.data$Cre)
## 
##  CCSP   CMV 
##  8704 11548
table(SNL_All@meta.data$tx)
## 
## Control  CXCR2i 
##   11982    8270
SNL_All
## An object of class Seurat 
## 32454 features across 20252 samples within 1 assay 
## Active assay: RNA (32454 features, 0 variable features)

Subset just for CCSP-Cre initiated samples

SNL_CCSP_1<-subset(SNL_All, idents=c("SNL_CCSP","SNL_CCSP_Tx"))

# Check cell numbers now.
table(SNL_CCSP_1@meta.data$tx)
## 
## Control  CXCR2i 
##    3442    5262
#Ensure subsetting got rid of all CMV samples.
table(SNL_CCSP_1@meta.data$Cre)
## 
## CCSP 
## 8704

Normalize data

SNL_CCSP_1 <- NormalizeData(SNL_CCSP_1, normalization.method = "LogNormalize", scale.factor = 10000)

SNL_CCSP_1
## An object of class Seurat 
## 32454 features across 8704 samples within 1 assay 
## Active assay: RNA (32454 features, 0 variable features)

Identify Highly variable features

SNL_CCSP_1 <- FindVariableFeatures(SNL_CCSP_1, selection.method = "vst", nfeatures = 2000)

Identify the 10 most highly variable genes features that are highly expressed in some cells, and lowly expressed in others.. often carrying biological significance in R

top10 <- head(VariableFeatures(SNL_CCSP_1), 50)
top10
##  [1] "Sftpc"     "Sftpa1"    "Lyz2"      "Scd1"      "Sftpb"     "Cxcl15"   
##  [7] "Reg3g"     "Tff1"      "Chil1"     "Cd74"      "Slc34a2"   "Krt15"    
## [13] "Lcn2"      "S100a9"    "Chil4"     "Napsa"     "Npc2"      "Hp"       
## [19] "Dmbt1"     "S100a8"    "Muc5b"     "Hc"        "Tff2"      "Ppbp"     
## [25] "Krt14"     "Tm4sf1"    "AW112010"  "Crct1"     "Cxcl2"     "Gkn1"     
## [31] "Sprr2h"    "Krt17"     "Reg1"      "Apoe"      "Scgb3a1"   "Ifitm1"   
## [37] "Tppp3"     "Lyz1"      "Krt5"      "Lamp3"     "Agr2"      "Sprr1a"   
## [43] "Spp1"      "Krtap31-2" "Cxcl3"     "Cyp2a5"    "Itih4"     "Scgb1a1"  
## [49] "Fabp5"     "Lpcat1"

Scale data

Shifts the expression of each gene, so that the mean expression across cells is 0. Scales the expression of each gene, so that the variance across cells is 1. This step gives equal weight in downstream analyses, so that highly-expressed genes do not dominate.

all.genes <- rownames(SNL_CCSP_1)
SNL_CCSP_1 <- ScaleData(SNL_CCSP_1, features = all.genes)
## Centering and scaling data matrix

Perform linear dimension reduction/ PCA

Here we can see top genes driving principal components/data variability in first 5 PCs.

SNL_CCSP_1 <- RunPCA(SNL_CCSP_1, features = VariableFeatures(object = SNL_CCSP_1))
## PC_ 1 
## Positive:  2300002M23Rik, Glrx, Cnfn, Gadd45b, Isg15, Csta1, Slc6a14, Fam84a, Ndrg1, Cysrt1 
##     Ifit1bl1, Arg1, Klk13, Duoxa2, S100a10, Upk1b, Il1a, Gbp4, Jdp2, Spink5 
##     Herpud1, Plat, Hist1h1c, Il1r2, Pbp2, Prss22, Gm44275, Calml3, Hist1h2ac, Dusp14 
## Negative:  Ptma, Cd63, Krt8, Rpsa, Rpl3, Ppia, Rps18, Dbi, Eef1a1, Fabp5 
##     Rps2, Atp1a1, Tagln2, Lgals4, Rps8, Rpl12, Oat, Prdx1, Rpl5, Rps19 
##     Rpl8, Rack1, Rplp1, Prom1, Rpl13, Rpl14, Rps21, Hsp90ab1, Rpl36a, Rps3a1 
## PC_ 2 
## Positive:  Slc34a2, Lamp3, Napsa, Hc, Sftpb, Cxcl15, Chil1, Lpcat1, Dram1, Cd74 
##     Scd1, Sftpa1, Lgi3, H2-Aa, Lyz2, Lrp2, Ager, Sfta2, Egfl6, H2-Eb1 
##     H2-Ab1, Itih4, Ptgs1, Mme, Mlc1, Lrrk2, Fasn, S100g, Sftpc, Ppp1r14c 
## Negative:  Lgals4, Oit1, Clu, Ctse, Prom1, Agr2, Anxa10, Tff2, Dpcr1, Wfdc1 
##     Vsig1, Rgs17, Il18, Paqr5, Tff1, 2210407C18Rik, Krt19, Fermt1, Mgst2, Tmsb10 
##     Prss32, Ces1f, Gsta4, Krt8, Basp1, Fut2, Muc16, Scara3, Pla2g10, Gm3336 
## PC_ 3 
## Positive:  Tpm2, Cldn3, Tnfrsf11b, Cldn4, Tacstd2, Srd5a1, Gapdh, Anxa8, Nupr1, Plat 
##     Serpinb6b, Akr1b8, Cwh43, Basp1, Srgn, Tnip3, H2afz, Marcksl1, Hk2, Arg1 
##     Ctsl, Myo1b, Hilpda, Junb, Aldh1a3, Msn, Stom, Calm1, Phlda1, Atox1 
## Negative:  Tff2, Ern2, Selenop, Agr2, Ces1f, Prom1, Lgals4, Xist, Ctse, Cfi 
##     Vsig1, 5330417C22Rik, Galnt6, Muc16, Zfp704, Paqr5, Fabp5, Tff1, Smim6, Reg3g 
##     Ptprn2, Glrx, Gstm1, Dmbt1, Oat, Rgs17, Sult1c2, Scara3, Car2, Itih2 
## PC_ 4 
## Positive:  Napsa, Lamp3, Slc34a2, Cldn18, Hc, Lgi3, Fabp5, S100g, Ctse, Tff1 
##     Lrp2, Cxcl15, Lyz2, Dpcr1, Scd1, Itih4, Il33, Chil1, Egfl6, Ager 
##     Apoe, Mlc1, Anxa10, Clu, Mme, Dram1, Fasn, Sftpb, Lpcat1, Sgpp2 
## Negative:  Ccdc153, Tppp3, Tmem212, Enkur, 1110017D15Rik, Rsph1, Gm19935, Dnah12, Mlf1, Tuba1a 
##     Cfap206, Cfap126, Mapk15, Mns1, Dnah9, Sec14l3, Gm867, Ccdc113, Lrrc10b, Foxj1 
##     Ccdc39, Iqcg, 1700007K13Rik, Riiad1, Dnah6, Fam92b, Ccdc17, 4833427G06Rik, Fam183b, Drc3 
## PC_ 5 
## Positive:  Hdc, Chchd10, Cldn3, Emb, Sftpd, Ppp1r14c, Sftpb, Cbr2, Slc34a2, Chil1 
##     Napsa, Hc, Nupr1, Lamp3, Vpreb3, Wfdc2, Aldh1a1, Sec14l3, Dram1, Tmem212 
##     Cxcl15, Arg1, Acot1, Krt19, Fasn, Lgi3, Cfap126, Lrp2, Capsl, Lrrc10b 
## Negative:  Calcrl, Egfl7, Cldn5, Ptprb, Pecam1, Ramp2, Tmem100, Ace, Cav1, Itga1 
##     Cyyr1, Clec14a, Ctla2a, Acvrl1, Hpgd, Scn7a, Cd36, Cavin2, Cdh5, BC028528 
##     Aqp1, Thbd, Adgrl4, Gpihbp1, Cd93, S1pr1, Clec1a, Icam2, Tspan7, Klf2

PCA plot of PC1, PC2

DimPlot(SNL_CCSP_1, reduction = "pca", group.by='tx')+ggtitle("PCA of all cells captured")

Here we see some outlying cells that are likely non-tumor.

Determine the dimensionality for clustering

Visualize dimensions and choose elbow that captures majority of variance.

ElbowPlot(SNL_CCSP_1, ndims=50)

Choosing 25 dimensions

SNL_CCSP_1 <- FindNeighbors(SNL_CCSP_1, dims = 1:25)
## Computing nearest neighbor graph
## Computing SNN
SNL_CCSP_1<- FindClusters(SNL_CCSP_1, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 8704
## Number of edges: 282351
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8395
## Number of communities: 10
## Elapsed time: 1 seconds

Run Non-Linear Dimension reduction (UMAP)

SNL_CCSP_1 <- RunUMAP(SNL_CCSP_1, dims = 1:25)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 16:37:46 UMAP embedding parameters a = 0.9922 b = 1.112
## 16:37:46 Read 8704 rows and found 25 numeric columns
## 16:37:46 Using Annoy for neighbor search, n_neighbors = 30
## 16:37:46 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 16:37:48 Writing NN index file to temp file /tmp/Rtmpyyhzg4/filea5cc4a2a9832
## 16:37:48 Searching Annoy index using 1 thread, search_k = 3000
## 16:37:51 Annoy recall = 100%
## 16:37:52 Commencing smooth kNN distance calibration using 1 thread
## 16:37:54 Initializing from normalized Laplacian + noise
## 16:37:54 Commencing optimization for 500 epochs, with 367956 positive edges
## 16:38:26 Optimization finished

Plot UMAP

x<-DimPlot(SNL_CCSP_1, reduction = "umap",label=TRUE,pt.size=.1)+ggtitle("All cells by Cluster ID")
y<-DimPlot(SNL_CCSP_1, reduction = "umap", group.by = "tx",label=FALSE,pt.size=0.1)+ggtitle("All cells by treatment")
multiplot(x,y, cols=2)

Looks like we have some outlying populations, I am going to check cell types with our markers.

Assign cell types per cluster using Dotplots of common cell markers

more_types<-c("GFP","Sox2", "Nkx2-1", "Trp63", "Krt5","Krt13","Krt4",
              "Myc","Yap1","Sox9","Runx2","Sox6","Sp7",
              "Vcam1","Fgf10","Dcn","Clec3b","Fbln1",  # rpma
              "Col14a1", # matrix fibroblast/osteoclastic
              
              "Acta2","Myh11","Tagln","Mustn1", #myofibroblast
              "Lpl","Lipa","Pparg","Plin2","Ear1","Fabp1","Spp1", #lipofibroblast/osteoblastic
              
              "Mertk","Marco","Mrc1","Ly75","Adgre1","Itgax","Cd68","Csf1r","Mafb","Msr1","Arg1","Adgre4","Clec4a1", #Macs/Myeloid
              "Cx3cr1","Itgam","Cd14", #Monocytes
              "S100a9","S100a8","Mmp9","Csf3r","Cxcr2","Ly6g", #Neuts
              "Batf3","Xcr1","Clec9a","Ccl17","Ccl22", #DC
              
              
              "Cd3d","Cd3e","Cd3g","Cd28","Cd8a","Cd4","Foxp3", # Tcell
              
              "Gzma","Ncr1","Gzmb", #NK
              
              "Fcmr","Cd19","Fcer2a","Pax5","Cd22","Cd79b","Cd79a", #B cells
              "Slamf7", "Prdm1","Mzb1", #Plasma
              
              "Mcam","Pecam1","Icam2","Cd36","Cd93", #Endothelial
              
              "Scd1","Lamp3","Sftpb","Sftpa1","Aqp5", #Alveolar
              "Scgb1a1","Cyp2f2","Scgb3a2","Scgb3a1","Lypd2", #club
              "Foxj1","Rfx2","Rfx3","Trp73", #ciliated
              "Muc5b","Muc5ac", #goblet
              "Krt17","Perp", #basal
              "Pou2f3","Ascl2","Rgs13","Il25","Tslp", #tuft
              "Foxi1","Cftr","Ascl3", #ionocyte
              "Epgn","Slc16a14","Grhl1", #Eosinophils
              "Cpa3","Fcer1a","Ms4a2", #Basophils
              "Cma1","Tpsb2", #Mast
              "Cd200r3" #Combined with Mast, Baso, Eosinophil
)
DotPlot(SNL_CCSP_1,features = more_types, group.by="seurat_clusters")+theme(axis.text.x=element_text(angle=90, hjust=1,size=8), axis.text.y=element_text(hjust=1,size=10))

Looks like cluster 8 is AT2/Club cells/healthy lung. Cluster 9 is enriched for endothelial cell markers like PECAM1, MCAM.

Re-cluster tumor cells without 8 or 9

Subset, re-scale

SNL_CCSP_tumor<-subset(SNL_CCSP_1, idents=c("0","1","2","3","4","5","6", "7"))

table(SNL_CCSP_tumor@meta.data$tx)
## 
## Control  CXCR2i 
##    3349    5201
all.genes <- rownames(SNL_CCSP_tumor)
SNL_CCSP_tumor <- ScaleData(SNL_CCSP_tumor, features = all.genes)
## Centering and scaling data matrix

New PCA Clustering

SNL_CCSP_tumor <- RunPCA(SNL_CCSP_tumor, features = VariableFeatures(object = SNL_CCSP_tumor))
## PC_ 1 
## Positive:  2300002M23Rik, Glrx, Cnfn, Gadd45b, Isg15, Csta1, Slc6a14, Fam84a, Ndrg1, S100a10 
##     Cysrt1, Klk13, Ifit1bl1, Arg1, Duoxa2, Upk1b, Gbp4, Spink5, Il1a, Jdp2 
##     Herpud1, Il1r2, Pbp2, Hist1h1c, Gm44275, Plat, Prss22, Calml3, Hist1h2ac, Dusp14 
## Negative:  Krt8, Ptma, Cd63, Rpsa, Rpl3, Ppia, Lgals4, Rps18, Eef1a1, Dbi 
##     Fabp5, Prom1, Atp1a1, Rps2, Tagln2, Rpl12, Oat, Rps8, Agr2, Prdx1 
##     Rpl5, Rack1, Rpl8, Rplp1, Hsp90ab1, Cxcl17, Rps19, Rpl13, Rpl14, Ctse 
## PC_ 2 
## Positive:  Tff2, Selenop, Ern2, Agr2, Prom1, Ces1f, Lgals4, Ctse, Xist, Vsig1 
##     Cfi, Galnt6, Muc16, Paqr5, 5330417C22Rik, Zfp704, Tff1, Gstm1, Fabp5, Reg3g 
##     Ptprn2, Smim6, Kitl, Glrx, Tcf4, Rgs17, Dmbt1, Sult1c2, Scara3, Oat 
## Negative:  Tpm2, Cldn3, Tnfrsf11b, Tacstd2, Cldn4, Srd5a1, Gapdh, Anxa8, Nupr1, Plat 
##     Serpinb6b, Akr1b8, Cwh43, H2afz, Tnip3, Hk2, Srgn, Basp1, Arg1, Marcksl1 
##     Msn, Ctsl, Myo1b, Junb, Hilpda, Aldh1a3, Atox1, Mif, Calm1, Stom 
## PC_ 3 
## Positive:  Ccdc153, Sec14l3, Tmem212, Enkur, 1110017D15Rik, Lrrc10b, Rsph1, Dnah12, Gm19935, Cfap126 
##     Cfap206, Mlf1, Mns1, Dnah9, Mapk15, Ccdc113, Tppp3, Gm867, Foxj1, Ccdc39 
##     Iqcg, 1700007K13Rik, Dynlrb2, Riiad1, Dnah6, 4833427G06Rik, Fam92b, Fam183b, Drc3, Ccdc17 
## Negative:  Tff1, Ctse, Dpcr1, Anxa10, Lgals4, Clu, Rgs17, Vsig1, Fabp5, Cldn18 
##     Fut2, Vsig2, Ptprn2, Phgr1, Prss32, Ces1f, Tesc, Paqr5, Oit1, Pla2g10 
##     Ldhd, Tff2, Ern2, Wtip, Kit, Sytl2, B4galt6, Zfpm1, Lypd8, Gm3336 
## PC_ 4 
## Positive:  Cnfn, Cysrt1, Krt4, Klk13, Gadd45b, Crct1, Krt14, Klk10, Krt13, Csta1 
##     Ly6g6c, S100a10, Klk14, Dmkn, S100a7a, 2300002M23Rik, 2610528A11Rik, Krt5, Dusp14, Rplp0 
##     Upk1b, Glrx, Pbp2, Lgals7, Gm44275, Aqp3, Eef1b2, Dapl1, Ndrg1, Cyp2f2 
## Negative:  Srd5a1, Muc4, Myo1b, Basp1, Tnfrsf11b, Plat, Btg2, Marcksl1, Vtcn1, Aplp2 
##     Cldn3, Hk2, Clu, Cxcl3, Ifit3, Ifit3b, Cd14, Cxcl5, Slc26a4, Akr1b8 
##     Sorl1, Scin, Cdh1, Tnip3, Cwh43, Fam46a, Sprr2f, Prss22, Ces1f, Nupr1 
## PC_ 5 
## Positive:  Cbr2, Ces1d, Tmem176b, Mgst1, Tmem176a, Cyp2f2, Chad, Hp, Ldhb, Cp 
##     Lrg1, Gsta3, 5330417C22Rik, Selenbp1, Ifitm3, Rassf9, Car8, Gas6, Gabrp, Wnt4 
##     Dcxr, Aldh2, Them5, Adh7, Acsl1, Ifitm1, C3, Anpep, Foxp2, Lbp 
## Negative:  Dpcr1, Gldc, Tff1, Cldn18, Il33, Ndnf, Cftr, B4galt6, Fst, Avil 
##     Areg, Kit, Anxa10, Bok, Emp3, Ereg, Bmyc, Krt20, Gcnt1, Car9 
##     Arhgdig, Nedd9, Asns, Cela1, Samd5, Otop3, Cldn2, Prss12, Nptx2, Pkib
DimPlot(SNL_CCSP_tumor, reduction = "pca", group.by='tx')+ggtitle("PCA of Tumor Cells Only")

Looks much better, more continous changes.

New Dimensionality Reduction

ElbowPlot(SNL_CCSP_tumor, ndims=50)

Choosing ~15 dims which captures majority of variance

New UMAP

SNL_CCSP_tumor <- FindNeighbors(SNL_CCSP_tumor, dims = 1:15)
## Computing nearest neighbor graph
## Computing SNN
SNL_CCSP_tumor<- FindClusters(SNL_CCSP_tumor, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 8550
## Number of edges: 269355
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8397
## Number of communities: 9
## Elapsed time: 1 seconds
SNL_CCSP_tumor <- RunUMAP(SNL_CCSP_tumor, dims = 1:15)
## 16:39:09 UMAP embedding parameters a = 0.9922 b = 1.112
## 16:39:09 Read 8550 rows and found 15 numeric columns
## 16:39:09 Using Annoy for neighbor search, n_neighbors = 30
## 16:39:09 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 16:39:12 Writing NN index file to temp file /tmp/Rtmpyyhzg4/filea5cc7b626d52
## 16:39:12 Searching Annoy index using 1 thread, search_k = 3000
## 16:39:15 Annoy recall = 100%
## 16:39:16 Commencing smooth kNN distance calibration using 1 thread
## 16:39:18 Initializing from normalized Laplacian + noise
## 16:39:18 Commencing optimization for 500 epochs, with 348698 positive edges
## 16:39:49 Optimization finished
x<-DimPlot(SNL_CCSP_tumor, reduction = "umap",label=TRUE,pt.size=.1)+ggtitle("Tumor cells by Cluster ID")
y<-DimPlot(SNL_CCSP_tumor, reduction = "umap", group.by = "tx",label=FALSE,pt.size=0.1)+ggtitle("Tumor cells by treatment")

multiplot(x, y, cols=2)

Double check cell types with same dotplot

DotPlot(SNL_CCSP_tumor,features = more_types, group.by="seurat_clusters")+ theme(axis.text.x=element_text(angle=90, hjust=1,size=8), axis.text.y=element_text(hjust=1,size=10))

Looks good for now. Moving forward.

Assign cell cycle phase. Check to see if clustering is being determined by cell cycle phase.

library(hciR)
library(hciRdata)
s.genes<-cc.genes$s.genes
g2m.genes<-cc.genes$g2m.genes
s.genes.list<-subset(mouse94, mouse94$human_homolog %in% s.genes)
s.genes.mouse<-(s.genes.list$gene_name)
s.genes.mouse
##  [1] "Cdc45"    "Uhrf1"    "Mcm2"     "Slbp"     "Mcm5"     "Pola1"   
##  [7] "Gmnn"     "Cdc6"     "Rrm2"     "Atad2"    "Dscc1"    "Mcm4"    
## [13] "Chaf1b"   "Rfc2"     "Msh2"     "Fen1"     "Hells"    "Prim1"   
## [19] "Tyms"     "Mcm6"     "Wdr76"    "Rad51"    "Pcna"     "Ccne2"   
## [25] "Casp8ap2" "Usp1"     "Nasp"     "Rpa2"     "Ung"      "Rad51ap1"
## [31] "Blm"      "Pold3"    "Rrm1"     "Gins2"    "Tipin"    "Brip1"   
## [37] "Dtl"      "Exo1"     "Ubr7"     "Clspn"    "E2f8"     "Cdca7"
g2m.genes.list<-subset(mouse94, mouse94$human_homolog %in% g2m.genes)
g2m.genes.mouse<-(g2m.genes.list$gene_name)
g2m.genes.mouse
##  [1] "Ube2c"   "Lbr"     "Ctcf"    "Cdc20"   "Cbx5"    "Kif11"   "Anp32e" 
##  [8] "Birc5"   "Cdk1"    "Tmpo"    "Hmmr"    "Aurkb"   "Top2a"   "Gtse1"  
## [15] "Rangap1" "Cdca3"   "Ndc80"   "Kif20b"  "Cenpf"   "Nek2"    "Nuf2"   
## [22] "Nusap1"  "Bub1"    "Tpx2"    "Aurka"   "Ect2"    "Cks1b"   "Kif2c"  
## [29] "Cdca8"   "Cenpa"   "Mki67"   "Ccnb2"   "Kif23"   "Smc4"    "G2e3"   
## [36] "Tubb4b"  "Anln"    "Tacc3"   "Dlgap5"  "Ckap2"   "Ncapd2"  "Ttk"    
## [43] "Ckap5"   "Cdc25c"  "Hjurp"   "Cenpe"   "Ckap2l"  "Cdca2"   "Hmgb2"  
## [50] "Cks2"    "Psrc1"   "Gas2l3"
SNL_CCSP_tumor <- CellCycleScoring(SNL_CCSP_tumor, s.features = s.genes.mouse, g2m.features = g2m.genes.mouse)

DimPlot(SNL_CCSP_tumor, reduction = "umap", group.by = "Phase", pt.size=.2)+ggtitle("Tumor cells by Cell Cycle Phase")

Does not look like it. Let’s take a more quantitative look…

What % of cells in each phase occupy each seurat cluster?

proportions <- as.data.frame(100*prop.table(x = table(SNL_CCSP_tumor$seurat_clusters, SNL_CCSP_tumor$Phase), margin = 2))

colnames(proportions)<-c("Cluster", "Sample", "Frequency")

library(ggpubr)

ggbarplot(proportions, x="Sample", y="Frequency", fill = "Sample", color="black",
          
          group = "Sample", ylab = "Frequency (percent)", xlab="", 
          
          # palette = sample_cols
          
)+
  
  theme_bw()+
  
  facet_wrap(facets = "Cluster", scales="free_y", ncol =4)+
  
  theme(axis.text.y = element_text(size=12)) +
  
  rotate_x_text(angle = 45)

Clusters aren’t being determined by cell cycle phase. Moving on to DEG analyses.

What % of cells in each treatment group occupy each seurat cluster?

proportions <- as.data.frame(100*prop.table(x = table(SNL_CCSP_tumor$seurat_clusters, SNL_CCSP_tumor$tx), margin = 2))

colnames(proportions)<-c("Cluster", "Sample", "Frequency")

library(ggpubr)

ggbarplot(proportions, x="Sample", y="Frequency", fill = "Sample", color="black",
          
          group = "Sample", ylab = "Frequency (percent)", xlab="", 
          
          # palette = sample_cols
          
)+
  
  theme_bw()+
  
  facet_wrap(facets = "Cluster", scales="free_y", ncol =4)+
  
  theme(axis.text.y = element_text(size=12)) +
  
  rotate_x_text(angle = 45)

DEG Analyses

Between seurat clusters

Downsampling to 100 cells per cluster, and top 50 genes per cluster. To see all, see excel file.

Cluster.Markers <- FindAllMarkers(SNL_CCSP_tumor, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)

test<-as.data.frame(Cluster.Markers)
write.csv(test, "~/SNL CCSP Tumor Cluster Marker Genes.csv")


top100 <- Cluster.Markers  %>% group_by(cluster) %>% top_n(n = 50, wt = avg_log2FC)

top100_names<-top100$gene
DoHeatmap(subset(SNL_CCSP_tumor, downsample = 100), features = top100_names, size = 3) + NoLegend()

Table of top 50 marker genes per cluster.

DT::datatable(
  Cluster.Markers %>%
    mutate(gene=rownames(.)) %>%
    group_by(cluster) %>%
    top_n(n = 50, wt = avg_log2FC) %>%
    select(gene, everything())
)

Differentially expressed genes between Control and CXCR2i tumor cells

Downsampling to 2000 cells per cluster, and top 100 genes per cluster. To see all, see excel file.

Idents(SNL_CCSP_tumor)<-"tx"
Tx.Markers <- FindAllMarkers(SNL_CCSP_tumor, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
## Calculating cluster Control
## Calculating cluster CXCR2i
test<-as.data.frame(Tx.Markers)
write.csv(test, "~/Tx.Markers SNL CCSP Tumor cells.csv")


top100 <- Tx.Markers  %>% group_by(cluster) %>% top_n(n = 100, wt = avg_log2FC)

top100_names<-top100$gene

DoHeatmap(subset(SNL_CCSP_tumor, downsample = 2000), features = top100_names, size = 3, group.colors=c("orchid4","orange"))

Table of top 100 marker genes per treatment cluster.

DT::datatable(
  Tx.Markers %>%
    mutate(gene=rownames(.)) %>%
    group_by(cluster) %>%
    top_n(n = 100, wt = avg_log2FC) %>%
    select(gene, everything())
)

Save seurat object as an rds for quick restart.

saveRDS(SNL_CCSP_tumor,“/scratch/Aireland/SNL for R01 Analysis/full_processed_SNL_CCSP_tumor.rds”)