load in 2 samples of each condition so that this code can run on my computer

data_dir ="/Users/rorihoover/Desktop/BME494HW/Eutopic/sample8/outs/filtered_feature_bc_matrix/"
data <- Read10X(data.dir = data_dir)
Eutopic_seurat = CreateSeuratObject(counts = data)

data_dir ="/Users/rorihoover/Desktop/BME494HW/Eutopic/sample24/outs/filtered_feature_bc_matrix/" 
data <- Read10X(data.dir = data_dir)
seurat_object = CreateSeuratObject(counts = data)
Eutopic_seurat<- merge(Eutopic_seurat, y = seurat_object)
## Warning: Some cell names are duplicated across objects provided. Renaming to
## enforce unique cell names.
Eutopic_seurat<- merge(Eutopic_seurat, y = seurat_object)
rm(seurat_object)
rm(data)

#saveRDS(Eutopic_seurat, file = "Eutopic_seurat.rds")
Length<- length(Cells(Eutopic_seurat))
condition <- list(rep("Eutopic_Endometrium", Length))
meta_data <- data.frame(condition)
rownames(meta_data)<- Cells(Eutopic_seurat)

Eutopic_seurat<- AddMetaData(
  object = Eutopic_seurat,
  metadata = meta_data,
  col.name = 'condition'
)
head(x = Eutopic_seurat[[]])
##                         orig.ident nCount_RNA nFeature_RNA           condition
## AAACCCAAGCGTGCCT-1_1 SeuratProject       1443          551 Eutopic_Endometrium
## AAACCCAAGTTACGTC-1_1 SeuratProject       3404          950 Eutopic_Endometrium
## AAACCCACAACCCTCT-1_1 SeuratProject       1064          408 Eutopic_Endometrium
## AAACCCACATCACGGC-1_1 SeuratProject       4377         1301 Eutopic_Endometrium
## AAACCCAGTAAGTTGA-1_1 SeuratProject       1481          552 Eutopic_Endometrium
## AAACCCAGTATTCCGA-1_1 SeuratProject       9648         2430 Eutopic_Endometrium
data_dir ="/Users/rorihoover/Desktop/BME494HW/endo/sample5/outs/filtered_feature_bc_matrix/" 
data <- Read10X(data.dir = data_dir)
Endo_seurat = CreateSeuratObject(counts = data)

data_dir ="/Users/rorihoover/Desktop/BME494HW/endo/sample16/outs/filtered_feature_bc_matrix/" 
data <- Read10X(data.dir = data_dir)
seurat_object = CreateSeuratObject(counts = data)
Endo_seurat<- merge(Endo_seurat, y = seurat_object)
## Warning: Some cell names are duplicated across objects provided. Renaming to
## enforce unique cell names.
rm(seurat_object)
rm(data)

#saveRDS(Endo_seurat, file = "Endo_seurat.rds")
Length<- length(Cells(Endo_seurat))

condition <- list(rep("Endometriosis", Length))
meta_data <- data.frame(condition)
rownames(meta_data)<- Cells(Endo_seurat)

Endo_seurat<- AddMetaData(
  object = Endo_seurat,
  metadata = meta_data,
  col.name = 'condition'
)
rm(meta_data)
rm(Length)
rm(condition)
seurat<- merge(Endo_seurat, y = Eutopic_seurat)
## Warning: Some cell names are duplicated across objects provided. Renaming to
## enforce unique cell names.
rm(Endo_seurat)
rm(Eutopic_seurat)
head(x = seurat[[]])
##                           orig.ident nCount_RNA nFeature_RNA     condition
## AAACCCAAGGATATGT-1_1_1 SeuratProject       5091         2108 Endometriosis
## AAACCCACAGTCCCGA-1_1_1 SeuratProject       3688          862 Endometriosis
## AAACGAACAGAGGAAA-1_1_1 SeuratProject       4693         1321 Endometriosis
## AAACGAACATACCGTA-1_1_1 SeuratProject        611          353 Endometriosis
## AAACGAATCACAAGAA-1_1_1 SeuratProject       1264          359 Endometriosis
## AAACGCTCACTGGAAG-1_1_1 SeuratProject       3379         1067 Endometriosis
seurat[["percent.mt"]] <- PercentageFeatureSet(seurat, pattern = "^MT-")
VlnPlot(seurat, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.

FeatureScatter(seurat, feature1 = "nCount_RNA", feature2 = "percent.mt")+
  geom_hline(yintercept = 5, linetype = "dashed", color = "black")

FeatureScatter(seurat, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")+
  geom_hline(yintercept = 2500, linetype = "dashed", color = "black")+
  geom_hline(yintercept = 200, linetype = "dashed", color = "black")

seurat <- subset(seurat, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5)
seurat[["percent.mt"]] <- PercentageFeatureSet(seurat, pattern = "^MT-")
VlnPlot(seurat, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.

seurat <- RunAzimuth(seurat, reference = "pbmcref")
## Warning: Overwriting miscellanous data for model
## Warning: Adding a dimensional reduction (refUMAP) without the associated assay
## being present
## Warning: Adding a dimensional reduction (refUMAP) without the associated assay
## being present
## detected inputs from HUMAN with id type Gene.name
## reference rownames detected HUMAN with id type Gene.name
## Normalizing query using reference SCT model
## Warning: No layers found matching search pattern provided
## Projecting cell embeddings
## Finding query neighbors
## Finding neighborhoods
## Finding anchors
##  Found 3392 anchors
## Finding integration vectors
## Finding integration vector weights
## Predicting cell labels
## Predicting cell labels
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
## Predicting cell labels
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
## 
## Integrating dataset 2 with reference dataset
## Finding integration vectors
## Integrating data
## Warning: Keys should be one or more alphanumeric characters followed by an
## underscore, setting key from integrated_dr_ to integrateddr_
## Computing nearest neighbors
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## Running UMAP projection
## Warning in RunUMAP.default(object = neighborlist, reduction.model =
## reduction.model, : Number of neighbors between query and reference is not equal
## to the number of neighbors within reference
## 21:43:46 Read 6303 rows
## 21:43:46 Processing block 1 of 1
## 21:43:46 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 20
## 21:43:46 Initializing by weighted average of neighbor coordinates using 1 thread
## 21:43:46 Commencing optimization for 67 epochs, with 126060 positive edges
## 21:43:48 Finished
## Warning: No assay specified, setting assay as RNA by default.
## Projecting reference PCA onto query
## Finding integration vector weights
## Projecting back the query cells into original PCA space
## Finding integration vector weights
## Computing scores:
##     Finding neighbors of original query cells
##     Finding neighbors of transformed query cells
##     Computing query SNN
##     Determining bandwidth and computing transition probabilities
## Total elapsed time: 6.29784488677979
seurat
## An object of class Seurat 
## 33633 features across 6303 samples within 4 assays 
## Active assay: RNA (33538 features, 0 variable features)
##  5 layers present: counts.1.1, counts.1.1.2, counts.2.1, counts.2.2, counts.2.1.2
##  3 other assays present: prediction.score.celltype.l1, prediction.score.celltype.l2, prediction.score.celltype.l3
##  2 dimensional reductions calculated: integrated_dr, ref.umap
seurat <- NormalizeData(seurat)
## Normalizing layer: counts.1.1
## Normalizing layer: counts.1.1.2
## Normalizing layer: counts.2.1
## Normalizing layer: counts.2.2
## Normalizing layer: counts.2.1.2
seurat <- FindVariableFeatures(seurat)
## Finding variable features for layer counts.1.1
## Finding variable features for layer counts.1.1.2
## Finding variable features for layer counts.2.1
## Finding variable features for layer counts.2.2
## Finding variable features for layer counts.2.1.2
seurat <- ScaleData(seurat)
## Centering and scaling data matrix
seurat <- RunPCA(seurat)
## PC_ 1 
## Positive:  DCN, COL1A2, COL6A2, COL1A1, C1S, MGP, C1R, CCDC80, SERPINF1, LUM 
##     COL14A1, CALD1, COL6A1, SPARCL1, SERPING1, COL6A3, COL3A1, SPARC, OGN, FBLN1 
##     MMP2, C3, AEBP1, RARRES2, GSN, SELENOP, MFAP4, LRP1, TIMP2, IGFBP5 
## Negative:  PTPRC, SRGN, HCST, CD48, CD52, LAPTM5, LCP1, CORO1A, CXCR4, CD37 
##     IL32, CD3D, CD2, SH3BGRL3, GMFG, CD3E, TRBC2, TRAC, CYBA, CD53 
##     CCL5, LTB, SAMSN1, CYTIP, FYB1, EVI2B, CD69, ITGB2, IL7R, GZMA 
## PC_ 2 
## Positive:  IL32, CD3D, TRAC, CD3E, CD2, TRBC2, HBB, CCL5, HBA1, HBA2 
##     CD3G, LTB, GZMA, TRBC1, GZMM, CD52, IL7R, GZMK, CST7, CD247 
##     CTSW, KLRB1, SLPI, SPOCK2, CD7, SKAP1, NKG7, IKZF3, WFDC2, TBC1D10C 
## Negative:  LYZ, FCN1, S100A9, IL1B, S100A8, CXCL8, EREG, MNDA, BCL2A1, S100A12 
##     CCL20, CLEC4E, PLAUR, FCER1G, IL1A, AIF1, CTSS, CYBB, C5AR1, CLEC7A 
##     TYROBP, CFP, OLR1, PHACTR1, CD14, KYNU, TNFAIP6, SERPINB2, LST1, AQP9 
## PC_ 3 
## Positive:  MALAT1, PTPRC, CXCR4, IL32, ZFP36L2, CD3D, CD2, TSC22D3, CD3E, SRSF7 
##     NEAT1, CCL5, CST7, CD69, GZMA, CD52, TRBC2, TRAC, MCL1, CYTIP 
##     CREM, CEMIP2, ARL4C, HSP90AA1, HCST, SRGN, IL7R, CD48, SPOCK2, GZMM 
## Negative:  WFDC2, SLPI, TFF3, FTL, DEFB1, SCGB2A1, HBA1, HBA2, AGR2, HBB 
##     KRT18, PIGR, CAPS, FXYD3, SCGB1D2, ASRGL1, C19orf33, HLA-DRA, SMIM22, KRT8 
##     AIF1, LYZ, C1QA, FTH1, CD68, CD24, CRIP2, C1QB, PRR15, SOX17 
## PC_ 4 
## Positive:  SFRP2, S100A4, CCDC80, IGFBP6, FBLN2, GPNMB, DPT, FBLN1, CHRDL1, MFAP4 
##     ADH1B, CFD, C16orf89, CPE, PCOLCE2, MGST1, SRPX, PDGFRL, CST3, LUM 
##     PI16, MFAP5, CTHRC1, C1QTNF3, PRRX1, SCARA5, SLIT3, GSN, LMO3, CYBA 
## Negative:  JUNB, ZFP36, FOSB, SERPINE1, NR4A1, CLDN5, PTPRB, VWF, ADGRL4, LIFR 
##     C11orf96, MCL1, AQP1, EPAS1, CAV1, EMCN, LMNA, SEMA6A, EMP1, TSC22D1 
##     CDH5, SOCS3, MIR222HG, CAVIN2, TM4SF1, COL4A1, CLEC14A, ADGRF5, EGFL7, MYC 
## PC_ 5 
## Positive:  PTPRB, ADGRL4, CLDN5, AQP1, CDH5, EMCN, VWF, CLEC14A, ADGRF5, EGFL7 
##     PECAM1, ROBO4, A2M, TM4SF18, CAVIN2, ECSCR, RAMP2, CXorf36, ARHGAP29, CALCRL 
##     MMRN2, CYYR1, TXNIP, PODXL, RAPGEF5, TM4SF1, FLT1, EPAS1, ESAM, S1PR1 
## Negative:  FOSB, SERPINE1, JUNB, EGR1, ZFP36, IER3, CEBPB, NR4A1, PPP1R15A, HBA2 
##     C11orf96, HBB, HBA1, CCL2, FOS, ERRFI1, IER2, ATF3, KRT19, SOD2 
##     PTGDS, SLPI, WFDC2, KDM6B, SOCS3, EGR3, TNFRSF12A, KLF10, GEM, CXCL2
VizDimLoadings(seurat, dims = 1:2, reduction = "pca")

DimPlot(seurat, reduction = "pca", group.by = "condition")

ElbowPlot(seurat)

seurat <- FindNeighbors(seurat, dims = 1:10, reduction = "pca")
## Computing nearest neighbor graph
## Computing SNN
seurat<- FindClusters(seurat, resolution = 0.1, cluster.name = "unintegrated_clusters")
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 6303
## Number of edges: 203479
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9691
## Number of communities: 9
## Elapsed time: 0 seconds
seurat <- RunUMAP(seurat, dims = 1:10, reduction = "pca", reduction.name = "umap.unintegrated")
## 21:44:26 UMAP embedding parameters a = 0.9922 b = 1.112
## 21:44:26 Read 6303 rows and found 10 numeric columns
## 21:44:26 Using Annoy for neighbor search, n_neighbors = 30
## 21:44:26 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 21:44:27 Writing NN index file to temp file /var/folders/hv/xklkw9n15xgbktqz58g7gtbw0000gn/T//RtmpFgmubG/filed878290cbb04
## 21:44:27 Searching Annoy index using 1 thread, search_k = 3000
## 21:44:29 Annoy recall = 100%
## 21:44:30 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 21:44:32 Initializing from normalized Laplacian + noise (using RSpectra)
## 21:44:32 Commencing optimization for 500 epochs, with 251694 positive edges
## 21:44:45 Optimization finished
DimPlot(
  seurat,
  reduction = "umap.unintegrated",
  group.by = "condition",
  combine = FALSE, label.size = 2
)
## [[1]]

DimPlot(seurat, reduction = "umap.unintegrated")

# visualize by batch and cell type annotation
# cell type annotations were previously added by Azimuth
DimPlot(seurat, reduction = "umap.unintegrated", group.by = "predicted.celltype.l2")

#this will take a long time 
seurat <- IntegrateLayers(object = seurat, method = CCAIntegration, new.reduction = "integrated.cca", verbose = F)
seurat <- FindNeighbors(seurat, reduction = "integrated.cca", dims = 1:10)
## Computing nearest neighbor graph
## Computing SNN
seurat <- FindClusters(seurat, resolution = 0.1, cluster.name = "integrated_clusters")
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 6303
## Number of edges: 212648
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9604
## Number of communities: 6
## Elapsed time: 0 seconds
seurat <- RunUMAP(seurat, reduction = "integrated.cca", dims = 1:10, reduction.name = "umap.cca")
## 21:46:23 UMAP embedding parameters a = 0.9922 b = 1.112
## 21:46:23 Read 6303 rows and found 10 numeric columns
## 21:46:23 Using Annoy for neighbor search, n_neighbors = 30
## 21:46:23 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 21:46:24 Writing NN index file to temp file /var/folders/hv/xklkw9n15xgbktqz58g7gtbw0000gn/T//RtmpFgmubG/filed87854af8917
## 21:46:24 Searching Annoy index using 1 thread, search_k = 3000
## 21:46:27 Annoy recall = 100%
## 21:46:28 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 21:46:30 Initializing from normalized Laplacian + noise (using RSpectra)
## 21:46:30 Commencing optimization for 500 epochs, with 257932 positive edges
## 21:46:43 Optimization finished
DimPlot(
  seurat,
  reduction = "umap.cca",
  group.by = "condition",
  combine = FALSE, label.size = 2
)
## [[1]]

DimPlot(
  seurat,
  reduction = "umap.cca",
  group.by = "predicted.celltype.l2",
  combine = FALSE, label.size = 2
)
## [[1]]

DimPlot(seurat, reduction = "umap.cca")

# re-join layers after integration
seurat[["RNA"]] <- JoinLayers(seurat[["RNA"]])
seurat
## An object of class Seurat 
## 33633 features across 6303 samples within 4 assays 
## Active assay: RNA (33538 features, 2000 variable features)
##  3 layers present: data, counts, scale.data
##  3 other assays present: prediction.score.celltype.l1, prediction.score.celltype.l2, prediction.score.celltype.l3
##  6 dimensional reductions calculated: integrated_dr, ref.umap, pca, umap.unintegrated, integrated.cca, umap.cca
markers0 <- FindMarkers(
  object = seurat,   
  ident.1 = "0",    
  ident.2 = NULL, 
  only.pos = TRUE
)

markers1 <- FindMarkers(
  object = seurat,   
  ident.1 = "1",    
  ident.2 = NULL, 
  only.pos = TRUE
)

markers2 <- FindMarkers(
  object = seurat, 
  ident.1 = "2",      
  ident.2 = NULL, 
  only.pos = TRUE
)

markers3 <- FindMarkers(
  object = seurat,   
  ident.1 = "3",     
  ident.2 = NULL, 
  only.pos = TRUE
)

markers4 <- FindMarkers(
  object = seurat,    
  ident.1 = "4",     
  ident.2 = NULL, 
  only.pos = TRUE
)

markers5 <- FindMarkers(
  object = seurat,    
  ident.1 = "5",      
  ident.2 = "0", 
  only.pos = TRUE
)
seurat$cond_celltype <- paste(seurat$condition, seurat$integrated_clusters, sep = "_")
Idents(seurat) <- "cond_celltype"
table(seurat$cond_celltype)
## 
##       Endometriosis_0       Endometriosis_1       Endometriosis_2 
##                  1051                  1205                   399 
##       Endometriosis_3       Endometriosis_4       Endometriosis_5 
##                    88                    98                    20 
## Eutopic_Endometrium_0 Eutopic_Endometrium_1 Eutopic_Endometrium_2 
##                  1367                  1053                   300 
## Eutopic_Endometrium_3 Eutopic_Endometrium_4 Eutopic_Endometrium_5 
##                   371                   301                    50
cond_celltype_table <- table(seurat$cond_celltype)
cond_celltype_df <- as.data.frame(cond_celltype_table)
colnames(cond_celltype_df) <- c("Cluster", "Count")

cond_celltype_df <- cond_celltype_df %>%
  mutate(Condition = ifelse(grepl("Endometriosis", Cluster, ignore.case = TRUE),
                            "Endometriosis",
                            "Eutopic_Endometrium"))
cond_celltype_df$Cluster <- sub("^[^0-9]+", "", cond_celltype_df$Cluster)

print(cond_celltype_df)
##    Cluster Count           Condition
## 1        0  1051       Endometriosis
## 2        1  1205       Endometriosis
## 3        2   399       Endometriosis
## 4        3    88       Endometriosis
## 5        4    98       Endometriosis
## 6        5    20       Endometriosis
## 7        0  1367 Eutopic_Endometrium
## 8        1  1053 Eutopic_Endometrium
## 9        2   300 Eutopic_Endometrium
## 10       3   371 Eutopic_Endometrium
## 11       4   301 Eutopic_Endometrium
## 12       5    50 Eutopic_Endometrium
ggplot(cond_celltype_df, aes(x = Cluster, y = Count, fill = Condition)) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  labs(title = "Cells Present in Each Cluster", x = "Cluster", y = "Count") 

perts <- cond_celltype_df %>%
  group_by(Condition) %>%
  mutate(Total = sum(Count), Percentage = (Count / Total) * 100) %>%
  select(Cluster, Count, Condition, Percentage)

head(perts, 12)
## # A tibble: 12 × 4
## # Groups:   Condition [2]
##    Cluster Count Condition           Percentage
##    <chr>   <int> <chr>                    <dbl>
##  1 0        1051 Endometriosis           36.7  
##  2 1        1205 Endometriosis           42.1  
##  3 2         399 Endometriosis           13.9  
##  4 3          88 Endometriosis            3.08 
##  5 4          98 Endometriosis            3.43 
##  6 5          20 Endometriosis            0.699
##  7 0        1367 Eutopic_Endometrium     39.7  
##  8 1        1053 Eutopic_Endometrium     30.6  
##  9 2         300 Eutopic_Endometrium      8.72 
## 10 3         371 Eutopic_Endometrium     10.8  
## 11 4         301 Eutopic_Endometrium      8.74 
## 12 5          50 Eutopic_Endometrium      1.45
cond_markers0 <- FindMarkers(seurat, ident.1 = "Endometriosis_0", ident.2 = "Eutopic_Endometrium_0", verbose = FALSE)

cond_markers1 <- FindMarkers(seurat, ident.1 = "Endometriosis_1", ident.2 = "Eutopic_Endometrium_1", verbose = FALSE)


cond_markers2 <- FindMarkers(seurat, ident.1 = "Endometriosis_2", ident.2 = "Eutopic_Endometrium_2", verbose = FALSE)

cond_markers3 <- FindMarkers(seurat, ident.1 = "Endometriosis_3", ident.2 = "Eutopic_Endometrium_3", verbose = FALSE)

cond_markers4 <- FindMarkers(seurat, ident.1 = "Endometriosis_4", ident.2 = "Eutopic_Endometrium_4", verbose = FALSE)

cond_markers5 <- FindMarkers(seurat, ident.1 = "Endometriosis_5", ident.2 = "Eutopic_Endometrium_5", verbose = FALSE)
library("knitr")
top_and_bot <- function(ldf, bot = TRUE) {
 
  ldf <- ldf[order(ldf$avg_log2FC, decreasing = TRUE), ]
    if (bot) {
        top_3 <- head(ldf, 3)
        result <- top_3
    }      
  top_5 <- head(ldf, 5)
  result <- top_5

  if (bot) {
    bottom_3 <- tail(ldf, 3)
    result <- rbind(result, bottom_3)
  }
  print(knitr::kable(result, format = "simple"))

  return(result)
}

cluster 0

top_and_bot(markers0, bot = FALSE)
## 
## 
##               p_val   avg_log2FC   pct.1   pct.2   p_val_adj
## -----------  ------  -----------  ------  ------  ----------
## CILP2             0     9.394701   0.036   0.000           0
## FREM1             0     9.047897   0.043   0.000           0
## AC007920.2        0     8.773420   0.020   0.000           0
## MEG8              0     8.688816   0.108   0.001           0
## COL6A6            0     8.428849   0.020   0.000           0
##                   p_val avg_log2FC pct.1 pct.2    p_val_adj
## CILP2      4.907295e-33   9.394701 0.036 0.000 1.645808e-28
## FREM1      1.832568e-38   9.047897 0.043 0.000 6.146067e-34
## AC007920.2 1.200183e-18   8.773420 0.020 0.000 4.025173e-14
## MEG8       7.029721e-96   8.688816 0.108 0.001 2.357628e-91
## COL6A6     5.255316e-19   8.428849 0.020 0.000 1.762528e-14
#genes from find marker genes cluster 0 B- cells IGLC2, IGKC, IGHG1 ???
FeaturePlot(seurat, features = c("CILP2","FREM1"), reduction = "umap.cca", split.by = "condition")

top_and_bot(cond_markers0)
## 
## 
##              p_val   avg_log2FC   pct.1   pct.2   p_val_adj
## ----------  ------  -----------  ------  ------  ----------
## PROK1            0    10.448529   0.157   0.000       0e+00
## SPRR2F           0    10.113882   0.102   0.000       0e+00
## GSTA1            0     9.768453   0.037   0.000       0e+00
## GATA4            0     8.569486   0.070   0.000       0e+00
## LINC00473        0     8.270741   0.059   0.000       0e+00
## SCGB2A1          0    -9.261560   0.000   0.045       1e-07
## MUC5B            0    -9.351723   0.000   0.048       0e+00
## SCGB3A1          0   -11.124677   0.000   0.143       0e+00
##                  p_val avg_log2FC pct.1 pct.2    p_val_adj
## PROK1     6.437929e-52  10.448529 0.157 0.000 2.159153e-47
## SPRR2F    1.709378e-33  10.113882 0.102 0.000 5.732913e-29
## GSTA1     7.052018e-13   9.768453 0.037 0.000 2.365106e-08
## GATA4     2.267648e-23   8.569486 0.070 0.000 7.605238e-19
## LINC00473 9.508400e-20   8.270741 0.059 0.000 3.188927e-15
## SCGB2A1   4.089799e-12  -9.261560 0.000 0.045 1.371637e-07
## MUC5B     5.192497e-13  -9.351723 0.000 0.048 1.741460e-08
## SCGB3A1   3.025745e-37 -11.124677 0.000 0.143 1.014775e-32
#proliferative phase marker genes 
FeaturePlot(seurat, features = c("PGR", "ESR1"), reduction = "umap.cca",split.by = "condition")

#proliferative phase marker genes 
FeaturePlot(seurat, features = c("IGF1", "MMP11"), reduction = "umap.cca",split.by = "condition")

#proliferative phase marker genes 
FeaturePlot(seurat, features = c("CRABP2", "ECM1"), reduction = "umap.cca", split.by = "condition")

Cluster 1

top_and_bot(markers1, bot = FALSE)
## 
## 
##              p_val   avg_log2FC   pct.1   pct.2   p_val_adj
## ----------  ------  -----------  ------  ------  ----------
## SIRPG            0    10.210382   0.089       0           0
## LINC02273        0     9.847718   0.068       0           0
## TRBV28           0     9.731967   0.034       0           0
## UBASH3A          0     9.634712   0.061       0           0
## C15orf53         0     9.378756   0.047       0           0
##                  p_val avg_log2FC pct.1 pct.2    p_val_adj
## SIRPG     2.032893e-82  10.210382 0.089     0 6.817918e-78
## LINC02273 5.020953e-63   9.847719 0.068     0 1.683927e-58
## TRBV28    8.144850e-32   9.731967 0.034     0 2.731620e-27
## UBASH3A   1.742459e-56   9.634712 0.061     0 5.843860e-52
## C15orf53  2.655284e-44   9.378756 0.047     0 8.905293e-40
#genes from find marker genes cluster 1 T- cell CD3D, TRAC, CD3E, TRBC2
FeaturePlot(seurat, features = c("CD3D","TRAC"), reduction = "umap.cca", split.by = "condition")

# PAEP and SPP1 (secretory phase genes) are up in endometriosis compared to Eutopic t-cells
# also up is COLGALT2 wich is overexressed in ovarian cancer 
top_and_bot(cond_markers1)
## 
## 
##                 p_val   avg_log2FC   pct.1   pct.2   p_val_adj
## ---------  ----------  -----------  ------  ------  ----------
## IGLC3       0.0000000    11.316918   0.045   0.000   0.0000001
## SPP1        0.0000000     8.158166   0.060   0.000   0.0000000
## RHEX        0.0002818     7.491035   0.012   0.000   1.0000000
## COLGALT2    0.0000169     6.296506   0.017   0.000   0.5659832
## CLDN11      0.0007269     6.256431   0.011   0.000   1.0000000
## WFDC2       0.0000000    -8.493679   0.002   0.249   0.0000000
## SCGB3A1     0.0000000    -8.528897   0.001   0.203   0.0000000
## SLPI        0.0000000    -9.403687   0.002   0.417   0.0000000
##                  p_val avg_log2FC pct.1 pct.2     p_val_adj
## IGLC3     3.635213e-12  11.316918 0.045 0.000  1.219178e-07
## SPP1      7.733389e-16   8.158165 0.060 0.000  2.593624e-11
## RHEX      2.817724e-04   7.491035 0.012 0.000  1.000000e+00
## COLGALT2  1.687588e-05   6.296506 0.017 0.000  5.659832e-01
## CLDN11    7.269314e-04   6.256431 0.011 0.000  1.000000e+00
## WFDC2     5.349628e-74  -8.493679 0.002 0.249  1.794158e-69
## SCGB3A1   6.406948e-60  -8.528897 0.001 0.203  2.148762e-55
## SLPI     4.123295e-134  -9.403687 0.002 0.417 1.382871e-129
FeaturePlot(seurat, features = c("GATA4","PROK1"), reduction = "umap.cca", split.by = "condition")
## Warning: All cells have the same value (0) of "GATA4"
## Warning: All cells have the same value (0) of "PROK1"

# up in endo is genes associated with fetal growth PROK1 and GATA4, and also Hylaruon synthase HAS1
# down in endo is cell growth inhibitor SCGB3A1, IGHM and MUC5B

Cluster 2

top_and_bot(markers2, bot = FALSE)
## 
## 
##            p_val   avg_log2FC   pct.1   pct.2   p_val_adj
## --------  ------  -----------  ------  ------  ----------
## MCEMP1         0     11.27867   0.094       0           0
## LILRA5         0     10.42753   0.070       0           0
## ACOD1          0     10.19312   0.050       0           0
## CD300LB        0     10.16492   0.057       0           0
## CD300E         0     10.10868   0.120       0           0
##                 p_val avg_log2FC pct.1 pct.2     p_val_adj
## MCEMP1  2.784776e-118   11.27867 0.094     0 9.339581e-114
## LILRA5   4.419314e-88   10.42753 0.070     0  1.482149e-83
## ACOD1    2.595595e-63   10.19312 0.050     0  8.705108e-59
## CD300LB  3.783834e-72   10.16492 0.057     0  1.269022e-67
## CD300E  2.646639e-148   10.10868 0.120     0 8.876297e-144
#genes from find marker genes cluster 2 macrophage 

FeaturePlot(seurat, features = c("CD14","CD68"), reduction = "umap.cca", split.by = "condition")

top_and_bot(cond_markers2)
## 
## 
##                   p_val   avg_log2FC   pct.1   pct.2   p_val_adj
## -----------  ----------  -----------  ------  ------  ----------
## CHIT1         0.0000000    11.593705   0.140   0.000   0.0000005
## CCL18         0.0000002     9.920340   0.085   0.000   0.0075477
## CHI3L1        0.0000000     9.900358   0.098   0.000   0.0008778
## CCL13         0.0037831     8.704779   0.028   0.000   1.0000000
## AC015660.2    0.0000103     8.523929   0.063   0.000   0.3458789
## MUC5B         0.0000568    -7.290253   0.000   0.040   1.0000000
## SCGB3A1       0.0000000    -8.876590   0.000   0.087   0.0000718
## HBG2          0.0456419    -9.463383   0.000   0.010   1.0000000
##                   p_val avg_log2FC pct.1 pct.2    p_val_adj
## CHIT1      1.453863e-11  11.593705 0.140 0.000 4.875965e-07
## CCL18      2.250497e-07   9.920340 0.085 0.000 7.547718e-03
## CHI3L1     2.617369e-08   9.900358 0.098 0.000 8.778132e-04
## CCL13      3.783077e-03   8.704779 0.028 0.000 1.000000e+00
## AC015660.2 1.031305e-05   8.523929 0.063 0.000 3.458789e-01
## MUC5B      5.677898e-05  -7.290253 0.000 0.040 1.000000e+00
## SCGB3A1    2.141507e-09  -8.876590 0.000 0.087 7.182186e-05
## HBG2       4.564194e-02  -9.463383 0.000 0.010 1.000000e+00
#no large diff
FeaturePlot(seurat, features = c("CCL18","HBG2"), reduction = "umap.cca", split.by = "condition")

Cluster 3

top_and_bot(markers3, bot = FALSE)
## 
## 
##              p_val   avg_log2FC   pct.1   pct.2   p_val_adj
## ----------  ------  -----------  ------  ------  ----------
## LINC01320        0     12.12859   0.081       0           0
## WDR38            0     11.34371   0.037       0           0
## TUBA4B           0     10.71207   0.028       0           0
## C9orf152         0     10.52318   0.041       0           0
## ANKRD66          0     10.48615   0.022       0           0
##                   p_val avg_log2FC pct.1 pct.2     p_val_adj
## LINC01320 4.817969e-105   12.12859 0.081     0 1.615850e-100
## WDR38      4.100697e-49   11.34371 0.037     0  1.375292e-44
## TUBA4B     6.033155e-38   10.71207 0.028     0  2.023400e-33
## C9orf152   1.068082e-54   10.52318 0.041     0  3.582133e-50
## ANKRD66    1.445551e-29   10.48615 0.022     0  4.848091e-25
#genes from find marker genes cluster 3
FeaturePlot(seurat, features = c("LINC01320","WDR38"), reduction = "umap.cca", split.by = "condition")

top_and_bot(cond_markers2)
## 
## 
##                   p_val   avg_log2FC   pct.1   pct.2   p_val_adj
## -----------  ----------  -----------  ------  ------  ----------
## CHIT1         0.0000000    11.593705   0.140   0.000   0.0000005
## CCL18         0.0000002     9.920340   0.085   0.000   0.0075477
## CHI3L1        0.0000000     9.900358   0.098   0.000   0.0008778
## CCL13         0.0037831     8.704779   0.028   0.000   1.0000000
## AC015660.2    0.0000103     8.523929   0.063   0.000   0.3458789
## MUC5B         0.0000568    -7.290253   0.000   0.040   1.0000000
## SCGB3A1       0.0000000    -8.876590   0.000   0.087   0.0000718
## HBG2          0.0456419    -9.463383   0.000   0.010   1.0000000
##                   p_val avg_log2FC pct.1 pct.2    p_val_adj
## CHIT1      1.453863e-11  11.593705 0.140 0.000 4.875965e-07
## CCL18      2.250497e-07   9.920340 0.085 0.000 7.547718e-03
## CHI3L1     2.617369e-08   9.900358 0.098 0.000 8.778132e-04
## CCL13      3.783077e-03   8.704779 0.028 0.000 1.000000e+00
## AC015660.2 1.031305e-05   8.523929 0.063 0.000 3.458789e-01
## MUC5B      5.677898e-05  -7.290253 0.000 0.040 1.000000e+00
## SCGB3A1    2.141507e-09  -8.876590 0.000 0.087 7.182186e-05
## HBG2       4.564194e-02  -9.463383 0.000 0.010 1.000000e+00
#big diff
FeaturePlot(seurat, features = c("MUC5B","PAEP"), reduction = "umap.cca", split.by = "condition")
## Warning: All cells have the same value (0) of "MUC5B"

#big diff
FeaturePlot(seurat, features = c("SCGB3A1","TFF3"), reduction = "umap.cca", split.by = "condition")

#secretory phase marker genes 
FeaturePlot(seurat, features = c("PAEP","FOXO1"), reduction = "umap.cca", split.by = "condition")

Cluster 4

top_and_bot(markers4, bot = FALSE)
## 
## 
##              p_val   avg_log2FC   pct.1   pct.2   p_val_adj
## ----------  ------  -----------  ------  ------  ----------
## GPIHBP1          0     9.468329   0.100       0           0
## HID1-AS1         0     9.123262   0.013       0           0
## LINC00636        0     9.064349   0.020       0           0
## FOXC2            0     8.931556   0.013       0           0
## CAPN11           0     8.913801   0.015       0           0
##                   p_val avg_log2FC pct.1 pct.2     p_val_adj
## GPIHBP1   1.256795e-124   9.468329 0.100     0 4.215038e-120
## HID1-AS1   7.705891e-18   9.123262 0.013     0  2.584402e-13
## LINC00636  1.345849e-27   9.064349 0.020     0  4.513708e-23
## FOXC2      7.705891e-18   8.931556 0.013     0  2.584402e-13
## CAPN11     4.264719e-21   8.913801 0.015     0  1.430301e-16
#cluster 4
FeaturePlot(seurat, features = c("GPIHBP1","HID1-AS1"), reduction = "umap.cca", split.by = "condition")

top_and_bot(markers4)
## 
## 
##                  p_val   avg_log2FC   pct.1   pct.2   p_val_adj
## ----------  ----------  -----------  ------  ------  ----------
## GPIHBP1      0.0000000    9.4683292   0.100   0.000           0
## HID1-AS1     0.0000000    9.1232622   0.013   0.000           0
## LINC00636    0.0000000    9.0643493   0.020   0.000           0
## FOXC2        0.0000000    8.9315560   0.013   0.000           0
## CAPN11       0.0000000    8.9138007   0.015   0.000           0
## MIDN         0.0317405    0.1001424   0.105   0.152           1
## RPL18A       0.3427599    0.1000908   0.952   0.949           1
## DESI1        0.0262904    0.1000488   0.033   0.061           1
##                   p_val avg_log2FC pct.1 pct.2     p_val_adj
## GPIHBP1   1.256795e-124  9.4683292 0.100 0.000 4.215038e-120
## HID1-AS1   7.705891e-18  9.1232622 0.013 0.000  2.584402e-13
## LINC00636  1.345849e-27  9.0643493 0.020 0.000  4.513708e-23
## FOXC2      7.705891e-18  8.9315560 0.013 0.000  2.584402e-13
## CAPN11     4.264719e-21  8.9138007 0.015 0.000  1.430301e-16
## MIDN       3.174049e-02  0.1001424 0.105 0.152  1.000000e+00
## RPL18A     3.427599e-01  0.1000908 0.952 0.949  1.000000e+00
## DESI1      2.629036e-02  0.1000488 0.033 0.061  1.000000e+00
#cluster 4
FeaturePlot(seurat, features = c("HMOX1","PLA2G2A"), reduction = "umap.cca", split.by = "condition")

Cluster 5

top_and_bot(markers5, bot = FALSE)
## 
## 
##           p_val   avg_log2FC   pct.1   pct.2   p_val_adj
## -------  ------  -----------  ------  ------  ----------
## SCN3A         0    10.729229   0.143       0           0
## HRH2          0    10.433841   0.157       0           0
## OR51E2        0    10.077810   0.100       0           0
## DGKG          0    10.073699   0.100       0           0
## ANO3          0     9.769803   0.071       0           0
##               p_val avg_log2FC pct.1 pct.2    p_val_adj
## SCN3A  2.270273e-77  10.729229 0.143     0 7.614042e-73
## HRH2   5.942715e-85  10.433842 0.157     0 1.993068e-80
## OR51E2 1.205328e-54  10.077810 0.100     0 4.042427e-50
## DGKG   1.205328e-54  10.073699 0.100     0 4.042427e-50
## ANO3   1.666858e-39   9.769803 0.071     0 5.590308e-35
#cluster 5
FeaturePlot(seurat, features = c("SCN3A","HRH2"), reduction = "umap.cca", split.by = "condition")
## Warning: All cells have the same value (0) of "SCN3A"

top_and_bot(cond_markers5)
## 
## 
##                p_val   avg_log2FC   pct.1   pct.2   p_val_adj
## --------  ----------  -----------  ------  ------  ----------
## C7         0.0000005     7.893226    0.45    0.00   0.0175086
## MEG3       0.0000626     7.509323    0.30    0.00   1.0000000
## LAMB1      0.0000005     7.340226    0.45    0.00   0.0175086
## IER3       0.0000132     6.775525    0.35    0.00   0.4410753
## COLEC11    0.0000132     6.526857    0.35    0.00   0.4410753
## HBB        0.0314106    -5.997063    0.20    0.44   1.0000000
## HBA2       0.0051733    -6.064125    0.15    0.48   1.0000000
## RERGL      0.0000020    -8.381696    0.00    0.68   0.0686126
##                p_val avg_log2FC pct.1 pct.2  p_val_adj
## C7      5.220539e-07   7.893226  0.45  0.00 0.01750864
## MEG3    6.259361e-05   7.509323  0.30  0.00 1.00000000
## LAMB1   5.220539e-07   7.340226  0.45  0.00 0.01750864
## IER3    1.315151e-05   6.775525  0.35  0.00 0.44107526
## COLEC11 1.315151e-05   6.526857  0.35  0.00 0.44107526
## HBB     3.141060e-02  -5.997063  0.20  0.44 1.00000000
## HBA2    5.173346e-03  -6.064124  0.15  0.48 1.00000000
## RERGL   2.045818e-06  -8.381696  0.00  0.68 0.06861264
#cluster 5
FeaturePlot(seurat, features = c("C7","RERGL"), reduction = "umap.cca", split.by = "condition")