#Workspace setup

#Import data ##Load filtered objects

U1 <- Read10X_h5("Filtered h5 objects/UTI-1_filtered_feature_bc_matrix.h5")
U2 <- Read10X_h5("Filtered h5 objects/UTI-2_filtered_feature_bc_matrix.h5")
U3 <- Read10X_h5("Filtered h5 objects/UTI-3_filtered_feature_bc_matrix.h5")
C1 <- Read10X_h5("Filtered h5 objects/Control-1_filtered_feature_bc_matrix.h5")
C2 <- Read10X_h5("Filtered h5 objects/Control-2_filtered_feature_bc_matrix.h5")
C3 <- Read10X_h5("Filtered h5 objects/Control-3_filtered_feature_bc_matrix.h5")

##Create seurat objects

#Turn into seurat objects
U1 <- CreateSeuratObject(counts = U1)
U2 <- CreateSeuratObject(counts = U2)
U3 <- CreateSeuratObject(counts = U3)
C1 <- CreateSeuratObject(counts = C1)
C2 <- CreateSeuratObject(counts = C2)
C3 <- CreateSeuratObject(counts = C3)

##Merge, labeling cell.id in metadata

#merge into single seurat object, layers still separated
merged_all <- merge(U1, y = c(U2, U3, C1, C2, C3),
                add.cell.ids = c("U1_UTI", "U2_UTI", "U3_UTI", "C1_Control", "C2_Control", "C3_Control"),
                project = "RecUTI")
#Adjust metadata
merged_all$sample <- rownames(merged_all@meta.data)
merged_all@meta.data <- separate(merged_all@meta.data, col = 'sample', into = c('sample', 'condition', 'barcode'),
                             sep = '_')

#label percent mitochondrial DNA
merged_all$percent.mt <- PercentageFeatureSet(merged_all, pattern = "^mt-")

#add number of genes per UMI for each cell to metadata
#log10genesPerUMI expected is ~0.8-0.9 for scRNA data, >1.0 is suspected doublet
merged_all$log10GenesPerUMI <- log10(merged_all$nFeature_RNA) / log10(merged_all$nCount_RNA)
merged_all$GenesPerUMI <- (merged_all$nFeature_RNA / merged_all$nCount_RNA)

#QC ##log10genesperUMI

#Visualize log10genesPerUMI PRIOR to subsetting
VlnPlot(merged_all, features = "log10GenesPerUMI") +
  geom_hline(yintercept = 0.95, linetype = "dashed", color = "red") + 
  geom_hline(yintercept = 0.8, linetype = "dashed", color = "red")

#plot log10UMI vs nCount_RNA
merged_all@meta.data %>% 
    ggplot(aes(color=condition, x=nCount_RNA, fill= sample)) + 
    geom_density(alpha = 0.2) + 
    scale_x_log10() + 
    theme_classic() +
    ylab("log10 cell density") +
    geom_vline(xintercept = 500)

##qc feature plots (before subsetting)

#plot nCount vs nFeature
qcfeat1 <- FeatureScatter(merged_all, feature1 = 'nCount_RNA', feature2 = 'nFeature_RNA', group.by = 'sample', pt.size = 0.05) 
qcfeat2 <- FeatureScatter(merged_all, feature1 = "nCount_RNA", feature2 = "percent.mt", group.by = 'sample', pt.size = 0.05)
qcfeat3 <- FeatureScatter(merged_all, feature1 = "nFeature_RNA", feature2 = "percent.mt", group.by = 'sample', pt.size = 0.05)

qcfeat1 + qcfeat2 + qcfeat3

##qc vln plots (pre-subsetting)

qc_vln <- VlnPlot(merged_all, features = c('nCount_RNA', 'nFeature_RNA', 'percent.mt'), alpha = 0.1, group.by = 'sample')
qc_vln[[2]] <- qc_vln[[2]] + geom_hline(yintercept = 500) + geom_hline(yintercept = 5000)
qc_vln[[3]] <- qc_vln[[3]] + geom_hline(yintercept = 10) 
qc_vln

##Subset data - percent.mt<10, nFeatures 500-5000, log10genesperUMI>0.8

merged_all <- subset(merged_all, subset = percent.mt<10 &
                                nFeature_RNA>500 &
                                nFeature_RNA<5000 &
                                log10GenesPerUMI >=0.8)

##repeat QC vln plots

qc_vln <- VlnPlot(merged_all, features = c('nCount_RNA', 'nFeature_RNA', 'percent.mt'), alpha = 0.1, group.by = 'sample')
qc_vln[[2]] <- qc_vln[[2]] + geom_hline(yintercept = 500) + geom_hline(yintercept = 5000)
qc_vln[[3]] <- qc_vln[[3]] + geom_hline(yintercept = 10) 
qc_vln

##cell density by nFeature_RNA (post-subsetting)

#plot log10UMI vs nFeature_RNA
merged_all@meta.data %>% 
    ggplot(aes(color=condition, x=nFeature_RNA, fill= sample)) + 
    geom_density(alpha = 0.2) + 
    scale_x_log10() + 
    theme_classic() +
    ylab("log10 cell density") +
    geom_vline(xintercept = 500)

##cell density by percent.mt

#plot log10UMI vs percent.mt
merged_all@meta.data %>% 
    ggplot(aes(color=condition, x=percent.mt, fill= sample)) + 
    geom_density(alpha = 0.2) + 
    theme_classic() +
    ylab("log10 cell density") 

##Filter out low quality cells (features expressed in <10 cells)

#subset out features expressed in <10 cells (Suzstak used 10)
counts <- LayerData(merged_all, assay = "RNA", layer = "counts")
keep_genes <- rowSums(counts>0) >=10
merged_all <- subset(merged_all, features = names(keep_genes[keep_genes]))

merged_all #features 33696 -> 17500 genes with sufficient expression data
An object of class Seurat 
17535 features across 27248 samples within 1 assay 
Active assay: RNA (17535 features, 0 variable features)
 6 layers present: counts.1, counts.2, counts.3, counts.4, counts.5, counts.6

#Processing ##Join layers and preprocessing workflow

#Join layers
#Perform standard PREprocessing workflow steps to figure out if we have significant batch effects

merged_all <- JoinLayers(merged_all)
merged_all <- NormalizeData(merged_all)
Performing log-normalization
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
merged_all <- FindVariableFeatures(merged_all, nfeatures = 3000)
Calculating gene variances
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
merged_all <- ScaleData(merged_all)

  |                                                                                                                           
  |                                                                                                                     |   0%
  |                                                                                                                           
  |=======================================                                                                              |  33%
  |                                                                                                                           
  |==============================================================================                                       |  67%
  |                                                                                                                           
  |=====================================================================================================================| 100%
top_var_genes_3000 <- head(VariableFeatures(merged_all), 20)
print(top_var_genes_3000)
 [1] "Jchain"   "Igkc"     "Igha"     "Hba-a1"   "Hbb-bs"   "Slc12a3"  "Slc26a4"  "Abca13"   "Atp6v1g3" "Calb1"    "Ngp"     
[12] "Mzb1"     "Lcn2"     "Ccl4"     "Robo2"    "Camp"     "Nrxn3"    "Retnlg"   "S100g"    "Cxcl10"  

##Visualize most variable features

#Visualize most variable features
VariableFeaturePlot(merged_all)

LabelPoints(points = top_var_genes_3000, plot = last_plot(), repel = TRUE)

##decide number of PCAs to use for analysis: npcs=20

#decide number of PCAs to use for analysis
merged_all <- RunPCA(merged_all, npcs = 50)
ElbowPlot_50PCAs <- ElbowPlot(merged_all, ndims = 50) + 
  scale_y_continuous(n.breaks = 20) + 
  geom_hline(yintercept = 2) +
  geom_vline(xintercept = 20)
ElbowPlot_50PCAs 

##Standard processing and clustering, 0.6 resolution

#Standard processing and clustering
merged_all <- FindNeighbors(merged_all, dims = 1:20, verbose = F)
merged_all <- FindClusters(merged_all, resolution = 0.6)
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 27248
Number of edges: 954660

Running Louvain algorithm...
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.9489
Number of communities: 27
Elapsed time: 3 seconds
merged_all <- RunUMAP(merged_all, dims = 1:20)
Using method 'umap'
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|

##Calculate number of cells in each cluster

print(cells_per_cluster)

print(cells_per_cluster_cond)

#UMAP visualization pre-doublet removal

Idents(merged_all) <- "seurat_clusters"
DimPlot(merged_all, reduction = "umap", label = T, repel = T) + ggtitle("Seurat Clusters")

#Remove doublets

##Remove doublets
#install and load scDblFinder package

#BiocManager::install("scDblFinder")
library(scDblFinder)
library(SingleCellExperiment)

##convert to single cell experiment and run scDblFinder

#convert to single cell experiment and run scDblFinder
sce <- as.SingleCellExperiment(merged_all)
sce <- scDblFinder::scDblFinder(sce, samples = 'sample')

  |                                                                                                                           
  |                                                                                                                     |   0%
  |                                                                                                                           
  |====================                                                                                                 |  17%
  |                                                                                                                           
  |=======================================                                                                              |  33%
  |                                                                                                                           
  |==========================================================                                                           |  50%
  |                                                                                                                           
  |==============================================================================                                       |  67%
  |                                                                                                                           
  |==================================================================================================                   |  83%
  |                                                                                                                           
  |=====================================================================================================================| 100%

##add doublet results back into metadata and visualize on UMAP

#add results back into seurat metadata
merged_all$scDblFinder.class <- colData(sce)$scDblFinder.class

#visualize
sing_vs_doub_before_subset <- DimPlot(merged_all,split.by = 'condition', reduction = "umap", group.by = "scDblFinder.class", pt.size = 0.5) + ggtitle("scDblFinder: Singlets vs Doublets")
sing_vs_doub_before_subset

##subset to only singlets

##rename merged_all2 and findneighbors and clusters again?

merged_all2 <- merged_all

merged_all2 <- FindNeighbors(merged_all2, dims = 1:20, verbose = F)
merged_all2 <- FindClusters(merged_all2, resolution = 0.6)
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 26229
Number of edges: 923064

Running Louvain algorithm...
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.9492
Number of communities: 25
Elapsed time: 2 seconds
merged_all2 <- RunUMAP(merged_all2, dims = 1:20)
Using method 'umap'
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
A <- DimPlot(merged_all, label = T, repel = T) + NoLegend() + ggtitle("Old Clusters After \nDoublet Removal")
B <- DimPlot(merged_all2, label = T, repel = T) + NoLegend() + ggtitle("Reclustering After \nDoublet Removal")
A+B

##calculate cells in each cluster after doublet removal and reclustering


cells_per_cluster2 <- merged_all2@meta.data %>%
  group_by(seurat_clusters) %>%
  summarise(cell_count = n()) %>%
  arrange(seurat_clusters)

cells_per_cluster_cond2 <- merged_all2@meta.data %>%
  group_by(seurat_clusters, condition) %>%
  summarise(cell_count = n()) %>%
  arrange(seurat_clusters, condition)

View(cells_per_cluster2)
View(cells_per_cluster_cond2)

print(cells_per_cluster2)
print(cells_per_cluster_cond2)

#Visualization ##By condition

#Group by condition
DimPlot(merged_all2, reduction = "umap", group.by = "condition", pt.size = 0.1, alpha = 0.3) 

##by sample, split by condition

##grouped by cluster, split by condition

##Bar graph number of cells per cluster from each individual sample

#visualize percent of each cluster from individual samples
meta_data <- merged_all2@meta.data

# Count cells per cluster per sample
cluster_counts <- meta_data %>%
  group_by(seurat_clusters, sample) %>%
  summarise(count = n(), .groups = 'drop')

# Calculate total cells per sample
total_per_cluster <- cluster_counts %>%
  group_by(seurat_clusters) %>%
  summarise(total = sum(count), .groups = 'drop')

# Merge and calculate percentage
cluster_percentages <- cluster_counts %>%
  left_join(total_per_cluster, by = "seurat_clusters") %>%
  mutate(percentage = (count / total) * 100)


# Plot
ggplot(cluster_percentages, aes(x = seurat_clusters, y = percentage, fill = sample)) +
  geom_bar(stat = "identity", position = "stack") +
  labs(title = "Cluster Composition By Sample",
       x = "Cluster",
       y = "Percentage of Cluster",
       fill = "Sample") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

NA
NA

##percent of each cluster from each condition

#visualize percent of each cluster from each condition

# Count cells per cluster per condition
cluster_counts_cond <- meta_data %>%
  group_by(seurat_clusters, condition) %>%
  summarise(count = n(), .groups = 'drop')

# Calculate total cells per sample
total_cond_per_cluster <- cluster_counts_cond %>%
  group_by(seurat_clusters) %>%
  summarise(total = sum(count), .groups = 'drop')

# Merge and calculate percentage
cluster_cond_percentages <- cluster_counts_cond %>%
  left_join(total_cond_per_cluster, by = "seurat_clusters") %>%
  mutate(percentage = (count / total) * 100)


# Plot
ggplot(cluster_cond_percentages, aes(x = seurat_clusters, y = percentage, fill = condition)) +
  geom_bar(stat = "identity", position = "stack") +
  labs(title = "Cluster Composition by Condition",
       x = "Cluster",
       y = "Percentage of Cluster",
       fill = "Sample") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

##visualize percent.mt per cluster

#add metadata column for cluster_condition
merged_all2$cluster_condition <- paste0(merged_all2$seurat_clusters,"_",merged_all2$condition)
Idents(merged_all2) <- merged_all2$cluster_condition

#check that metadata column is added
colnames(merged_all2@meta.data)
 [1] "orig.ident"        "nCount_RNA"        "nFeature_RNA"      "sample"            "condition"         "barcode"          
 [7] "percent.mt"        "log10GenesPerUMI"  "GenesPerUMI"       "RNA_snn_res.0.6"   "seurat_clusters"   "scDblFinder.class"
[13] "cluster_condition"
# Create summary data frame
percent_mt_summary <- merged_all2@meta.data %>%
  group_by(seurat_clusters) %>%
    summarise(
    mean_percent_mt = mean(percent.mt, na.rm = TRUE),
    median_percent_mt = median(percent.mt, na.rm = TRUE),
        n_cells = n()
  )

print(percent_mt_summary)

ggplot(percent_mt_summary, aes(x = factor(seurat_clusters), y = mean_percent_mt)) +
  geom_bar(stat = "identity", fill = "#2c7fb8") +
  labs(
    title = "Mean Mitochondrial Percentage by Cluster",
    x = "Cluster",
    y = "Mean % Mitochondrial Genes"
  ) +
  theme_minimal()

clusters_table <- table(merged_all2$seurat_clusters, merged_all2$sample)
clusters_table_cond <- table(merged_all2$seurat_clusters, merged_all2$condition)
clusters_table
    
      C1  C2  C3  U1  U2  U3
  0  209 414 535 325 612 820
  1  124 379 872 234 585 670
  2  139 332 596 231 775 780
  3  168 283 371 656 304 405
  4  123 174 507 516 256 454
  5  111 339 448 126 433 543
  6   67 232 335 148 332 385
  7   50 103 441 135 285 266
  8  134 127 123 640  50 144
  9   51  82  68 855  65  91
  10  99 280 141 106 194 194
  11  20  39  46 414  33 100
  12   4   4  30  65 197 335
  13  15  41 190  12 113 143
  14  29  88  83  48 133 114
  15  40  36 113  58  88 140
  16  11  40 102  51 121 116
  17   4   8  74  17  98 178
  18  20  31  51  43  77  90
  19  43  23  60  54  40  89
  20   5  45  59  23  66  56
  21   3  34  46  12  46  85
  22  28   8  64  60  52  12
  23   1   9  13   4  51  47
  24   7   7  36  18  28  20
clusters_table_cond
    
     Control  UTI
  0     1158 1757
  1     1375 1489
  2     1067 1786
  3      822 1365
  4      804 1226
  5      898 1102
  6      634  865
  7      594  686
  8      384  834
  9      201 1011
  10     520  494
  11     105  547
  12      38  597
  13     246  268
  14     200  295
  15     189  286
  16     153  288
  17      86  293
  18     102  210
  19     126  183
  20     109  145
  21      83  143
  22     100  124
  23      23  102
  24      50   66

#Create cluster tree

# Clustree visualization
install.packages("clustree")
Error in install.packages : Updating loaded packages
library(clustree)

merged_tree <- merged_all2

#Add multiple resolutions
merged_tree <- FindClusters(merged_tree, resolution = c(0.1, 0.2, 0.4, 0.6, 0.8))
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 26229
Number of edges: 923064

Running Louvain algorithm...
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.9871
Number of communities: 14
Elapsed time: 3 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 26229
Number of edges: 923064

Running Louvain algorithm...
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.9777
Number of communities: 17
Elapsed time: 2 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 26229
Number of edges: 923064

Running Louvain algorithm...
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.9621
Number of communities: 24
Elapsed time: 2 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 26229
Number of edges: 923064

Running Louvain algorithm...
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.9492
Number of communities: 25
Elapsed time: 2 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 26229
Number of edges: 923064

Running Louvain algorithm...
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.9369
Number of communities: 28
Elapsed time: 3 seconds
install.packages("clustree")
Warning in install.packages :
  package ‘clustree’ is in use and will not be installed
cluster.tree <- clustree(
  merged_tree, "RNA_snn_res.") + ggtitle("Cluster tree")

ggsave("resolution cluster tree.svg", plot = cluster.tree, width = 12)

cluster.tree 

##silhouette plot

#Evaluate conserved marker genes, use to consolidate smaller clusters into larger ones

Idents(merged_all2) <- 'seurat_clusters'
merged_all2$consolidated_clusters <- merged_all2$seurat_clusters

library(presto)
library(SeuratWrappers)

# Initialize the list
conserved_markers <- list()

# Loop through the clusters in the specified order
for (i in 0:24) {
  aux <- SeuratWrappers::RunPresto(
    merged_all2,
    ident.1 = i,
    only.pos = TRUE,
    logfc.threshold = 0.5,
    min.pct = 0.4
  )
  
  if (!is.null(aux) && nrow(aux) > 0) {
    aux$gene <- rownames(aux)
    aux$cluster <- i
    conserved_markers[[paste0("cluster_", i)]] <- aux
  }
}

#filter
for(i in names(conserved_markers)){
  aux <- conserved_markers[[i]]
  aux$cluster <- i
  aux <- aux[aux$p_val_adj < 0.05,]
  aux <- aux[order(aux$avg_log2FC, decreasing=T),]
  conserved_markers[[i]] <- aux
}

##export conserved markers as excel file

#Consolidate clusters


#copy seurat_clusters to new metadate column
merged_all2$consolidated_clusters <- merged_all2$seurat_clusters

#set identity as new column, 'consolidated_clusters'
Idents(merged_all2) <- 'consolidated_clusters'

# Make new column a character vector instead of factor
merged_all2@meta.data$consolidated_clusters <- 
  as.character(merged_all2@meta.data$consolidated_clusters)

#define 'current clusters' for mapping later
current_clusters <- as.character(merged_all2$seurat_clusters)


#Consolidations (unnamed clusters go unchanged)
new_cluster_map <- c(
  "24"  = "12",
  "23"  = "1",
  "21" = "14"
)

##5,10,14,18,21 are all myeloid (CD68+)
   #14/21 = DC
##from cluster tree: 5 and 10 combine, 14, 18, and 21 combine
#24 -> 12 (gamma delta T?) -> 7 (also expressing Trgv in top genes)

#Apply mapping back to metadata
merged_all2$consolidated_clusters <- ifelse(
  current_clusters %in% names(new_cluster_map),
  new_cluster_map[current_clusters],
  current_clusters
)

# Convert back to factor (optional, but usually cleaner for Idents)
merged_all2@meta.data$consolidated_clusters <- 
  factor(merged_all2@meta.data$consolidated_clusters, levels = c(0:20,22))

# Set active identities correctly
Idents(merged_all2) <- 'consolidated_clusters'

#Naming clusters (preliminary, manual). In new object ‘merged_annot’

merged_annot <- merged_all2
merged_annot$annot <- merged_annot$consolidated_clusters


#set identity as new column, 'consolidated_clusters'
Idents(merged_annot) <- 'annot'

# Make new column a character vector instead of factor
merged_annot@meta.data$annot <- 
  as.character(merged_annot@meta.data$annot)

#define 'current clusters' for mapping later
current_clusters_annot <- as.character(merged_annot$annot)


#Consolidations (unnamed clusters go unchanged)
new_cluster_map_annot <- c(
  "0" = "Cytotoxic NK-T",
  "1" = "B cell",
  "2" = "Naive T_TCM",
  "3" = "Neutrophil", 
  "4" = "PT group 1",
  "5" = "Activated macrophage",
  "6" = "Immature T_group 2",
  "7" = "Th1",
  "8" = "PT group 2",
  "9" = "PT group 3",
  "10" = "Myeloid",
  "11" = "PT group 4",
  "12" = "T cell_gamma_delta",
  "13" = "TAL_DCT",
  "14" = "DC",
  "15" = "DCT_CNT",
  "16" = "TAL_DCT",
  "17" = "TAL",
  "18" = "Mono_macrophage",
  "19" = "Tissue-resident NK-T",
  "20" = "Endothelial",
  "22" = "Intercalated cell")



#Apply mapping back to metadata
merged_annot$annot <- ifelse(
  current_clusters_annot %in% names(new_cluster_map_annot),
  new_cluster_map_annot[current_clusters_annot],
  current_clusters_annot
)

# Convert back to factor (optional, but usually cleaner for Idents)
merged_annot@meta.data$annot <- 
  factor(merged_annot@meta.data$annot)

# Set active identities correctly
Idents(merged_annot) <- 'annot'

#Visualize umap with annotations

##split by condition

Idents(merged_annot) <- 'annot'

annotations <- list()
annotations <- unique(Idents(merged_annot))

# Initialize the list
conserved_markers_annot <- list()

# Loop through the clusters in the specified order
for (i in unique(annotations)) {
  aux <- SeuratWrappers::RunPresto(
    merged_annot,
    ident.1 = i,
    only.pos = TRUE,
    logfc.threshold = 0.5,
    min.pct = 0.4
  )
  
  if (!is.null(aux) && nrow(aux) > 0) {
    aux$gene <- rownames(aux)
    aux$cluster <- i
    conserved_markers_annot[[i]] <- aux
  }
}

#filter
for(i in names(conserved_markers_annot)){
  aux <- conserved_markers_annot[[i]]
  aux$cluster <- i
  aux <- aux[aux$p_val_adj < 0.05,]
  aux <- aux[order(aux$avg_log2FC, decreasing=T),]
  conserved_markers_annot[[i]] <- aux
}
merged_all2$consolidated_clusters <- merged_all2$seurat_clusters
Idents(merged_all2) <- 'consolidated_clusters'

##24 into 12, T cell population with Trg 
merged_all2@meta.data$consolidated_clusters <- 
  ifelse(
    merged_all2@meta.data$consolidated_clusters == "24",
    12,
    merged_all2@meta.data$consolidated_clusters
  )

#23 into 1, B cells - lots of immunoglobulins in 23 and jchain, 1 is cd19-high b cell group
merged_all2@meta.data$consolidated_clusters <- 
  ifelse(
    merged_all2@meta.data$consolidated_clusters == "23",
    1,
    merged_all2@meta.data$consolidated_clusters
  )

unique(merged_all2$consolidated_clusters)

++++ #Evaluate conserved and differentially expressed genes

Idents(merged_all2) <- 'seurat_clusters'

# Define the custom order
cluster_order_0.1 <- c(0, 1, 10, 11, 12, 13, 14, 15, 2, 3, 4, 5, 6, 7, 8, 9)

# Initialize the list
conserved_markers_res0.1 <- list()

# Loop through the clusters in the specified order
for (i in cluster_order_0.1) {
  aux <- SeuratWrappers::RunPresto(
    merged_all_cons,
    ident.1 = i,
    only.pos = TRUE,
    logfc.threshold = 0.5,
    min.pct = 0.4
  )
  
  if (!is.null(aux) && nrow(aux) > 0) {
    aux$gene <- rownames(aux)
    aux$cluster <- i
    conserved_markers_res0.1[[paste0("cluster_", i)]] <- aux
  }
}
#filter
for(i in names(conserved_markers_res0.1)){
  aux <- conserved_markers_res0.1[[i]]
  aux$cluster <- i
  aux <- aux[aux$p_val_adj < 0.05,]
  aux <- aux[order(aux$avg_log2FC, decreasing=T),]
  conserved_markers_res0.1[[i]] <- aux
}
# take top 15 genes from each cluster to do annotations
cluster_annot_df_0.1 <- conserved_markers_res0.1 %>%
  map(~ head(.x, 10)) %>%  # Take first 10 rows of each data frame
  bind_rows()              # Combine into one data frame
rio::export(conserved_markers_res0.1, "all pos conserved genes per cluster.xlsx")
rio::export(cluster_annot_df_0.1, "top 15 conserved genes per cluster.xlsx")

#Put in cluster annotations

l1_annotations_res0.1 <- c(
  "PT",           # cluster 0
  "T cell",                    # cluster 1
  "Myeloid",        # cluster 2
  "B Cell",                         # cluster 3
  "NK-T",      # cluster 4
  "T cell", # cluster 5
  "PMN",                    # cluster 6
  "Fibro_Stromal",     # cluster 7
  "Podocyte",                      # cluster 8
  "Macrophage",              # cluster 9
  "T cell",          # cluster 10
  "TAL",           # cluster 11
  "pDC",           # cluster 12
  "DCT",       # cluster 13
  "IC-B",      # cluster 14
  "ILC2/Myeloid"    # cluster 15
)

Idents(merged_all_cons) <- 'RNA_snn_res.0.1'

unique(Idents(merged_all_cons))
cluster.tree
Idents(merged_all_cons) <- 'seurat_clusters'
DefaultAssay(merged_all_cons)

merged_0.05 <- FindClusters(merged_0.05, resolution = 0.01)
  
cluster.tree.8dim <- clustree(merged_0.05, prefix = "RNA_snn_res.")
cluster.tree.8dim
# Make sure cluster IDs are numeric and 0-based
clusters <- as.numeric(as.character(merged_all_cons$RNA_snn_res.0.1))

# Map cluster IDs to manual annotations
# Create a named vector for easy lookup
names(l1_annotations_res0.1) <- as.character(0:15)

# Add annotation metadata by matching cluster IDs
merged_all_cons@meta.data$l1_annotations_res0.1 <- l1_annotations_res0.1[as.character(clusters)]
merged_all_cons@meta.data
Idents(merged_all_cons) <- 'annotation'
DimPlot(merged_all_cons, label = T, repel = T)
Idents(merged_all_cons) <- 'RNA_snn_res.0.1'
DimPlot(merged_all_cons, reduction = "tsne")+ ggtitle("unbiased clustering")
Idents(obj_annot) <- 'grouping'
DimPlot(obj_annot, reduction = "tsne", split.by = 'condition')
DimPlot(merged_all_cons, label = T, repel = T, split.by = 'condition') + NoLegend()
clusters_0.1_table <- table(merged_all_cons$RNA_snn_res.0.1, merged_all_cons$sample)
clusters_0.1_table_cond <- table(merged_all_cons$RNA_snn_res.0.1, merged_all_cons$condition)
clusters_0.1_table
clusters_0.1_table_cond
clusters_0.01_table <- table(merged_0.05$RNA_snn_res.0.01, merged_0.05$sample)
clusters_0.01_table_cond <- table(merged_0.05$RNA_snn_res.0.01, merged_0.05$condition)
clusters_0.01_table
clusters_0.01_table_cond

clusters_0.01_df <- as.data.frame(clusters_0.01_table)
clusters_0.01_df

clusters_0.01_df$annot <- c("lymphocyte", "myeloid", "PT", "B cell", "NK", "PMN", "TAL_Distal tubule", "PT_EMT_Fibro", "DCT")
merged_all_cons@meta.data$l1_annotations_res0.1 <- 
  ifelse(
    merged_all_cons@meta.data$l1_annotations_res0.1 == "Fibroblast/Stroma",
    "Fibro_Stromal",
    merged_all_cons@meta.data$l1_annotations_res0.1
  )
Idents(merged_all_cons) <- 'RNA_snn_res.0.8'

DE_0.8_10v19 <- SeuratWrappers::RunPresto(merged_all_cons, ident.1 = 10, ident.2 = 19)
gene <- rownames(DE_0.8_10v19)
DE_0.8_10v19$gene <- gene

DE_0.8_10v19 <- DE_0.8_10v19[DE_0.8_10v19$p_val_adj < 0.05,]
DE_0.8_10v19 <- order(DE_0.8_10v19$avg_log2FC, decreasing = T)  
#compare DE gene expression between UTI/controls in each cluster
#defined as deg_cons (list)
colnames(merged_all_cons@meta.data)

merged_all_cons@meta.data$l1_condition <- paste0(merged_all_cons@meta.data$l1_annotations_res0.1,"_",merged_all_cons$condition)

Idents(merged_all_cons) <- 'l1_condition'
unique(Idents(merged_all_cons))

DEg_res_0.1_UTI <-  list()

for(i in unique(merged_all_cons$l1_annotations_res0.1)){
  aux <- SeuratWrappers::RunPresto(merged_all_cons,ident.1 = paste0(i,"_UTI"), ident.2 = paste0(i,"_Control"))  
  aux$gene <- rownames(aux)
  DEg_res_0.1_UTI[[paste0(i,"_UTI")]] <- aux
}

#Filter for easier analysis
for(i in names(DEg_res_0.1_UTI)){
  aux <- DEg_res_0.1_UTI[[i]]
  aux <- aux[aux$p_val_adj < 0.05,]
  aux <- aux[order(aux$avg_log2FC, decreasing=T),]
  DEg_res_0.1_UTI[[i]] <- aux
}

names(DEg_res_0.1_UTI)[names(DEg_res_0.1_UTI) %in% "ILC2/Myeloid_UTI"] <- "ILC2_UTI"

#export as excel file
rio::export(DEg_res_0.1_UTI, "res 0.1_DE genes.xlsx")

#save RDS and create loupe


saveRDS(merged_all_cons, "merged_obj.RDS")
save.image("res 0.5_PCA 25_percentmt10.RData")
setwd("C:/Users/evrajadh/OneDrive - Indiana University/Research/Recurrent UTI GFR/ScRNA/Rec_UTI")
create_loupe_from_seurat(merged_all_cons, output_dir = "cloupes/", output_name = "manual annot_res_0.5")
#Split object by condition and analyze conditions separately
merged_split <- SplitObject(merged_all_cons, split.by = "condition")

#eval conserved markers by condition

merged_split$UTI

merged_UTI <- merged_split$UTI

#Add multiple resolutions
merged_UTI <- RunPCA(merged_UTI, npcs=30)
merged_UTI <- FindNeighbors(merged_UTI, dims = 1:20, verbose = F)
merged_UTI <- FindClusters(merged_UTI, resolution = c(0.1, 0.2, 0.4, 0.6, 0.8))

merged_UTI <- RunUMAP(merged_UTI, dims = 1:20)

DimPlot(merged_UTI)

cluster.tree_UTI <- clustree(
  merged_UTI, "RNA_snn_res.") + ggtitle("Cluster tree")

cluster.tree_UTI
#haven't removed doublets, only preliminary
Idents
Idents(merged_UTI) <- 'l1_annotations_res0.1'

DEg_UTI_LOW_res <- list()
for(i in unique(merged_0.05$RNA_snn_res.0.01)){
  aux <- SeuratWrappers::RunPresto(merged_0.05,ident.1 = paste0(i,"_UTI"), ident.2 = paste0(i,"_Control"))  
  aux$gene <- rownames(aux)
  DEg_UTI_LOW_res[[paste0(i,"_UTI")]] <- aux
}

#Filter for easier analysis
for(i in names(DEg_UTI_LOW_res)){
  aux <- DEg_UTI_LOW_res[[i]]
  aux <- aux[aux$p_val_adj < 0.05,]
  aux <- aux[order(aux$avg_log2FC, decreasing=T),]
  DEg_UTI_LOW_res[[i]] <- aux
}

rio::export(DEg_UTI_LOW_res, "LOW resolution DEgenes_UTI.xlsx")
Idents(merged_0.05) <- 'RNA_snn_res.0.01'
DimPlot(merged_0.05, reduction = "tsne")
DimPlot(merged_0.05, reduction = "pca", label = T, repel = T)
Idents(merged_0.05) <- 'l1_annotations_res0.1'
DimPlot(merged_0.05, reduction = "tsne", label = T)
DimPlot(merged_0.05, reduction = "pca", label = T) + NoLegend()
obj_annot <- merged_all_cons
obj_annot$grouping <- NA

obj_annot$grouping[obj_annot$RNA_snn_res.0.1 %in% c(1,5,10,15,4,3)] <- "Lymphocyte"
obj_annot$grouping[obj_annot$RNA_snn_res.0.1 %in% c(2,12,9,6)] <- "Myeloid"
obj_annot$grouping[obj_annot$RNA_snn_res.0.1 %in% c(0)] <- "PT"
obj_annot$grouping[obj_annot$RNA_snn_res.0.1 %in% c(7)] <- "PT_EMT"
obj_annot$grouping[obj_annot$RNA_snn_res.0.1 %in% c(14,11,8,13)] <- "TAL_distal tubule"
Idents(obj_annot) <- 'grouping'
unique(Idents(obj_annot))
conserved_genes_lowres_grouping <- list()
Idents(obj_annot) <- 'grouping'

# Loop through the clusters in the specified order
conserved_genes_lowres_grouping <- SeuratWrappers::RunPrestoAll(obj_annot,
    only.pos = TRUE,
    logfc.threshold = 0.5,
    min.pct = 0.4)
conserved_genes_lowres_grouping$gene <- rownames(conserved_genes_lowres_grouping)

rio::export(conserved_genes_lowres_grouping, "Cluster_GROUPING_conserved genes.xlsx")
    
obj_annot$group_cond <- paste0(obj_annot$grouping,'_',obj_annot$condition)
Idents(obj_annot) <- 'group_cond'

DEg_UTI_LOW_res_annot <- list()
for(i in unique(obj_annot$grouping)){
  aux <- SeuratWrappers::RunPresto(obj_annot,ident.1 = paste0(i,"_UTI"), ident.2 = paste0(i,"_Control"))  
  aux$gene <- rownames(aux)
  DEg_UTI_LOW_res_annot[[paste0(i,"_UTI")]] <- aux
}

#Filter for easier analysis
for(i in names(DEg_UTI_LOW_res_annot)){
  aux <- DEg_UTI_LOW_res_annot[[i]]
  aux <- aux[aux$p_val_adj < 0.05,]
  aux <- aux[order(aux$avg_log2FC, decreasing=T),]
  DEg_UTI_LOW_res_annot[[i]] <- aux
}

rio::export(DEg_UTI_LOW_res_annot, "DEgenes cluster_GROUPING_UTIvControl.xlsx")
rio::export(DEg_UTI_LOW_res, "LOW resolution DEgenes_UTI.xlsx")

Monocling

Idents(merged_all_cons) <- 'RNA_snn_res.0.1'
pt_cells <- subset(merged_all_cons, idents = c('0', '7'))
library(monocle3)

cds <- SeuratWrappers::as.cell_data_set(pt_cells)
cds <- cluster_cells(cds)
cds <- learn_graph(cds)
plot_cells(cds, color_cells_by = "cluster", label_cell_groups = T)
Idents(obj_annot) <- 'l1_annotations_res0.1'
unique(Idents(obj_annot))

obj_annot$l1_annotations_res0.1[obj_annot$l1_annotations_res0.1 %in% "Fibro_Stromal"] <- "PT_EMT"
obj_annot$l1_annotations_res0.1[obj_annot$l1_annotations_res0.1 %in% "ILC2"] <- "ILC"
obj_annot$l1_annotations_res0.1[obj_annot$l1_annotations_res0.1 %in% "Podocyte"] <- "TAL_DCT"
obj_annot$l1_annotations_res0.1[obj_annot$l1_annotations_res0.1 %in% "pDC"] <- "DC"
cds <- SeuratWrappers::as.cell_data_set(obj_annot)
cds <- cluster_cells(cds)
cds <- learn_graph(cds)
# Plot to choose root node visually
plot_cells(cds, color_cells_by = "cluster", label_cell_groups = T, label_roots = T) 

get_earliest_principal_node <- function(cds) {
  pr_graph_test <- principal_graph(cds)[[1]]
  names(which.min(igraph::degree(pr_graph_test)))
}

# Once you identify the root cell/cluster:
cds <- order_cells(cds, root_pr_nodes = get_earliest_principal_node(cds))
plot_cells(cds, color_cells_by = "pseudotime", label_groups_by_cluster = FALSE)
DimPlot(obj_annot)
BiocManager::install("slingshot")
library(slingshot)

Idents(obj_annot) <- 'l1_annotations_res0.1'
DimPlot(obj_annot, reduction = 'tsne') + DimPlot(obj_annot, reduction = 'pca')

sce <- as.SingleCellExperiment(obj_annot)
sce <- slingshot(sce, clusterLabels = obj_annot$l1_annotations_res0.1, reducedDim = 'UMAP')

#Pseudotime analysis with slingshot

sce <- as.SingleCellExperiment(obj_annot)
sce <- slingshot::slingshot(sce, clusterLabels = sce$l1_annotations_res0.1, reducedDim = 'UMAP')
#plot pseudotime
library(scater)

plot(reducedDim(sce, "UMAP"), col = sce$slingPseudotime_1 , pch = 16)
plot(reducedDim(sce, "TSNE"), col = sce$slingPseudotime_1 , pch = 16)

plot(reducedDim(sce, "UMAP"), col = sce$slingPseudotime_1 , pch = 16)

clusters <- sce$l1_annotations_res0.1
centers <- t(sapply(unique(clusters),function(cl){
  colMeans(reducedDim(sce, "UMAP")[clusters == cl,])
}))
text(centers, labels = unique(clusters),col = "black", cex = 0.8)
sce <- as.SingleCellExperiment(pt_cells)
sce <- slingshot::slingshot(sce, clusterLabels = sce$l1_annotations_res0.1, reducedDim = 'UMAP')
plot(reducedDim(sce, "UMAP"), col = sce$slingPseudotime_1 , pch = 10)

clusters <- sce$l1_annotations_res0.1
centers <- t(sapply(unique(clusters),function(cl){
  colMeans(reducedDim(sce, "UMAP")[clusters == cl,])
}))
text(centers, labels = unique(clusters),col = "black", cex = 0.8)
#For just PT cells and PT_EMT

# Extract data from SingleCellExperiment
umap_df <- as.data.frame(reducedDim(sce, "UMAP"))
colnames(umap_df) <- c("UMAP_1", "UMAP_2")  # Rename to readable names
umap_df$cluster <- sce$l1_annotations_res0.1
umap_df$pseudotime <- sce$slingPseudotime_1


# Calculate cluster centroids for labeling
label_df <- umap_df %>%
  group_by(cluster) %>%
  summarize(UMAP_1 = mean(UMAP_1), UMAP_2 = mean(UMAP_2))


# Plot with ggplot2
ggplot(umap_df, aes(x = UMAP_1, y = UMAP_2, color = pseudotime)) +
  geom_point(size = 0.5) +
  scale_color_viridis_c() +
  geom_text(data = label_df, aes(x = UMAP_1, y = UMAP_2, label = cluster), 
            color = "black", size = 4) +
  theme_minimal() +
  labs(x = "UMAP 1", y = "UMAP 2", title = "Slingshot Pseudotime [PT Cells] with Cluster Labels")
sce <- as.SingleCellExperiment(obj_annot)
sce <- slingshot::slingshot(sce, clusterLabels = sce$RNA_snn_res.0.4, reducedDim = 'UMAP')
#For just PT cells and PT_EMT

# Extract data from SingleCellExperiment
umap_df <- as.data.frame(reducedDim(sce, "UMAP"))
colnames(umap_df) <- c("UMAP_1", "UMAP_2")  # Rename to readable names
umap_df$cluster <- sce$l1_annotations_res0.1
umap_df$pseudotime <- sce$slingPseudotime_1


# Calculate cluster centroids for labeling
label_df <- umap_df %>%
  group_by(cluster) %>%
  summarize(UMAP_1 = mean(UMAP_1), UMAP_2 = mean(UMAP_2))


# Plot with ggplot2
ggplot(umap_df, aes(x = UMAP_1, y = UMAP_2, color = pseudotime)) +
  geom_point(size = 0.5) +
  scale_color_viridis_c() +
  geom_text(data = label_df, aes(x = UMAP_1, y = UMAP_2, label = cluster), 
            color = "black", size = 4) +
  theme_minimal() +
  labs(x = "UMAP 1", y = "UMAP 2", title = "Slingshot Pseudotime [PT Cells] with Cluster Labels")
Idents(obj_annot) <- 'grouping'
unique(Idents(obj_annot))

#already subsetted PT and PT_EMT to pt_cells
myeloid_cells <- subset(obj_annot, idents = "Myeloid")
lympho_cells <- subset(obj_annot, idents = "Lymphocyte")
TAL_DCT_cells <- subset(obj_annot, idents = "TAL_distal tubule")

#myeloid slingshot TSNE

sce_myeloid <- as.SingleCellExperiment(myeloid_cells)
sce_myeloid <- slingshot::slingshot(sce_myeloid, clusterLabels = sce_myeloid$RNA_snn_res.0.4, reducedDim = 'TSNE')
#For just PT cells and PT_EMT

# Extract data from SingleCellExperiment
umap_myeloid_df <- as.data.frame(reducedDim(sce_myeloid, "TSNE"))
colnames(umap_myeloid_df) <- c("TSNE_1", "TSNE_2")  # Rename to readable names
umap_myeloid_df$cluster <- sce_myeloid$l1_annotations_res0.1
umap_myeloid_df$pseudotime <- sce_myeloid$slingPseudotime_1


# Calculate cluster centroids for labeling
label_myeloid_df <- umap_myeloid_df %>%
  group_by(cluster) %>%
  summarize(TSNE_1 = mean(TSNE_1), TSNE_2 = mean(TSNE_2))


# Plot with ggplot2
ggplot(umap_myeloid_df, aes(x = TSNE_1, y = TSNE_2, color = pseudotime)) +
  geom_point(size = 0.5) +
  scale_color_viridis_c() +
  geom_text(data = label_myeloid_df, aes(x = TSNE_1, y = TSNE_2, label = cluster), 
            color = "black", size = 4) +
  theme_minimal() +
  labs(x = "TSNE1", y = "TSNE 2", title = "Slingshot Pseudotime [Myeloid Cells] with Cluster Labels")

#myeloid slingshot UMAP

sce_myeloid <- as.SingleCellExperiment(myeloid_cells)
sce_myeloid <- slingshot::slingshot(sce_myeloid, clusterLabels = sce_myeloid$RNA_snn_res.0.4, reducedDim = 'UMAP')
#For just PT cells and PT_EMT

# Extract data from SingleCellExperiment
umap_myeloid_df <- as.data.frame(reducedDim(sce_myeloid, "UMAP"))
colnames(umap_myeloid_df) <- c("UMAP_1", "UMAP_2")  # Rename to readable names
umap_myeloid_df$cluster <- sce_myeloid$l1_annotations_res0.1
umap_myeloid_df$pseudotime <- sce_myeloid$slingPseudotime_1


# Calculate cluster centroids for labeling
label_myeloid_df <- umap_myeloid_df %>%
  group_by(cluster) %>%
  summarize(UMAP_1 = mean(UMAP_1), UMAP_2 = mean(UMAP_2))


# Plot with ggplot2
ggplot(umap_myeloid_df, aes(x = UMAP_1, y = UMAP_2, color = pseudotime)) +
  geom_point(size = 0.5) +
  scale_color_viridis_c() +
  geom_text(data = label_myeloid_df, aes(x = UMAP_1, y = UMAP_2, label = cluster), 
            color = "black", size = 4) +
  theme_minimal() +
  labs(x = "UMAP 1", y = "UMAP 2", title = "Slingshot Pseudotime [PT Cells] with Cluster Labels")

#lympho slingshot

sce_lymph <- as.SingleCellExperiment(lympho_cells)
sce_lymph <- slingshot::slingshot(sce_lymph, clusterLabels = sce_lymph$RNA_snn_res.0.4, reducedDim = 'TSNE')
#For just PT cells and PT_EMT

# Extract data from SingleCellExperiment
umap_lymph_df <- as.data.frame(reducedDim(sce_lymph, "TSNE"))
colnames(umap_lymph_df) <- c("TSNE_1", "TSNE_2")  # Rename to readable names
umap_lymph_df$cluster <- sce_lymph$l1_annotations_res0.1
umap_lymph_df$pseudotime <- sce_lymph$slingPseudotime_1


# Calculate cluster centroids for labeling
label_lymph_df <- umap_lymph_df %>%
  group_by(cluster) %>%
  summarize(TSNE_1 = mean(TSNE_1), TSNE_2 = mean(TSNE_2))


# Plot with ggplot2
ggplot(umap_lymph_df, aes(x = TSNE_1, y = TSNE_2, color = pseudotime)) +
  geom_point(size = 0.5) +
  scale_color_viridis_c() +
  geom_text(data = label_lymph_df, aes(x = TSNE_1, y = TSNE_2, label = cluster), 
            color = "black", size = 4) +
  theme_minimal() +
  labs(x = "TSNE1", y = "TSNE 2", title = "Slingshot Pseudotime [PT Cells] with Cluster Labels")

#TAL_DCT cell slingshot

sce_tub <- as.SingleCellExperiment(TAL_DCT_cells)
sce_tub <- slingshot::slingshot(sce_tub, clusterLabels = sce_tub$RNA_snn_res.0.4, reducedDim = 'TSNE')

# Extract data from SingleCellExperiment
tub_df <- as.data.frame(reducedDim(sce_tub, "TSNE"))
colnames(tub_df) <- c("TSNE_1", "TSNE_2")  # Rename to readable names
tub_df$cluster <- sce_tub$l1_annotations_res0.1
tub_df$pseudotime <- sce_tub$slingPseudotime_1


# Calculate cluster centroids for labeling
label_tub_df <- tub_df %>%
  group_by(cluster) %>%
  summarize(TSNE_1 = mean(TSNE_1), TSNE_2 = mean(TSNE_2))


# Plot with ggplot2
ggplot(tub_df, aes(x = TSNE_1, y = TSNE_2, color = pseudotime)) +
  geom_point(size = 0.5) +
  scale_color_viridis_c() +
  geom_text(data = label_tub_df, aes(x = TSNE_1, y = TSNE_2, label = cluster), 
            color = "black", size = 4) +
  theme_minimal() +
  labs(x = "TSNE1", y = "TSNE 2", title = "Slingshot Pseudotime [PT Cells] with Cluster Labels")
Idents(obj_annot) <- 'RNA_snn_res.0.1'
DimPlot(obj_annot, reduction = "tsne", label = T,repel=T,  label.size = 5) + ggtitle("0.1 resolution clustering")
obj_annot <- readRDS("C:/Users/evrajadh/OneDrive - Indiana University/Research/Recurrent UTI GFR/ScRNA/Rec_UTI/res 0.5_PCA 25_7-29/annotation object.RDS")

Idents(obj_annot) <- 'l1_annotations_res0.1'
DimPlot(obj_annot, reduction = "tsne", label = T,repel=T,  label.size = 5)
Idents(obj_annot) <- 'grouping'
DimPlot(obj_annot, reduction = "tsne", label = T, label.size = 4, pt.size = 0.1) + NoLegend() + ggtitle('Clusters')
Idents(obj_annot) <- 'grouping'
DoHeatmap(obj_annot, features = c("Nrep", "Map3k7cl", "Angptl7", "Hdc", "Slc6a18", "Tmigd1", "Slc22a12", "Slc22a6"), cells = "PT")
VlnPlot(obj_annot, features = c("Nrep", "Map3k7cl", "Angptl7", "Hdc", "Slc6a18", "Tmigd1", "Slc22a12", "Slc22a6"), raster = F, split.by = 'condition')
FeaturePlot(obj_annot, features = "Nrep", reduction = "tsne", split.by = 'condition', pt.size = 0.8)
FeaturePlot(obj_annot, features = "Map3k7cl", reduction = "tsne", split.by = 'condition', pt.size = 0.8)
VlnPlot(obj_annot, features = "Nrep", raster = F, split.by = 'condition')
VlnPlot(obj_annot, features = "Map3k7cl", raster = F, split.by = 'condition')
VlnPlot(obj_annot, features = "B2m", raster = F, split.by = 'condition')
feat_scCKDatlas <- c("Cd79a", "Cd3d", "Prf1", "Cpa3", "Folr2", "Sema3g", "Sost", "Prox1", "Vcam1", "Plvap", "Rergl", "Cox4l2", "Dcn", "Postn", "Slc22a6", "Prune2", "Nos1", "Umod", "Calb1", "Slc4a1", "Upk3b", "Wt1", "Gfr3a")

DoHeatmap(obj_annot, features = c("Cryab", "Sema5a", "Pdgfrb", "Dcdc2a", "Cp", "Nkg7", "Xcr1", "S100a9", "Lrp2", "Slc12a1", "Slc12a3"))
FeaturePlot(obj_annot, features = c("Cryab", "Sema5a", "Pdgfrb", "Dcdc2a", "Cp", "Nkg7", "Xcr1", "S100a9", "Lrp2", "Slc12a1", "Slc12a3"), reduction = "tsne", split.by = 'condition', pt.size = 0.8)
VlnPlot(obj_annot, features = c("Cryab", "Sema5a", "Pdgfrb", "Dcdc2a", "Cp", "Nkg7", "Xcr1", "S100a9", "Lrp2", "Slc12a1", "Slc12a3"), raster = F, split.by = 'condition')

feat_PTinjury <- c("Slc5a12", "Slc34a1", "Slc22a30", "Col27a1", "Plin2", "Ypel2", "Bcat1", "Slc7a12", "Hsp90aa1", "Top2a", "Kcnip4", "Vcam1", "Havcr1")
#Healthy: Slc5a12, 45a1, 22a30, 7a13
#type 1 injured S1/2: Col27a1, Plin2
#Type 1 injured S3: Plin2, Havcr1
#type2 injured S1/2: Ypel2
#Type2 injured S3: ypel2, Bcat1, Slc7a12
#acute injury: Hsp90aa1
#Repairing: Top2a
#Failed repair: Ypel2, Kcnip4,Vcam1, Havcr1

FeaturePlot(obj_annot, features = feat_PTinjury, reduction = "tsne", split.by = 'condition', pt.size = 0.8)
VlnPlot(obj_annot, features = feat_PTinjury, raster = F, split.by = 'condition')
#see which PT clusters are where on tsne (clusters 5,9, and 11 on res0.5)
Idents(obj_annot) <- 'RNA_snn_res.0.5'
DimPlot(obj_annot, reduction = "tsne", label = T)
sce <- as.SingleCellExperiment(pt_cells)
sce <- slingshot(sce, clusterLabels = sce$l1_con, reducedDim = "TSNE")
curves <- slingCurves(sce)

  
  df <- as.data.frame(reducedDim(sce, "TSNE"))
colnames(df) <- c("TSNE_1", "TSNE_2")
df$pseudotime <- sce$slingPseudotime_1
df$cluster <- sce$l1_annotations_res0.1 

  p <- ggplot(df, aes(x = TSNE_1, y = TSNE_2, color = pseudotime)) +
  geom_point(size = 0.5) +
  scale_color_viridis_c() +
  theme_minimal()
  
  for (i in seq_along(curves)) {
  curve_df <- as.data.frame(curves[[i]]$s)
  colnames(curve_df) <- c("TSNE_1", "TSNE_2")
  
  # Add arrows along the curve
  p <- p + geom_path(data = curve_df, aes(x = TSNE_1, y = TSNE_2),
                     arrow = arrow(type = "open", length = unit(0.15, "inches")),
                     color = "black", size = 0.8)
  }
  
  p

#PT cells also with condition separation

sce <- as.SingleCellExperiment(pt_cells)
sce <- slingshot(sce, clusterLabels = sce$l1_condition, reducedDim = "TSNE")
curves <- slingCurves(sce)

  
  df <- as.data.frame(reducedDim(sce, "TSNE"))
colnames(df) <- c("TSNE_1", "TSNE_2")
df$pseudotime <- sce$slingPseudotime_1
df$cluster <- sce$l1_condition

  p <- ggplot(df, aes(x = TSNE_1, y = TSNE_2, color = pseudotime)) +
  geom_point(size = 0.5) +
  scale_color_viridis_c() +
  theme_minimal()
  
  for (i in seq_along(curves)) {
  curve_df <- as.data.frame(curves[[i]]$s)
  colnames(curve_df) <- c("TSNE_1", "TSNE_2")
  
  # Add arrows along the curve
  p <- p + geom_path(data = curve_df, aes(x = TSNE_1, y = TSNE_2),
                     arrow = arrow(type = "open", length = unit(0.15, "inches")),
                     color = "black", size = 0.8)
  }
  
  p
saveRDS(obj_annot, "annotation object.RDS")
unique(merged_all_cons@meta.data$l1_annotations_res0.1)
Idents(merged_all_cons) <- 'l1_annotations_res0.1'
DimPlot(merged_all_cons, reduction = "umap")
markers_test <- FindMarkers(merged_all_cons, grouping.var = "l1_annotations_res0.1", ident.1 = "Podocyte")
markers_test
DimPlot(merged_all_cons, label = T, repel = T)

+++++++++++

myeloid_df <- as.data.frame(reducedDim(sce_myeloid, "TSNE"))
colnames(myeloid_df) <- c("TSNE_1", "TSNE_2")

myeloid_df$lineage <- is.na(sce_myeloid$slingPseudotime_1)
ggplot(myeloid_df, aes(x = TSNE_1, y = TSNE_2, color = lineage)) +
  geom_point(size = 0.5) +
  scale_color_manual(values = c("FALSE" = "blue", "TRUE" = "gray")) +
  theme_minimal() +
  labs(title = "Cells Assigned to Slingshot Lineage 1",
       color = "Assigned?")
sce_all <- as.SingleCellExperiment(obj_annot)
sce_all <- slingshot::slingshot(sce_all, clusterLabels = sce_all$RNA_snn_res.0.4, reducedDim = 'TSNE')
#For just PT cells and PT_EMT

# Extract data from SingleCellExperiment
tsne_all_df <- as.data.frame(reducedDim(sce_all, "TSNE"))
colnames(tsne_all_df) <- c("TSNE_1", "TSNE_2")  # Rename to readable names
tsne_all_df$cluster <- sce_all$l1_annotations_res0.1
tsne_all_df$pseudotime <- sce_all$slingPseudotime_1


# Calculate cluster centroids for labeling
label_all_tsne_df <- tsne_all_df %>%
  group_by(cluster) %>%
  summarize(TSNE_1 = mean(TSNE_1), TSNE_2 = mean(TSNE_2))


# Plot with ggplot2
ggplot(tsne_all_df, aes(x = TSNE_1, y = TSNE_2, color = pseudotime)) +
  geom_point(size = 0.5) +
  scale_color_viridis_c() +
  geom_text(data = label_all_tsne_df, aes(x = TSNE_1, y = TSNE_2, label = cluster), 
            color = "black", size = 4) +
  theme_minimal() +
  labs(x = "TSNE1", y = "TSNE 2", title = "Slingshot Pseudotime [All Cells] with Cluster Labels")
tsne_all_df <- as.data.frame(reducedDim(sce_all, "TSNE"))
colnames(tsne_all_df) <- c("TSNE_1", "TSNE_2")

tsne_all_df$lineage <- is.na(sce_all$slingPseudotime_1)
ggplot(tsne_all_df, aes(x = TSNE_1, y = TSNE_2, color = lineage)) +
  geom_point(size = 0.5) +
  scale_color_manual(values = c("FALSE" = "blue", "TRUE" = "gray")) +
  theme_minimal() +
  labs(title = "Cells Assigned to Slingshot Lineage 1",
       color = "Assigned?")
sce_myeloid$lineage <- is.na(sce_myeloid$slingPseudotime_1)
ggplot(..., color = lineage)  # TRUE = unassigned = gray

df <- as.data.frame(reducedDim(sce_myeloid, "TSNE"))
colnames(df) <- c("TSNE_1", "TSNE_2")

df$lineage <- is.na(sce_myeloid$slingPseudotime_1)
plot(reducedDim(sce), col = sce$cluster, type = 'lineages')

sce$li
plot_cells(cds, color_cells_by = "cluster")
plot_cells(cds, genes = c("Cdh2"), show_trajectory_graph = TRUE)
+++++++++++

ggsave("resolution cluster tree.svg", plot = cluster.tree, width = 12)

cluster.tree 


UTI_0.5_markers_clust_annot <- SeuratWrappers::RunPrestoAll(merged_0.5_splitlist$UTI)

UTI_0.5_markers_list <- split(UTI_0.5_markers_clust_annot, UTI_0.5_markers_clust_annot[['cluster']])

for(i in names(UTI_0.5_markers_list)){
  aux <- UTI_0.5_markers_list[[i]]
  aux <- aux[aux$p_val_adj < 0.05,]
  aux <- aux[order(aux$avg_log2FC, decreasing=T),]
  UTI_all_markers_list[[i]] <- aux
}

#DE genes by condition at res 0.01

merged_0.05$l0_condition <- paste0(merged_0.05$RNA_snn_res.0.01, '_', merged_0.05$condition)
Idents(merged_0.05) <- 'l0_condition'

DE_0.05_resol_condition <- SeuratWrappers::RunPresto(merged_0.05)

Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Ctrl+Alt+I.

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Ctrl+Shift+K to preview the HTML file).

The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.

---
title: "August 2025 analysis for Takashi"
output: html_notebook
---


#Workspace setup
```{r}
#load necessary packages
library(cellranger)
library(hdf5r)
library(Seurat)
library(dplyr)
library(patchwork)
library(glmGamPoi)
library(sctransform)
library(loupeR)

library(tidyverse)

#update.packages(list = c(cellranger, hdf5r, Seurat, dplyr, patchwork, glmGamPoi, sctransform, loupeR, tidyverse))

setwd("C:/Users/evrajadh/OneDrive - Indiana University/Research/Recurrent UTI GFR/ScRNA/Rec_UTI")
rm(list = ls())
```

#Import data
##Load filtered objects
```{r}
U1 <- Read10X_h5("Filtered h5 objects/UTI-1_filtered_feature_bc_matrix.h5")
U2 <- Read10X_h5("Filtered h5 objects/UTI-2_filtered_feature_bc_matrix.h5")
U3 <- Read10X_h5("Filtered h5 objects/UTI-3_filtered_feature_bc_matrix.h5")
C1 <- Read10X_h5("Filtered h5 objects/Control-1_filtered_feature_bc_matrix.h5")
C2 <- Read10X_h5("Filtered h5 objects/Control-2_filtered_feature_bc_matrix.h5")
C3 <- Read10X_h5("Filtered h5 objects/Control-3_filtered_feature_bc_matrix.h5")
```

##Create seurat objects 
```{r}
#Turn into seurat objects
U1 <- CreateSeuratObject(counts = U1)
U2 <- CreateSeuratObject(counts = U2)
U3 <- CreateSeuratObject(counts = U3)
C1 <- CreateSeuratObject(counts = C1)
C2 <- CreateSeuratObject(counts = C2)
C3 <- CreateSeuratObject(counts = C3)
```

##Merge, labeling cell.id in metadata
```{r}
#merge into single seurat object, layers still separated
merged_all <- merge(U1, y = c(U2, U3, C1, C2, C3),
                add.cell.ids = c("U1_UTI", "U2_UTI", "U3_UTI", "C1_Control", "C2_Control", "C3_Control"),
                project = "RecUTI")
```

```{r}
#Adjust metadata
merged_all$sample <- rownames(merged_all@meta.data)
merged_all@meta.data <- separate(merged_all@meta.data, col = 'sample', into = c('sample', 'condition', 'barcode'),
                             sep = '_')

#label percent mitochondrial DNA
merged_all$percent.mt <- PercentageFeatureSet(merged_all, pattern = "^mt-")

#add number of genes per UMI for each cell to metadata
#log10genesPerUMI expected is ~0.8-0.9 for scRNA data, >1.0 is suspected doublet
merged_all$log10GenesPerUMI <- log10(merged_all$nFeature_RNA) / log10(merged_all$nCount_RNA)
merged_all$GenesPerUMI <- (merged_all$nFeature_RNA / merged_all$nCount_RNA)
```

#QC
##log10genesperUMI
```{r}
#Visualize log10genesPerUMI PRIOR to subsetting
VlnPlot(merged_all, features = "log10GenesPerUMI") +
  geom_hline(yintercept = 0.95, linetype = "dashed", color = "red") + 
  geom_hline(yintercept = 0.8, linetype = "dashed", color = "red")
```

```{r}
#plot log10UMI vs nCount_RNA
merged_all@meta.data %>% 
  	ggplot(aes(color=condition, x=nCount_RNA, fill= sample)) + 
  	geom_density(alpha = 0.2) + 
  	scale_x_log10() + 
  	theme_classic() +
  	ylab("log10 cell density") +
  	geom_vline(xintercept = 500)
```
##qc feature plots (before subsetting)
```{r, fig.width=10}
#plot nCount vs nFeature
qcfeat1 <- FeatureScatter(merged_all, feature1 = 'nCount_RNA', feature2 = 'nFeature_RNA', group.by = 'sample', pt.size = 0.05) 
qcfeat2 <- FeatureScatter(merged_all, feature1 = "nCount_RNA", feature2 = "percent.mt", group.by = 'sample', pt.size = 0.05)
qcfeat3 <- FeatureScatter(merged_all, feature1 = "nFeature_RNA", feature2 = "percent.mt", group.by = 'sample', pt.size = 0.05)

qcfeat1 + qcfeat2 + qcfeat3
```
##qc vln plots (pre-subsetting)
```{r, fig.width=12}
qc_vln <- VlnPlot(merged_all, features = c('nCount_RNA', 'nFeature_RNA', 'percent.mt'), alpha = 0.1, group.by = 'sample')
qc_vln[[2]] <- qc_vln[[2]] + geom_hline(yintercept = 500) + geom_hline(yintercept = 5000)
qc_vln[[3]] <- qc_vln[[3]] + geom_hline(yintercept = 10) 
qc_vln
```
##Subset data - percent.mt<10, nFeatures 500-5000, log10genesperUMI>0.8
```{r}
merged_all <- subset(merged_all, subset = percent.mt<10 &
                                nFeature_RNA>500 &
                                nFeature_RNA<5000 &
                                log10GenesPerUMI >=0.8)

```

##repeat QC vln plots
```{r}
qc_vln <- VlnPlot(merged_all, features = c('nCount_RNA', 'nFeature_RNA', 'percent.mt'), alpha = 0.1, group.by = 'sample')
qc_vln[[2]] <- qc_vln[[2]] + geom_hline(yintercept = 500) + geom_hline(yintercept = 5000)
qc_vln[[3]] <- qc_vln[[3]] + geom_hline(yintercept = 10) 
qc_vln
```
```{r}
VlnPlot(merged_all, features = "log10GenesPerUMI") 
```
##cell density by nFeature_RNA (post-subsetting)
```{r}
#plot log10UMI vs nFeature_RNA
merged_all@meta.data %>% 
  	ggplot(aes(color=condition, x=nFeature_RNA, fill= sample)) + 
  	geom_density(alpha = 0.2) + 
  	scale_x_log10() + 
  	theme_classic() +
  	ylab("log10 cell density") +
  	geom_vline(xintercept = 500)
```
##cell density by percent.mt
```{r}
#plot log10UMI vs percent.mt
merged_all@meta.data %>% 
  	ggplot(aes(color=condition, x=percent.mt, fill= sample)) + 
  	geom_density(alpha = 0.2) + 
  	theme_classic() +
  	ylab("log10 cell density") 
```
##Filter out low quality cells (features expressed in <10 cells)
```{r}
#subset out features expressed in <10 cells (Suzstak used 10)
counts <- LayerData(merged_all, assay = "RNA", layer = "counts")
keep_genes <- rowSums(counts>0) >=10
merged_all <- subset(merged_all, features = names(keep_genes[keep_genes]))

merged_all #features 33696 -> 17535 genes with sufficient expression data
```

#Processing
##Join layers and preprocessing workflow
```{r}
#Join layers
#Perform standard PREprocessing workflow steps to figure out if we have significant batch effects

merged_all <- JoinLayers(merged_all)
merged_all <- NormalizeData(merged_all)

merged_all <- FindVariableFeatures(merged_all, nfeatures = 3000)
merged_all <- ScaleData(merged_all)

top_var_genes_3000 <- head(VariableFeatures(merged_all), 20)
print(top_var_genes_3000)
```

##Visualize most variable features
```{r}
VariableFeaturePlot(merged_all)
LabelPoints(points = top_var_genes_3000, plot = last_plot(), repel = TRUE)

```

##decide number of PCAs to use for analysis: npcs=20
```{r}
merged_all <- RunPCA(merged_all, npcs = 50)
ElbowPlot_50PCAs <- ElbowPlot(merged_all, ndims = 50) + 
  scale_y_continuous(n.breaks = 20) + 
  geom_hline(yintercept = 2) +
  geom_vline(xintercept = 20)
ElbowPlot_50PCAs 
```

##Standard processing and clustering, 0.6 resolution
```{r}
#Standard processing and clustering
merged_all <- FindNeighbors(merged_all, dims = 1:20, verbose = F)
merged_all <- FindClusters(merged_all, resolution = 0.6)
merged_all <- RunUMAP(merged_all, dims = 1:20)
```

##Calculate number of cells in each cluster
```{r}

 cells_per_cluster <- merged_all@meta.data %>%
  group_by(seurat_clusters) %>%
  summarise(cell_count = n()) %>%
  arrange(seurat_clusters)

 cells_per_cluster_cond <- merged_all@meta.data %>%
  group_by(seurat_clusters, condition) %>%
  summarise(cell_count = n()) %>%
  arrange(seurat_clusters, condition)

View(cells_per_cluster)
View(cells_per_cluster_cond)

print(cells_per_cluster)
print(cells_per_cluster_cond)
```

#UMAP visualization pre-doublet removal
```{r}
Idents(merged_all) <- "seurat_clusters"
DimPlot(merged_all, reduction = "umap", label = T, repel = T) + ggtitle("Seurat Clusters")
```

#Remove doublets
```{r}
#install and load scDblFinder package

#BiocManager::install("scDblFinder")
library(scDblFinder)
library(SingleCellExperiment)
```
##convert to single cell experiment and run scDblFinder
```{r}
#convert to single cell experiment and run scDblFinder
sce <- as.SingleCellExperiment(merged_all)
sce <- scDblFinder::scDblFinder(sce, samples = 'sample')

```
##add doublet results back into metadata and visualize on UMAP
```{r}
#add results back into seurat metadata
merged_all$scDblFinder.class <- colData(sce)$scDblFinder.class

#visualize
sing_vs_doub_before_subset <- DimPlot(merged_all,split.by = 'condition', reduction = "umap", group.by = "scDblFinder.class", pt.size = 0.5) + ggtitle("scDblFinder: Singlets vs Doublets")
sing_vs_doub_before_subset
```

##subset to only singlets
```{r}
merged_all <- subset(merged_all, subset = scDblFinder.class == "singlet")
```

##rename merged_all2 and findneighbors and clusters again?
```{r}
merged_all2 <- merged_all

merged_all2 <- FindNeighbors(merged_all2, dims = 1:20, verbose = F)
merged_all2 <- FindClusters(merged_all2, resolution = 0.6)

merged_all2 <- RunUMAP(merged_all2, dims = 1:20)
```
```{r}
A <- DimPlot(merged_all, label = T, repel = T) + NoLegend() + ggtitle("Old Clusters After \nDoublet Removal")
B <- DimPlot(merged_all2, label = T, repel = T) + NoLegend() + ggtitle("Reclustering After \nDoublet Removal")
A+B
```
##calculate cells in each cluster after doublet removal and reclustering
```{r}

cells_per_cluster2 <- merged_all2@meta.data %>%
  group_by(seurat_clusters) %>%
  summarise(cell_count = n()) %>%
  arrange(seurat_clusters)

cells_per_cluster_cond2 <- merged_all2@meta.data %>%
  group_by(seurat_clusters, condition) %>%
  summarise(cell_count = n()) %>%
  arrange(seurat_clusters, condition)

View(cells_per_cluster2)
View(cells_per_cluster_cond2)

print(cells_per_cluster2)
print(cells_per_cluster_cond2)
```

#Visualization
##By condition
```{r}
#Group by condition
DimPlot(merged_all2, reduction = "umap", group.by = "condition", pt.size = 0.1, alpha = 0.3) 
```
##by sample, split by condition
```{r, fig.width=9}
DimPlot(merged_all2, reduction = "umap", group.by = 'sample', split.by = "condition", alpha = 0.3)
```
##grouped by cluster, split by condition
```{r, fig.width=10}
DimPlot(merged_all2, reduction = "umap", split.by = "condition", label = T, repel = F, label.size = 5) + NoLegend()

```
##Bar graph number of cells per cluster from each individual sample
```{r}
#visualize percent of each cluster from individual samples
meta_data <- merged_all2@meta.data

# Count cells per cluster per sample
cluster_counts <- meta_data %>%
  group_by(seurat_clusters, sample) %>%
  summarise(count = n(), .groups = 'drop')

# Calculate total cells per sample
total_per_cluster <- cluster_counts %>%
  group_by(seurat_clusters) %>%
  summarise(total = sum(count), .groups = 'drop')

# Merge and calculate percentage
cluster_percentages <- cluster_counts %>%
  left_join(total_per_cluster, by = "seurat_clusters") %>%
  mutate(percentage = (count / total) * 100)


# Plot
ggplot(cluster_percentages, aes(x = seurat_clusters, y = percentage, fill = sample)) +
  geom_bar(stat = "identity", position = "stack") +
  labs(title = "Cluster Composition By Sample",
       x = "Cluster",
       y = "Percentage of Cluster",
       fill = "Sample") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))


```
##percent of each cluster from each condition
```{r}
#visualize percent of each cluster from each condition

# Count cells per cluster per condition
cluster_counts_cond <- meta_data %>%
  group_by(seurat_clusters, condition) %>%
  summarise(count = n(), .groups = 'drop')

# Calculate total cells per sample
total_cond_per_cluster <- cluster_counts_cond %>%
  group_by(seurat_clusters) %>%
  summarise(total = sum(count), .groups = 'drop')

# Merge and calculate percentage
cluster_cond_percentages <- cluster_counts_cond %>%
  left_join(total_cond_per_cluster, by = "seurat_clusters") %>%
  mutate(percentage = (count / total) * 100)


# Plot
ggplot(cluster_cond_percentages, aes(x = seurat_clusters, y = percentage, fill = condition)) +
  geom_bar(stat = "identity", position = "stack") +
  labs(title = "Cluster Composition by Condition",
       x = "Cluster",
       y = "Percentage of Cluster",
       fill = "Sample") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
```
##visualize percent.mt per cluster
```{r}
#add metadata column for cluster_condition
merged_all2$cluster_condition <- paste0(merged_all2$seurat_clusters,"_",merged_all2$condition)
Idents(merged_all2) <- merged_all2$cluster_condition

#check that metadata column is added
colnames(merged_all2@meta.data)

# Create summary data frame
percent_mt_summary <- merged_all2@meta.data %>%
  group_by(seurat_clusters) %>%
    summarise(
    mean_percent_mt = mean(percent.mt, na.rm = TRUE),
    median_percent_mt = median(percent.mt, na.rm = TRUE),
        n_cells = n()
  )

print(percent_mt_summary)

ggplot(percent_mt_summary, aes(x = factor(seurat_clusters), y = mean_percent_mt)) +
  geom_bar(stat = "identity", fill = "#2c7fb8") +
  labs(
    title = "Mean Mitochondrial Percentage by Cluster",
    x = "Cluster",
    y = "Mean % Mitochondrial Genes"
  ) +
  theme_minimal()
```

```{r}
clusters_table <- table(merged_all2$seurat_clusters, merged_all2$sample)
clusters_table_cond <- table(merged_all2$seurat_clusters, merged_all2$condition)
clusters_table
clusters_table_cond

```
#Create cluster tree 
```{r}
# Clustree visualization
install.packages("clustree")
library(clustree)

merged_tree <- merged_all2

#Add multiple resolutions
merged_tree <- FindClusters(merged_tree, resolution = c(0.1, 0.2, 0.4, 0.6, 0.8))


cluster.tree <- clustree(
  merged_tree, "RNA_snn_res.") + ggtitle("Cluster tree")

ggsave("resolution cluster tree.svg", plot = cluster.tree, width = 12)

cluster.tree 
```
##silhouette plot
```{r, fig.width = 4, fig.height=12}
library(cluster)
library(purrr)

# Convert clusters to numeric
clusters <- as.numeric(as.character(merged_all2$seurat_clusters))

# Compute distance on PCA embeddings
d <- dist(merged_all2@reductions$pca@cell.embeddings)

# Run silhouette
sil <- cluster::silhouette(clusters, d)

# 4. Convert to data frame
sil_df <- as.data.frame(sil)
sil_df$cell <- rownames(sil_df) 
colnames(sil_df) <- c("cluster", "neighbor", "sil_width", "cell")
sil_df

# Make cluster a factor for plotting
sil_df$cluster <- as.factor(sil_df$cluster)

# 5. ggplot silhouette plot
ggplot(sil_df, aes(x = reorder(cell, sil_width), y = sil_width, fill = cluster)) +
  geom_bar(stat = "identity", width = 1, show.legend = FALSE) +
  facet_wrap(~ cluster, scales = "free_y", ncol = 1) +
  coord_flip() +
  theme_minimal() +
  labs(x = "Cells (ordered by silhouette width)",
       y = "Silhouette width",
       title = "Silhouette plot of Seurat clusters") +
  theme(axis.text.y = element_blank(),
        axis.ticks.y = element_blank())
```


#Evaluate conserved marker genes, use to consolidate smaller clusters into larger ones
```{r}
Idents(merged_all2) <- 'seurat_clusters'
merged_all2$consolidated_clusters <- merged_all2$seurat_clusters

library(presto)
library(SeuratWrappers)

# Initialize the list
conserved_markers <- list()

# Loop through the clusters in the specified order
for (i in 0:24) {
  aux <- SeuratWrappers::RunPresto(
    merged_all2,
    ident.1 = i,
    only.pos = TRUE,
    logfc.threshold = 0.5,
    min.pct = 0.4
  )
  
  if (!is.null(aux) && nrow(aux) > 0) {
    aux$gene <- rownames(aux)
    aux$cluster <- i
    conserved_markers[[paste0("cluster_", i)]] <- aux
  }
}

#filter
for(i in names(conserved_markers)){
  aux <- conserved_markers[[i]]
  aux$cluster <- i
  aux <- aux[aux$p_val_adj < 0.05,]
  aux <- aux[order(aux$avg_log2FC, decreasing=T),]
  conserved_markers[[i]] <- aux
}

```
##export conserved markers as excel file
```{r}
library(rio)
rio::export(conserved_markers, "conserved markers all clusters.xlsx")
```

#Consolidate clusters
```{r}

#copy seurat_clusters to new metadate column
merged_all2$consolidated_clusters <- merged_all2$seurat_clusters

#set identity as new column, 'consolidated_clusters'
Idents(merged_all2) <- 'consolidated_clusters'

# Make new column a character vector instead of factor
merged_all2@meta.data$consolidated_clusters <- 
  as.character(merged_all2@meta.data$consolidated_clusters)

#define 'current clusters' for mapping later
current_clusters <- as.character(merged_all2$seurat_clusters)


#Consolidations (unnamed clusters go unchanged)
new_cluster_map <- c(
  "24"  = "12",
  "23"  = "1",
  "21" = "14"
)

##5,10,14,18,21 are all myeloid (CD68+)
   #14/21 = DC
##from cluster tree: 5 and 10 combine, 14, 18, and 21 combine
#24 -> 12 (gamma delta T?) -> 7 (also expressing Trgv in top genes)

#Apply mapping back to metadata
merged_all2$consolidated_clusters <- ifelse(
  current_clusters %in% names(new_cluster_map),
  new_cluster_map[current_clusters],
  current_clusters
)

# Convert back to factor (optional, but usually cleaner for Idents)
merged_all2@meta.data$consolidated_clusters <- 
  factor(merged_all2@meta.data$consolidated_clusters, levels = c(0:20,22))

# Set active identities correctly
Idents(merged_all2) <- 'consolidated_clusters'
```

#Naming clusters (preliminary, manual). In new object 'merged_annot'
```{r}
merged_annot <- merged_all2
merged_annot$annot <- merged_annot$consolidated_clusters


#set identity as new column, 'consolidated_clusters'
Idents(merged_annot) <- 'annot'

# Make new column a character vector instead of factor
merged_annot@meta.data$annot <- 
  as.character(merged_annot@meta.data$annot)

#define 'current clusters' for mapping later
current_clusters_annot <- as.character(merged_annot$annot)


#Consolidations (unnamed clusters go unchanged)
new_cluster_map_annot <- c(
  "0" = "Cytotoxic NK-T",
  "1" = "B cell",
  "2" = "Naive T_TCM",
  "3" = "Neutrophil", 
  "4" = "PT group 1",
  "5" = "Activated macrophage",
  "6" = "Immature T_group 2",
  "7" = "Th1",
  "8" = "PT group 2",
  "9" = "PT group 3",
  "10" = "Myeloid",
  "11" = "PT group 4",
  "12" = "T cell_gamma_delta",
  "13" = "TAL_DCT",
  "14" = "DC",
  "15" = "DCT_CNT",
  "16" = "TAL_DCT",
  "17" = "TAL",
  "18" = "Mono_macrophage",
  "19" = "Tissue-resident NK-T",
  "20" = "Endothelial",
  "22" = "Intercalated cell")



#Apply mapping back to metadata
merged_annot$annot <- ifelse(
  current_clusters_annot %in% names(new_cluster_map_annot),
  new_cluster_map_annot[current_clusters_annot],
  current_clusters_annot
)

# Convert back to factor (optional, but usually cleaner for Idents)
merged_annot@meta.data$annot <- 
  factor(merged_annot@meta.data$annot)

# Set active identities correctly
Idents(merged_annot) <- 'annot'


```

#Visualize umap with annotations
```{r, fig.height=7, fig.width=7}
DimPlot(merged_annot, label=T) + NoLegend()
```
##split by condition
```{r, fig.height=7, fig.width=13}
DimPlot(merged_annot, label=T, split.by = 'condition') + NoLegend()
```

```{r}
saveRDS(merged_annot, "Evan_seurat_obj.RDS")
```

```{r}
Idents(merged_annot) <- 'annot'

annotations <- list()
annotations <- unique(Idents(merged_annot))

# Initialize the list
conserved_markers_annot <- list()

# Loop through the clusters in the specified order
for (i in unique(annotations)) {
  aux <- SeuratWrappers::RunPresto(
    merged_annot,
    ident.1 = i,
    only.pos = TRUE,
    logfc.threshold = 0.5,
    min.pct = 0.4
  )
  
  if (!is.null(aux) && nrow(aux) > 0) {
    aux$gene <- rownames(aux)
    aux$cluster <- i
    conserved_markers_annot[[i]] <- aux
  }
}

#filter
for(i in names(conserved_markers_annot)){
  aux <- conserved_markers_annot[[i]]
  aux$cluster <- i
  aux <- aux[aux$p_val_adj < 0.05,]
  aux <- aux[order(aux$avg_log2FC, decreasing=T),]
  conserved_markers_annot[[i]] <- aux
}

```


```{r}
merged_all2$consolidated_clusters <- merged_all2$seurat_clusters
Idents(merged_all2) <- 'consolidated_clusters'

##24 into 12, T cell population with Trg 
merged_all2@meta.data$consolidated_clusters <- 
  ifelse(
    merged_all2@meta.data$consolidated_clusters == "24",
    12,
    merged_all2@meta.data$consolidated_clusters
  )

#23 into 1, B cells - lots of immunoglobulins in 23 and jchain, 1 is cd19-high b cell group
merged_all2@meta.data$consolidated_clusters <- 
  ifelse(
    merged_all2@meta.data$consolidated_clusters == "23",
    1,
    merged_all2@meta.data$consolidated_clusters
  )

unique(merged_all2$consolidated_clusters)
```







++++
#Evaluate conserved and differentially expressed genes
```{r}
Idents(merged_all2) <- 'seurat_clusters'

# Define the custom order
cluster_order_0.1 <- c(0, 1, 10, 11, 12, 13, 14, 15, 2, 3, 4, 5, 6, 7, 8, 9)

# Initialize the list
conserved_markers_res0.1 <- list()

# Loop through the clusters in the specified order
for (i in cluster_order_0.1) {
  aux <- SeuratWrappers::RunPresto(
    merged_all_cons,
    ident.1 = i,
    only.pos = TRUE,
    logfc.threshold = 0.5,
    min.pct = 0.4
  )
  
  if (!is.null(aux) && nrow(aux) > 0) {
    aux$gene <- rownames(aux)
    aux$cluster <- i
    conserved_markers_res0.1[[paste0("cluster_", i)]] <- aux
  }
}
```


```{r}
#filter
for(i in names(conserved_markers_res0.1)){
  aux <- conserved_markers_res0.1[[i]]
  aux$cluster <- i
  aux <- aux[aux$p_val_adj < 0.05,]
  aux <- aux[order(aux$avg_log2FC, decreasing=T),]
  conserved_markers_res0.1[[i]] <- aux
}
```

```{r}
# take top 15 genes from each cluster to do annotations
cluster_annot_df_0.1 <- conserved_markers_res0.1 %>%
  map(~ head(.x, 10)) %>%  # Take first 10 rows of each data frame
  bind_rows()              # Combine into one data frame
```


```{r}
rio::export(conserved_markers_res0.1, "all pos conserved genes per cluster.xlsx")
rio::export(cluster_annot_df_0.1, "top 15 conserved genes per cluster.xlsx")
```

#Put in cluster annotations
```{r}
l1_annotations_res0.1 <- c(
  "PT",           # cluster 0
  "T cell",                    # cluster 1
  "Myeloid",        # cluster 2
  "B Cell",                         # cluster 3
  "NK-T",      # cluster 4
  "T cell", # cluster 5
  "PMN",                    # cluster 6
  "Fibro_Stromal",     # cluster 7
  "Podocyte",                      # cluster 8
  "Macrophage",              # cluster 9
  "T cell",          # cluster 10
  "TAL",           # cluster 11
  "pDC",           # cluster 12
  "DCT",       # cluster 13
  "IC-B",      # cluster 14
  "ILC2/Myeloid"    # cluster 15
)

Idents(merged_all_cons) <- 'RNA_snn_res.0.1'

unique(Idents(merged_all_cons))
```
```{r, fig.width=10}
cluster.tree
```
```{r, fig.height=8}
Idents(merged_all_cons) <- 'seurat_clusters'
DefaultAssay(merged_all_cons)

merged_0.05 <- FindClusters(merged_0.05, resolution = 0.01)
  
cluster.tree.8dim <- clustree(merged_0.05, prefix = "RNA_snn_res.")
cluster.tree.8dim
```


```{r}
# Make sure cluster IDs are numeric and 0-based
clusters <- as.numeric(as.character(merged_all_cons$RNA_snn_res.0.1))

# Map cluster IDs to manual annotations
# Create a named vector for easy lookup
names(l1_annotations_res0.1) <- as.character(0:15)

# Add annotation metadata by matching cluster IDs
merged_all_cons@meta.data$l1_annotations_res0.1 <- l1_annotations_res0.1[as.character(clusters)]

```

```{r}
merged_all_cons@meta.data
```

```{r}
Idents(merged_all_cons) <- 'annotation'
DimPlot(merged_all_cons, label = T, repel = T)
```
```{r}
Idents(merged_all_cons) <- 'RNA_snn_res.0.1'
DimPlot(merged_all_cons, reduction = "tsne")+ ggtitle("unbiased clustering")
```
```{r}
Idents(obj_annot) <- 'grouping'
DimPlot(obj_annot, reduction = "tsne", split.by = 'condition')
```

```{r, fig.width=9}
DimPlot(merged_all_cons, label = T, repel = T, split.by = 'condition') + NoLegend()
```

```{r}
clusters_0.1_table <- table(merged_all_cons$RNA_snn_res.0.1, merged_all_cons$sample)
clusters_0.1_table_cond <- table(merged_all_cons$RNA_snn_res.0.1, merged_all_cons$condition)
clusters_0.1_table
clusters_0.1_table_cond
```
```{r}
clusters_0.01_table <- table(merged_0.05$RNA_snn_res.0.01, merged_0.05$sample)
clusters_0.01_table_cond <- table(merged_0.05$RNA_snn_res.0.01, merged_0.05$condition)
clusters_0.01_table
clusters_0.01_table_cond

clusters_0.01_df <- as.data.frame(clusters_0.01_table)
clusters_0.01_df

clusters_0.01_df$annot <- c("lymphocyte", "myeloid", "PT", "B cell", "NK", "PMN", "TAL_Distal tubule", "PT_EMT_Fibro", "DCT")

```


```{r}
merged_all_cons@meta.data$l1_annotations_res0.1 <- 
  ifelse(
    merged_all_cons@meta.data$l1_annotations_res0.1 == "Fibroblast/Stroma",
    "Fibro_Stromal",
    merged_all_cons@meta.data$l1_annotations_res0.1
  )
```

```{r}
Idents(merged_all_cons) <- 'RNA_snn_res.0.8'

DE_0.8_10v19 <- SeuratWrappers::RunPresto(merged_all_cons, ident.1 = 10, ident.2 = 19)
gene <- rownames(DE_0.8_10v19)
DE_0.8_10v19$gene <- gene

DE_0.8_10v19 <- DE_0.8_10v19[DE_0.8_10v19$p_val_adj < 0.05,]
DE_0.8_10v19 <- order(DE_0.8_10v19$avg_log2FC, decreasing = T)  




```


```{r}
#compare DE gene expression between UTI/controls in each cluster
#defined as deg_cons (list)
colnames(merged_all_cons@meta.data)

merged_all_cons@meta.data$l1_condition <- paste0(merged_all_cons@meta.data$l1_annotations_res0.1,"_",merged_all_cons$condition)

Idents(merged_all_cons) <- 'l1_condition'
unique(Idents(merged_all_cons))

DEg_res_0.1_UTI <-  list()

for(i in unique(merged_all_cons$l1_annotations_res0.1)){
  aux <- SeuratWrappers::RunPresto(merged_all_cons,ident.1 = paste0(i,"_UTI"), ident.2 = paste0(i,"_Control"))  
  aux$gene <- rownames(aux)
  DEg_res_0.1_UTI[[paste0(i,"_UTI")]] <- aux
}

#Filter for easier analysis
for(i in names(DEg_res_0.1_UTI)){
  aux <- DEg_res_0.1_UTI[[i]]
  aux <- aux[aux$p_val_adj < 0.05,]
  aux <- aux[order(aux$avg_log2FC, decreasing=T),]
  DEg_res_0.1_UTI[[i]] <- aux
}
```

```{r}

names(DEg_res_0.1_UTI)[names(DEg_res_0.1_UTI) %in% "ILC2/Myeloid_UTI"] <- "ILC2_UTI"

#export as excel file
rio::export(DEg_res_0.1_UTI, "res 0.1_DE genes.xlsx")

```

#save RDS and create loupe
```{r}

saveRDS(merged_all_cons, "merged_obj.RDS")

```

```{r}
save.image("res 0.5_PCA 25_percentmt10.RData")

```


```{r}
setwd("C:/Users/evrajadh/OneDrive - Indiana University/Research/Recurrent UTI GFR/ScRNA/Rec_UTI")
create_loupe_from_seurat(merged_all_cons, output_dir = "cloupes/", output_name = "manual annot_res_0.5")
```



```{r}
#Split object by condition and analyze conditions separately
merged_split <- SplitObject(merged_all_cons, split.by = "condition")

#eval conserved markers by condition

merged_split$UTI

merged_UTI <- merged_split$UTI

#Add multiple resolutions
merged_UTI <- RunPCA(merged_UTI, npcs=30)
merged_UTI <- FindNeighbors(merged_UTI, dims = 1:20, verbose = F)
merged_UTI <- FindClusters(merged_UTI, resolution = c(0.1, 0.2, 0.4, 0.6, 0.8))

merged_UTI <- RunUMAP(merged_UTI, dims = 1:20)

DimPlot(merged_UTI)

cluster.tree_UTI <- clustree(
  merged_UTI, "RNA_snn_res.") + ggtitle("Cluster tree")

cluster.tree_UTI
#haven't removed doublets, only preliminary



```
```{r}
Idents
```

```{r}
Idents(merged_UTI) <- 'l1_annotations_res0.1'

DEg_UTI_LOW_res <- list()
for(i in unique(merged_0.05$RNA_snn_res.0.01)){
  aux <- SeuratWrappers::RunPresto(merged_0.05,ident.1 = paste0(i,"_UTI"), ident.2 = paste0(i,"_Control"))  
  aux$gene <- rownames(aux)
  DEg_UTI_LOW_res[[paste0(i,"_UTI")]] <- aux
}

#Filter for easier analysis
for(i in names(DEg_UTI_LOW_res)){
  aux <- DEg_UTI_LOW_res[[i]]
  aux <- aux[aux$p_val_adj < 0.05,]
  aux <- aux[order(aux$avg_log2FC, decreasing=T),]
  DEg_UTI_LOW_res[[i]] <- aux
}

rio::export(DEg_UTI_LOW_res, "LOW resolution DEgenes_UTI.xlsx")


```

```{r}
Idents(merged_0.05) <- 'RNA_snn_res.0.01'
DimPlot(merged_0.05, reduction = "tsne")
```
```{r}
DimPlot(merged_0.05, reduction = "pca", label = T, repel = T)
```
```{r}
Idents(merged_0.05) <- 'l1_annotations_res0.1'
DimPlot(merged_0.05, reduction = "tsne", label = T)
```
```{r, fig.width=6, fig.height=6}
DimPlot(merged_0.05, reduction = "pca", label = T) + NoLegend()
```

```{r}
obj_annot <- merged_all_cons
```


```{r}
obj_annot$grouping <- NA

obj_annot$grouping[obj_annot$RNA_snn_res.0.1 %in% c(1,5,10,15,4,3)] <- "Lymphocyte"
obj_annot$grouping[obj_annot$RNA_snn_res.0.1 %in% c(2,12,9,6)] <- "Myeloid"
obj_annot$grouping[obj_annot$RNA_snn_res.0.1 %in% c(0)] <- "PT"
obj_annot$grouping[obj_annot$RNA_snn_res.0.1 %in% c(7)] <- "PT_EMT"
obj_annot$grouping[obj_annot$RNA_snn_res.0.1 %in% c(14,11,8,13)] <- "TAL_distal tubule"
```

```{r}
Idents(obj_annot) <- 'grouping'
unique(Idents(obj_annot))
```

```{r}
conserved_genes_lowres_grouping <- list()
Idents(obj_annot) <- 'grouping'

# Loop through the clusters in the specified order
conserved_genes_lowres_grouping <- SeuratWrappers::RunPrestoAll(obj_annot,
    only.pos = TRUE,
    logfc.threshold = 0.5,
    min.pct = 0.4)
conserved_genes_lowres_grouping$gene <- rownames(conserved_genes_lowres_grouping)

rio::export(conserved_genes_lowres_grouping, "Cluster_GROUPING_conserved genes.xlsx")
    
```
```{r}
obj_annot$group_cond <- paste0(obj_annot$grouping,'_',obj_annot$condition)
Idents(obj_annot) <- 'group_cond'

DEg_UTI_LOW_res_annot <- list()
for(i in unique(obj_annot$grouping)){
  aux <- SeuratWrappers::RunPresto(obj_annot,ident.1 = paste0(i,"_UTI"), ident.2 = paste0(i,"_Control"))  
  aux$gene <- rownames(aux)
  DEg_UTI_LOW_res_annot[[paste0(i,"_UTI")]] <- aux
}

#Filter for easier analysis
for(i in names(DEg_UTI_LOW_res_annot)){
  aux <- DEg_UTI_LOW_res_annot[[i]]
  aux <- aux[aux$p_val_adj < 0.05,]
  aux <- aux[order(aux$avg_log2FC, decreasing=T),]
  DEg_UTI_LOW_res_annot[[i]] <- aux
}

rio::export(DEg_UTI_LOW_res_annot, "DEgenes cluster_GROUPING_UTIvControl.xlsx")
```


```{r}
rio::export(DEg_UTI_LOW_res, "LOW resolution DEgenes_UTI.xlsx")


```





Monocling

```{r}
Idents(merged_all_cons) <- 'RNA_snn_res.0.1'
pt_cells <- subset(merged_all_cons, idents = c('0', '7'))
library(monocle3)
```
```{r}

cds <- SeuratWrappers::as.cell_data_set(pt_cells)
cds <- cluster_cells(cds)
cds <- learn_graph(cds)
```
```{r}
plot_cells(cds, color_cells_by = "cluster", label_cell_groups = T)
```

```{r}
Idents(obj_annot) <- 'l1_annotations_res0.1'
unique(Idents(obj_annot))

obj_annot$l1_annotations_res0.1[obj_annot$l1_annotations_res0.1 %in% "Fibro_Stromal"] <- "PT_EMT"
obj_annot$l1_annotations_res0.1[obj_annot$l1_annotations_res0.1 %in% "ILC2"] <- "ILC"
obj_annot$l1_annotations_res0.1[obj_annot$l1_annotations_res0.1 %in% "Podocyte"] <- "TAL_DCT"
obj_annot$l1_annotations_res0.1[obj_annot$l1_annotations_res0.1 %in% "pDC"] <- "DC"

```
```{r}
cds <- SeuratWrappers::as.cell_data_set(obj_annot)
cds <- cluster_cells(cds)
cds <- learn_graph(cds)
```


```{r}
# Plot to choose root node visually
plot_cells(cds, color_cells_by = "cluster", label_cell_groups = T, label_roots = T) 

```
```{r}

get_earliest_principal_node <- function(cds) {
  pr_graph_test <- principal_graph(cds)[[1]]
  names(which.min(igraph::degree(pr_graph_test)))
}

# Once you identify the root cell/cluster:
cds <- order_cells(cds, root_pr_nodes = get_earliest_principal_node(cds))
```

```{r}
plot_cells(cds, color_cells_by = "pseudotime", label_groups_by_cluster = FALSE)

```
```{r}
DimPlot(obj_annot)
```

```{r}
BiocManager::install("slingshot")
```

```{r}
library(slingshot)

Idents(obj_annot) <- 'l1_annotations_res0.1'
DimPlot(obj_annot, reduction = 'tsne') + DimPlot(obj_annot, reduction = 'pca')

sce <- as.SingleCellExperiment(obj_annot)
sce <- slingshot(sce, clusterLabels = obj_annot$l1_annotations_res0.1, reducedDim = 'UMAP')
```
#Pseudotime analysis with slingshot
```{r}
sce <- as.SingleCellExperiment(obj_annot)
sce <- slingshot::slingshot(sce, clusterLabels = sce$l1_annotations_res0.1, reducedDim = 'UMAP')

```

```{r}
#plot pseudotime
library(scater)

plot(reducedDim(sce, "UMAP"), col = sce$slingPseudotime_1 , pch = 16)
```

```{r}
plot(reducedDim(sce, "TSNE"), col = sce$slingPseudotime_1 , pch = 16)
```

```{r}

plot(reducedDim(sce, "UMAP"), col = sce$slingPseudotime_1 , pch = 16)

clusters <- sce$l1_annotations_res0.1
centers <- t(sapply(unique(clusters),function(cl){
  colMeans(reducedDim(sce, "UMAP")[clusters == cl,])
}))
text(centers, labels = unique(clusters),col = "black", cex = 0.8)
```

```{r}
sce <- as.SingleCellExperiment(pt_cells)
sce <- slingshot::slingshot(sce, clusterLabels = sce$l1_annotations_res0.1, reducedDim = 'UMAP')
```

```{r}
plot(reducedDim(sce, "UMAP"), col = sce$slingPseudotime_1 , pch = 10)

clusters <- sce$l1_annotations_res0.1
centers <- t(sapply(unique(clusters),function(cl){
  colMeans(reducedDim(sce, "UMAP")[clusters == cl,])
}))
text(centers, labels = unique(clusters),col = "black", cex = 0.8)
```

```{r}
#For just PT cells and PT_EMT

# Extract data from SingleCellExperiment
umap_df <- as.data.frame(reducedDim(sce, "UMAP"))
colnames(umap_df) <- c("UMAP_1", "UMAP_2")  # Rename to readable names
umap_df$cluster <- sce$l1_annotations_res0.1
umap_df$pseudotime <- sce$slingPseudotime_1


# Calculate cluster centroids for labeling
label_df <- umap_df %>%
  group_by(cluster) %>%
  summarize(UMAP_1 = mean(UMAP_1), UMAP_2 = mean(UMAP_2))


# Plot with ggplot2
ggplot(umap_df, aes(x = UMAP_1, y = UMAP_2, color = pseudotime)) +
  geom_point(size = 0.5) +
  scale_color_viridis_c() +
  geom_text(data = label_df, aes(x = UMAP_1, y = UMAP_2, label = cluster), 
            color = "black", size = 4) +
  theme_minimal() +
  labs(x = "UMAP 1", y = "UMAP 2", title = "Slingshot Pseudotime [PT Cells] with Cluster Labels")


```
```{r}
sce <- as.SingleCellExperiment(obj_annot)
sce <- slingshot::slingshot(sce, clusterLabels = sce$RNA_snn_res.0.4, reducedDim = 'UMAP')
```

```{r}
#For just PT cells and PT_EMT

# Extract data from SingleCellExperiment
umap_df <- as.data.frame(reducedDim(sce, "UMAP"))
colnames(umap_df) <- c("UMAP_1", "UMAP_2")  # Rename to readable names
umap_df$cluster <- sce$l1_annotations_res0.1
umap_df$pseudotime <- sce$slingPseudotime_1


# Calculate cluster centroids for labeling
label_df <- umap_df %>%
  group_by(cluster) %>%
  summarize(UMAP_1 = mean(UMAP_1), UMAP_2 = mean(UMAP_2))


# Plot with ggplot2
ggplot(umap_df, aes(x = UMAP_1, y = UMAP_2, color = pseudotime)) +
  geom_point(size = 0.5) +
  scale_color_viridis_c() +
  geom_text(data = label_df, aes(x = UMAP_1, y = UMAP_2, label = cluster), 
            color = "black", size = 4) +
  theme_minimal() +
  labs(x = "UMAP 1", y = "UMAP 2", title = "Slingshot Pseudotime [PT Cells] with Cluster Labels")

```

```{r}
Idents(obj_annot) <- 'grouping'
unique(Idents(obj_annot))

#already subsetted PT and PT_EMT to pt_cells
myeloid_cells <- subset(obj_annot, idents = "Myeloid")
lympho_cells <- subset(obj_annot, idents = "Lymphocyte")
TAL_DCT_cells <- subset(obj_annot, idents = "TAL_distal tubule")

```

#myeloid slingshot TSNE

```{r}
sce_myeloid <- as.SingleCellExperiment(myeloid_cells)
sce_myeloid <- slingshot::slingshot(sce_myeloid, clusterLabels = sce_myeloid$RNA_snn_res.0.4, reducedDim = 'TSNE')
```

```{r}
#For just PT cells and PT_EMT

# Extract data from SingleCellExperiment
umap_myeloid_df <- as.data.frame(reducedDim(sce_myeloid, "TSNE"))
colnames(umap_myeloid_df) <- c("TSNE_1", "TSNE_2")  # Rename to readable names
umap_myeloid_df$cluster <- sce_myeloid$l1_annotations_res0.1
umap_myeloid_df$pseudotime <- sce_myeloid$slingPseudotime_1


# Calculate cluster centroids for labeling
label_myeloid_df <- umap_myeloid_df %>%
  group_by(cluster) %>%
  summarize(TSNE_1 = mean(TSNE_1), TSNE_2 = mean(TSNE_2))


# Plot with ggplot2
ggplot(umap_myeloid_df, aes(x = TSNE_1, y = TSNE_2, color = pseudotime)) +
  geom_point(size = 0.5) +
  scale_color_viridis_c() +
  geom_text(data = label_myeloid_df, aes(x = TSNE_1, y = TSNE_2, label = cluster), 
            color = "black", size = 4) +
  theme_minimal() +
  labs(x = "TSNE1", y = "TSNE 2", title = "Slingshot Pseudotime [Myeloid Cells] with Cluster Labels")

```

#myeloid slingshot UMAP
```{r}
sce_myeloid <- as.SingleCellExperiment(myeloid_cells)
sce_myeloid <- slingshot::slingshot(sce_myeloid, clusterLabels = sce_myeloid$RNA_snn_res.0.4, reducedDim = 'UMAP')
```

```{r}
#For just PT cells and PT_EMT

# Extract data from SingleCellExperiment
umap_myeloid_df <- as.data.frame(reducedDim(sce_myeloid, "UMAP"))
colnames(umap_myeloid_df) <- c("UMAP_1", "UMAP_2")  # Rename to readable names
umap_myeloid_df$cluster <- sce_myeloid$l1_annotations_res0.1
umap_myeloid_df$pseudotime <- sce_myeloid$slingPseudotime_1


# Calculate cluster centroids for labeling
label_myeloid_df <- umap_myeloid_df %>%
  group_by(cluster) %>%
  summarize(UMAP_1 = mean(UMAP_1), UMAP_2 = mean(UMAP_2))


# Plot with ggplot2
ggplot(umap_myeloid_df, aes(x = UMAP_1, y = UMAP_2, color = pseudotime)) +
  geom_point(size = 0.5) +
  scale_color_viridis_c() +
  geom_text(data = label_myeloid_df, aes(x = UMAP_1, y = UMAP_2, label = cluster), 
            color = "black", size = 4) +
  theme_minimal() +
  labs(x = "UMAP 1", y = "UMAP 2", title = "Slingshot Pseudotime [PT Cells] with Cluster Labels")

```

#lympho slingshot
```{r}
sce_lymph <- as.SingleCellExperiment(lympho_cells)
sce_lymph <- slingshot::slingshot(sce_lymph, clusterLabels = sce_lymph$RNA_snn_res.0.4, reducedDim = 'TSNE')
```

```{r}
#For just PT cells and PT_EMT

# Extract data from SingleCellExperiment
umap_lymph_df <- as.data.frame(reducedDim(sce_lymph, "TSNE"))
colnames(umap_lymph_df) <- c("TSNE_1", "TSNE_2")  # Rename to readable names
umap_lymph_df$cluster <- sce_lymph$l1_annotations_res0.1
umap_lymph_df$pseudotime <- sce_lymph$slingPseudotime_1


# Calculate cluster centroids for labeling
label_lymph_df <- umap_lymph_df %>%
  group_by(cluster) %>%
  summarize(TSNE_1 = mean(TSNE_1), TSNE_2 = mean(TSNE_2))


# Plot with ggplot2
ggplot(umap_lymph_df, aes(x = TSNE_1, y = TSNE_2, color = pseudotime)) +
  geom_point(size = 0.5) +
  scale_color_viridis_c() +
  geom_text(data = label_lymph_df, aes(x = TSNE_1, y = TSNE_2, label = cluster), 
            color = "black", size = 4) +
  theme_minimal() +
  labs(x = "TSNE1", y = "TSNE 2", title = "Slingshot Pseudotime [PT Cells] with Cluster Labels")

```
#TAL_DCT cell slingshot
```{r}
sce_tub <- as.SingleCellExperiment(TAL_DCT_cells)
sce_tub <- slingshot::slingshot(sce_tub, clusterLabels = sce_tub$RNA_snn_res.0.4, reducedDim = 'TSNE')
```

```{r}

# Extract data from SingleCellExperiment
tub_df <- as.data.frame(reducedDim(sce_tub, "TSNE"))
colnames(tub_df) <- c("TSNE_1", "TSNE_2")  # Rename to readable names
tub_df$cluster <- sce_tub$l1_annotations_res0.1
tub_df$pseudotime <- sce_tub$slingPseudotime_1


# Calculate cluster centroids for labeling
label_tub_df <- tub_df %>%
  group_by(cluster) %>%
  summarize(TSNE_1 = mean(TSNE_1), TSNE_2 = mean(TSNE_2))


# Plot with ggplot2
ggplot(tub_df, aes(x = TSNE_1, y = TSNE_2, color = pseudotime)) +
  geom_point(size = 0.5) +
  scale_color_viridis_c() +
  geom_text(data = label_tub_df, aes(x = TSNE_1, y = TSNE_2, label = cluster), 
            color = "black", size = 4) +
  theme_minimal() +
  labs(x = "TSNE1", y = "TSNE 2", title = "Slingshot Pseudotime [PT Cells] with Cluster Labels")

```

```{r}
Idents(obj_annot) <- 'RNA_snn_res.0.1'
DimPlot(obj_annot, reduction = "tsne", label = T,repel=T,  label.size = 5) + ggtitle("0.1 resolution clustering")
```


```{r}
obj_annot <- readRDS("C:/Users/evrajadh/OneDrive - Indiana University/Research/Recurrent UTI GFR/ScRNA/Rec_UTI/res 0.5_PCA 25_7-29/annotation object.RDS")

Idents(obj_annot) <- 'l1_annotations_res0.1'
DimPlot(obj_annot, reduction = "tsne", label = T,repel=T,  label.size = 5)
```
```{r, fig.width=3, fig.height=3}
Idents(obj_annot) <- 'grouping'
DimPlot(obj_annot, reduction = "tsne", label = T, label.size = 4, pt.size = 0.1) + NoLegend() + ggtitle('Clusters')
```
```{r}
Idents(obj_annot) <- 'grouping'
DoHeatmap(obj_annot, features = c("Nrep", "Map3k7cl", "Angptl7", "Hdc", "Slc6a18", "Tmigd1", "Slc22a12", "Slc22a6"), cells = "PT")
```
```{r, fig.height=12}
VlnPlot(obj_annot, features = c("Nrep", "Map3k7cl", "Angptl7", "Hdc", "Slc6a18", "Tmigd1", "Slc22a12", "Slc22a6"), raster = F, split.by = 'condition')
```
```{r, fig.width=6, fig.height=3}
FeaturePlot(obj_annot, features = "Nrep", reduction = "tsne", split.by = 'condition', pt.size = 0.8)
FeaturePlot(obj_annot, features = "Map3k7cl", reduction = "tsne", split.by = 'condition', pt.size = 0.8)
```

```{r, fig.width=5, fig.height=4}
VlnPlot(obj_annot, features = "Nrep", raster = F, split.by = 'condition')
VlnPlot(obj_annot, features = "Map3k7cl", raster = F, split.by = 'condition')
VlnPlot(obj_annot, features = "B2m", raster = F, split.by = 'condition')
```
```{r}
feat_scCKDatlas <- c("Cd79a", "Cd3d", "Prf1", "Cpa3", "Folr2", "Sema3g", "Sost", "Prox1", "Vcam1", "Plvap", "Rergl", "Cox4l2", "Dcn", "Postn", "Slc22a6", "Prune2", "Nos1", "Umod", "Calb1", "Slc4a1", "Upk3b", "Wt1", "Gfr3a")

DoHeatmap(obj_annot, features = c("Cryab", "Sema5a", "Pdgfrb", "Dcdc2a", "Cp", "Nkg7", "Xcr1", "S100a9", "Lrp2", "Slc12a1", "Slc12a3"))
```
```{r, fig.height=22}
FeaturePlot(obj_annot, features = c("Cryab", "Sema5a", "Pdgfrb", "Dcdc2a", "Cp", "Nkg7", "Xcr1", "S100a9", "Lrp2", "Slc12a1", "Slc12a3"), reduction = "tsne", split.by = 'condition', pt.size = 0.8)
VlnPlot(obj_annot, features = c("Cryab", "Sema5a", "Pdgfrb", "Dcdc2a", "Cp", "Nkg7", "Xcr1", "S100a9", "Lrp2", "Slc12a1", "Slc12a3"), raster = F, split.by = 'condition')
```
```{r, fig.height=25}

feat_PTinjury <- c("Slc5a12", "Slc34a1", "Slc22a30", "Col27a1", "Plin2", "Ypel2", "Bcat1", "Slc7a12", "Hsp90aa1", "Top2a", "Kcnip4", "Vcam1", "Havcr1")
#Healthy: Slc5a12, 45a1, 22a30, 7a13
#type 1 injured S1/2: Col27a1, Plin2
#Type 1 injured S3: Plin2, Havcr1
#type2 injured S1/2: Ypel2
#Type2 injured S3: ypel2, Bcat1, Slc7a12
#acute injury: Hsp90aa1
#Repairing: Top2a
#Failed repair: Ypel2, Kcnip4,Vcam1, Havcr1

FeaturePlot(obj_annot, features = feat_PTinjury, reduction = "tsne", split.by = 'condition', pt.size = 0.8)
VlnPlot(obj_annot, features = feat_PTinjury, raster = F, split.by = 'condition')

```


```{r}
#see which PT clusters are where on tsne (clusters 5,9, and 11 on res0.5)
Idents(obj_annot) <- 'RNA_snn_res.0.5'
DimPlot(obj_annot, reduction = "tsne", label = T)
```


```{r}
sce <- as.SingleCellExperiment(pt_cells)
sce <- slingshot(sce, clusterLabels = sce$l1_con, reducedDim = "TSNE")
curves <- slingCurves(sce)
```

```{r}

  
  df <- as.data.frame(reducedDim(sce, "TSNE"))
colnames(df) <- c("TSNE_1", "TSNE_2")
df$pseudotime <- sce$slingPseudotime_1
df$cluster <- sce$l1_annotations_res0.1 

  p <- ggplot(df, aes(x = TSNE_1, y = TSNE_2, color = pseudotime)) +
  geom_point(size = 0.5) +
  scale_color_viridis_c() +
  theme_minimal()
  
  for (i in seq_along(curves)) {
  curve_df <- as.data.frame(curves[[i]]$s)
  colnames(curve_df) <- c("TSNE_1", "TSNE_2")
  
  # Add arrows along the curve
  p <- p + geom_path(data = curve_df, aes(x = TSNE_1, y = TSNE_2),
                     arrow = arrow(type = "open", length = unit(0.15, "inches")),
                     color = "black", size = 0.8)
  }
  
  p
```
#PT cells also with condition separation
```{r}
sce <- as.SingleCellExperiment(pt_cells)
sce <- slingshot(sce, clusterLabels = sce$l1_condition, reducedDim = "TSNE")
curves <- slingCurves(sce)
```

```{r}

  
  df <- as.data.frame(reducedDim(sce, "TSNE"))
colnames(df) <- c("TSNE_1", "TSNE_2")
df$pseudotime <- sce$slingPseudotime_1
df$cluster <- sce$l1_condition

  p <- ggplot(df, aes(x = TSNE_1, y = TSNE_2, color = pseudotime)) +
  geom_point(size = 0.5) +
  scale_color_viridis_c() +
  theme_minimal()
  
  for (i in seq_along(curves)) {
  curve_df <- as.data.frame(curves[[i]]$s)
  colnames(curve_df) <- c("TSNE_1", "TSNE_2")
  
  # Add arrows along the curve
  p <- p + geom_path(data = curve_df, aes(x = TSNE_1, y = TSNE_2),
                     arrow = arrow(type = "open", length = unit(0.15, "inches")),
                     color = "black", size = 0.8)
  }
  
  p
```



```{r}
saveRDS(obj_annot, "annotation object.RDS")
```


```{r}
unique(merged_all_cons@meta.data$l1_annotations_res0.1)
```

```{r}
Idents(merged_all_cons) <- 'l1_annotations_res0.1'
```

```{r}
DimPlot(merged_all_cons, reduction = "umap")
```
```{r}
markers_test <- FindMarkers(merged_all_cons, grouping.var = "l1_annotations_res0.1", ident.1 = "Podocyte")
```

```{r}
markers_test
```

```{r}
DimPlot(merged_all_cons, label = T, repel = T)
```
















+++++++++++
```{r}
myeloid_df <- as.data.frame(reducedDim(sce_myeloid, "TSNE"))
colnames(myeloid_df) <- c("TSNE_1", "TSNE_2")

myeloid_df$lineage <- is.na(sce_myeloid$slingPseudotime_1)

```

```{r}
ggplot(myeloid_df, aes(x = TSNE_1, y = TSNE_2, color = lineage)) +
  geom_point(size = 0.5) +
  scale_color_manual(values = c("FALSE" = "blue", "TRUE" = "gray")) +
  theme_minimal() +
  labs(title = "Cells Assigned to Slingshot Lineage 1",
       color = "Assigned?")

```

```{r}
sce_all <- as.SingleCellExperiment(obj_annot)
sce_all <- slingshot::slingshot(sce_all, clusterLabels = sce_all$RNA_snn_res.0.4, reducedDim = 'TSNE')
```

```{r}
#For just PT cells and PT_EMT

# Extract data from SingleCellExperiment
tsne_all_df <- as.data.frame(reducedDim(sce_all, "TSNE"))
colnames(tsne_all_df) <- c("TSNE_1", "TSNE_2")  # Rename to readable names
tsne_all_df$cluster <- sce_all$l1_annotations_res0.1
tsne_all_df$pseudotime <- sce_all$slingPseudotime_1


# Calculate cluster centroids for labeling
label_all_tsne_df <- tsne_all_df %>%
  group_by(cluster) %>%
  summarize(TSNE_1 = mean(TSNE_1), TSNE_2 = mean(TSNE_2))


# Plot with ggplot2
ggplot(tsne_all_df, aes(x = TSNE_1, y = TSNE_2, color = pseudotime)) +
  geom_point(size = 0.5) +
  scale_color_viridis_c() +
  geom_text(data = label_all_tsne_df, aes(x = TSNE_1, y = TSNE_2, label = cluster), 
            color = "black", size = 4) +
  theme_minimal() +
  labs(x = "TSNE1", y = "TSNE 2", title = "Slingshot Pseudotime [All Cells] with Cluster Labels")

```
```{r}
tsne_all_df <- as.data.frame(reducedDim(sce_all, "TSNE"))
colnames(tsne_all_df) <- c("TSNE_1", "TSNE_2")

tsne_all_df$lineage <- is.na(sce_all$slingPseudotime_1)

```

```{r}
ggplot(tsne_all_df, aes(x = TSNE_1, y = TSNE_2, color = lineage)) +
  geom_point(size = 0.5) +
  scale_color_manual(values = c("FALSE" = "blue", "TRUE" = "gray")) +
  theme_minimal() +
  labs(title = "Cells Assigned to Slingshot Lineage 1",
       color = "Assigned?")

```










```{r}
sce_myeloid$lineage <- is.na(sce_myeloid$slingPseudotime_1)
ggplot(..., color = lineage)  # TRUE = unassigned = gray

df <- as.data.frame(reducedDim(sce_myeloid, "TSNE"))
colnames(df) <- c("TSNE_1", "TSNE_2")

df$lineage <- is.na(sce_myeloid$slingPseudotime_1)


```







```{r}
plot(reducedDim(sce), col = sce$cluster, type = 'lineages')

sce$li
```























```{r}
plot_cells(cds, color_cells_by = "cluster")
plot_cells(cds, genes = c("Cdh2"), show_trajectory_graph = TRUE)
```










```{r}
+++++++++++

ggsave("resolution cluster tree.svg", plot = cluster.tree, width = 12)

cluster.tree 


UTI_0.5_markers_clust_annot <- SeuratWrappers::RunPrestoAll(merged_0.5_splitlist$UTI)

UTI_0.5_markers_list <- split(UTI_0.5_markers_clust_annot, UTI_0.5_markers_clust_annot[['cluster']])

for(i in names(UTI_0.5_markers_list)){
  aux <- UTI_0.5_markers_list[[i]]
  aux <- aux[aux$p_val_adj < 0.05,]
  aux <- aux[order(aux$avg_log2FC, decreasing=T),]
  UTI_all_markers_list[[i]] <- aux
}
```

#DE genes by condition at res 0.01
```{r}
merged_0.05$l0_condition <- paste0(merged_0.05$RNA_snn_res.0.01, '_', merged_0.05$condition)
Idents(merged_0.05) <- 'l0_condition'

DE_0.05_resol_condition <- SeuratWrappers::RunPresto(merged_0.05)
```




Add a new chunk by clicking the *Insert Chunk* button on the toolbar or by pressing *Ctrl+Alt+I*.

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the *Preview* button or press *Ctrl+Shift+K* to preview the HTML file).

The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike *Knit*, *Preview* does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.
