#human---------------------------------------------------------------
#r 3.6
setwd("~/Desktop/Math/Bioinformatics/final_project/filtered/human/")

##import the count data
hum = read.table("human_final.csv",header=TRUE,sep=",")
hum_final = data.frame(hum[,-1])
hum_final = data.frame(hum_final[,-1], row.names=hum_final[,1])

#reorder dataframes
order_hum = hum_final[ order(row.names(hum_final)), ]

#remove genes in less than 30 cells
rm_genes = which(rowSums(order_hum > 0) <30)
r2 = order_hum [-rm_genes,]

library(scran)
#create sce object
sce = SingleCellExperiment(list(counts = data.matrix(r2)))

#do clustering to reduce heterogeneity
clusters = quickCluster(sce, min.size=100)
## Warning: Setting 'use.ranks=TRUE' for the old defaults.
## Set 'use.ranks=FALSE' for the new defaults.
sce = computeSumFactors (sce, cluster = clusters)

#normalize, don't return log2
sce = normalize (sce, return_log = FALSE)

library(Seurat)

#create Seurat object using scran object
s_obj= CreateSeuratObject(counts = log(counts(sce) +1))

#find variable features to reduce run time
s_obj=FindVariableFeatures(s_obj)

#regress out batch
s_obj_nobatch = ScaleData(s_obj)
## Centering and scaling data matrix
#add metadata
s_obj_nobatch@meta.data[, "protocol"] <- "human"

#s_obj= ScaleData(s_obj, vars.to.regress = "orig.ident")
s_obj = RunPCA(s_obj_nobatch, features = VariableFeatures(s_obj))
## PC_ 1
## Positive:  285175, 1159, 51761, 3382, 8745, 6326, 6323, 4133, 22854, 2561
##     2904, 115827, 9699, 9568, 491, 4897, 65009, 6812, 55342, 2554
##     254531, 2903, 9118, 84940, 10745, 80144, 3786, 4628, 6616, 2977
## Negative:  51148, 4340, 9725, 4336, 5129, 55314, 115584, 4099, 57571, 60484
##     933, 745, 9705, 2065, 1298, 1307, 94015, 2628, 11240, 84735
##     1287, 9444, 347, 214, 5376, 5354, 6035, 8537, 3766, 2934
## PC_ 2
## Positive:  2572, 3815, 2571, 80309, 594855, 5649, 3479, 23236, 6092, 2890
##     10769, 1012, 132204, 11122, 57689, 129684, 3356, 84457, 56937, 2897
##     53826, 57624, 26050, 116443, 10752, 400120, 53335, 30819, 148, 6529
## Negative:  79789, 6474, 128553, 57084, 153579, 1286, 6934, 2185, 6696, 1135
##     1285, 85301, 113146, 151835, 10842, 131096, 6330, 5774, 8496, 10335
##     57451, 25780, 29091, 5979, 10777, 9899, 6900, 65983, 4744, 308
## PC_ 3
## Positive:  9638, 3925, 10314, 5168, 10085, 1410, 4155, 85445, 4336, 4099
##     4340, 5860, 5129, 57471, 57571, 7368, 8871, 7018, 60484, 115584
##     23446, 9705, 9771, 5305, 84735, 3920, 4124, 745, 6900, 54443
## Negative:  477, 64344, 6263, 5803, 677, 5625, 2697, 361, 3371, 54739
##     2261, 254295, 7026, 10659, 50509, 5015, 6563, 2152, 9628, 80162
##     718, 493, 6538, 165, 59352, 85301, 3687, 9699, 6886, 3280
## PC_ 4
## Positive:  64919, 24141, 5318, 7164, 3398, 1404, 4781, 388121, 885, 1268
##     11249, 27145, 10368, 3937, 56892, 2786, 30010, 2290, 121256, 1745
##     2066, 9547, 6656, 1749, 58494, 170302, 758, 94122, 6925, 1128
## Negative:  5015, 8403, 4212, 23414, 2625, 148281, 7026, 127343, 3321, 121551
##     613212, 4880, 55515, 90139, 56934, 56243, 646658, 9628, 493, 6096
##     9723, 100131897, 846, 5617, 401498, 120114, 3356, 25861, 1008, 80059
## PC_ 5
## Positive:  8829, 151647, 8671, 5803, 26280, 10100, 4430, 654502, 7903, 56956
##     2917, 5122, 1910, 407738, 57713, 83857, 845, 6480, 590, 3738
##     55075, 10611, 1463, 2915, 133584, 254887, 9452, 338596, 1075, 23328
## Negative:  22987, 7402, 153579, 10512, 5865, 440279, 130271, 1846, 117177, 9196
##     81832, 2195, 88, 345557, 3936, 92126, 10123, 131096, 55294, 6696
##     2903, 302, 2572, 56937, 3479, 3815, 80309, 85301, 5979, 594855
#s_obj_nobatch = RunPCA (s_obj_nobatch, features=VariableFeatures(s_obj_nobatch))

s_obj = FindNeighbors(s_obj)
## Computing nearest neighbor graph
## Computing SNN
s_obj = FindClusters(s_obj, resolution = 0.15)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 1576
## Number of edges: 43555
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9501
## Number of communities: 7
## Elapsed time: 0 seconds
table(Idents(s_obj))
##
##   0   1   2   3   4   5   6
## 700 217 185 141 123 121  89
s_obj = RunTSNE(s_obj)
DimPlot (s_obj, reduction = "tsne", label = TRUE)

#label clusters based on marker genes
current.cluster.ids = c(0, 1, 2, 3, 4, 5, 6)
new.cluster.ids = c("Glutamatergic neuron", "Glutamatergic neuron", "Oligodendrocyte", "GABAergic neuron",
                    "Glutamatergic neuron", "Astrocyte", "GABAergic neuron")
names(x = new.cluster.ids) <- levels(x = s_obj)
pbmc <- RenameIdents(object = s_obj, new.cluster.ids)
DimPlot(object = pbmc, reduction = "tsne", label = TRUE, pt.size = 0.5) + NoLegend()

DimPlot(object = pbmc, reduction = "tsne", group.by = "protocol", pt.size = 0.5) + NoLegend()

pbmc_human = pbmc

#mouse---------------------------------------------------------------
mus = read.table("mouse_final.csv",header=TRUE,sep=",")
mus = mus[2:1998]
mus_final = data.frame(mus[-1997],row.names=mus[,1997])

#reorder dataframes
order_mus = mus_final[ order(row.names(mus_final)), ]

#remove genes in less than 30 cells
rm_genes = which(rowSums(order_mus > 0) <30)
r2 = order_mus [-rm_genes,]

library(scran)
#create sce object
sce = SingleCellExperiment(list(counts = data.matrix(r2)))

#do clustering to reduce heterogeneity
clusters = quickCluster(sce, min.size=100)
## Warning: Setting 'use.ranks=TRUE' for the old defaults.
## Set 'use.ranks=FALSE' for the new defaults.
sce = computeSumFactors (sce, cluster = clusters)

#normalize, don't return log2
sce = normalize (sce, return_log = FALSE)

library(Seurat)

#create Seurat object using scran object
s_obj= CreateSeuratObject(counts = log(counts(sce) +1))

#find variable features to reduce run time
s_obj=FindVariableFeatures(s_obj)

#regress out batch
s_obj_nobatch = ScaleData(s_obj)
## Centering and scaling data matrix
#add metadata
s_obj_nobatch@meta.data[, "protocol"] <- "mouse"

#s_obj= ScaleData(s_obj, vars.to.regress = "orig.ident")
s_obj = RunPCA(s_obj_nobatch, features = VariableFeatures(s_obj))
## PC_ 1
## Positive:  25797, 967, 6529, 10396, 65108, 214, 221692, 357, 2628, 338645
##     9839, 55619, 116173, 140576, 116448, 3766, 928, 8577, 349149, 6663
##     2571, 84457, 9516, 256987, 8537, 1410, 26137, 29968, 2791, 6284
## Negative:  2904, 7070, 11113, 54704, 64753, 6327, 57084, 104, 51299, 2563
##     5774, 10125, 51655, 8851, 259217, 4254, 9806, 491, 5775, 885
##     5582, 57030, 3747, 2557, 781, 6474, 22854, 326624, 171024, 5580
## PC_ 2
## Positive:  5730, 51555, 5629, 57480, 79789, 83700, 81615, 6446, 9516, 140576
##     85461, 349149, 9957, 928, 3766, 79152, 116448, 6663, 116984, 2791
##     23302, 79929, 8537, 81619, 11145, 116173, 1410, 6934, 114795, 5010
## Negative:  56853, 222865, 138046, 55364, 55531, 534, 10857, 140679, 50863, 2572
##     644353, 140767, 11130, 51617, 7857, 2571, 54913, 642968, 63908, 2823
##     50632, 56255, 25791, 7991, 5464, 4345, 5015, 11180, 2915, 348980
## PC_ 3
## Positive:  493, 256130, 347731, 112609, 30010, 140689, 8001, 4821, 5618, 345557
##     80309, 6014, 134121, 6860, 794, 3358, 266812, 53826, 6252, 83988
##     5100, 55638, 6751, 6752, 9079, 147381, 6001, 2066, 1756, 100127206
## Negative:  440279, 10186, 285025, 6096, 10590, 3479, 146760, 3708, 51176, 7138
##     9734, 127343, 160851, 2570, 1142, 594855, 8973, 401498, 54769, 389941
##     10418, 79623, 220441, 6792, 79698, 3983, 4081, 92211, 9066, 64881
## PC_ 4
## Positive:  10382, 5010, 4340, 94274, 443, 128218, 23632, 283298, 8560, 2705
##     11226, 9725, 94015, 93377, 3732, 57471, 348938, 55353, 7135, 79899
##     7130, 347, 9705, 81551, 4336, 51031, 5764, 51097, 9717, 4099
## Negative:  6496, 6252, 30010, 6860, 493, 6751, 112609, 53826, 3358, 140689
##     9951, 6001, 256130, 6014, 5618, 200150, 5100, 407738, 894, 5176
##     116372, 2918, 80309, 147381, 1993, 345557, 5013, 347731, 1756, 134121
## PC_ 5
## Positive:  443, 57471, 94274, 2861, 3925, 283298, 9725, 23632, 4336, 60484
##     128218, 11226, 93377, 1902, 79899, 1992, 7135, 7018, 11076, 256472
##     9717, 90865, 1949, 219699, 79739, 118611, 348938, 54739, 3732, 7111
## Negative:  129807, 1464, 55273, 10882, 5156, 9283, 84632, 857, 55553, 2840
##     26032, 11341, 10203, 29995, 1462, 2173, 129080, 55765, 4166, 84879
##     89958, 11098, 1265, 10991, 221749, 6017, 9865, 28959, 55365, 477
#s_obj_nobatch = RunPCA (s_obj_nobatch, features=VariableFeatures(s_obj_nobatch))

s_obj = FindNeighbors(s_obj)
## Computing nearest neighbor graph
## Computing SNN
s_obj = FindClusters(s_obj, resolution = 0.15)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 1996
## Number of edges: 57766
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9489
## Number of communities: 7
## Elapsed time: 0 seconds
table(Idents(s_obj))
##
##   0   1   2   3   4   5   6
## 689 436 435 216  92  85  43
s_obj = RunTSNE(s_obj)
DimPlot (s_obj, reduction = "tsne", label = TRUE)

#label clusters based on marker genes
current.cluster.ids = c(0, 1, 2, 3, 4, 5, 6)
new.cluster.ids = c("Glutamatergic neuron", "GABAergic neuron", "Glutamatergic neuron", "GABAergic neuron",
                    "Glutamatergic neuron","Oligodendrocyte","GABAergic neuron")
names(x = new.cluster.ids) <- levels(x = s_obj)
pbmc <- RenameIdents(object = s_obj, new.cluster.ids)
DimPlot(object = pbmc, reduction = "tsne", label = TRUE, pt.size = 0.5) + NoLegend()

pbmc_mouse = pbmc

#integration---------------------------------------------------------------
pbmc.combined = merge(pbmc_mouse, y = pbmc_human, add.cell.ids = c("mouse", "human"), project = "protocol")
species.list <- SplitObject(pbmc.combined, split.by = "protocol")
reference.list <- species.list[c("human", "mouse")]
species.anchors <- FindIntegrationAnchors(object.list = reference.list, dims = 1:30)
## Computing 2000 integration features
## No variable features found for object1 in the object.list. Running FindVariableFeatures ...
## No variable features found for object2 in the object.list. Running FindVariableFeatures ...
## Scaling features for provided objects
## Finding all pairwise anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
##  Found 4954 anchors
## Filtering anchors
##  Retained 1909 anchors
## Extracting within-dataset neighbors
species.integrated <- IntegrateData(anchorset = species.anchors, dims = 1:30)
## Merging dataset 1 into 2
## Extracting anchors for merged samples
## Finding integration vectors
## Finding integration vector weights
## Integrating data
DefaultAssay(species.integrated) <- "integrated"

# Run the standard workflow for visualization and clustering
library(cowplot)
species.integrated <- ScaleData(species.integrated, verbose = FALSE)
species.integrated <- RunPCA(species.integrated, npcs = 30, verbose = FALSE)
species.integrated <- RunTSNE(species.integrated, dims = 1:30)
p1 <- DimPlot(species.integrated, reduction = "tsne", group.by = "protocol")
plot(p1)

p2 <- DimPlot(species.integrated, reduction = "tsne", group.by = "seurat_clusters", label = TRUE, repel = TRUE) +
  NoLegend()
plot(p2)

plot_grid(p1, p2)

#annotate based on markers---------------------------------------------------
#gabaergic
FeaturePlot(object = species.integrated, features= c("2572", "2571"), min.cutoff = "q9", pt.size = 0.5)

#oligodendrocyte
FeaturePlot(object = species.integrated, features = c("4155", "4340","6507"), min.cutoff = "q9", pt.size = 0.5)

#glutametergic
FeaturePlot(object = species.integrated, features = c("57084", "2904","2744"), min.cutoff = "q9", pt.size = 0.5)

#Astrocyte
FeaturePlot(object = species.integrated, features = c("361"), min.cutoff = "q9", pt.size = 0.5)

#
FeaturePlot(object = species.integrated, features = c("5156"), min.cutoff = "q9", pt.size = 0.5)

new.ident <- c("Glutamatergic neuron", "GABAergic neuron", "Oligodendrocyte", "GABAergic neuron", "?", "Oligodendrocyte", "GABAergic neuron")

levels(x = species.integrated)
## [1] "Astrocyte"            "GABAergic neuron"     "Glutamatergic neuron"
## [4] "Oligodendrocyte"
p2 <- DimPlot(species.integrated, reduction = "tsne", label = TRUE, repel = TRUE) +
  NoLegend()
plot(p2)

plot_grid(p1, p2)

freq_table <- prop.table(x = table(species.integrated@active.ident, species.integrated@meta.data[, "protocol"]),
                         margin = 2)
barplot(height = freq_table)

freq_table
##
##                             human      mouse
##   Astrocyte            0.07677665 0.00000000
##   GABAergic neuron     0.14593909 0.34819639
##   Glutamatergic neuron 0.65989848 0.60921844
##   Oligodendrocyte      0.11738579 0.04258517