a lot of the code comes from https://github.com/jessicachung/endo_model_paper/blob/master/src/GSE141549_analysis.Rmd

because that is the code that was used to analyze GSE141549 for the paper. So the goal is to handle the data in the same way and then use the Endest R package. In the linked github the data is normalized and the the molecular time is determined using the get_molecular_time function. However Endest is supposed to do that same thing all in one function estimate_cycle_time.

the code for endest: https://github.com/jessicachung/endest

In the paper the model is evaluated on the GSE141549 data set to generate figure 4. So I thought I would be able to use Endest to recreate those results. But it is not working out.

https://www.nature.com/articles/s41467-023-41979-z#Fig4

get data

code from GSE141549_analysis.Rmd

gse_xlsx_file <- file.path("/Users/rorihoover/Desktop/Rprojects/GSE141549_batchCorrectednormalizedArrayscombined.xlsx")
gse_data <- read_excel(gse_xlsx_file) %>% janitor::clean_names()
all_probe_annotation <- illuminaHumanv3fullReannotation() %>%
  dplyr::select(IlluminaID, ProbeQuality, SymbolReannotated, EnsemblReannotated,
                EntrezReannotated) %>% janitor::clean_names()
gse_1 <- read_delim(file.path("/Users/rorihoover/Desktop/Rprojects/GSE141549-GPL10558_series_matrix.txt"), delim="\t", comment="!Series") %>%
  janitor::clean_names()
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 34 Columns: 73
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (73): !Sample_title, SAMPLE 332 PE, SAMPLE 333 DiEIn, SAMPLE 334 PE, SAM...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
gse_2 <- read_delim(file.path("/Users/rorihoover/Desktop/Rprojects/GSE141549-GPL13376_series_matrix.txt"), delim="\t", comment="!Series") %>%
  janitor::clean_names()
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 34 Columns: 337
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (337): !Sample_title, SAMPLE 1 OMA, SAMPLE 2 PP, SAMPLE 3 PE, SAMPLE 4 D...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
stopifnot(gse_1$sample_title == gse_2$sample_title)
gse_phenotype_combined <- cbind(gse_1,  gse_2[,-1])
gse_phenotype <- gse_phenotype_combined %>%
  filter(sample_title %in% c("!Sample_geo_accession", "!Sample_source_name_ch1",
                          "!Sample_description", "!Sample_characteristics_ch1"))
rownames(gse_phenotype) <- c("geo_accession", "source", "tissue", "age",
                             "disease_stage", "cycle_phase", "description")

gse_phenotype <- gse_phenotype %>% t %>% data.frame(stringsAsFactors=FALSE)
gse_phenotype <- gse_phenotype[-1,] %>% tibble::rownames_to_column("sample_id")

gse_phenotype <- gse_phenotype %>%
  mutate(age=as.numeric(str_remove(age, "age: ")),
         disease_stage=str_remove(disease_stage, "disease stage: "),
         cycle_phase=str_remove(cycle_phase, "cycle phase: "),
         tissue=str_remove(tissue, "tissue: "))

expression data

code from GSE141549_analysis.Rmd

all_gse_matrix <- gse_data %>% select(-gene_symbol, -probe_id) %>% 
  as.matrix
rownames(all_gse_matrix) <- gse_data$probe_id
probe_info <- all_probe_annotation %>%
  filter(illumina_id %in% gse_data$probe_id,
         ! is.na(ensembl_reannotated),
         str_detect(probe_quality, "Good|Perfect"))

tmp <- probe_info %>% group_by(ensembl_reannotated) %>% summarise(n=n())
table(tmp$n)
## 
##     1     2     3     4     5     6     8    10 
## 12782  2858   692   168    28    12     1     1
single_ensembl <- tmp %>% filter(n == 1) %>% pull(ensembl_reannotated)
multiple_ensembl <- tmp %>% filter(n > 1) %>% pull(ensembl_reannotated)

m <- match(single_ensembl, probe_info$ensembl_reannotated)
single_probes <- probe_info[m, "illumina_id"]
single_exprs <- all_gse_matrix[single_probes,]
rownames(single_exprs) <- single_ensembl

# Consolidate ensembl IDs that have multiple probes
consolidated_exprs <- list()
for (ensembl_id in multiple_ensembl) {
  illumina_ids <- probe_info %>% 
    filter(ensembl_reannotated == ensembl_id) %>% pull(illumina_id)
  consolidated_exprs[[ensembl_id]] <- all_gse_matrix[illumina_ids,] %>% colMeans
}

# Combine single and multiple
gse_matrix <- rbind(
  do.call(rbind, consolidated_exprs),
  single_exprs)

gse_matrix[1:10,1:5]
##                 sample_332_pe sample_333_di_e_in sample_334_pe sample_335_pe_lb
## ENSG00000000003      9.948560           9.497258      9.470034         9.263269
## ENSG00000000457      7.746377           7.588266      7.650068         7.999423
## ENSG00000000938      8.783053           8.822535      9.602908         8.749572
## ENSG00000000971      8.694983          10.534457      8.702493         9.433396
## ENSG00000001084      8.208586           8.117399      8.096838         8.232088
## ENSG00000001167      7.328995           7.461156      7.548491         7.298389
## ENSG00000002330      8.750289           8.187478      8.187828         8.429237
## ENSG00000002745      6.508301           6.510290      6.761333         6.607068
## ENSG00000003147      9.129841           7.964991      9.440453         8.931206
## ENSG00000003436      8.464065           9.056110      9.172818         8.622319
##                 sample_336_pe
## ENSG00000000003      9.415012
## ENSG00000000457      7.645652
## ENSG00000000938      9.453189
## ENSG00000000971      8.045737
## ENSG00000001084      8.249323
## ENSG00000001167      7.810778
## ENSG00000002330      8.158730
## ENSG00000002745      6.680754
## ENSG00000003147      9.345195
## ENSG00000003436      9.413529

Applying Endest

Samples are dated 0-99:

0 ~start of menstruation 

~8 start of proliferative phase

~58 start of secretory phase

results <- estimate_cycle_time(exprs=gse_matrix)
## Using ensembl identifiers from rownames.
## Using 12674 / 16542 observations from the input expression matrix.
endest_value <- results$estimated_time
gse_phenotype$estimated_time <-endest_value
endometrium_samples <- gse_phenotype %>% 
  filter(source == "Endometrium") %>%
  select(sample_id, cycle_phase, estimated_time, description, source, tissue,
         age, disease_stage)
endometrium_samples %>% filter(cycle_phase %in% c("menstruation", "proliferative", "secretory")) %>%
  ggplot(aes(x=estimated_time)) +
  geom_histogram(binwidth = 2)

It seems skewed? in the paper the smallest proportion of samples are less than 10 but here i have most my data around 10.

  ggplot(endometrium_samples, aes(x=estimated_time, y=cycle_phase)) +
  geom_point()

it seems that medication samples are creating a lot of bulk on the lower end

I would expect to see a grouping for secretory between 58-99, menstruation 0-8 and proliferative 8-58. However that is not happening.

PCA of gene expression with author labeled cycle phase

dat <- endometrium_samples %>% filter(cycle_phase %in% c("menstruation", "proliferative", "secretory"))
pca <- prcomp(t(gse_matrix[,dat$sample_id]), scale.=FALSE)$x[,1:4]
pca <- as.data.frame(pca) %>% tibble::rownames_to_column("sample_id") %>%
  merge(dat, by="sample_id")

ggplot(pca, aes(x=PC1, y=PC2, color=cycle_phase)) +
  geom_point()

PCA of gene expression with endest labeled cycle time

ggplot(pca, aes(x = PC1, y = PC2, color = estimated_time)) +
  geom_point(size = 2) +
  scale_colour_gradientn(limits=c(0,100), colors=rainbow(20)[2:19])+
  ggtitle("recreating figure 4")

this looks nothing like figure 4 of the paper

Retry Endest but filter matrix first

It could be the case that i should not run the medication and uknown samples through Endest because those tend give low values

filtered_gse_matrix <- gse_matrix[, colnames(gse_matrix) %in% dat$sample_id]
#matrix contains only endometrium sample with cycle phase menstraul, proliferative or secretory 
results <- estimate_cycle_time(exprs=filtered_gse_matrix)
## Using ensembl identifiers from rownames.
## Using 12674 / 16542 observations from the input expression matrix.
endest_value2 <- results$estimated_time
dat$estimated_time2 <- endest_value2

ggplot(dat,aes(x=estimated_time2)) +
  geom_histogram(binwidth = 2)

ggplot(dat, aes(x=estimated_time2, y=cycle_phase)) +
  geom_point()

Still no expected grouping

# same pca as earlier just hear to look at it again
pca <- prcomp(t(filtered_gse_matrix[,dat$sample_id]), scale.=FALSE)$x[,1:4]
pca <- as.data.frame(pca) %>% tibble::rownames_to_column("sample_id") %>%
  merge(dat, by="sample_id")

ggplot(pca, aes(x=PC1, y=PC2, color=cycle_phase)) +
  geom_point()

ggplot(pca, aes(x = PC1, y = PC2, color = estimated_time2)) +
  geom_point(size = 2) +
  scale_colour_gradientn(limits=c(0,100), colors=rainbow(20)[2:19])+
  ggtitle("recreating figure 4 2nd try")

it still looks bad?

I could try imputing the original illumina probes into estimate_cycle_time instead of filtering for good probes and converting to ensembl. But i feel like that should not cause this drastic of an issue and furthermore filtering for good probes and converting to ensembl is what was done in the paper to analyze GSE141549.

Dr.Bartelle mentioned I should run Endest on the data used to train it and see what happens.