Project Goals

What is a Geoduck?

Load Geoduck Transcriptome

salmonMatrix<-read.table(file="https://raw.githubusercontent.com/sr320/paper-geoduck-gene-exp/main/data/salmon.merged.gene_tpm.tsv", header=TRUE, sep = '\t')
names(salmonMatrix)[1]<-"target_id"
names(salmonMatrix)[2]<-"gene"

goslims<-read.csv(file="https://raw.githubusercontent.com/RobertsLab/code/master/r_projects/sam/20230328-pgen-gene_annotation-update/outputs/02-goslim-mapping/20230329-pgen-annotations-SwissProt-GO-BP_GOslim.tab", header=TRUE, sep = '\t')

### <b> 
pgenerosa_simple_goslims<-salmonMatrix %>% 
  left_join(goslims, by=c("gene"))
### </b>

knitr::kable(head(pgenerosa_simple_goslims[1:5, 1:5]), "simple")
target_id gene ctenidia gonad heart
PGEN_.00g000010 PGEN_.00g000010 0.000000 0.000000 0.000000
PGEN_.00g000020 PGEN_.00g000020 0.000000 0.000000 0.000000
PGEN_.00g000030 PGEN_.00g000030 0.225494 0.000000 0.000000
PGEN_.00g000040 PGEN_.00g000040 0.565506 0.000000 0.254843
PGEN_.00g000050 PGEN_.00g000050 0.508251 4.745168 0.228272
#kable(head(pgenerosa_simple_goslims, n = 5))
#kable(pgenerosa_simple_goslims[1:5,], caption = "Pgenerosa Transcriptome")
##         target_id            gene ctenidia    gonad    heart juvenile   larvae
## 1 PGEN_.00g000010 PGEN_.00g000010 0.000000 0.000000 0.000000 4.851851 3.423667
## 2 PGEN_.00g000020 PGEN_.00g000020 0.000000 0.000000 0.000000 0.840164 0.370266
## 3 PGEN_.00g000030 PGEN_.00g000030 0.225494 0.000000 0.000000 0.069598 0.052545
## 4 PGEN_.00g000040 PGEN_.00g000040 0.565506 0.000000 0.254843 0.376364 1.131114
## 5 PGEN_.00g000050 PGEN_.00g000050 0.508251 4.745168 0.228272 0.182728 1.117655
## 6 PGEN_.00g000060 PGEN_.00g000060 6.490179 0.000000 0.000000 6.742646 5.936612
##         target_id            gene   tissue      tpm
## 1 PGEN_.00g000010 PGEN_.00g000010 ctenidia 0.000000
## 2 PGEN_.00g000020 PGEN_.00g000020 ctenidia 0.000000
## 3 PGEN_.00g000030 PGEN_.00g000030 ctenidia 0.225494
## 4 PGEN_.00g000040 PGEN_.00g000040 ctenidia 0.565506
## 5 PGEN_.00g000050 PGEN_.00g000050 ctenidia 0.508251
## 6 PGEN_.00g000060 PGEN_.00g000060 ctenidia 6.490179
P.generosa long format
target_id gene tissue tpm binary accessions
PGEN_.00g000010 PGEN_.00g000010 ctenidia 0.000000 0 Q86IC9;Q552T5
PGEN_.00g000020 PGEN_.00g000020 ctenidia 0.000000 0 P04177
PGEN_.00g000030 PGEN_.00g000030 ctenidia 0.225494 1 NA
PGEN_.00g000040 PGEN_.00g000040 ctenidia 0.565506 1 NA
PGEN_.00g000050 PGEN_.00g000050 ctenidia 0.508251 1 Q8L840;O04092;Q9FT71
PGEN_.00g000060 PGEN_.00g000060 ctenidia 6.490179 1 Q61043;A0A1Y7VJL5;B2RQ73;B7ZMZ9;E9Q488;E9Q4S3;Q674R4;Q6ZPM7

Break cells by semi-colon and return only distinct values

#break cells and reform with unique values only
distinct_goslims <- pgenerosa_simple_goslims %>%
  ### <b>
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";")))  %>% distinct(Term_unique, .keep_all = TRUE) %>%
  mutate(GOslim_unique = lapply(strsplit(as.character(GOslim),";"),function(GOslim_unique) paste0(unique(GOslim_unique), collapse = ";"))) %>% 
  distinct(GOslim_unique, .keep_all = TRUE)
### </b>

knitr::kable(head(distinct_goslims[1:5, 1:5]), "simple")
target_id gene ctenidia gonad heart
PGEN_.00g000010 PGEN_.00g000010 0.000000 0.000000 0.000000
PGEN_.00g000020 PGEN_.00g000020 0.000000 0.000000 0.000000
PGEN_.00g000050 PGEN_.00g000050 0.508251 4.745168 0.228272
PGEN_.00g000060 PGEN_.00g000060 6.490179 0.000000 0.000000
PGEN_.00g000080 PGEN_.00g000080 0.000000 0.000000 0.119488

Get GOterm counts

#count top GOterms
tissue_goslims <- pgenerosa_long %>%
    group_by(tissue) %>% 
  ### <b>
  filter(binary == 1) %>%
  ### </b>
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>%
  separate_rows(Term_unique, sep = ";") %>%
  mutate(Term_unique = trimws(Term_unique)) %>%
  count(Term_unique, tissue, sort = TRUE)

kable(tissue_goslims[1:5,], caption="GOterms by tissue")
GOterms by tissue
tissue Term_unique n
juvenile anatomical structure development 1817
larvae anatomical structure development 1743
heart anatomical structure development 1678
ctenidia anatomical structure development 1661
juvenile signaling 1572

SPECIES

gene Scaffold Mercenaria_gene V3 V4
PGEN_.00g000040 Scaffold_01:55792-67546 XM_045314054.2 84.706 170
PGEN_.00g000050 Scaffold_01:67586-69113 XM_053546612.1 83.056 602
PGEN_.00g000060 Scaffold_01:70713-81099 XM_053526799.1 75.616 406
PGEN_.00g000270 Scaffold_01:344820-379444 XM_053526799.1 76.190 609
PGEN_.00g000280 Scaffold_01:402378-436712 XR_008368031.1 83.088 136

ex) Annotated Mercenaria Transcriptome

#join clam transcriptome with goslims
Manila_Pgen_goslims <- Manila_Pgen_new %>%
left_join(goslims, by=c("gene"))
Pgen_Manila_goslims <- Pgen_Manila_new %>%
  left_join(goslims, by=c("gene"))
Mercenaria_Pgen_goslims<- Mercenaria_Pgen_new %>%
left_join(goslims, by=c("gene")) #6521
### <b>
Pgen_Mercenaria_goslims<- Pgen_Mercenaria_new %>%
left_join(goslims, by=c("gene")) #5099
### </b>
knitr::kable(head(Pgen_Mercenaria_goslims[1:5, 1:5]), "simple",  caption="annotated Mercenaria Transcriptome")
annotated Mercenaria Transcriptome
gene Scaffold Mercenaria_gene V3 V4
PGEN_.00g000040 Scaffold_01:55792-67546 XM_045314054.2 84.706 170
PGEN_.00g000050 Scaffold_01:67586-69113 XM_053546612.1 83.056 602
PGEN_.00g000060 Scaffold_01:70713-81099 XM_053526799.1 75.616 406
PGEN_.00g000270 Scaffold_01:344820-379444 XM_053526799.1 76.190 609
PGEN_.00g000280 Scaffold_01:402378-436712 XR_008368031.1 83.088 136

View GOterm counts by Species

P. generosa
Term_unique n
anatomical structure development 1853
signaling 1603
cell differentiation 1281
DNA-templated transcription 741
regulation of DNA-templated transcription 678

GOterm counts for all Pgenerosa genes

knitr::kable(term_count_best[1:10,], caption="GOterms count, removing duplicates")
GOterms count, removing duplicates
transcriptome Term_unique n
Pgenerosa anatomical structure development 1853
Pgenerosa signaling 1603
Pgenerosa cell differentiation 1281
Pgenerosa DNA-templated transcription 741
Pgenerosa regulation of DNA-templated transcription 678
Pgenerosa immune system process 670
Pgenerosa programmed cell death 600
Pgenerosa cell motility 595
Pgenerosa reproductive process 536
Pgenerosa cytoskeleton organization 516
Total Terms per Species
transcriptome total_n
Manila 1235
Manila_r 1390
Mercenaria 2198
Mercenaria_r 7027
Pgenerosa 17611
NA NA
NA NA
Species Normalized by Pgenerosa Transcriptome
transcriptome Term_unique n_species n_divided n_Pgen
Manila anatomical structure development 129 0.1044534 1853
Manila signaling 113 0.0914980 1603
Manila cell differentiation 86 0.0696356 1281
Manila cytoskeleton organization 65 0.0526316 516
Manila DNA-templated transcription 51 0.0412955 741

Normalize Species by Pgenerosa transcriptome

### <b>
filtered_data1 <- term_count_best %>%
  filter(transcriptome %in% c("Mercenaria_r", "Manila", "Mercenaria", "Manila_r")) %>%
  mutate(n_divided = case_when(
                               transcriptome == "Manila" ~ n/1235,
                               transcriptome == "Manila_r" ~ n/1390,
                               transcriptome == "Mercenaria" ~ n/2198,
                               transcriptome == "Mercenaria_r" ~ n/7027))
### </b>

knitr::kable(species_normal[1:5, 1:5],caption="Species Normalized by Pgenerosa Transcriptome")
Species Normalized by Pgenerosa Transcriptome
transcriptome Term_unique n_species n_divided n_Pgen
Manila anatomical structure development 129 0.1044534 1853
Manila signaling 113 0.0914980 1603
Manila cell differentiation 86 0.0696356 1281
Manila cytoskeleton organization 65 0.0526316 516
Manila DNA-templated transcription 51 0.0412955 741

Download Clam Genomes

load genomes from blast output

Mercenaria_genome<-read.csv(file="https://gannet.fish.washington.edu/gigas/data/p.generosa/Pgenerosa_Mercenaria_db_genome_blastn.tab", sep = '\t', header = FALSE)

Quadrangularis_genome<-read.csv(file="https://gannet.fish.washington.edu/gigas/data/p.generosa/Pgenerosa_quadrangularis_db_genome_blastn.tab", sep = '\t', header = FALSE)

Manila_genome<-read.csv(file="https://gannet.fish.washington.edu/gigas/data/p.generosa/Pgenerosa_manila_db_genome_blastn.tab", sep = '\t', header = FALSE)

Marissincia_genome<-read.csv(file="https://gannet.fish.washington.edu/gigas/data/p.generosa/Pgenerosa_marissinsa_db_genome_blastn.tab", sep = '\t', header = FALSE)

Solida_genome<-read.csv(file="https://gannet.fish.washington.edu/gigas/data/p.generosa/solida_db_genome_blastn.tab", sep = '\t', header = FALSE)

get summary data for genomes

#Mercenaria
Mercenaria_new <- separate(Mercenaria_genome, V1, into = c("gene", "Scaffold"), sep = "::")
names(Mercenaria_new)[1]<-"gene"
names(Mercenaria_new)[12]<-"e-value"

Mercenaria_summary <- Mercenaria_new %>%
  mutate(Marissincia_genes = lapply(strsplit(as.character(V2), ";"), function(Marissincia_genes) paste0(unique(Marissincia_genes), collapse = ";"))) %>% 
  distinct(Marissincia_genes, .keep_all = TRUE) 

#Quadrangularis
Quadrangularis_new <- separate(Quadrangularis_genome, V1, into = c("gene", "Scaffold"), sep = "::")
names(Quadrangularis_new)[1]<-"gene"
names(Quadrangularis_new)[12]<-"e-value"

Quadrangularis_summary <- Quadrangularis_new %>%
  mutate(genes = lapply(strsplit(as.character(V2), ";"), function(genes) paste0(unique(genes), collapse = ";"))) %>% 
  distinct(genes, .keep_all = TRUE) 

#Manila 
Manila_new <- separate(Manila_genome, V1, into = c("gene", "Scaffold"), sep = "::")
names(Manila_new)[1]<-"gene"
names(Manila_new)[12]<-"e-value"

Manila_summary <- Manila_new%>%
  mutate(genes = lapply(strsplit(as.character(V2), ";"), function(genes) paste0(unique(genes), collapse = ";"))) %>% 
  distinct(genes, .keep_all = TRUE) 

#Marissincia
Marissincia_new <- separate(Marissincia_genome, V1, into = c("gene", "Scaffold"), sep = "::")
names(Marissincia_new)[1]<-"gene"
names(Marissincia_new)[12]<-"e-value"

Marissincia_summary <- Marissincia_new %>%
  mutate(Marissincia_genes = lapply(strsplit(as.character(V2), ";"), function(Marissincia_genes) paste0(unique(Marissincia_genes), collapse = ";"))) %>% 
  distinct(Marissincia_genes, .keep_all = TRUE) 

#Solida
Solida_new <- separate(Solida_genome, V1, into = c("gene", "Scaffold"), sep = "::")
names(Solida_new)[1]<-"gene"
names(Solida_new)[12]<-"e-value"

Solida_summary <- Solida_new%>%
  mutate(genes = lapply(strsplit(as.character(V2), ";"), function(genes) paste0(unique(genes), collapse = ";"))) %>% 
  distinct(genes, .keep_all = TRUE) 

knitr::kable(head(Solida_summary[1:5, 1:5]), "simple")
gene Scaffold V2 V3 V4
PGEN_.00g000060 Scaffold_01:70713-81099 OX365947.1 74.279 416
PGEN_.00g000270 Scaffold_01:344820-379444 OX365949.1 75.163 306
PGEN_.00g000320 Scaffold_01:571666-599482 OX365950.1 79.545 220
PGEN_.00g000670 Scaffold_01:1389639-1400328 OX365948.1 87.857 140
PGEN_.00g001410 Scaffold_01:2961919-2993436 OX365952.1 79.126 206

make tables for clam genomes

clam_genomes<-bind_rows(lst(Solida_new, Marissincia_new, Mercenaria_new, Manila_new, Quadrangularis_new), .id = "clam")

knitr::kable(head(clam_genomes[1:5, 1:5]), "simple")
clam gene Scaffold V2 V3
Solida_new PGEN_.00g000060 Scaffold_01:70713-81099 OX365947.1 74.279
Solida_new PGEN_.00g000270 Scaffold_01:344820-379444 OX365949.1 75.163
Solida_new PGEN_.00g000300 Scaffold_01:503707-557915 OX365947.1 83.648
Solida_new PGEN_.00g000320 Scaffold_01:571666-599482 OX365950.1 79.545
Solida_new PGEN_.00g000670 Scaffold_01:1389639-1400328 OX365948.1 87.857

TISSUES

  • gonad
  • ctenidia
  • heart
  • juvenile
  • larvae
Total Terms per Tissue(s)
tissue total_n
ctenidia 15911
gonad 14715
heart 16021
juvenile 17277
larvae 16632
NA NA

Normalized Tissue Data

filtered_data <- tissue_goslims %>%
  filter(tissue %in% c("ctenidia", "heart", "gonad", "larvae", "juvenile")) %>%
  ### <b>
  mutate(n_divided = case_when(tissue == "ctenidia" ~ n/15911,
                               tissue == "heart" ~ n/16021, 
                               tissue == "gonad" ~ n/14715, 
                               tissue == "larvae" ~ n/16632,
                               tissue == "juvenile" ~ n/17277)) 
### </b>
kable(Tis_normal[1:7,])
tissue Term_unique n_tissue n_divided n_Pgen Pgen_normalized normalized log_scale
juvenile anatomical structure development 1817 0.1051687 1853 0.1052183 0.9995285 -0.0004716
larvae anatomical structure development 1743 0.1047980 1853 0.1052183 0.9960050 -0.0040030
heart anatomical structure development 1678 0.1047375 1853 0.1052183 0.9954305 -0.0045800
ctenidia anatomical structure development 1661 0.1043932 1853 0.1052183 0.9921578 -0.0078731
juvenile signaling 1572 0.0909880 1603 0.0910227 0.9996195 -0.0003806
gonad anatomical structure development 1524 0.1035678 1853 0.1052183 0.9843132 -0.0158112
larvae signaling 1501 0.0902477 1603 0.0910227 0.9914863 -0.0085502

Species Heatmap normalized by P. generosa transcriptopme where white = 0.9

Tissue Heatmap normalized by P. generosa transcriptome where white = 0.9

Plan for the next couple weeks:

Week 1 - presented research at National Shellfish Association Meeting

Week 2 - Got COVID, started downloading clam transcriptomes

Week 3 - gather genes from each tissue type, build sh files for blasting manilla clam and mercenaria against pgen in Mox, Run stand blasts with Manilla, Pgen and Mercenaria build database for Manilla, Pgen and Mercenaria, start writing methods

Week 4 - dowload to raven the genomic and transctiptomic information for 6 clam species, find transcriptomes for introduction, break goslims into seperate columns, join blast tables with annotation tables

Week 5 - added new normalizing gene count column for heatmapping, finished writing draft 1 of results, working on methods, finish introduction

Week 6 - finish writing introduction/discussion for geoduck paper

Week 7 - address edits from Steven for CS work, run blast against other clam transcriptomes (step 8), make table for compairative clam analysis, update methods and results in geoduck paper

Week 8 -

Week 9 -

Week 10 - hopefully nothing, prepare for graduation!