Zenodo release: DOI

1 Comparative Genomic and Proteomic Analysis of Geoduck clam (Panopea generosa)

  • Geoduck Genome already assembled in Putnum et al. 2022
  • Geoduck Transcritome – genome linked transcriptome link here
  • Comparative Transcriptome Analysis
    1. gonad: reproductive genes
    2. heart: circulatory genes
    3. ctenidia: respiratory genes
  • Reproductive Transcriptome Analysis
    1. larvae
    2. juvenile
    3. Dheilly et al. 2012 Pacific oyster reproductive genes (female gonad)
  • Comparative Species Transcriptome Analysis
    1. Mercenaria mercenaria
    2. Manila Clam (Ruditapes philippinarum)
  • Comparative Species Genome Analysis
    1. Mercenaria mercenaria
    2. Manila Clam (Ruditapes philippinarum)
    3. Archivesica marissinica
    4. Spinsula solida
    5. Mactra quadrangularis

2 LOAD DATA

2.1 load new genome linked transcriptome PGEN

salmonMatrix<-read.csv(file="https://raw.githubusercontent.com/sr320/paper-geoduck-gene-exp/main/data/salmon.merged.gene_tpm.tsv", header=TRUE, sep = '\t')
names(salmonMatrix)[1]<-"target_id"
names(salmonMatrix)[2]<-"gene"
data.table(salmonMatrix)
##              target_id            gene ctenidia    gonad    heart juvenile
##     1: PGEN_.00g000010 PGEN_.00g000010 0.000000 0.000000 0.000000 4.851851
##     2: PGEN_.00g000020 PGEN_.00g000020 0.000000 0.000000 0.000000 0.840164
##     3: PGEN_.00g000030 PGEN_.00g000030 0.225494 0.000000 0.000000 0.069598
##     4: PGEN_.00g000040 PGEN_.00g000040 0.565506 0.000000 0.254843 0.376364
##     5: PGEN_.00g000050 PGEN_.00g000050 0.508251 4.745168 0.228272 0.182728
##    ---                                                                    
## 34943: PGEN_.00g349430 PGEN_.00g349430 0.000000 0.000000 0.000000 0.000000
## 34944: PGEN_.00g349440 PGEN_.00g349440 1.955226 1.489901 1.621033 0.000000
## 34945: PGEN_.00g349450 PGEN_.00g349450 0.000000 0.000000 0.159121 0.000000
## 34946: PGEN_.00g349460 PGEN_.00g349460 0.000000 0.000000 0.000000 0.000000
## 34947: PGEN_.00g349470 PGEN_.00g349470 0.000000 0.000000 0.000000 0.000000
##          larvae
##     1: 3.423667
##     2: 0.370266
##     3: 0.052545
##     4: 1.131114
##     5: 1.117655
##    ---         
## 34943: 0.000000
## 34944: 2.222424
## 34945: 0.000000
## 34946: 0.000000
## 34947: 0.000000
#34,947

2.2 load goslim mapping from Swiss Prott

link here

#double check to make sure this is reading the data correctly
goslims<-read.csv(file="https://raw.githubusercontent.com/RobertsLab/code/master/r_projects/sam/20230328-pgen-gene_annotation-update/outputs/02-goslim-mapping/20230329-pgen-annotations-SwissProt-GO-BP_GOslim.tab", header=TRUE, sep = '\t')
#14,672
presence_count <- sum(!is.na(goslims[, 11]) & goslims[, 11] != "") #10,903

2.3 download other transcriptome from clam species

#from Mox blasting Manilla/Mercenaria on Pgen
Manilla_Pgen<-read.table(file="https://gannet.fish.washington.edu/gigas/data/p.generosa/Manilla_Pgenenerosa_blastx.tab")
Mercenaria_Pgen<-read.table(file="https://gannet.fish.washington.edu/gigas/data/p.generosa/Mercenaria_Pgenenerosa_blastx.tab")

2.4 load genomes from blast output

Mercenaria_genome<-read.csv(file="https://gannet.fish.washington.edu/gigas/data/p.generosa/Pgenerosa_Mercenaria_db_genome_blastn.tab", sep = '\t', header = FALSE)

Quadrangularis_genome<-read.csv(file="https://gannet.fish.washington.edu/gigas/data/p.generosa/Pgenerosa_quadrangularis_db_genome_blastn.tab", sep = '\t', header = FALSE)

Manila_genome<-read.csv(file="https://gannet.fish.washington.edu/gigas/data/p.generosa/Pgenerosa_manila_db_genome_blastn.tab", sep = '\t', header = FALSE)

Marissincia_genome<-read.csv(file="https://gannet.fish.washington.edu/gigas/data/p.generosa/Pgenerosa_marissinsa_db_genome_blastn.tab", sep = '\t', header = FALSE)

Solida_genome<-read.csv(file="https://gannet.fish.washington.edu/gigas/data/p.generosa/solida_db_genome_blastn.tab", sep = '\t', header = FALSE)

kable(Solida_genome[1:10,1:5])
V1 V2 V3 V4 V5
PGEN_.00g000060::Scaffold_01:70713-81099 OX365947.1 74.279 416 84
PGEN_.00g000270::Scaffold_01:344820-379444 OX365949.1 75.163 306 58
PGEN_.00g000300::Scaffold_01:503707-557915 OX365947.1 83.648 159 16
PGEN_.00g000320::Scaffold_01:571666-599482 OX365950.1 79.545 220 35
PGEN_.00g000670::Scaffold_01:1389639-1400328 OX365948.1 87.857 140 17
PGEN_.00g001140::Scaffold_01:2257307-2316262 OX365948.1 78.487 423 82
PGEN_.00g001170::Scaffold_01:2366888-2417042 OX365948.1 82.443 131 23
PGEN_.00g001350::Scaffold_01:2794647-2838542 OX365948.1 76.093 343 78
PGEN_.00g001380::Scaffold_01:2878758-2888130 OX365948.1 78.431 204 43
PGEN_.00g001410::Scaffold_01:2961919-2993436 OX365952.1 79.126 206 35

2.4.1 get summary data for genomes

#Mercenaria
Mercenaria_new <- separate(Mercenaria_genome, V1, into = c("gene", "Scaffold"), sep = "::")
names(Mercenaria_new)[1]<-"gene"
names(Mercenaria_new)[12]<-"e-value"

Mercenaria_summary <- Mercenaria_new %>%
  mutate(Marissincia_genes = lapply(strsplit(as.character(V2), ";"), function(Marissincia_genes) paste0(unique(Marissincia_genes), collapse = ";"))) %>% 
  distinct(Marissincia_genes, .keep_all = TRUE) 

#Quadrangularis
Quadrangularis_new <- separate(Quadrangularis_genome, V1, into = c("gene", "Scaffold"), sep = "::")
names(Quadrangularis_new)[1]<-"gene"
names(Quadrangularis_new)[12]<-"e-value"

Quadrangularis_summary <- Quadrangularis_new %>%
  mutate(genes = lapply(strsplit(as.character(V2), ";"), function(genes) paste0(unique(genes), collapse = ";"))) %>% 
  distinct(genes, .keep_all = TRUE) 

#Manila 
Manila_new <- separate(Manila_genome, V1, into = c("gene", "Scaffold"), sep = "::")
names(Manila_new)[1]<-"gene"
names(Manila_new)[12]<-"e-value"

Manila_summary <- Manila_new%>%
  mutate(genes = lapply(strsplit(as.character(V2), ";"), function(genes) paste0(unique(genes), collapse = ";"))) %>% 
  distinct(genes, .keep_all = TRUE) 

#Marissincia
Marissincia_new <- separate(Marissincia_genome, V1, into = c("gene", "Scaffold"), sep = "::")
names(Marissincia_new)[1]<-"gene"
names(Marissincia_new)[12]<-"e-value"

Marissincia_summary <- Marissincia_new %>%
  mutate(Marissincia_genes = lapply(strsplit(as.character(V2), ";"), function(Marissincia_genes) paste0(unique(Marissincia_genes), collapse = ";"))) %>% 
  distinct(Marissincia_genes, .keep_all = TRUE) 

#Solida
Solida_new <- separate(Solida_genome, V1, into = c("gene", "Scaffold"), sep = "::")
names(Solida_new)[1]<-"gene"
names(Solida_new)[12]<-"e-value"

Solida_summary <- Solida_new%>%
  mutate(genes = lapply(strsplit(as.character(V2), ";"), function(genes) paste0(unique(genes), collapse = ";"))) %>% 
  distinct(genes, .keep_all = TRUE) 

2.5 make tables for clam genomes

clam_genomes<-bind_rows(lst(Solida_new, Marissincia_new, Mercenaria_new, Manila_new, Quadrangularis_new), .id = "clam")
kable(clam_genomes[1:5,1:5])
clam gene Scaffold V2 V3
Solida_new PGEN_.00g000060 Scaffold_01:70713-81099 OX365947.1 74.279
Solida_new PGEN_.00g000270 Scaffold_01:344820-379444 OX365949.1 75.163
Solida_new PGEN_.00g000300 Scaffold_01:503707-557915 OX365947.1 83.648
Solida_new PGEN_.00g000320 Scaffold_01:571666-599482 OX365950.1 79.545
Solida_new PGEN_.00g000670 Scaffold_01:1389639-1400328 OX365948.1 87.857

3 JOIN DATA

3.1 combine all clams for scatterplot by e-value

#all clam transcriptomes
#Reminder: Merc->Pgen = 6521, Manila->Pgen = 761, Pgen -> Merc = 5,099, Pgen -> Manila = 657
#always join large -> small
compairative_clam_transcripts_right <- left_join(Mercenaria_Pgen_clean, Manila_Pgen_clean, by = "gene") 

compairative_clam_transcripts_left <- left_join(Pgen_Mercenaria_clean, Pgen_Manila_clean, by = "gene") 

4 join clam transcriptome with goslims

Manila_Pgen_goslims <- Manila_Pgen_new %>%
left_join(goslims, by=c("gene"))
Pgen_Manila_goslims <- Pgen_Manila_new %>%
  left_join(goslims, by=c("gene"))
Mercenaria_Pgen_goslims<- Mercenaria_Pgen_new %>%
left_join(goslims, by=c("gene")) #6521
Pgen_Mercenaria_goslims<- Pgen_Mercenaria_new %>%
left_join(goslims, by=c("gene")) #5099

5 long format

tidy_data<-salmonMatrix
head(tidy_data)
##         target_id            gene ctenidia    gonad    heart juvenile   larvae
## 1 PGEN_.00g000010 PGEN_.00g000010 0.000000 0.000000 0.000000 4.851851 3.423667
## 2 PGEN_.00g000020 PGEN_.00g000020 0.000000 0.000000 0.000000 0.840164 0.370266
## 3 PGEN_.00g000030 PGEN_.00g000030 0.225494 0.000000 0.000000 0.069598 0.052545
## 4 PGEN_.00g000040 PGEN_.00g000040 0.565506 0.000000 0.254843 0.376364 1.131114
## 5 PGEN_.00g000050 PGEN_.00g000050 0.508251 4.745168 0.228272 0.182728 1.117655
## 6 PGEN_.00g000060 PGEN_.00g000060 6.490179 0.000000 0.000000 6.742646 5.936612
tidy_data_long<-tidy_data %>%
  gather(key="tissue", value="tpm", ctenidia:larvae) #name key as time, name value as tpm, gather columns heart:larvae, converts wide to long format
names(tidy_data_long)[2]<-"gene"
head(tidy_data_long)
##         target_id            gene   tissue      tpm
## 1 PGEN_.00g000010 PGEN_.00g000010 ctenidia 0.000000
## 2 PGEN_.00g000020 PGEN_.00g000020 ctenidia 0.000000
## 3 PGEN_.00g000030 PGEN_.00g000030 ctenidia 0.225494
## 4 PGEN_.00g000040 PGEN_.00g000040 ctenidia 0.565506
## 5 PGEN_.00g000050 PGEN_.00g000050 ctenidia 0.508251
## 6 PGEN_.00g000060 PGEN_.00g000060 ctenidia 6.490179
tidy_data_long$binary=ifelse(tidy_data_long$tpm > 0, 1, 0)

#add goslims below

6 combine datasets into one gene annotation file

pgenerosa_simple_goslims<-salmonMatrix %>% #best pgenerosa data set
  left_join(goslims, by=c("gene"))
pgenerosa_goslims<-salmonMatrix %>%
  left_join(goslims, by=c("gene"))
pgenerosa_long<-tidy_data_long %>% #best long pgenerosa data set
  left_join(goslims, by=c("gene"))
pgenerosa_goslims_long<-tidy_data_long %>%
  left_join(pgenerosa_simple_goslims, by=c("gene"))
kable(pgenerosa_simple_goslims[1:10, 1:10])
target_id gene ctenidia gonad heart juvenile larvae accessions gene_id gene_name
PGEN_.00g000010 PGEN_.00g000010 0.000000 0.000000 0.000000 4.851851 3.423667 Q86IC9;Q552T5 8620183 omt5
PGEN_.00g000020 PGEN_.00g000020 0.000000 0.000000 0.000000 0.840164 0.370266 P04177 25085 Th
PGEN_.00g000030 PGEN_.00g000030 0.225494 0.000000 0.000000 0.069598 0.052545 NA NA NA
PGEN_.00g000040 PGEN_.00g000040 0.565506 0.000000 0.254843 0.376364 1.131114 NA NA NA
PGEN_.00g000050 PGEN_.00g000050 0.508251 4.745168 0.228272 0.182728 1.117655 Q8L840;O04092;Q9FT71 837636 RECQL4A
PGEN_.00g000060 PGEN_.00g000060 6.490179 0.000000 0.000000 6.742646 5.936612 Q61043;A0A1Y7VJL5;B2RQ73;B7ZMZ9;E9Q488;E9Q4S3;Q674R4;Q6ZPM7 18080 Nin
PGEN_.00g000070 PGEN_.00g000070 0.000000 0.000000 0.000000 0.025013 1.103535 NA NA NA
PGEN_.00g000080 PGEN_.00g000080 0.000000 0.000000 0.119488 0.229570 0.000000 A1E2V0 489433 BIRC3
PGEN_.00g000090 PGEN_.00g000090 0.187374 0.000000 0.000000 0.173312 0.000000 P34456 186266
PGEN_.00g000100 PGEN_.00g000100 0.000000 0.000000 0.000000 0.304322 0.000000 NA NA NA

6.1 break semi-colon in Term column and count unique values only

6.1.1 distinct goslims

# check for duplicate values within each cell using strsplit() and anyDuplicated()
duplicates <- sapply(strsplit(pgenerosa_simple_goslims$gene, ";"), anyDuplicated)

# print the row indices of cells with duplicate values
if (any(duplicates)) {
  print(which(duplicates > 0))
} else {
  print("No duplicates found")
}
## [1] "No duplicates found"
#accession numbers = no duplicates
#gene_ID = no duplicates
#allGoID duplicates = column 24458
#BP_GO_ID duplicates = column 24458
#CC_GO_ID duplicates = no duplicates
#MF_GO_ID duplicates = columns 4561  8040 13044 16138
#--------------------------------
#GOslim duplicates lots and lots 
#Terms duplicates lots and lots 

#break cells and reform with unique values only
distinct_goslims <- pgenerosa_simple_goslims %>%
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  distinct(Term_unique, .keep_all = TRUE) %>%
  mutate(GOslim_unique = lapply(strsplit(as.character(GOslim), ";"),function(GOslim_unique) paste0(unique(GOslim_unique), collapse = ";"))) %>% 
  distinct(GOslim_unique, .keep_all = TRUE)

kable(distinct_goslims[1:10,1:10])
target_id gene ctenidia gonad heart juvenile larvae accessions gene_id gene_name
PGEN_.00g000010 PGEN_.00g000010 0.000000 0.000000 0.000000 4.851851 3.423667 Q86IC9;Q552T5 8620183 omt5
PGEN_.00g000020 PGEN_.00g000020 0.000000 0.000000 0.000000 0.840164 0.370266 P04177 25085 Th
PGEN_.00g000050 PGEN_.00g000050 0.508251 4.745168 0.228272 0.182728 1.117655 Q8L840;O04092;Q9FT71 837636 RECQL4A
PGEN_.00g000060 PGEN_.00g000060 6.490179 0.000000 0.000000 6.742646 5.936612 Q61043;A0A1Y7VJL5;B2RQ73;B7ZMZ9;E9Q488;E9Q4S3;Q674R4;Q6ZPM7 18080 Nin
PGEN_.00g000080 PGEN_.00g000080 0.000000 0.000000 0.119488 0.229570 0.000000 A1E2V0 489433 BIRC3
PGEN_.00g000090 PGEN_.00g000090 0.187374 0.000000 0.000000 0.173312 0.000000 P34456 186266
PGEN_.00g000210 PGEN_.00g000210 2.810603 0.000000 2.775616 3.495122 0.898959 O00463;B4DIS9;B4E0A2;Q6FHY1 7188 TRAF5
PGEN_.00g000300 PGEN_.00g000300 4.091597 2.739254 3.785783 5.053620 1.087948 Q5REG4 100171717 DTX3
PGEN_.00g000380 PGEN_.00g000380 0.475060 0.000000 0.000000 0.315384 0.000000 Q8QG60;Q8QG52;Q8QGQ5 374092 CRY2
PGEN_.00g000440 PGEN_.00g000440 11.483098 5.728598 3.123012 6.873106 10.243535 Q9H583;Q5T3Q8;Q6P197;Q9NW23 55127 HEATR1

7 SPECIES

7.1 Separate the Term columb by semi-colons to count occurances for table

# For Pgenerosa alone (wide format) 
# Separate the "Term" column by semi-colons and count the occurrences
term_count_Pgen <- distinct_goslims %>% #make sure you have removed duplicate values beforehand
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) # Count the occurrences and sort by descending count

# Print the count of each term
print(term_count_Pgen)
## # A tibble: 71 × 2
##    Term_unique                                   n
##    <chr>                                     <int>
##  1 anatomical structure development           1853
##  2 signaling                                  1603
##  3 cell differentiation                       1281
##  4 DNA-templated transcription                 741
##  5 regulation of DNA-templated transcription   678
##  6 immune system process                       670
##  7 programmed cell death                       600
##  8 cell motility                               595
##  9 reproductive process                        536
## 10 cytoskeleton organization                   516
## # ℹ 61 more rows
# For Pgenerosa using pgenerosa_long (versus wide)
# Separate the "Term" column by semi-colons and count the occurrences
term_count_Pgen_long <- pgenerosa_long %>%  # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) # Count the occurrences and sort by descending count

# Print the count of each term
print(term_count_Pgen_long)
## # A tibble: 71 × 2
##    Term_unique                                   n
##    <chr>                                     <int>
##  1 anatomical structure development           1853
##  2 signaling                                  1603
##  3 cell differentiation                       1281
##  4 DNA-templated transcription                 741
##  5 regulation of DNA-templated transcription   678
##  6 immune system process                       670
##  7 programmed cell death                       600
##  8 cell motility                               595
##  9 reproductive process                        536
## 10 cytoskeleton organization                   516
## # ℹ 61 more rows
#exactly the same as Pgen_wide


#for Manila Clam
# Separate the "Term" column by semi-colons and count the occurrences
term_count_manila <- Manila_Pgen_goslims %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) # Count the occurrences and sort by descending count

# Print the count of each term
print(term_count_manila)
## # A tibble: 65 × 2
##    Term_unique                                   n
##    <chr>                                     <int>
##  1 anatomical structure development            129
##  2 signaling                                   113
##  3 cell differentiation                         86
##  4 cytoskeleton organization                    65
##  5 DNA-templated transcription                  51
##  6 reproductive process                         48
##  7 regulation of DNA-templated transcription    47
##  8 cell motility                                40
##  9 programmed cell death                        40
## 10 protein-containing complex assembly          39
## # ℹ 55 more rows
# Manila Reverse
term_count_manila_r <- Pgen_Manila_goslims %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) # Count the occurrences and sort by descending count

# Print the count of each term
print(term_count_manila_r)
## # A tibble: 65 × 2
##    Term_unique                                   n
##    <chr>                                     <int>
##  1 anatomical structure development            149
##  2 signaling                                   126
##  3 cell differentiation                         96
##  4 cytoskeleton organization                    68
##  5 DNA-templated transcription                  60
##  6 regulation of DNA-templated transcription    55
##  7 reproductive process                         55
##  8 programmed cell death                        44
##  9 immune system process                        42
## 10 protein-containing complex assembly          42
## # ℹ 55 more rows
#for Mercenaria Clam
# Separate the "Term" column by semi-colons and count the occurrences
term_count_Mercenaria <- Mercenaria_Pgen_goslims %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) # Count the occurrences and sort by descending count

# Print the count of each term
print(term_count_Mercenaria)
## # A tibble: 69 × 2
##    Term_unique                                   n
##    <chr>                                     <int>
##  1 anatomical structure development            247
##  2 signaling                                   189
##  3 cell differentiation                        170
##  4 DNA-templated transcription                  88
##  5 cytoskeleton organization                    87
##  6 cell motility                                83
##  7 regulation of DNA-templated transcription    82
##  8 reproductive process                         71
##  9 immune system process                        66
## 10 programmed cell death                        63
## # ℹ 59 more rows
#for Mercenaria Clam reverse
# Separate the "Term" column by semi-colons and count the occurrences
term_count_Mercenaria_r <- Pgen_Mercenaria_goslims %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) # Count the occurrences and sort by descending count

# Print the count of each term
print(term_count_Mercenaria_r) #over 3x more terms than Mercenaria not reversed...wonder why that is....
## # A tibble: 71 × 2
##    Term_unique                                   n
##    <chr>                                     <int>
##  1 anatomical structure development            768
##  2 signaling                                   635
##  3 cell differentiation                        517
##  4 DNA-templated transcription                 267
##  5 regulation of DNA-templated transcription   245
##  6 cell motility                               241
##  7 cytoskeleton organization                   237
##  8 immune system process                       232
##  9 programmed cell death                       229
## 10 vesicle-mediated transport                  218
## # ℹ 61 more rows
#combine all 5 datasets
term_count_all <- bind_rows(term_count_Pgen, term_count_manila, term_count_manila_r, term_count_Mercenaria, term_count_Mercenaria_r, .id = "transcriptome") 

term_count_best <- term_count_all %>% 
  mutate(transcriptome = ifelse(transcriptome == 1, "Pgenerosa", 
                    ifelse(transcriptome == 2, "Manila",
                           ifelse(transcriptome == 3, "Manila_r", 
                                  ifelse(transcriptome == 4, "Mercenaria", 
                                         ifelse(transcriptome == 5, "Mercenaria_r", NA))))))

7.2 NORMALIZE

#normalize by Pgen transcriptome
sum_terms<- term_count_Pgen %>% #count terms in n 
  summarise(total_n = sum(n)) #17,611

Pgen_normalized<- term_count_Pgen %>%
  mutate(Pgen_normalized = n/17611)

normal_counts<- term_count_best %>%
  group_by(transcriptome) %>%
  summarise(total_n = sum(n))
#Pgen = 17,611
#Manila = 1235
#Manila_r = 1390
#Mercenaria = 2198
#Mercenaria_r = 7027


filtered_data1 <- term_count_best %>%
  filter(transcriptome %in% c("Mercenaria_r", "Manila", "Mercenaria", "Manila_r")) %>%
  mutate(n_divided = case_when(
                               transcriptome == "Manila" ~ n/1235,
                               transcriptome == "Manila_r" ~ n/1390,
                               transcriptome == "Mercenaria" ~ n/2198,
                               transcriptome == "Mercenaria_r" ~ n/7027))

grouped_data1<-left_join(filtered_data1, Pgen_normalized, by="Term_unique")

species_normal<-grouped_data1 %>%
  mutate(normalized = n_divided/Pgen_normalized)%>%
  rename(n_species = n.x) %>%
  rename(n_Pgen = n.y) %>%
  mutate(log_scale = log(normalized))
#for heatmap

kable(species_normal[1:5,1:5])
transcriptome Term_unique n_species n_divided n_Pgen
Manila anatomical structure development 129 0.1044534 1853
Manila signaling 113 0.0914980 1603
Manila cell differentiation 86 0.0696356 1281
Manila cytoskeleton organization 65 0.0526316 516
Manila DNA-templated transcription 51 0.0412955 741

7.3 count GOterms unique to species

#####make new variables with binary data

#wide format data

Manila_2<-Manila_Pgen_goslims %>%
  select(-c(3:11, 13:23))

Mercenaria_2<-Mercenaria_Pgen_goslims %>%
   select(-c(3:11, 13:23))

species_joined<-left_join(Mercenaria_2, Manila_2, by="gene")
## Warning in left_join(Mercenaria_2, Manila_2, by = "gene"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 17 of `x` matches multiple rows in `y`.
## ℹ Row 256 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
#Mercenaria
species_joined$`e-value.x`=ifelse(species_joined$`e-value.x` > 0, 1, 0)
#Manila
species_joined$`e-value.y`=ifelse(species_joined$`e-value.y` > 0, 1, 0)

species_matrix<-species_joined %>% select(gene, `e-value.x`, `e-value.y`) %>%
  rename(Mercenaria_evalue = `e-value.x`)%>%
  rename(Manila_evalue = `e-value.y`)%>%
  distinct(gene, .keep_all = TRUE)%>%
  mutate(Manila_evalue = ifelse(is.na(Manila_evalue), 0, Manila_evalue))%>%
  mutate(Pgen_evalue = 1)

species_matrix<-tibble::column_to_rownames(species_matrix, var="gene")

species_matrix<-as.matrix(species_matrix) #count matrix for UpSet plots

matrix_species = make_comb_mat(species_matrix, mode="distinct")
matrix_species_intersect = make_comb_mat(species_matrix, mode="intersect")
matrix_species_union = make_comb_mat(species_matrix, mode="union")
set_size(matrix_species)
## Mercenaria_evalue     Manila_evalue       Pgen_evalue 
##               980               210              1029
comb_size(matrix_species)
## 111 101 011 001 
## 202 778   8  41
# create sample data frame
species_count<- as.data.frame(species_matrix) %>%
  mutate(Mercenaria_Manila = NA) %>%
  mutate(Manila_Pgen = NA) %>%
  mutate(Pgen_Mercenaria = NA) %>%
  mutate(all=NA)%>%
  mutate(Pgen= NA)

# iterate through rows
########################
#Mercenaria_Manila does not include Pgen genome or else will return all zeros
species_count$Mercenaria_Manila <-ifelse(species_count$Mercenaria_evalue ==1 & species_count$Manila_evalue == 1, 1, 0)

#Manila + Pgenerosa
species_count$Manila_Pgen <-ifelse(species_count$Mercenaria_evalue ==0 & species_count$Manila_evalue == 1 & species_count$Pgen_evalue == 1, 1, 0)

#Mercenaria + Pgenerosa
species_count$Pgen_Mercenaria <-ifelse(species_count$Mercenaria_evalue == 1 & species_count$Manila_evalue == 0 & species_count$Pgen_evalue == 1, 1, 0)

#Mercenaria +Pgenerosa + Manila
species_count$all<-ifelse(species_count$Mercenaria_evalue == 1 & species_count$Manila_evalue == 1 & species_count$Pgen_evalue == 1, 1, 0)

#Pgen alone
species_count$Pgen <-ifelse(species_count$Mercenaria_evalue == 0 & species_count$Manila_evalue == 0 & species_count$Pgen_evalue == 1, 1, 0)

#return column names when species == 1
# identify row indices where Pgen_Mercenaria column equals 1
row_indices_a<- which(species_count$Mercenaria_Manila == 1)
row_indices_b<- which(species_count$Manila_Pgen == 1)
row_indices_c<- which(species_count$Pgen_Mercenaria == 1)
row_indices_d<- which(species_count$all == 1)
row_indices_e<- which(species_count$Pgen == 1)

# extract corresponding row names
genes_Mer_Man <- as.data.frame(rownames(species_count[row_indices_a, ])) 
names(genes_Mer_Man)[1]<-"gene"

genes_Man_Pgen <- as.data.frame(rownames(species_count[row_indices_b, ])) 
names(genes_Man_Pgen)[1]<-"gene"

genes_Mer_Pgen <- as.data.frame(rownames(species_count[row_indices_c, ])) 
names(genes_Mer_Pgen)[1]<-"gene"

genes_all <- as.data.frame(rownames(species_count[row_indices_d, ])) 
names(genes_all)[1]<-"gene"

genes_Pgen <- as.data.frame(rownames(species_count[row_indices_e, ])) 
names(genes_Pgen)[1]<-"gene"

#goslims
goslims_genes_MM<-left_join(genes_Mer_Man, goslims, by="gene")
goslims_gene_ManPgen<-left_join(genes_Man_Pgen, goslims, by ="gene")
goslims_genes_MerPgen<-left_join(genes_Mer_Pgen, goslims, by="gene")
goslims_genes_all<-left_join(genes_all, goslims, by="gene")
goslims_genes_Pgen<-left_join(genes_Pgen, goslims, by="gene")

#Unique GOterms only
#Mercenaria + Manila
MM_count_unique <- goslims_genes_MM %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
 # distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, species= "Mercenaria-Manila")

Man_Pgen_count_unique <- goslims_gene_ManPgen %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
 # distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, species= "Manila-Pgen")

Mer_Pgen_count_unique <- goslims_genes_MerPgen %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
 # distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, species= "Mercenaria-Pgen")

all_count_unique <- goslims_genes_all %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
 # distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, species= "all-3")

Pgen_count_unique <- goslims_genes_Pgen %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
 # distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, species= "Pgen_alone")

species_unique<-bind_rows(Mer_Pgen_count_unique, Man_Pgen_count_unique, MM_count_unique, Pgen_count_unique, all_count_unique)

kable(species_unique[1:4, 1:4])
Term_unique n count_unique species
NA 261 68 Mercenaria-Pgen
anatomical structure development 203 68 Mercenaria-Pgen
signaling 181 68 Mercenaria-Pgen
cell differentiation 134 68 Mercenaria-Pgen

8 Oyster female gonad comparison

8.1 load data from Dheilly et al. 2012 from Supplemental Table 3 and filter for genebank accession numbers

oyster_gonad<-read.csv(file="https://raw.githubusercontent.com/course-fish546-2023/olivia-geoduck/main/data/pone.0036353.s003.csv")
oyster_gonad_raw<-read.csv(file="https://raw.githubusercontent.com/course-fish546-2023/olivia-geoduck/main/data/pone.0036353.s003.csv")

oyster_gonad<- oyster_gonad %>%
  group_by(Cluster) %>%
  select(Genebank, Cluster, Tissue.specificity, Description)

kable(oyster_gonad[1:10, 1:4])
Genebank Cluster Tissue.specificity Description
BQ427044 1 (sp:P34714) SPARC OS=Caenorhabditis elegans GN=ost-1 PE=1 SV=1
CU683682 1 0
DV736722 1 0
BQ426244 1 0
AM854135 1 0
CU999082 1 0
FP010274 1 0
CU682512 1 0
ES789495 1 0
CU683161 1 adductor muscle (sp:A2ASS6) Titin OS=Mus musculus GN=Ttn PE=1 SV=1

8.2 load results from BLASTn from Dheilly 2012 paper fasta against Pgen transcriptome

used Panopea-generosa-v2-db as transcriptome

#blast results = cgigas_gonad_pgen
cgigas_gonad_pgen<-read.csv(file="https://gannet.fish.washington.edu/gigas/data/p.generosa/cgigas_gonad_pgen.tab", header=FALSE, sep = '\t')
cgigas_gonad_pgen<- separate(cgigas_gonad_pgen, V2, into = c("Pgen_gene", "Scaffold"), sep = "::")
names(cgigas_gonad_pgen)[1]<-"Genebank"
names(cgigas_gonad_pgen)[2]<-"gene"
names(cgigas_gonad_pgen)[12]<-"e-value"

gonad_genes<-left_join(cgigas_gonad_pgen, oyster_gonad, by="Genebank")
kable(gonad_genes[1:10, 14:16])
Cluster Tissue.specificity Description
NA NA NA
NA NA NA
3 (sp:Q5R416) Catenin alpha-2 OS=Pongo abelii GN=CTNNA2 PE=2 SV=3
NA NA NA
NA NA NA
NA NA NA
NA NA NA
NA NA NA
NA NA NA
NA NA NA

8.3 Filter by genes present in female gonad

return list of gonad genes in C. gigas and P. generosa

gonad_genes<-gonad_genes %>%
  filter(!is.na(Cluster)) 

kable(gonad_genes[1:10, 14:16])
Cluster Tissue.specificity Description
3 (sp:Q5R416) Catenin alpha-2 OS=Pongo abelii GN=CTNNA2 PE=2 SV=3
6 (sp:Q498H0) Sister chromatid cohesion protein PDS5 homolog B-A OS=Xenopus laevis GN=pds5b-A PE=1 SV=1
9 (sp:P83917) Chromobox protein homolog 1 OS=Mus musculus GN=Cbx1 PE=1 SV=1
3 female gonad (sp:Q5ZJI0) Protein-tyrosine sulfotransferase 2 OS=Gallus gallus GN=TPST2 PE=2 SV=1
3 female gonad (sp:Q8CG46) Structural maintenance of chromosomes protein 5 OS=Mus musculus GN=Smc5 PE=2 SV=1
7 (sp:P67963) Casein kinase I isoform alpha OS=Xenopus laevis GN=csnk1a1 PE=2 SV=1
5 (sp:Q9JLI7) Sperm-associated antigen 6 OS=Mus musculus GN=Spag6 PE=1 SV=1
5 (sp:A3KMX7) Putative malate dehydrogenase 1B OS=Bos taurus GN=MDH1B PE=2 SV=1
3 (sp:Q6NSM8) Serine/threonine-protein kinase SIK3 homolog OS=Danio rerio GN=zgc:66101 PE=2 SV=1
2 female gonad (sp:Q8TBZ3) WD repeat-containing protein 20 OS=Homo sapiens GN=WDR20 PE=1 SV=2
gonad_genes<-left_join(gonad_genes, goslims, by="gene")

gonad_genes4 <- gonad_genes %>%
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% mutate(GOslim_unique = lapply(strsplit(as.character(GOslim), ";"),function(GOslim_unique) paste0(unique(GOslim_unique), collapse = ";")))

library(dplyr)
library(stringr)

gonad_genes4 <- gonad_genes %>%
  mutate(Term_unique = sapply(strsplit(as.character(Term), ";"), function(x) str_c(unique(x), collapse = ";")),
         GOslim_unique = sapply(strsplit(as.character(GOslim), ";"), function(x) str_c(unique(x), collapse = ";")))

write.csv(gonad_genes, file = "/Users/oliviacattau/Documents/GitHub/olivia-geoduck/output/tables/reproductive_genes.csv")

write.csv(gonad_genes4, file = "/Users/oliviacattau/Documents/GitHub/olivia-geoduck/output/tables/reproductive_genes2.csv")

gonad_list <- gonad_genes %>%
  group_by(Cluster) %>%
  count(Cluster, sort = TRUE)

kable(gonad_list[1:10, 1:2])
Cluster n
2 14
3 13
7 13
5 8
1 7
9 7
4 5
6 4
10 2
8 1

9 TISSUE

9.1 Count Goslims by tissue type

#count top GOterms
tissue_goslims <- pgenerosa_long %>%
    group_by(tissue) %>% 
  filter(binary == 1) %>%
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>%
  separate_rows(Term_unique, sep = ";") %>%
  mutate(Term_unique = trimws(Term_unique)) %>%
  count(Term_unique, tissue, sort = TRUE)

10 NORMALIZE

10.1 normalize goterm count by % in geoduck transcriptome by % tissue goterm counts by tissue transcriptome and return another % which is the normalized goterm count

#add new column to variable term_count_Pgen

#normalize data by % hit in Pgen transcriptome
sum_terms<- term_count_Pgen %>% #count terms in n 
  summarise(total_n = sum(n)) #17,611

Pgen_normalized<- term_count_Pgen %>%
  mutate(Pgen_normalized = n/17611)

#normalize tissue data
normal_counts<- tissue_goslims %>%
  group_by(tissue) %>%
  summarise(total_n = sum(n))
#heart = 16,021
#ctenidia = 15,911
#gonad = 14,715
#juv = 17,277
#larvae = 16,632

filtered_data <- tissue_goslims %>%
  filter(tissue %in% c("ctenidia", "heart", "gonad", "larvae", "juvenile")) %>%
  mutate(n_divided = case_when(tissue == "ctenidia" ~ n/15911,
                               tissue == "heart" ~ n/16021,
                               tissue == "gonad" ~ n/14715,
                               tissue == "larvae" ~ n/16632,
                               tissue == "juvenile" ~ n/17277))

#combine Pgen_normalized with filtered_data and add new column normalized where tissue_n/geoduck_n to produce new normalized value

grouped_data<-left_join(filtered_data, Pgen_normalized, by="Term_unique")

Tis_normal<-grouped_data %>%
  mutate(normalized = n_divided/Pgen_normalized)%>%
  rename(n_tissue = n.x) %>%
  rename(n_Pgen = n.y) %>%
  mutate(log_scale = log(normalized))
#for heatmap

kable(Tis_normal[1:5, 1:8])
tissue Term_unique n_tissue n_divided n_Pgen Pgen_normalized normalized log_scale
juvenile anatomical structure development 1817 0.1051687 1853 0.1052183 0.9995285 -0.0004716
larvae anatomical structure development 1743 0.1047980 1853 0.1052183 0.9960050 -0.0040030
heart anatomical structure development 1678 0.1047375 1853 0.1052183 0.9954305 -0.0045800
ctenidia anatomical structure development 1661 0.1043932 1853 0.1052183 0.9921578 -0.0078731
juvenile signaling 1572 0.0909880 1603 0.0910227 0.9996195 -0.0003806

10.2 filter by tissue type

not actually unique goslims, only unique to tissue types

tidy1<-pgenerosa_goslims_long %>%
  group_by(tissue) %>%
  filter(tpm > 0) 

ctenidia<-pgenerosa_goslims %>%
  filter(ctenidia > 0 & gonad == 0 & heart == 0 & juvenile == 0 & larvae == 0)
count(ctenidia)
##     n
## 1 340
#340 obs (unique to ctenidia)
write.csv(ctenidia, file="/Users/oliviacattau/Documents/GitHub/code-for-Pgenerosa/output/ctenidia_unique.csv")

gonad<-pgenerosa_goslims %>%
  filter(gonad > 0 & ctenidia == 0 & heart == 0 & juvenile == 0 & larvae == 0)
count(gonad)
##     n
## 1 119
#119 obs (unique to gonad)
write.csv(gonad, file="/Users/oliviacattau/Documents/GitHub/code-for-Pgenerosa/output/gonad_unique.csv")

heart<-pgenerosa_goslims %>%
  filter(heart > 0 & ctenidia == 0 & gonad == 0 & juvenile == 0 & larvae == 0)
count(heart)
##     n
## 1 371
#371 obs (unique to heart)
write.csv(heart, file="/Users/oliviacattau/Documents/GitHub/code-for-Pgenerosa/output/heart_unique.csv")

juvenile<-pgenerosa_goslims %>%
  filter(juvenile > 0 & ctenidia == 0 & heart == 0 & gonad == 0 & larvae == 0)
count(juvenile)
##      n
## 1 2180
#2,180 obs (unique to juvenile alone)
write.csv(juvenile, file="/Users/oliviacattau/Documents/GitHub/code-for-Pgenerosa/output/juvenile_unique.csv")

larvae<-pgenerosa_goslims %>%
  filter(larvae > 0 & ctenidia == 0 & heart == 0 & gonad == 0 & juvenile == 0)
count(larvae)
##     n
## 1 868
#868 obs (unique to larvae alone)
write.csv(larvae, file="/Users/oliviacattau/Documents/GitHub/code-for-Pgenerosa/output/larvae_unique.csv")

juvorlarv<-pgenerosa_goslims %>%
  filter(ctenidia == 0 & gonad == 0 & heart == 0) %>%
  filter(juvenile > 0 & larvae > 0)
#5,022 obs unique to both juveniles or larvae
#1,974 obs unique to juveniles and larvae together

all_unique<-pgenerosa_goslims %>%
   filter(ctenidia > 0 & gonad > 0 & heart > 0 & juvenile > 0 & larvae > 0)

10.3 count GOterms for individual tissues

gonad larvae heart ctenidia juvenile

gonad_count_unique <- gonad %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% # distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, tissue= "gonad")

larvae_count_unique <- larvae %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% # distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, tissue= "larvae")

heart_count_unique <- heart %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% # distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, tissue= "heart")

ctenidia_count_unique <- ctenidia %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% # distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, tissue= "ctenidia")

juvenile_count_unique <- juvenile %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% # distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, tissue= "juvenile")

tissue_all_unique_genes_GOterms<-rbind(heart_count_unique, gonad_count_unique, ctenidia_count_unique, larvae_count_unique, juvenile_count_unique)

kable(tissue_all_unique_genes_GOterms[1:4, 1:4])
Term_unique n count_unique tissue
NA 342 41 heart
signaling 13 41 heart
anatomical structure development 11 41 heart
cell differentiation 9 41 heart

11 organize data for UpSet plot

library(dplyr)
#####make new variables with binary data
tidy_data$heart=ifelse(tidy_data$heart > 0, 1, 0)
tidy_data$gonad=ifelse(tidy_data$gonad >0, 1, 0)
tidy_data$ctenidia=ifelse(tidy_data$ctenidia > 0, 1, 0)
tidy_data$larvae=ifelse(tidy_data$larvae > 0, 1, 0)
tidy_data$juvenile=ifelse(tidy_data$juvenile > 0, 1, 0)
new_data<-tidy_data %>% select(target_id,heart, gonad, ctenidia, larvae, juvenile)
new_data<-tibble::column_to_rownames(new_data, var="target_id")
new_data<-as.matrix(new_data) #count matrix for UpSet plots

12 Make UpSet plot for visualization of overlapping data:

link to UpSet R page

m1 = make_comb_mat(new_data) #only distinct intersections
m2=make_comb_mat(new_data, mode="intersect")
set_size(m1)
##    heart    gonad ctenidia   larvae juvenile 
##    17602    13682    17479    19449    23417
comb_size(m1)
## 11111 11110 11101 11011 10111 01111 11100 11010 11001 10110 10101 10011 01110 
## 11294    24   692   397  2147   337    54    13   122    51   658  1007     8 
## 01101 01011 00111 11000 10100 10010 10001 01100 01010 01001 00110 00101 00011 
##   100   258   886    43   124    60   545    20    46   155    79   665  1974 
## 10000 01000 00100 00010 00001 00000 
##   371   119   340   868  2180  9310
comb_data<-as.data.frame(comb_size(m1))

p1<-UpSet(m1 [comb_degree(m1)<=2], column_title ="All intersections of 2 or less") #intersections 2 or less
p2<-UpSet(m1 [comb_degree(m1)>=3], column_title= "All intersection of 3 or more") #intersections 3 or more
p3<-UpSet(m1, top_annotation = HeatmapAnnotation(
    degree = as.character(comb_degree(m1)),
    "Intersection\nsize" = anno_barplot(comb_size(m1), 
        border = FALSE, 
        gp = gpar(fill = "black"), 
        height = unit(2, "cm")
    ), 
    annotation_name_side = "left", 
    annotation_name_rot = 0), column_title="p. generosa transcriptome distinct intersections") #distinct mode

p5<-UpSet(m2 [comb_degree(m2)<=2], column_title ="All intersections of 2 or less")
p6<-UpSet(m2 [comb_degree(m2)>=3], column_title ="All intersections of 3 or more")
#####testing comb_set sizes
x2<-comb_size(m1)
x3<-as.data.frame(x2)
sum(x3$x2) #34,947 --distinct mode
## [1] 34947
x4<-comb_size(m2)
x5<-as.data.frame(x4)
sum(x5$x4) #450,489 --intersect mode
## [1] 450489

13 UPset Plots

for looking at unique genes per RNA-seq tissue library

p1

p2

p3

14 Make gene list for unique genes per tissue type

# create sample data frame
count_df<- as.data.frame(new_data) %>%
  mutate(heart_gonad = NA) %>%
  mutate(heart_ctenidia = NA) %>%
  mutate(ctenidia_gonad = NA)


# iterate through rows
########################
#heart + gonad
# create a new column for shared data between heart and gonad columns
count_df$heart_gonad <- ifelse(count_df$heart == 1 & count_df$gonad == 1 & count_df$ctenidia == 0 & count_df$juvenile == 0 & count_df$larvae == 0, 1, 0)

#heart + ctenidia
count_df$heart_ctenidia <- ifelse(count_df$heart == 1 & count_df$gonad == 0 & count_df$ctenidia == 1 & count_df$juvenile == 0 & count_df$larvae == 0, 1, 0) 

#ctenidia + gonad
count_df$ctenidia_gonad <- ifelse(count_df$heart == 0 & count_df$gonad == 1 & count_df$ctenidia == 1 & count_df$juvenile == 0 & count_df$larvae == 0, 1, 0) 

#larvae + juvenile
count_df$larv_juv <- ifelse(count_df$heart == 0 & count_df$gonad == 0 & count_df$ctenidia == 0 & count_df$juvenile == 1 & count_df$larvae == 1, 1, 0)

#ctenidia + juvenile 
count_df$ctenidia_juv <- ifelse(count_df$heart == 0 & count_df$gonad == 0 & count_df$ctenidia == 1 & count_df$juvenile == 1 & count_df$larvae == 0, 1, 0)

#heart + juvenile
count_df$heart_juv <- ifelse(count_df$heart == 1 & count_df$gonad == 0 & count_df$ctenidia == 0 & count_df$juvenile == 1 & count_df$larvae == 0, 1, 0)

#gonad + juvenile
count_df$gonad_juv <- ifelse(count_df$heart == 0 & count_df$gonad == 1 & count_df$ctenidia == 0 & count_df$juvenile == 1 & count_df$larvae == 0, 1, 0)

#ctenidia + larvae
count_df$larv_ctenidia <- ifelse(count_df$heart == 0 & count_df$gonad == 0 & count_df$ctenidia == 1 & count_df$juvenile == 0 & count_df$larvae == 1, 1, 0)

#heart + larvae
count_df$larv_heart <- ifelse(count_df$heart == 1 & count_df$gonad == 0 & count_df$ctenidia == 0 & count_df$juvenile == 0 & count_df$larvae == 1, 1, 0)

#gonad + larvae
count_df$larv_gonad <- ifelse(count_df$heart == 0 & count_df$gonad == 1 & count_df$ctenidia == 0 & count_df$juvenile == 0 & count_df$larvae == 1, 1, 0)

#return column names when tissues == 1
# identify row indices where heart_gonad column equals 1
row_indices  <- which(count_df$heart_gonad == 1)
row_indices_2<- which(count_df$heart_ctenidia == 1)
row_indices_3<- which(count_df$ctenidia_gonad == 1)
row_indices_4<- which(count_df$larv_juv == 1)
row_indices_5<- which(count_df$ctenidia_juv == 1)
row_indices_6<- which(count_df$heart_juv == 1)
row_indices_7<- which(count_df$gonad_juv == 1)
row_indices_8<- which(count_df$larv_ctenidia == 1)
row_indices_9<- which(count_df$larv_heart == 1)
row_indices_10<- which(count_df$larv_gonad == 1)


# extract corresponding row names
genes_heart_gonad <- as.data.frame(rownames(count_df[row_indices, ])) 
names(genes_heart_gonad)[1]<-"gene"

genes_heart_ctenidia <- as.data.frame(rownames(count_df[row_indices_2,]))
names(genes_heart_ctenidia)[1]<-"gene"

genes_ctenidia_gonad <- as.data.frame(rownames(count_df[row_indices_3, ])) 
names(genes_ctenidia_gonad)[1]<-"gene"

genes_larv_juv <- as.data.frame(rownames(count_df[row_indices_4, ])) 
names(genes_larv_juv)[1]<-"gene"

genes_ctenidia_juv <-as.data.frame(rownames(count_df[row_indices_5, ])) 
names(genes_ctenidia_juv)[1]<-"gene"

genes_heart_juv <- as.data.frame(rownames(count_df[row_indices_6, ])) 
names(genes_heart_juv)[1]<-"gene"

genes_gonad_juv <- as.data.frame(rownames(count_df[row_indices_7, ])) 
names(genes_gonad_juv)[1]<-"gene"

genes_larv_ctenidia <- as.data.frame(rownames(count_df[row_indices_8, ])) 
names(genes_larv_ctenidia)[1]<-"gene"

genes_larv_heart <- as.data.frame(rownames(count_df[row_indices_9, ])) 
names(genes_larv_heart)[1]<-"gene"

genes_larv_gonad <- as.data.frame(rownames(count_df[row_indices_10, ])) 
names(genes_larv_gonad)[1]<-"gene"

#goslims
goslims_genes_hg<-left_join(genes_heart_gonad, goslims, by="gene")
goslims_gene_hc<-left_join(genes_heart_ctenidia, goslims, by ="gene")
goslims_genes_cg<-left_join(genes_ctenidia_gonad, goslims, by="gene")
goslims_gene_lj<-left_join(genes_larv_juv, goslims, by="gene")
goslims_genes_cj<-left_join(genes_ctenidia_juv, goslims, by="gene")
goslims_genes_hj<-left_join(genes_heart_juv, goslims, by="gene")
goslims_genes_gj<-left_join(genes_gonad_juv, goslims, by="gene")
goslims_genes_cl<-left_join(genes_larv_ctenidia, goslims, by="gene")
goslims_genes_hl<-left_join(genes_larv_heart, goslims, by="gene")
goslims_genes_gl<-left_join(genes_larv_gonad, goslims, by="gene")


#Unique GOterms only
#heart-gonad
hg_count_unique <- goslims_genes_hg %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
 # distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, tissues= "heart-gonad")

#heart-ctenidia
hc_count_unique <- goslims_gene_hc %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  #distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>%# Count the occurrences and sort by descending count
  mutate(count_unique = n()-1, tissues= "heart-ctenidia")

#ctenidia-gonad
cg_count_unique <- goslims_genes_cg %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  #distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, tissues= "ctenidia-gonad")

#larvae-juvenile
lj_count_unique <- goslims_gene_lj %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  #distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, tissues= "larvae-juvenile") #count genes with GOslims and remove 1 (for NAs)

#ctenidia-juvenile
cj_count_unique <- goslims_genes_cj %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  #distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, tissues= "ctenidia-juvenile") #count genes with GOslims and remove 1 (for NAs)

#heart-juvenile
hj_count_unique <- goslims_genes_hj %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  #distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, tissues= "heart-juvenile") #count genes with GOslims and remove 1 (for NAs)

#gonad-juvenile
gj_count_unique <- goslims_genes_gj %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  #distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, tissues= "gonad-juvenile") #count genes with GOslims and remove 1 (for NAs)

#heart + larvae
hl_count_unique <- goslims_genes_hl %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  #distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, tissues= "heart-larvae") #count genes with GOslims and remove 1 (for NAs)

#ctenidia + larvae
cl_count_unique <- goslims_genes_cl %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  #distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, tissues= "ctenidia-larvae") #count genes with GOslims and remove 1 (for NAs)

#gonad + larvae
gl_count_unique <- goslims_genes_gl %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  #distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, tissues= "gonad-larvae") #count genes with GOslims and remove 1 (for NAs)

#term count Juv only
juv_count_unique <- juvenile %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  #distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, tissues= "juvenile") #count genes with GOslims and remove 1 (for NAs)

write.csv(juv_count_unique, file="/Users/oliviacattau/Documents/GitHub/olivia-geoduck/data/juv_count_unique.csv")

#combine into one DF
unique_overlapping_genes<-bind_rows(hg_count_unique, hc_count_unique, cg_count_unique, lj_count_unique, cj_count_unique, hj_count_unique, gj_count_unique, cl_count_unique, hl_count_unique, gl_count_unique)

15 Make stacked bar-plot for visualizing Unique Terms

#filter Terms down a bit 
species_normal_filtered<- species_normal%>%
  group_by(Term_unique) %>%
  filter(n_species >= 10) %>%
  filter(!is.na("Terms_unique"))

terms_filtered <- term_count_best %>%
  filter(transcriptome == "Pgenerosa" | transcriptome == "Manila_r" | transcriptome == "Mercenaria_r") %>%
  filter(n() >= 5) %>%
  filter(!is.na("Terms_unique")) %>%
  filter(n >= 10)

terms_filtered2 <- term_count_best %>%
  filter(transcriptome == "Manila_r" | transcriptome == "Mercenaria_r") %>%
  filter(n() >= 5) %>%
  filter(!is.na("Terms_unique")) %>%
  filter(n >= 10)

#load top 20 hits from Github
top20terms<-read.csv(file="https://raw.githubusercontent.com/course-fish546-2023/olivia-geoduck/main/output/tables/top20terms.csv")

compairative_transcriptomes<-top20terms %>%
  filter(transcriptome != "Pgenerosa") %>%
  filter(n >= 15)

cols = rainbow(25, s=.6, v=.9)[sample(1:25,25)]
cols2 = rainbow(69, s=.6, v=.9)[sample(1:69,69)]

stacked_barplot<-ggplot(top20terms, aes(x=transcriptome, y=n, fill=Term_unique))+geom_bar(stat = "identity")+theme_bw()+scale_fill_manual(values=cols)+theme(axis.text.x = element_text(angle = 45, hjust = 1, size=10, face="bold"))+theme(axis.text.y=element_text(size=10))+labs(fill="Biological GO Terms")

stacked_barplot2<-ggplot(compairative_transcriptomes, aes(x=transcriptome, y=n, fill=Term_unique))+geom_bar(stat = "identity", position="fill")+theme_bw()+scale_fill_manual(values=cols)+theme(axis.text.x = element_text(angle = 45, hjust = 1, size=10, face="bold"))+theme(axis.text.y=element_text(size=10))+labs(fill="Biological GO Terms")

stacked_barplot3<-ggplot(terms_filtered, aes(x=transcriptome, y=n, fill=Term_unique))+geom_bar(position="fill", stat = "identity")+theme_bw()+scale_fill_manual(values=cols2)+labs(x="Transcriptome")+labs(fill="Biological GO Terms")

stacked_barplot4<-ggplot(terms_filtered, aes(x=Term_unique, y=n, fill=transcriptome))+geom_bar(position="stack", stat="identity")+theme_bw()+theme(axis.text.x = element_text(hjust = 1, size=10))+coord_flip()+labs(x="Biological GO Terms")+labs(fill="Transcriptome")

stacked_barplot5<-ggplot(terms_filtered2, aes(x=Term_unique, y=n, fill=transcriptome))+geom_bar(position="stack", stat="identity")+theme_bw()+theme(axis.text.x = element_text (hjust = 1, size=10))+coord_flip()+labs(x="Biological GO Terms")+labs(fill="Transcriptome")
stacked_barplot 

stacked_barplot2

stacked_barplot4

stacked_barplot5

16 Heatmaps

heatmap7

heatmap8

heatmap9

heatmap10

17 pie charts

Functional classification of P generosa contigs by RNA-Seq library

# Filter out NA values in the column n.Pgenerosa
na_zero <- replace(top20_long, is.na(top20_long), 0)

# Plot the pie chart using the filtered data
pie_geoduck<-ggplot(na_zero, aes(x="", y=n.Pgenerosa, fill=Term_unique))+geom_bar(stat="identity", width=1) +
  coord_polar("y", start=0) +theme_void() #do not like the rainbow

#Plot the pie using base R: not a very good plot
pie_geoduck_R<-pie(na_zero$n.Pgenerosa, labels = na_zero$Term_unique, main="Geoduck Gene Library w/ Biological GO Terms")

18 Next Steps

gather e-value data from Genomes and make scatter plots with individual RNA-seq libraries

scatter<-ggplot(distinct_goslims, aes(x=gonad, y=juvenile))+geom_point()+theme_bw()

scatter

scatter2<-ggplot(distinct_goslims, aes(x=heart, y=juvenile))+geom_point()+theme_bw()

scatter2

#compairative_clam_transcripts_right

scatter4<-ggplot(compairative_clam_transcripts_right, aes(x=`e-value.x`, y=`e-value.y`))+geom_point()+theme_bw()+xlab("Mercenaria")+ylab("Manila")+labs(title="Pgenerosa blast db")+ theme(plot.title = element_text(hjust = 0.5, margin = margin(b = 10)))#removed 3394 rows containing missing values

scatter5<-ggplot(compairative_clam_transcripts_left, aes(x=`e-value.x`, y=`e-value.y`))+geom_point()+theme_bw()+xlab("Mercenaria")+ylab("Manila")+labs(title="Pgenerosa blast query")+ theme(plot.title = element_text(hjust = 0.5, margin = margin(b = 10)))#Removed 4686 rows containing missing values (`geom_point()`).