Zenodo release:

1 Comparative Genomic and Proteomic Analysis of Geoduck clam (Panopea generosa)

Geoduck Genome already assembled in Putnum et al. 2022
Geoduck Transcritome – genome linked transcriptome link here
Comparative Transcriptome Analysis
1. gonad: reproductive genes
2. heart: circulatory genes
3. ctenidia: respiratory genes
Reproductive Transcriptome Analysis
1. larvae
2. juvenile
3. Dheilly et al. 2012 Pacific oyster reproductive genes (female gonad)
Comparative Species Transcriptome Analysis
1. Mercenaria mercenaria
2. Manila Clam (Ruditapes philippinarum)
Comparative Species Genome Analysis
1. Mercenaria mercenaria
2. Manila Clam (Ruditapes philippinarum)
3. Archivesica marissinica
4. Spinsula solida
5. Mactra quadrangularis

2 LOAD DATA

2.1 load new genome linked transcriptome PGEN

salmonMatrix<-read.csv(file="https://raw.githubusercontent.com/sr320/paper-geoduck-gene-exp/main/data/salmon.merged.gene_tpm.tsv", header=TRUE, sep = '\t')
names(salmonMatrix)[1]<-"target_id"
names(salmonMatrix)[2]<-"gene"
data.table(salmonMatrix)

##              target_id            gene ctenidia    gonad    heart juvenile
##     1: PGEN_.00g000010 PGEN_.00g000010 0.000000 0.000000 0.000000 4.851851
##     2: PGEN_.00g000020 PGEN_.00g000020 0.000000 0.000000 0.000000 0.840164
##     3: PGEN_.00g000030 PGEN_.00g000030 0.225494 0.000000 0.000000 0.069598
##     4: PGEN_.00g000040 PGEN_.00g000040 0.565506 0.000000 0.254843 0.376364
##     5: PGEN_.00g000050 PGEN_.00g000050 0.508251 4.745168 0.228272 0.182728
##    ---                                                                    
## 34943: PGEN_.00g349430 PGEN_.00g349430 0.000000 0.000000 0.000000 0.000000
## 34944: PGEN_.00g349440 PGEN_.00g349440 1.955226 1.489901 1.621033 0.000000
## 34945: PGEN_.00g349450 PGEN_.00g349450 0.000000 0.000000 0.159121 0.000000
## 34946: PGEN_.00g349460 PGEN_.00g349460 0.000000 0.000000 0.000000 0.000000
## 34947: PGEN_.00g349470 PGEN_.00g349470 0.000000 0.000000 0.000000 0.000000
##          larvae
##     1: 3.423667
##     2: 0.370266
##     3: 0.052545
##     4: 1.131114
##     5: 1.117655
##    ---         
## 34943: 0.000000
## 34944: 2.222424
## 34945: 0.000000
## 34946: 0.000000
## 34947: 0.000000

#34,947

2.2 load goslim mapping from Swiss Prott

link here

#double check to make sure this is reading the data correctly
goslims<-read.csv(file="https://raw.githubusercontent.com/RobertsLab/code/master/r_projects/sam/20230328-pgen-gene_annotation-update/outputs/02-goslim-mapping/20230329-pgen-annotations-SwissProt-GO-BP_GOslim.tab", header=TRUE, sep = '\t')
#14,672
presence_count <- sum(!is.na(goslims[, 11]) & goslims[, 11] != "") #10,903

2.3 download other transcriptome from clam species

#from Mox blasting Manilla/Mercenaria on Pgen
Manilla_Pgen<-read.table(file="https://gannet.fish.washington.edu/gigas/data/p.generosa/Manilla_Pgenenerosa_blastx.tab")
Mercenaria_Pgen<-read.table(file="https://gannet.fish.washington.edu/gigas/data/p.generosa/Mercenaria_Pgenenerosa_blastx.tab")

2.4 load genomes from blast output

Mercenaria_genome<-read.csv(file="https://gannet.fish.washington.edu/gigas/data/p.generosa/Pgenerosa_Mercenaria_db_genome_blastn.tab", sep = '\t', header = FALSE)

Quadrangularis_genome<-read.csv(file="https://gannet.fish.washington.edu/gigas/data/p.generosa/Pgenerosa_quadrangularis_db_genome_blastn.tab", sep = '\t', header = FALSE)

Manila_genome<-read.csv(file="https://gannet.fish.washington.edu/gigas/data/p.generosa/Pgenerosa_manila_db_genome_blastn.tab", sep = '\t', header = FALSE)

Marissincia_genome<-read.csv(file="https://gannet.fish.washington.edu/gigas/data/p.generosa/Pgenerosa_marissinsa_db_genome_blastn.tab", sep = '\t', header = FALSE)

Solida_genome<-read.csv(file="https://gannet.fish.washington.edu/gigas/data/p.generosa/solida_db_genome_blastn.tab", sep = '\t', header = FALSE)

kable(Solida_genome[1:10,1:5])

V1	V2	V3	V4	V5
PGEN_.00g000060::Scaffold_01:70713-81099	OX365947.1	74.279	416	84
PGEN_.00g000270::Scaffold_01:344820-379444	OX365949.1	75.163	306	58
PGEN_.00g000300::Scaffold_01:503707-557915	OX365947.1	83.648	159	16
PGEN_.00g000320::Scaffold_01:571666-599482	OX365950.1	79.545	220	35
PGEN_.00g000670::Scaffold_01:1389639-1400328	OX365948.1	87.857	140	17
PGEN_.00g001140::Scaffold_01:2257307-2316262	OX365948.1	78.487	423	82
PGEN_.00g001170::Scaffold_01:2366888-2417042	OX365948.1	82.443	131	23
PGEN_.00g001350::Scaffold_01:2794647-2838542	OX365948.1	76.093	343	78
PGEN_.00g001380::Scaffold_01:2878758-2888130	OX365948.1	78.431	204	43
PGEN_.00g001410::Scaffold_01:2961919-2993436	OX365952.1	79.126	206	35

2.4.1 get summary data for genomes

#Mercenaria
Mercenaria_new <- separate(Mercenaria_genome, V1, into = c("gene", "Scaffold"), sep = "::")
names(Mercenaria_new)[1]<-"gene"
names(Mercenaria_new)[12]<-"e-value"

Mercenaria_summary <- Mercenaria_new %>%
  mutate(Marissincia_genes = lapply(strsplit(as.character(V2), ";"), function(Marissincia_genes) paste0(unique(Marissincia_genes), collapse = ";"))) %>% 
  distinct(Marissincia_genes, .keep_all = TRUE) 

#Quadrangularis
Quadrangularis_new <- separate(Quadrangularis_genome, V1, into = c("gene", "Scaffold"), sep = "::")
names(Quadrangularis_new)[1]<-"gene"
names(Quadrangularis_new)[12]<-"e-value"

Quadrangularis_summary <- Quadrangularis_new %>%
  mutate(genes = lapply(strsplit(as.character(V2), ";"), function(genes) paste0(unique(genes), collapse = ";"))) %>% 
  distinct(genes, .keep_all = TRUE) 

#Manila 
Manila_new <- separate(Manila_genome, V1, into = c("gene", "Scaffold"), sep = "::")
names(Manila_new)[1]<-"gene"
names(Manila_new)[12]<-"e-value"

Manila_summary <- Manila_new%>%
  mutate(genes = lapply(strsplit(as.character(V2), ";"), function(genes) paste0(unique(genes), collapse = ";"))) %>% 
  distinct(genes, .keep_all = TRUE) 

#Marissincia
Marissincia_new <- separate(Marissincia_genome, V1, into = c("gene", "Scaffold"), sep = "::")
names(Marissincia_new)[1]<-"gene"
names(Marissincia_new)[12]<-"e-value"

Marissincia_summary <- Marissincia_new %>%
  mutate(Marissincia_genes = lapply(strsplit(as.character(V2), ";"), function(Marissincia_genes) paste0(unique(Marissincia_genes), collapse = ";"))) %>% 
  distinct(Marissincia_genes, .keep_all = TRUE) 

#Solida
Solida_new <- separate(Solida_genome, V1, into = c("gene", "Scaffold"), sep = "::")
names(Solida_new)[1]<-"gene"
names(Solida_new)[12]<-"e-value"

Solida_summary <- Solida_new%>%
  mutate(genes = lapply(strsplit(as.character(V2), ";"), function(genes) paste0(unique(genes), collapse = ";"))) %>% 
  distinct(genes, .keep_all = TRUE)

2.5 make tables for clam genomes

clam_genomes<-bind_rows(lst(Solida_new, Marissincia_new, Mercenaria_new, Manila_new, Quadrangularis_new), .id = "clam")
kable(clam_genomes[1:5,1:5])

clam	gene	Scaffold	V2	V3
Solida_new	PGEN_.00g000060	Scaffold_01:70713-81099	OX365947.1	74.279
Solida_new	PGEN_.00g000270	Scaffold_01:344820-379444	OX365949.1	75.163
Solida_new	PGEN_.00g000300	Scaffold_01:503707-557915	OX365947.1	83.648
Solida_new	PGEN_.00g000320	Scaffold_01:571666-599482	OX365950.1	79.545
Solida_new	PGEN_.00g000670	Scaffold_01:1389639-1400328	OX365948.1	87.857

3 JOIN DATA

3.1 combine all clams for scatterplot by e-value

#all clam transcriptomes
#Reminder: Merc->Pgen = 6521, Manila->Pgen = 761, Pgen -> Merc = 5,099, Pgen -> Manila = 657
#always join large -> small
compairative_clam_transcripts_right <- left_join(Mercenaria_Pgen_clean, Manila_Pgen_clean, by = "gene") 

compairative_clam_transcripts_left <- left_join(Pgen_Mercenaria_clean, Pgen_Manila_clean, by = "gene")

4 join clam transcriptome with goslims

Manila_Pgen_goslims <- Manila_Pgen_new %>%
left_join(goslims, by=c("gene"))
Pgen_Manila_goslims <- Pgen_Manila_new %>%
  left_join(goslims, by=c("gene"))
Mercenaria_Pgen_goslims<- Mercenaria_Pgen_new %>%
left_join(goslims, by=c("gene")) #6521
Pgen_Mercenaria_goslims<- Pgen_Mercenaria_new %>%
left_join(goslims, by=c("gene")) #5099

5 long format

tidy_data<-salmonMatrix
head(tidy_data)

##         target_id            gene ctenidia    gonad    heart juvenile   larvae
## 1 PGEN_.00g000010 PGEN_.00g000010 0.000000 0.000000 0.000000 4.851851 3.423667
## 2 PGEN_.00g000020 PGEN_.00g000020 0.000000 0.000000 0.000000 0.840164 0.370266
## 3 PGEN_.00g000030 PGEN_.00g000030 0.225494 0.000000 0.000000 0.069598 0.052545
## 4 PGEN_.00g000040 PGEN_.00g000040 0.565506 0.000000 0.254843 0.376364 1.131114
## 5 PGEN_.00g000050 PGEN_.00g000050 0.508251 4.745168 0.228272 0.182728 1.117655
## 6 PGEN_.00g000060 PGEN_.00g000060 6.490179 0.000000 0.000000 6.742646 5.936612

tidy_data_long<-tidy_data %>%
  gather(key="tissue", value="tpm", ctenidia:larvae) #name key as time, name value as tpm, gather columns heart:larvae, converts wide to long format
names(tidy_data_long)[2]<-"gene"
head(tidy_data_long)

##         target_id            gene   tissue      tpm
## 1 PGEN_.00g000010 PGEN_.00g000010 ctenidia 0.000000
## 2 PGEN_.00g000020 PGEN_.00g000020 ctenidia 0.000000
## 3 PGEN_.00g000030 PGEN_.00g000030 ctenidia 0.225494
## 4 PGEN_.00g000040 PGEN_.00g000040 ctenidia 0.565506
## 5 PGEN_.00g000050 PGEN_.00g000050 ctenidia 0.508251
## 6 PGEN_.00g000060 PGEN_.00g000060 ctenidia 6.490179

tidy_data_long$binary=ifelse(tidy_data_long$tpm > 0, 1, 0)

#add goslims below

6 combine datasets into one gene annotation file

pgenerosa_simple_goslims<-salmonMatrix %>% #best pgenerosa data set
  left_join(goslims, by=c("gene"))
pgenerosa_goslims<-salmonMatrix %>%
  left_join(goslims, by=c("gene"))
pgenerosa_long<-tidy_data_long %>% #best long pgenerosa data set
  left_join(goslims, by=c("gene"))
pgenerosa_goslims_long<-tidy_data_long %>%
  left_join(pgenerosa_simple_goslims, by=c("gene"))
kable(pgenerosa_simple_goslims[1:10, 1:10])

target_id	gene	ctenidia	gonad	heart	juvenile	larvae	accessions	gene_id	gene_name
PGEN_.00g000010	PGEN_.00g000010	0.000000	0.000000	0.000000	4.851851	3.423667	Q86IC9;Q552T5	8620183	omt5
PGEN_.00g000020	PGEN_.00g000020	0.000000	0.000000	0.000000	0.840164	0.370266	P04177	25085	Th
PGEN_.00g000030	PGEN_.00g000030	0.225494	0.000000	0.000000	0.069598	0.052545	NA	NA	NA
PGEN_.00g000040	PGEN_.00g000040	0.565506	0.000000	0.254843	0.376364	1.131114	NA	NA	NA
PGEN_.00g000050	PGEN_.00g000050	0.508251	4.745168	0.228272	0.182728	1.117655	Q8L840;O04092;Q9FT71	837636	RECQL4A
PGEN_.00g000060	PGEN_.00g000060	6.490179	0.000000	0.000000	6.742646	5.936612	Q61043;A0A1Y7VJL5;B2RQ73;B7ZMZ9;E9Q488;E9Q4S3;Q674R4;Q6ZPM7	18080	Nin
PGEN_.00g000070	PGEN_.00g000070	0.000000	0.000000	0.000000	0.025013	1.103535	NA	NA	NA
PGEN_.00g000080	PGEN_.00g000080	0.000000	0.000000	0.119488	0.229570	0.000000	A1E2V0	489433	BIRC3
PGEN_.00g000090	PGEN_.00g000090	0.187374	0.000000	0.000000	0.173312	0.000000	P34456	186266
PGEN_.00g000100	PGEN_.00g000100	0.000000	0.000000	0.000000	0.304322	0.000000	NA	NA	NA

6.1 break semi-colon in Term column and count unique values only

6.1.1 distinct goslims

# check for duplicate values within each cell using strsplit() and anyDuplicated()
duplicates <- sapply(strsplit(pgenerosa_simple_goslims$gene, ";"), anyDuplicated)

# print the row indices of cells with duplicate values
if (any(duplicates)) {
  print(which(duplicates > 0))
} else {
  print("No duplicates found")
}

## [1] "No duplicates found"

#accession numbers = no duplicates
#gene_ID = no duplicates
#allGoID duplicates = column 24458
#BP_GO_ID duplicates = column 24458
#CC_GO_ID duplicates = no duplicates
#MF_GO_ID duplicates = columns 4561  8040 13044 16138
#--------------------------------
#GOslim duplicates lots and lots 
#Terms duplicates lots and lots 

#break cells and reform with unique values only
distinct_goslims <- pgenerosa_simple_goslims %>%
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  distinct(Term_unique, .keep_all = TRUE) %>%
  mutate(GOslim_unique = lapply(strsplit(as.character(GOslim), ";"),function(GOslim_unique) paste0(unique(GOslim_unique), collapse = ";"))) %>% 
  distinct(GOslim_unique, .keep_all = TRUE)

kable(distinct_goslims[1:10,1:10])

target_id	gene	ctenidia	gonad	heart	juvenile	larvae	accessions	gene_id	gene_name
PGEN_.00g000010	PGEN_.00g000010	0.000000	0.000000	0.000000	4.851851	3.423667	Q86IC9;Q552T5	8620183	omt5
PGEN_.00g000020	PGEN_.00g000020	0.000000	0.000000	0.000000	0.840164	0.370266	P04177	25085	Th
PGEN_.00g000050	PGEN_.00g000050	0.508251	4.745168	0.228272	0.182728	1.117655	Q8L840;O04092;Q9FT71	837636	RECQL4A
PGEN_.00g000060	PGEN_.00g000060	6.490179	0.000000	0.000000	6.742646	5.936612	Q61043;A0A1Y7VJL5;B2RQ73;B7ZMZ9;E9Q488;E9Q4S3;Q674R4;Q6ZPM7	18080	Nin
PGEN_.00g000080	PGEN_.00g000080	0.000000	0.000000	0.119488	0.229570	0.000000	A1E2V0	489433	BIRC3
PGEN_.00g000090	PGEN_.00g000090	0.187374	0.000000	0.000000	0.173312	0.000000	P34456	186266
PGEN_.00g000210	PGEN_.00g000210	2.810603	0.000000	2.775616	3.495122	0.898959	O00463;B4DIS9;B4E0A2;Q6FHY1	7188	TRAF5
PGEN_.00g000300	PGEN_.00g000300	4.091597	2.739254	3.785783	5.053620	1.087948	Q5REG4	100171717	DTX3
PGEN_.00g000380	PGEN_.00g000380	0.475060	0.000000	0.000000	0.315384	0.000000	Q8QG60;Q8QG52;Q8QGQ5	374092	CRY2
PGEN_.00g000440	PGEN_.00g000440	11.483098	5.728598	3.123012	6.873106	10.243535	Q9H583;Q5T3Q8;Q6P197;Q9NW23	55127	HEATR1

7 SPECIES

7.1 Separate the Term columb by semi-colons to count occurances for table

# For Pgenerosa alone (wide format) 
# Separate the "Term" column by semi-colons and count the occurrences
term_count_Pgen <- distinct_goslims %>% #make sure you have removed duplicate values beforehand
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) # Count the occurrences and sort by descending count

# Print the count of each term
print(term_count_Pgen)

## # A tibble: 71 × 2
##    Term_unique                                   n
##    <chr>                                     <int>
##  1 anatomical structure development           1853
##  2 signaling                                  1603
##  3 cell differentiation                       1281
##  4 DNA-templated transcription                 741
##  5 regulation of DNA-templated transcription   678
##  6 immune system process                       670
##  7 programmed cell death                       600
##  8 cell motility                               595
##  9 reproductive process                        536
## 10 cytoskeleton organization                   516
## # ℹ 61 more rows

# For Pgenerosa using pgenerosa_long (versus wide)
# Separate the "Term" column by semi-colons and count the occurrences
term_count_Pgen_long <- pgenerosa_long %>%  # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) # Count the occurrences and sort by descending count

# Print the count of each term
print(term_count_Pgen_long)

## # A tibble: 71 × 2
##    Term_unique                                   n
##    <chr>                                     <int>
##  1 anatomical structure development           1853
##  2 signaling                                  1603
##  3 cell differentiation                       1281
##  4 DNA-templated transcription                 741
##  5 regulation of DNA-templated transcription   678
##  6 immune system process                       670
##  7 programmed cell death                       600
##  8 cell motility                               595
##  9 reproductive process                        536
## 10 cytoskeleton organization                   516
## # ℹ 61 more rows

#exactly the same as Pgen_wide


#for Manila Clam
# Separate the "Term" column by semi-colons and count the occurrences
term_count_manila <- Manila_Pgen_goslims %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) # Count the occurrences and sort by descending count

# Print the count of each term
print(term_count_manila)

## # A tibble: 65 × 2
##    Term_unique                                   n
##    <chr>                                     <int>
##  1 anatomical structure development            129
##  2 signaling                                   113
##  3 cell differentiation                         86
##  4 cytoskeleton organization                    65
##  5 DNA-templated transcription                  51
##  6 reproductive process                         48
##  7 regulation of DNA-templated transcription    47
##  8 cell motility                                40
##  9 programmed cell death                        40
## 10 protein-containing complex assembly          39
## # ℹ 55 more rows

# Manila Reverse
term_count_manila_r <- Pgen_Manila_goslims %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) # Count the occurrences and sort by descending count

# Print the count of each term
print(term_count_manila_r)

## # A tibble: 65 × 2
##    Term_unique                                   n
##    <chr>                                     <int>
##  1 anatomical structure development            149
##  2 signaling                                   126
##  3 cell differentiation                         96
##  4 cytoskeleton organization                    68
##  5 DNA-templated transcription                  60
##  6 regulation of DNA-templated transcription    55
##  7 reproductive process                         55
##  8 programmed cell death                        44
##  9 immune system process                        42
## 10 protein-containing complex assembly          42
## # ℹ 55 more rows

#for Mercenaria Clam
# Separate the "Term" column by semi-colons and count the occurrences
term_count_Mercenaria <- Mercenaria_Pgen_goslims %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) # Count the occurrences and sort by descending count

# Print the count of each term
print(term_count_Mercenaria)

## # A tibble: 69 × 2
##    Term_unique                                   n
##    <chr>                                     <int>
##  1 anatomical structure development            247
##  2 signaling                                   189
##  3 cell differentiation                        170
##  4 DNA-templated transcription                  88
##  5 cytoskeleton organization                    87
##  6 cell motility                                83
##  7 regulation of DNA-templated transcription    82
##  8 reproductive process                         71
##  9 immune system process                        66
## 10 programmed cell death                        63
## # ℹ 59 more rows

#for Mercenaria Clam reverse
# Separate the "Term" column by semi-colons and count the occurrences
term_count_Mercenaria_r <- Pgen_Mercenaria_goslims %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) # Count the occurrences and sort by descending count

# Print the count of each term
print(term_count_Mercenaria_r) #over 3x more terms than Mercenaria not reversed...wonder why that is....

## # A tibble: 71 × 2
##    Term_unique                                   n
##    <chr>                                     <int>
##  1 anatomical structure development            768
##  2 signaling                                   635
##  3 cell differentiation                        517
##  4 DNA-templated transcription                 267
##  5 regulation of DNA-templated transcription   245
##  6 cell motility                               241
##  7 cytoskeleton organization                   237
##  8 immune system process                       232
##  9 programmed cell death                       229
## 10 vesicle-mediated transport                  218
## # ℹ 61 more rows

#combine all 5 datasets
term_count_all <- bind_rows(term_count_Pgen, term_count_manila, term_count_manila_r, term_count_Mercenaria, term_count_Mercenaria_r, .id = "transcriptome") 

term_count_best <- term_count_all %>% 
  mutate(transcriptome = ifelse(transcriptome == 1, "Pgenerosa", 
                    ifelse(transcriptome == 2, "Manila",
                           ifelse(transcriptome == 3, "Manila_r", 
                                  ifelse(transcriptome == 4, "Mercenaria", 
                                         ifelse(transcriptome == 5, "Mercenaria_r", NA))))))

7.2 NORMALIZE

#normalize by Pgen transcriptome
sum_terms<- term_count_Pgen %>% #count terms in n 
  summarise(total_n = sum(n)) #17,611

Pgen_normalized<- term_count_Pgen %>%
  mutate(Pgen_normalized = n/17611)

normal_counts<- term_count_best %>%
  group_by(transcriptome) %>%
  summarise(total_n = sum(n))
#Pgen = 17,611
#Manila = 1235
#Manila_r = 1390
#Mercenaria = 2198
#Mercenaria_r = 7027


filtered_data1 <- term_count_best %>%
  filter(transcriptome %in% c("Mercenaria_r", "Manila", "Mercenaria", "Manila_r")) %>%
  mutate(n_divided = case_when(
                               transcriptome == "Manila" ~ n/1235,
                               transcriptome == "Manila_r" ~ n/1390,
                               transcriptome == "Mercenaria" ~ n/2198,
                               transcriptome == "Mercenaria_r" ~ n/7027))

grouped_data1<-left_join(filtered_data1, Pgen_normalized, by="Term_unique")

species_normal<-grouped_data1 %>%
  mutate(normalized = n_divided/Pgen_normalized)%>%
  rename(n_species = n.x) %>%
  rename(n_Pgen = n.y) %>%
  mutate(log_scale = log(normalized))
#for heatmap

kable(species_normal[1:5,1:5])

transcriptome	Term_unique	n_species	n_divided	n_Pgen
Manila	anatomical structure development	129	0.1044534	1853
Manila	signaling	113	0.0914980	1603
Manila	cell differentiation	86	0.0696356	1281
Manila	cytoskeleton organization	65	0.0526316	516
Manila	DNA-templated transcription	51	0.0412955	741

7.3 count GOterms unique to species

#####make new variables with binary data

#wide format data

Manila_2<-Manila_Pgen_goslims %>%
  select(-c(3:11, 13:23))

Mercenaria_2<-Mercenaria_Pgen_goslims %>%
   select(-c(3:11, 13:23))

species_joined<-left_join(Mercenaria_2, Manila_2, by="gene")

## Warning in left_join(Mercenaria_2, Manila_2, by = "gene"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 17 of `x` matches multiple rows in `y`.
## ℹ Row 256 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

#Mercenaria
species_joined$`e-value.x`=ifelse(species_joined$`e-value.x` > 0, 1, 0)
#Manila
species_joined$`e-value.y`=ifelse(species_joined$`e-value.y` > 0, 1, 0)

species_matrix<-species_joined %>% select(gene, `e-value.x`, `e-value.y`) %>%
  rename(Mercenaria_evalue = `e-value.x`)%>%
  rename(Manila_evalue = `e-value.y`)%>%
  distinct(gene, .keep_all = TRUE)%>%
  mutate(Manila_evalue = ifelse(is.na(Manila_evalue), 0, Manila_evalue))%>%
  mutate(Pgen_evalue = 1)

species_matrix<-tibble::column_to_rownames(species_matrix, var="gene")

species_matrix<-as.matrix(species_matrix) #count matrix for UpSet plots

matrix_species = make_comb_mat(species_matrix, mode="distinct")
matrix_species_intersect = make_comb_mat(species_matrix, mode="intersect")
matrix_species_union = make_comb_mat(species_matrix, mode="union")
set_size(matrix_species)

## Mercenaria_evalue     Manila_evalue       Pgen_evalue 
##               980               210              1029

comb_size(matrix_species)

## 111 101 011 001 
## 202 778   8  41

# create sample data frame
species_count<- as.data.frame(species_matrix) %>%
  mutate(Mercenaria_Manila = NA) %>%
  mutate(Manila_Pgen = NA) %>%
  mutate(Pgen_Mercenaria = NA) %>%
  mutate(all=NA)%>%
  mutate(Pgen= NA)

# iterate through rows
########################
#Mercenaria_Manila does not include Pgen genome or else will return all zeros
species_count$Mercenaria_Manila <-ifelse(species_count$Mercenaria_evalue ==1 & species_count$Manila_evalue == 1, 1, 0)

#Manila + Pgenerosa
species_count$Manila_Pgen <-ifelse(species_count$Mercenaria_evalue ==0 & species_count$Manila_evalue == 1 & species_count$Pgen_evalue == 1, 1, 0)

#Mercenaria + Pgenerosa
species_count$Pgen_Mercenaria <-ifelse(species_count$Mercenaria_evalue == 1 & species_count$Manila_evalue == 0 & species_count$Pgen_evalue == 1, 1, 0)

#Mercenaria +Pgenerosa + Manila
species_count$all<-ifelse(species_count$Mercenaria_evalue == 1 & species_count$Manila_evalue == 1 & species_count$Pgen_evalue == 1, 1, 0)

#Pgen alone
species_count$Pgen <-ifelse(species_count$Mercenaria_evalue == 0 & species_count$Manila_evalue == 0 & species_count$Pgen_evalue == 1, 1, 0)

#return column names when species == 1
# identify row indices where Pgen_Mercenaria column equals 1
row_indices_a<- which(species_count$Mercenaria_Manila == 1)
row_indices_b<- which(species_count$Manila_Pgen == 1)
row_indices_c<- which(species_count$Pgen_Mercenaria == 1)
row_indices_d<- which(species_count$all == 1)
row_indices_e<- which(species_count$Pgen == 1)

# extract corresponding row names
genes_Mer_Man <- as.data.frame(rownames(species_count[row_indices_a, ])) 
names(genes_Mer_Man)[1]<-"gene"

genes_Man_Pgen <- as.data.frame(rownames(species_count[row_indices_b, ])) 
names(genes_Man_Pgen)[1]<-"gene"

genes_Mer_Pgen <- as.data.frame(rownames(species_count[row_indices_c, ])) 
names(genes_Mer_Pgen)[1]<-"gene"

genes_all <- as.data.frame(rownames(species_count[row_indices_d, ])) 
names(genes_all)[1]<-"gene"

genes_Pgen <- as.data.frame(rownames(species_count[row_indices_e, ])) 
names(genes_Pgen)[1]<-"gene"

#goslims
goslims_genes_MM<-left_join(genes_Mer_Man, goslims, by="gene")
goslims_gene_ManPgen<-left_join(genes_Man_Pgen, goslims, by ="gene")
goslims_genes_MerPgen<-left_join(genes_Mer_Pgen, goslims, by="gene")
goslims_genes_all<-left_join(genes_all, goslims, by="gene")
goslims_genes_Pgen<-left_join(genes_Pgen, goslims, by="gene")

#Unique GOterms only
#Mercenaria + Manila
MM_count_unique <- goslims_genes_MM %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
 # distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, species= "Mercenaria-Manila")

Man_Pgen_count_unique <- goslims_gene_ManPgen %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
 # distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, species= "Manila-Pgen")

Mer_Pgen_count_unique <- goslims_genes_MerPgen %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
 # distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, species= "Mercenaria-Pgen")

all_count_unique <- goslims_genes_all %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
 # distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, species= "all-3")

Pgen_count_unique <- goslims_genes_Pgen %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
 # distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, species= "Pgen_alone")

species_unique<-bind_rows(Mer_Pgen_count_unique, Man_Pgen_count_unique, MM_count_unique, Pgen_count_unique, all_count_unique)

kable(species_unique[1:4, 1:4])

Term_unique	n	count_unique	species
NA	261	68	Mercenaria-Pgen
anatomical structure development	203	68	Mercenaria-Pgen
signaling	181	68	Mercenaria-Pgen
cell differentiation	134	68	Mercenaria-Pgen

8 Oyster female gonad comparison

8.1 load data from Dheilly et al. 2012 from Supplemental Table 3 and filter for genebank accession numbers

oyster_gonad<-read.csv(file="https://raw.githubusercontent.com/course-fish546-2023/olivia-geoduck/main/data/pone.0036353.s003.csv")
oyster_gonad_raw<-read.csv(file="https://raw.githubusercontent.com/course-fish546-2023/olivia-geoduck/main/data/pone.0036353.s003.csv")

oyster_gonad<- oyster_gonad %>%
  group_by(Cluster) %>%
  select(Genebank, Cluster, Tissue.specificity, Description)

kable(oyster_gonad[1:10, 1:4])

Genebank	Cluster	Tissue.specificity	Description
BQ427044	1		(sp:P34714) SPARC OS=Caenorhabditis elegans GN=ost-1 PE=1 SV=1
CU683682	1		0
DV736722	1		0
BQ426244	1		0
AM854135	1		0
CU999082	1		0
FP010274	1		0
CU682512	1		0
ES789495	1		0
CU683161	1	adductor muscle	(sp:A2ASS6) Titin OS=Mus musculus GN=Ttn PE=1 SV=1

8.2 load results from BLASTn from Dheilly 2012 paper fasta against Pgen transcriptome

used Panopea-generosa-v2-db as transcriptome

#blast results = cgigas_gonad_pgen
cgigas_gonad_pgen<-read.csv(file="https://gannet.fish.washington.edu/gigas/data/p.generosa/cgigas_gonad_pgen.tab", header=FALSE, sep = '\t')
cgigas_gonad_pgen<- separate(cgigas_gonad_pgen, V2, into = c("Pgen_gene", "Scaffold"), sep = "::")
names(cgigas_gonad_pgen)[1]<-"Genebank"
names(cgigas_gonad_pgen)[2]<-"gene"
names(cgigas_gonad_pgen)[12]<-"e-value"

gonad_genes<-left_join(cgigas_gonad_pgen, oyster_gonad, by="Genebank")
kable(gonad_genes[1:10, 14:16])

Cluster	Tissue.specificity	Description
NA	NA	NA
NA	NA	NA
3		(sp:Q5R416) Catenin alpha-2 OS=Pongo abelii GN=CTNNA2 PE=2 SV=3
NA	NA	NA
NA	NA	NA
NA	NA	NA
NA	NA	NA
NA	NA	NA
NA	NA	NA
NA	NA	NA

8.3 Filter by genes present in female gonad

return list of gonad genes in C. gigas and P. generosa

gonad_genes<-gonad_genes %>%
  filter(!is.na(Cluster)) 

kable(gonad_genes[1:10, 14:16])

Cluster	Tissue.specificity	Description
3		(sp:Q5R416) Catenin alpha-2 OS=Pongo abelii GN=CTNNA2 PE=2 SV=3
6		(sp:Q498H0) Sister chromatid cohesion protein PDS5 homolog B-A OS=Xenopus laevis GN=pds5b-A PE=1 SV=1
9		(sp:P83917) Chromobox protein homolog 1 OS=Mus musculus GN=Cbx1 PE=1 SV=1
3	female gonad	(sp:Q5ZJI0) Protein-tyrosine sulfotransferase 2 OS=Gallus gallus GN=TPST2 PE=2 SV=1
3	female gonad	(sp:Q8CG46) Structural maintenance of chromosomes protein 5 OS=Mus musculus GN=Smc5 PE=2 SV=1
7		(sp:P67963) Casein kinase I isoform alpha OS=Xenopus laevis GN=csnk1a1 PE=2 SV=1
5		(sp:Q9JLI7) Sperm-associated antigen 6 OS=Mus musculus GN=Spag6 PE=1 SV=1
5		(sp:A3KMX7) Putative malate dehydrogenase 1B OS=Bos taurus GN=MDH1B PE=2 SV=1
3		(sp:Q6NSM8) Serine/threonine-protein kinase SIK3 homolog OS=Danio rerio GN=zgc:66101 PE=2 SV=1
2	female gonad	(sp:Q8TBZ3) WD repeat-containing protein 20 OS=Homo sapiens GN=WDR20 PE=1 SV=2

gonad_genes<-left_join(gonad_genes, goslims, by="gene")

gonad_genes4 <- gonad_genes %>%
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% mutate(GOslim_unique = lapply(strsplit(as.character(GOslim), ";"),function(GOslim_unique) paste0(unique(GOslim_unique), collapse = ";")))

library(dplyr)
library(stringr)

gonad_genes4 <- gonad_genes %>%
  mutate(Term_unique = sapply(strsplit(as.character(Term), ";"), function(x) str_c(unique(x), collapse = ";")),
         GOslim_unique = sapply(strsplit(as.character(GOslim), ";"), function(x) str_c(unique(x), collapse = ";")))

write.csv(gonad_genes, file = "/Users/oliviacattau/Documents/GitHub/olivia-geoduck/output/tables/reproductive_genes.csv")

write.csv(gonad_genes4, file = "/Users/oliviacattau/Documents/GitHub/olivia-geoduck/output/tables/reproductive_genes2.csv")

gonad_list <- gonad_genes %>%
  group_by(Cluster) %>%
  count(Cluster, sort = TRUE)

kable(gonad_list[1:10, 1:2])

Cluster	n
2	14
3	13
7	13
5	8
1	7
9	7
4	5
6	4
10	2
8	1

9 TISSUE

9.1 Count Goslims by tissue type

#count top GOterms
tissue_goslims <- pgenerosa_long %>%
    group_by(tissue) %>% 
  filter(binary == 1) %>%
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>%
  separate_rows(Term_unique, sep = ";") %>%
  mutate(Term_unique = trimws(Term_unique)) %>%
  count(Term_unique, tissue, sort = TRUE)

10 NORMALIZE

10.1 normalize goterm count by % in geoduck transcriptome by % tissue goterm counts by tissue transcriptome and return another % which is the normalized goterm count

#add new column to variable term_count_Pgen

#normalize data by % hit in Pgen transcriptome
sum_terms<- term_count_Pgen %>% #count terms in n 
  summarise(total_n = sum(n)) #17,611

Pgen_normalized<- term_count_Pgen %>%
  mutate(Pgen_normalized = n/17611)

#normalize tissue data
normal_counts<- tissue_goslims %>%
  group_by(tissue) %>%
  summarise(total_n = sum(n))
#heart = 16,021
#ctenidia = 15,911
#gonad = 14,715
#juv = 17,277
#larvae = 16,632

filtered_data <- tissue_goslims %>%
  filter(tissue %in% c("ctenidia", "heart", "gonad", "larvae", "juvenile")) %>%
  mutate(n_divided = case_when(tissue == "ctenidia" ~ n/15911,
                               tissue == "heart" ~ n/16021,
                               tissue == "gonad" ~ n/14715,
                               tissue == "larvae" ~ n/16632,
                               tissue == "juvenile" ~ n/17277))

#combine Pgen_normalized with filtered_data and add new column normalized where tissue_n/geoduck_n to produce new normalized value

grouped_data<-left_join(filtered_data, Pgen_normalized, by="Term_unique")

Tis_normal<-grouped_data %>%
  mutate(normalized = n_divided/Pgen_normalized)%>%
  rename(n_tissue = n.x) %>%
  rename(n_Pgen = n.y) %>%
  mutate(log_scale = log(normalized))
#for heatmap

kable(Tis_normal[1:5, 1:8])

tissue	Term_unique	n_tissue	n_divided	n_Pgen	Pgen_normalized	normalized	log_scale
juvenile	anatomical structure development	1817	0.1051687	1853	0.1052183	0.9995285	-0.0004716
larvae	anatomical structure development	1743	0.1047980	1853	0.1052183	0.9960050	-0.0040030
heart	anatomical structure development	1678	0.1047375	1853	0.1052183	0.9954305	-0.0045800
ctenidia	anatomical structure development	1661	0.1043932	1853	0.1052183	0.9921578	-0.0078731
juvenile	signaling	1572	0.0909880	1603	0.0910227	0.9996195	-0.0003806

10.2 filter by tissue type

not actually unique goslims, only unique to tissue types

tidy1<-pgenerosa_goslims_long %>%
  group_by(tissue) %>%
  filter(tpm > 0) 

ctenidia<-pgenerosa_goslims %>%
  filter(ctenidia > 0 & gonad == 0 & heart == 0 & juvenile == 0 & larvae == 0)
count(ctenidia)

##     n
## 1 340

#340 obs (unique to ctenidia)
write.csv(ctenidia, file="/Users/oliviacattau/Documents/GitHub/code-for-Pgenerosa/output/ctenidia_unique.csv")

gonad<-pgenerosa_goslims %>%
  filter(gonad > 0 & ctenidia == 0 & heart == 0 & juvenile == 0 & larvae == 0)
count(gonad)

##     n
## 1 119

#119 obs (unique to gonad)
write.csv(gonad, file="/Users/oliviacattau/Documents/GitHub/code-for-Pgenerosa/output/gonad_unique.csv")

heart<-pgenerosa_goslims %>%
  filter(heart > 0 & ctenidia == 0 & gonad == 0 & juvenile == 0 & larvae == 0)
count(heart)

##     n
## 1 371

#371 obs (unique to heart)
write.csv(heart, file="/Users/oliviacattau/Documents/GitHub/code-for-Pgenerosa/output/heart_unique.csv")

juvenile<-pgenerosa_goslims %>%
  filter(juvenile > 0 & ctenidia == 0 & heart == 0 & gonad == 0 & larvae == 0)
count(juvenile)

##      n
## 1 2180

#2,180 obs (unique to juvenile alone)
write.csv(juvenile, file="/Users/oliviacattau/Documents/GitHub/code-for-Pgenerosa/output/juvenile_unique.csv")

larvae<-pgenerosa_goslims %>%
  filter(larvae > 0 & ctenidia == 0 & heart == 0 & gonad == 0 & juvenile == 0)
count(larvae)

##     n
## 1 868

#868 obs (unique to larvae alone)
write.csv(larvae, file="/Users/oliviacattau/Documents/GitHub/code-for-Pgenerosa/output/larvae_unique.csv")

juvorlarv<-pgenerosa_goslims %>%
  filter(ctenidia == 0 & gonad == 0 & heart == 0) %>%
  filter(juvenile > 0 & larvae > 0)
#5,022 obs unique to both juveniles or larvae
#1,974 obs unique to juveniles and larvae together

all_unique<-pgenerosa_goslims %>%
   filter(ctenidia > 0 & gonad > 0 & heart > 0 & juvenile > 0 & larvae > 0)

10.3 count GOterms for individual tissues

gonad larvae heart ctenidia juvenile

gonad_count_unique <- gonad %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% # distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, tissue= "gonad")

larvae_count_unique <- larvae %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% # distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, tissue= "larvae")

heart_count_unique <- heart %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% # distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, tissue= "heart")

ctenidia_count_unique <- ctenidia %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% # distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, tissue= "ctenidia")

juvenile_count_unique <- juvenile %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% # distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, tissue= "juvenile")

tissue_all_unique_genes_GOterms<-rbind(heart_count_unique, gonad_count_unique, ctenidia_count_unique, larvae_count_unique, juvenile_count_unique)

kable(tissue_all_unique_genes_GOterms[1:4, 1:4])

Term_unique	n	count_unique	tissue
NA	342	41	heart
signaling	13	41	heart
anatomical structure development	11	41	heart
cell differentiation	9	41	heart

11 organize data for UpSet plot

library(dplyr)
#####make new variables with binary data
tidy_data$heart=ifelse(tidy_data$heart > 0, 1, 0)
tidy_data$gonad=ifelse(tidy_data$gonad >0, 1, 0)
tidy_data$ctenidia=ifelse(tidy_data$ctenidia > 0, 1, 0)
tidy_data$larvae=ifelse(tidy_data$larvae > 0, 1, 0)
tidy_data$juvenile=ifelse(tidy_data$juvenile > 0, 1, 0)
new_data<-tidy_data %>% select(target_id,heart, gonad, ctenidia, larvae, juvenile)
new_data<-tibble::column_to_rownames(new_data, var="target_id")
new_data<-as.matrix(new_data) #count matrix for UpSet plots

12 Make UpSet plot for visualization of overlapping data:

link to UpSet R page

m1 = make_comb_mat(new_data) #only distinct intersections
m2=make_comb_mat(new_data, mode="intersect")
set_size(m1)

##    heart    gonad ctenidia   larvae juvenile 
##    17602    13682    17479    19449    23417

comb_size(m1)

## 11111 11110 11101 11011 10111 01111 11100 11010 11001 10110 10101 10011 01110 
## 11294    24   692   397  2147   337    54    13   122    51   658  1007     8 
## 01101 01011 00111 11000 10100 10010 10001 01100 01010 01001 00110 00101 00011 
##   100   258   886    43   124    60   545    20    46   155    79   665  1974 
## 10000 01000 00100 00010 00001 00000 
##   371   119   340   868  2180  9310

comb_data<-as.data.frame(comb_size(m1))

p1<-UpSet(m1 [comb_degree(m1)<=2], column_title ="All intersections of 2 or less") #intersections 2 or less
p2<-UpSet(m1 [comb_degree(m1)>=3], column_title= "All intersection of 3 or more") #intersections 3 or more
p3<-UpSet(m1, top_annotation = HeatmapAnnotation(
    degree = as.character(comb_degree(m1)),
    "Intersection\nsize" = anno_barplot(comb_size(m1), 
        border = FALSE, 
        gp = gpar(fill = "black"), 
        height = unit(2, "cm")
    ), 
    annotation_name_side = "left", 
    annotation_name_rot = 0), column_title="p. generosa transcriptome distinct intersections") #distinct mode

p5<-UpSet(m2 [comb_degree(m2)<=2], column_title ="All intersections of 2 or less")
p6<-UpSet(m2 [comb_degree(m2)>=3], column_title ="All intersections of 3 or more")
#####testing comb_set sizes
x2<-comb_size(m1)
x3<-as.data.frame(x2)
sum(x3$x2) #34,947 --distinct mode

## [1] 34947

x4<-comb_size(m2)
x5<-as.data.frame(x4)
sum(x5$x4) #450,489 --intersect mode

## [1] 450489

13 UPset Plots

for looking at unique genes per RNA-seq tissue library

p1

p2

p3

14 Make gene list for unique genes per tissue type

# create sample data frame
count_df<- as.data.frame(new_data) %>%
  mutate(heart_gonad = NA) %>%
  mutate(heart_ctenidia = NA) %>%
  mutate(ctenidia_gonad = NA)


# iterate through rows
########################
#heart + gonad
# create a new column for shared data between heart and gonad columns
count_df$heart_gonad <- ifelse(count_df$heart == 1 & count_df$gonad == 1 & count_df$ctenidia == 0 & count_df$juvenile == 0 & count_df$larvae == 0, 1, 0)

#heart + ctenidia
count_df$heart_ctenidia <- ifelse(count_df$heart == 1 & count_df$gonad == 0 & count_df$ctenidia == 1 & count_df$juvenile == 0 & count_df$larvae == 0, 1, 0) 

#ctenidia + gonad
count_df$ctenidia_gonad <- ifelse(count_df$heart == 0 & count_df$gonad == 1 & count_df$ctenidia == 1 & count_df$juvenile == 0 & count_df$larvae == 0, 1, 0) 

#larvae + juvenile
count_df$larv_juv <- ifelse(count_df$heart == 0 & count_df$gonad == 0 & count_df$ctenidia == 0 & count_df$juvenile == 1 & count_df$larvae == 1, 1, 0)

#ctenidia + juvenile 
count_df$ctenidia_juv <- ifelse(count_df$heart == 0 & count_df$gonad == 0 & count_df$ctenidia == 1 & count_df$juvenile == 1 & count_df$larvae == 0, 1, 0)

#heart + juvenile
count_df$heart_juv <- ifelse(count_df$heart == 1 & count_df$gonad == 0 & count_df$ctenidia == 0 & count_df$juvenile == 1 & count_df$larvae == 0, 1, 0)

#gonad + juvenile
count_df$gonad_juv <- ifelse(count_df$heart == 0 & count_df$gonad == 1 & count_df$ctenidia == 0 & count_df$juvenile == 1 & count_df$larvae == 0, 1, 0)

#ctenidia + larvae
count_df$larv_ctenidia <- ifelse(count_df$heart == 0 & count_df$gonad == 0 & count_df$ctenidia == 1 & count_df$juvenile == 0 & count_df$larvae == 1, 1, 0)

#heart + larvae
count_df$larv_heart <- ifelse(count_df$heart == 1 & count_df$gonad == 0 & count_df$ctenidia == 0 & count_df$juvenile == 0 & count_df$larvae == 1, 1, 0)

#gonad + larvae
count_df$larv_gonad <- ifelse(count_df$heart == 0 & count_df$gonad == 1 & count_df$ctenidia == 0 & count_df$juvenile == 0 & count_df$larvae == 1, 1, 0)

#return column names when tissues == 1
# identify row indices where heart_gonad column equals 1
row_indices  <- which(count_df$heart_gonad == 1)
row_indices_2<- which(count_df$heart_ctenidia == 1)
row_indices_3<- which(count_df$ctenidia_gonad == 1)
row_indices_4<- which(count_df$larv_juv == 1)
row_indices_5<- which(count_df$ctenidia_juv == 1)
row_indices_6<- which(count_df$heart_juv == 1)
row_indices_7<- which(count_df$gonad_juv == 1)
row_indices_8<- which(count_df$larv_ctenidia == 1)
row_indices_9<- which(count_df$larv_heart == 1)
row_indices_10<- which(count_df$larv_gonad == 1)


# extract corresponding row names
genes_heart_gonad <- as.data.frame(rownames(count_df[row_indices, ])) 
names(genes_heart_gonad)[1]<-"gene"

genes_heart_ctenidia <- as.data.frame(rownames(count_df[row_indices_2,]))
names(genes_heart_ctenidia)[1]<-"gene"

genes_ctenidia_gonad <- as.data.frame(rownames(count_df[row_indices_3, ])) 
names(genes_ctenidia_gonad)[1]<-"gene"

genes_larv_juv <- as.data.frame(rownames(count_df[row_indices_4, ])) 
names(genes_larv_juv)[1]<-"gene"

genes_ctenidia_juv <-as.data.frame(rownames(count_df[row_indices_5, ])) 
names(genes_ctenidia_juv)[1]<-"gene"

genes_heart_juv <- as.data.frame(rownames(count_df[row_indices_6, ])) 
names(genes_heart_juv)[1]<-"gene"

genes_gonad_juv <- as.data.frame(rownames(count_df[row_indices_7, ])) 
names(genes_gonad_juv)[1]<-"gene"

genes_larv_ctenidia <- as.data.frame(rownames(count_df[row_indices_8, ])) 
names(genes_larv_ctenidia)[1]<-"gene"

genes_larv_heart <- as.data.frame(rownames(count_df[row_indices_9, ])) 
names(genes_larv_heart)[1]<-"gene"

genes_larv_gonad <- as.data.frame(rownames(count_df[row_indices_10, ])) 
names(genes_larv_gonad)[1]<-"gene"

#goslims
goslims_genes_hg<-left_join(genes_heart_gonad, goslims, by="gene")
goslims_gene_hc<-left_join(genes_heart_ctenidia, goslims, by ="gene")
goslims_genes_cg<-left_join(genes_ctenidia_gonad, goslims, by="gene")
goslims_gene_lj<-left_join(genes_larv_juv, goslims, by="gene")
goslims_genes_cj<-left_join(genes_ctenidia_juv, goslims, by="gene")
goslims_genes_hj<-left_join(genes_heart_juv, goslims, by="gene")
goslims_genes_gj<-left_join(genes_gonad_juv, goslims, by="gene")
goslims_genes_cl<-left_join(genes_larv_ctenidia, goslims, by="gene")
goslims_genes_hl<-left_join(genes_larv_heart, goslims, by="gene")
goslims_genes_gl<-left_join(genes_larv_gonad, goslims, by="gene")


#Unique GOterms only
#heart-gonad
hg_count_unique <- goslims_genes_hg %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
 # distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, tissues= "heart-gonad")

#heart-ctenidia
hc_count_unique <- goslims_gene_hc %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  #distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>%# Count the occurrences and sort by descending count
  mutate(count_unique = n()-1, tissues= "heart-ctenidia")

#ctenidia-gonad
cg_count_unique <- goslims_genes_cg %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  #distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, tissues= "ctenidia-gonad")

#larvae-juvenile
lj_count_unique <- goslims_gene_lj %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  #distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, tissues= "larvae-juvenile") #count genes with GOslims and remove 1 (for NAs)

#ctenidia-juvenile
cj_count_unique <- goslims_genes_cj %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  #distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, tissues= "ctenidia-juvenile") #count genes with GOslims and remove 1 (for NAs)

#heart-juvenile
hj_count_unique <- goslims_genes_hj %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  #distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, tissues= "heart-juvenile") #count genes with GOslims and remove 1 (for NAs)

#gonad-juvenile
gj_count_unique <- goslims_genes_gj %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  #distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, tissues= "gonad-juvenile") #count genes with GOslims and remove 1 (for NAs)

#heart + larvae
hl_count_unique <- goslims_genes_hl %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  #distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, tissues= "heart-larvae") #count genes with GOslims and remove 1 (for NAs)

#ctenidia + larvae
cl_count_unique <- goslims_genes_cl %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  #distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, tissues= "ctenidia-larvae") #count genes with GOslims and remove 1 (for NAs)

#gonad + larvae
gl_count_unique <- goslims_genes_gl %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  #distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, tissues= "gonad-larvae") #count genes with GOslims and remove 1 (for NAs)

#term count Juv only
juv_count_unique <- juvenile %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  #distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, tissues= "juvenile") #count genes with GOslims and remove 1 (for NAs)

write.csv(juv_count_unique, file="/Users/oliviacattau/Documents/GitHub/olivia-geoduck/data/juv_count_unique.csv")

#combine into one DF
unique_overlapping_genes<-bind_rows(hg_count_unique, hc_count_unique, cg_count_unique, lj_count_unique, cj_count_unique, hj_count_unique, gj_count_unique, cl_count_unique, hl_count_unique, gl_count_unique)

15 Make stacked bar-plot for visualizing Unique Terms

#filter Terms down a bit 
species_normal_filtered<- species_normal%>%
  group_by(Term_unique) %>%
  filter(n_species >= 10) %>%
  filter(!is.na("Terms_unique"))

terms_filtered <- term_count_best %>%
  filter(transcriptome == "Pgenerosa" | transcriptome == "Manila_r" | transcriptome == "Mercenaria_r") %>%
  filter(n() >= 5) %>%
  filter(!is.na("Terms_unique")) %>%
  filter(n >= 10)

terms_filtered2 <- term_count_best %>%
  filter(transcriptome == "Manila_r" | transcriptome == "Mercenaria_r") %>%
  filter(n() >= 5) %>%
  filter(!is.na("Terms_unique")) %>%
  filter(n >= 10)

#load top 20 hits from Github
top20terms<-read.csv(file="https://raw.githubusercontent.com/course-fish546-2023/olivia-geoduck/main/output/tables/top20terms.csv")

compairative_transcriptomes<-top20terms %>%
  filter(transcriptome != "Pgenerosa") %>%
  filter(n >= 15)

cols = rainbow(25, s=.6, v=.9)[sample(1:25,25)]
cols2 = rainbow(69, s=.6, v=.9)[sample(1:69,69)]

stacked_barplot<-ggplot(top20terms, aes(x=transcriptome, y=n, fill=Term_unique))+geom_bar(stat = "identity")+theme_bw()+scale_fill_manual(values=cols)+theme(axis.text.x = element_text(angle = 45, hjust = 1, size=10, face="bold"))+theme(axis.text.y=element_text(size=10))+labs(fill="Biological GO Terms")

stacked_barplot2<-ggplot(compairative_transcriptomes, aes(x=transcriptome, y=n, fill=Term_unique))+geom_bar(stat = "identity", position="fill")+theme_bw()+scale_fill_manual(values=cols)+theme(axis.text.x = element_text(angle = 45, hjust = 1, size=10, face="bold"))+theme(axis.text.y=element_text(size=10))+labs(fill="Biological GO Terms")

stacked_barplot3<-ggplot(terms_filtered, aes(x=transcriptome, y=n, fill=Term_unique))+geom_bar(position="fill", stat = "identity")+theme_bw()+scale_fill_manual(values=cols2)+labs(x="Transcriptome")+labs(fill="Biological GO Terms")

stacked_barplot4<-ggplot(terms_filtered, aes(x=Term_unique, y=n, fill=transcriptome))+geom_bar(position="stack", stat="identity")+theme_bw()+theme(axis.text.x = element_text(hjust = 1, size=10))+coord_flip()+labs(x="Biological GO Terms")+labs(fill="Transcriptome")

stacked_barplot5<-ggplot(terms_filtered2, aes(x=Term_unique, y=n, fill=transcriptome))+geom_bar(position="stack", stat="identity")+theme_bw()+theme(axis.text.x = element_text (hjust = 1, size=10))+coord_flip()+labs(x="Biological GO Terms")+labs(fill="Transcriptome")

stacked_barplot

stacked_barplot2

stacked_barplot4

stacked_barplot5

16 Heatmaps

heatmap7

heatmap8

heatmap9

heatmap10

17 pie charts

Functional classification of P generosa contigs by RNA-Seq library

# Filter out NA values in the column n.Pgenerosa
na_zero <- replace(top20_long, is.na(top20_long), 0)

# Plot the pie chart using the filtered data
pie_geoduck<-ggplot(na_zero, aes(x="", y=n.Pgenerosa, fill=Term_unique))+geom_bar(stat="identity", width=1) +
  coord_polar("y", start=0) +theme_void() #do not like the rainbow

#Plot the pie using base R: not a very good plot
pie_geoduck_R<-pie(na_zero$n.Pgenerosa, labels = na_zero$Term_unique, main="Geoduck Gene Library w/ Biological GO Terms")

18 Next Steps

gather e-value data from Genomes and make scatter plots with individual RNA-seq libraries

scatter<-ggplot(distinct_goslims, aes(x=gonad, y=juvenile))+geom_point()+theme_bw()

scatter

scatter2<-ggplot(distinct_goslims, aes(x=heart, y=juvenile))+geom_point()+theme_bw()

scatter2

#compairative_clam_transcripts_right

scatter4<-ggplot(compairative_clam_transcripts_right, aes(x=`e-value.x`, y=`e-value.y`))+geom_point()+theme_bw()+xlab("Mercenaria")+ylab("Manila")+labs(title="Pgenerosa blast db")+ theme(plot.title = element_text(hjust = 0.5, margin = margin(b = 10)))#removed 3394 rows containing missing values

scatter5<-ggplot(compairative_clam_transcripts_left, aes(x=`e-value.x`, y=`e-value.y`))+geom_point()+theme_bw()+xlab("Mercenaria")+ylab("Manila")+labs(title="Pgenerosa blast query")+ theme(plot.title = element_text(hjust = 0.5, margin = margin(b = 10)))#Removed 4686 rows containing missing values (`geom_point()`).

geoduck genome linked transcriptome

Olivia Cattau

2023-04-06