1 LOAD DATA

1.1 load new genome linked transcriptome PGEN

salmonMatrix<-read.table(file="https://raw.githubusercontent.com/sr320/paper-geoduck-gene-exp/main/data/salmon.merged.gene_tpm.tsv", header=TRUE, sep = '\t')
names(salmonMatrix)[1]<-"target_id"
names(salmonMatrix)[2]<-"gene"
data.table(salmonMatrix)

##              target_id            gene ctenidia    gonad    heart juvenile
##     1: PGEN_.00g000010 PGEN_.00g000010 0.000000 0.000000 0.000000 4.851851
##     2: PGEN_.00g000020 PGEN_.00g000020 0.000000 0.000000 0.000000 0.840164
##     3: PGEN_.00g000030 PGEN_.00g000030 0.225494 0.000000 0.000000 0.069598
##     4: PGEN_.00g000040 PGEN_.00g000040 0.565506 0.000000 0.254843 0.376364
##     5: PGEN_.00g000050 PGEN_.00g000050 0.508251 4.745168 0.228272 0.182728
##    ---                                                                    
## 34943: PGEN_.00g349430 PGEN_.00g349430 0.000000 0.000000 0.000000 0.000000
## 34944: PGEN_.00g349440 PGEN_.00g349440 1.955226 1.489901 1.621033 0.000000
## 34945: PGEN_.00g349450 PGEN_.00g349450 0.000000 0.000000 0.159121 0.000000
## 34946: PGEN_.00g349460 PGEN_.00g349460 0.000000 0.000000 0.000000 0.000000
## 34947: PGEN_.00g349470 PGEN_.00g349470 0.000000 0.000000 0.000000 0.000000
##          larvae
##     1: 3.423667
##     2: 0.370266
##     3: 0.052545
##     4: 1.131114
##     5: 1.117655
##    ---         
## 34943: 0.000000
## 34944: 2.222424
## 34945: 0.000000
## 34946: 0.000000
## 34947: 0.000000

#34,947

1.2 load geoduck gene annotation file from

mappingMatrix<-read.table(file="https://gannet.fish.washington.edu/Atumefaciens/20220419-pgen-gene_annotation_mapping/20220419-pgen-gene-accessions-gene_id-gene_name-gene_description-alt_gene_description-go_ids.tab", header=TRUE, sep = '\t', fill=TRUE)
colnames(mappingMatrix) <- c("target_id", "identifiers", "gene_id", "gene_name", "annotation", "alt_annotation", "goterms")
#7,736

1.3 load goslim mapping from Swiss Prott

link here

#double check to make sure this is reading the data correctly
goslims<-read.csv(file="https://raw.githubusercontent.com/RobertsLab/code/master/r_projects/sam/20230328-pgen-gene_annotation-update/outputs/02-goslim-mapping/20230329-pgen-annotations-SwissProt-GO-BP_GOslim.tab", header=TRUE, sep = '\t')
#14,672

1.4 download other trasncriptome from clam species

#from Mox blasting Manilla/Mercenaria on Pgen
Manilla_Pgen<-read.table(file="https://gannet.fish.washington.edu/gigas/data/p.generosa/Manilla_Pgenenerosa_blastx.tab")

1.5 load genomes from blast output

Mercenaria_genome<-read.csv(file="https://gannet.fish.washington.edu/gigas/data/p.generosa/Pgenerosa_Mercenaria_db_genome_blastn.tab", sep = '\t', header = FALSE)

Quadrangularis_genome<-read.csv(file="https://gannet.fish.washington.edu/gigas/data/p.generosa/Pgenerosa_quadrangularis_db_genome_blastn.tab", sep = '\t', header = FALSE)

Manila_genome<-read.csv(file="https://gannet.fish.washington.edu/gigas/data/p.generosa/Pgenerosa_manila_db_genome_blastn.tab", sep = '\t', header = FALSE)

Marissincia_genome<-read.csv(file="https://gannet.fish.washington.edu/gigas/data/p.generosa/Pgenerosa_marissinsa_db_genome_blastn.tab", sep = '\t', header = FALSE)

Solida_genome<-read.csv(file="https://gannet.fish.washington.edu/gigas/data/p.generosa/solida_db_genome_blastn.tab", sep = '\t', header = FALSE)

kable(Solida_genome[1:10,1:5])

V1	V2	V3	V4	V5
PGEN_.00g000060::Scaffold_01:70713-81099	OX365947.1	74.279	416	84
PGEN_.00g000270::Scaffold_01:344820-379444	OX365949.1	75.163	306	58
PGEN_.00g000300::Scaffold_01:503707-557915	OX365947.1	83.648	159	16
PGEN_.00g000320::Scaffold_01:571666-599482	OX365950.1	79.545	220	35
PGEN_.00g000670::Scaffold_01:1389639-1400328	OX365948.1	87.857	140	17
PGEN_.00g001140::Scaffold_01:2257307-2316262	OX365948.1	78.487	423	82
PGEN_.00g001170::Scaffold_01:2366888-2417042	OX365948.1	82.443	131	23
PGEN_.00g001350::Scaffold_01:2794647-2838542	OX365948.1	76.093	343	78
PGEN_.00g001380::Scaffold_01:2878758-2888130	OX365948.1	78.431	204	43
PGEN_.00g001410::Scaffold_01:2961919-2993436	OX365952.1	79.126	206	35

1.5.1 get summary data for genomes

#Mercenaria
Mercenaria_new <- separate(Mercenaria_genome, V1, into = c("gene", "Scaffold"), sep = "::")
names(Mercenaria_new)[1]<-"gene"
names(Mercenaria_new)[12]<-"e-value"

Mercenaria_summary <- Mercenaria_new %>%
  mutate(Marissincia_genes = lapply(strsplit(as.character(V2), ";"), function(Marissincia_genes) paste0(unique(Marissincia_genes), collapse = ";"))) %>% 
  distinct(Marissincia_genes, .keep_all = TRUE) 

#Quadrangularis
Quadrangularis_new <- separate(Quadrangularis_genome, V1, into = c("gene", "Scaffold"), sep = "::")
names(Quadrangularis_new)[1]<-"gene"
names(Quadrangularis_new)[12]<-"e-value"

Quadrangularis_summary <- Quadrangularis_new %>%
  mutate(genes = lapply(strsplit(as.character(V2), ";"), function(genes) paste0(unique(genes), collapse = ";"))) %>% 
  distinct(genes, .keep_all = TRUE) 

#Manila 
Manila_new <- separate(Manila_genome, V1, into = c("gene", "Scaffold"), sep = "::")
names(Manila_new)[1]<-"gene"
names(Manila_new)[12]<-"e-value"

Manila_summary <- Manila_new%>%
  mutate(genes = lapply(strsplit(as.character(V2), ";"), function(genes) paste0(unique(genes), collapse = ";"))) %>% 
  distinct(genes, .keep_all = TRUE) 

#Marissincia
Marissincia_new <- separate(Marissincia_genome, V1, into = c("gene", "Scaffold"), sep = "::")
names(Marissincia_new)[1]<-"gene"
names(Marissincia_new)[12]<-"e-value"

Marissincia_summary <- Marissincia_new %>%
  mutate(Marissincia_genes = lapply(strsplit(as.character(V2), ";"), function(Marissincia_genes) paste0(unique(Marissincia_genes), collapse = ";"))) %>% 
  distinct(Marissincia_genes, .keep_all = TRUE) 

#Solida
Solida_new <- separate(Solida_genome, V1, into = c("gene", "Scaffold"), sep = "::")
names(Solida_new)[1]<-"gene"
names(Solida_new)[12]<-"e-value"

Solida_summary <- Solida_new%>%
  mutate(genes = lapply(strsplit(as.character(V2), ";"), function(genes) paste0(unique(genes), collapse = ";"))) %>% 
  distinct(genes, .keep_all = TRUE)

1.6 make tables for clam genomes

clam_genomes<-bind_rows(lst(Solida_new, Marissincia_new, Mercenaria_new, Manila_new, Quadrangularis_new), .id = "clam")
kable(clam_genomes[1:5,1:5])

clam	gene	Scaffold	V2	V3
Solida_new	PGEN_.00g000060	Scaffold_01:70713-81099	OX365947.1	74.279
Solida_new	PGEN_.00g000270	Scaffold_01:344820-379444	OX365949.1	75.163
Solida_new	PGEN_.00g000300	Scaffold_01:503707-557915	OX365947.1	83.648
Solida_new	PGEN_.00g000320	Scaffold_01:571666-599482	OX365950.1	79.545
Solida_new	PGEN_.00g000670	Scaffold_01:1389639-1400328	OX365948.1	87.857

2 JOIN DATA

2.1 combine all clams for scatterplot by e-value

#all clam transcriptomes
#Reminder: Merc->Pgen = 6521, Manila->Pgen = 761, Pgen -> Merc = 5,099, Pgen -> Manila = 657
#always join large -> small
compairative_clam_transcripts_right <- left_join(Mercenaria_Pgen_clean, Manila_Pgen_clean, by = "gene")

## Warning in left_join(Mercenaria_Pgen_clean, Manila_Pgen_clean, by = "gene"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 17 of `x` matches multiple rows in `y`.
## ℹ Row 256 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

compairative_clam_transcripts_left <- left_join(Pgen_Mercenaria_clean, Pgen_Manila_clean, by = "gene")

3 join clam transcriptome with goslims

Manila_Pgen_goslims <- Manila_Pgen_new %>%
left_join(goslims, by=c("gene"))
Pgen_Manila_goslims <- Pgen_Manila_new %>%
  left_join(goslims, by=c("gene"))
Mercenaria_Pgen_goslims<- Mercenaria_Pgen_new %>%
left_join(goslims, by=c("gene")) #6521
Pgen_Mercenaria_goslims<- Pgen_Mercenaria_new %>%
left_join(goslims, by=c("gene")) #5099

4 long format

tidy_data<-salmonMatrix
head(tidy_data)

##         target_id            gene ctenidia    gonad    heart juvenile   larvae
## 1 PGEN_.00g000010 PGEN_.00g000010 0.000000 0.000000 0.000000 4.851851 3.423667
## 2 PGEN_.00g000020 PGEN_.00g000020 0.000000 0.000000 0.000000 0.840164 0.370266
## 3 PGEN_.00g000030 PGEN_.00g000030 0.225494 0.000000 0.000000 0.069598 0.052545
## 4 PGEN_.00g000040 PGEN_.00g000040 0.565506 0.000000 0.254843 0.376364 1.131114
## 5 PGEN_.00g000050 PGEN_.00g000050 0.508251 4.745168 0.228272 0.182728 1.117655
## 6 PGEN_.00g000060 PGEN_.00g000060 6.490179 0.000000 0.000000 6.742646 5.936612

tidy_data_long<-tidy_data %>%
  gather(key="tissue", value="tpm", ctenidia:larvae) #name key as time, name value as tpm, gather columns heart:larvae, converts wide to long format
names(tidy_data_long)[2]<-"gene"
head(tidy_data_long)

##         target_id            gene   tissue      tpm
## 1 PGEN_.00g000010 PGEN_.00g000010 ctenidia 0.000000
## 2 PGEN_.00g000020 PGEN_.00g000020 ctenidia 0.000000
## 3 PGEN_.00g000030 PGEN_.00g000030 ctenidia 0.225494
## 4 PGEN_.00g000040 PGEN_.00g000040 ctenidia 0.565506
## 5 PGEN_.00g000050 PGEN_.00g000050 ctenidia 0.508251
## 6 PGEN_.00g000060 PGEN_.00g000060 ctenidia 6.490179

tidy_data_long$binary=ifelse(tidy_data_long$tpm > 0, 1, 0)

#add goslims below

5 combine datasets into one gene annotation file

pgenerosa_complete<-salmonMatrix %>% 
  left_join(mappingMatrix, by=c("target_id")) 
# could also do this with goslims
pgenerosa_simple_goslims<-salmonMatrix %>% #best pgenerosa data set
  left_join(goslims, by=c("gene"))
pgenerosa_goslims<-pgenerosa_complete %>%
  left_join(goslims, by=c("gene"))
pgenerosa_long<-tidy_data_long %>% #best long pgenerosa data set
  left_join(goslims, by=c("gene"))
pgenerosa_goslims_long<-tidy_data_long %>%
  left_join(pgenerosa_simple_goslims, by=c("gene"))
kable(pgenerosa_simple_goslims[1:10, 1:10])

target_id	gene	ctenidia	gonad	heart	juvenile	larvae	accessions	gene_id	gene_name
PGEN_.00g000010	PGEN_.00g000010	0.000000	0.000000	0.000000	4.851851	3.423667	Q86IC9;Q552T5	8620183	omt5
PGEN_.00g000020	PGEN_.00g000020	0.000000	0.000000	0.000000	0.840164	0.370266	P04177	25085	Th
PGEN_.00g000030	PGEN_.00g000030	0.225494	0.000000	0.000000	0.069598	0.052545	NA	NA	NA
PGEN_.00g000040	PGEN_.00g000040	0.565506	0.000000	0.254843	0.376364	1.131114	NA	NA	NA
PGEN_.00g000050	PGEN_.00g000050	0.508251	4.745168	0.228272	0.182728	1.117655	Q8L840;O04092;Q9FT71	837636	RECQL4A
PGEN_.00g000060	PGEN_.00g000060	6.490179	0.000000	0.000000	6.742646	5.936612	Q61043;A0A1Y7VJL5;B2RQ73;B7ZMZ9;E9Q488;E9Q4S3;Q674R4;Q6ZPM7	18080	Nin
PGEN_.00g000070	PGEN_.00g000070	0.000000	0.000000	0.000000	0.025013	1.103535	NA	NA	NA
PGEN_.00g000080	PGEN_.00g000080	0.000000	0.000000	0.119488	0.229570	0.000000	A1E2V0	489433	BIRC3
PGEN_.00g000090	PGEN_.00g000090	0.187374	0.000000	0.000000	0.173312	0.000000	P34456	186266
PGEN_.00g000100	PGEN_.00g000100	0.000000	0.000000	0.000000	0.304322	0.000000	NA	NA	NA

5.1 break semi-colon in Term column and count unique values only

5.1.1 distinct goslims

# check for duplicate values within each cell using strsplit() and anyDuplicated()
duplicates <- sapply(strsplit(pgenerosa_simple_goslims$gene, ";"), anyDuplicated)

# print the row indices of cells with duplicate values
if (any(duplicates)) {
  print(which(duplicates > 0))
} else {
  print("No duplicates found")
}

## [1] "No duplicates found"

#accession numbers = no duplicates
#gene_ID = no duplicates
#allGoID duplicates = column 24458
#BP_GO_ID duplicates = column 24458
#CC_GO_ID duplicates = no duplicates
#MF_GO_ID duplicates = columns 4561  8040 13044 16138
#--------------------------------
#GOslim duplicates lots and lots 
#Terms duplicates lots and lots 

#break cells and reform with unique values only
distinct_goslims <- pgenerosa_simple_goslims %>%
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  distinct(Term_unique, .keep_all = TRUE) %>%
  mutate(GOslim_unique = lapply(strsplit(as.character(GOslim), ";"),function(GOslim_unique) paste0(unique(GOslim_unique), collapse = ";"))) %>% 
  distinct(GOslim_unique, .keep_all = TRUE)

kable(distinct_goslims[1:10,1:10])

target_id	gene	ctenidia	gonad	heart	juvenile	larvae	accessions	gene_id	gene_name
PGEN_.00g000010	PGEN_.00g000010	0.000000	0.000000	0.000000	4.851851	3.423667	Q86IC9;Q552T5	8620183	omt5
PGEN_.00g000020	PGEN_.00g000020	0.000000	0.000000	0.000000	0.840164	0.370266	P04177	25085	Th
PGEN_.00g000050	PGEN_.00g000050	0.508251	4.745168	0.228272	0.182728	1.117655	Q8L840;O04092;Q9FT71	837636	RECQL4A
PGEN_.00g000060	PGEN_.00g000060	6.490179	0.000000	0.000000	6.742646	5.936612	Q61043;A0A1Y7VJL5;B2RQ73;B7ZMZ9;E9Q488;E9Q4S3;Q674R4;Q6ZPM7	18080	Nin
PGEN_.00g000080	PGEN_.00g000080	0.000000	0.000000	0.119488	0.229570	0.000000	A1E2V0	489433	BIRC3
PGEN_.00g000090	PGEN_.00g000090	0.187374	0.000000	0.000000	0.173312	0.000000	P34456	186266
PGEN_.00g000210	PGEN_.00g000210	2.810603	0.000000	2.775616	3.495122	0.898959	O00463;B4DIS9;B4E0A2;Q6FHY1	7188	TRAF5
PGEN_.00g000300	PGEN_.00g000300	4.091597	2.739254	3.785783	5.053620	1.087948	Q5REG4	100171717	DTX3
PGEN_.00g000380	PGEN_.00g000380	0.475060	0.000000	0.000000	0.315384	0.000000	Q8QG60;Q8QG52;Q8QGQ5	374092	CRY2
PGEN_.00g000440	PGEN_.00g000440	11.483098	5.728598	3.123012	6.873106	10.243535	Q9H583;Q5T3Q8;Q6P197;Q9NW23	55127	HEATR1

6 SPECIES

6.1 Separate the Term columb by semi-colons to count occurances for table

# For Pgenerosa alone (wide format) 
# Separate the "Term" column by semi-colons and count the occurrences
term_count_Pgen <- distinct_goslims %>% #make sure you have removed duplicate values beforehand
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) # Count the occurrences and sort by descending count

# Print the count of each term
print(term_count_Pgen)

## # A tibble: 71 × 2
##    Term_unique                                   n
##    <chr>                                     <int>
##  1 anatomical structure development           1853
##  2 signaling                                  1603
##  3 cell differentiation                       1281
##  4 DNA-templated transcription                 741
##  5 regulation of DNA-templated transcription   678
##  6 immune system process                       670
##  7 programmed cell death                       600
##  8 cell motility                               595
##  9 reproductive process                        536
## 10 cytoskeleton organization                   516
## # ℹ 61 more rows

# For Pgenerosa using pgenerosa_long (versus wide)
# Separate the "Term" column by semi-colons and count the occurrences
term_count_Pgen_long <- pgenerosa_long %>%  # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) # Count the occurrences and sort by descending count

# Print the count of each term
print(term_count_Pgen_long)

## # A tibble: 71 × 2
##    Term_unique                                   n
##    <chr>                                     <int>
##  1 anatomical structure development           1853
##  2 signaling                                  1603
##  3 cell differentiation                       1281
##  4 DNA-templated transcription                 741
##  5 regulation of DNA-templated transcription   678
##  6 immune system process                       670
##  7 programmed cell death                       600
##  8 cell motility                               595
##  9 reproductive process                        536
## 10 cytoskeleton organization                   516
## # ℹ 61 more rows

#exactly the same as Pgen_wide


#for Manila Clam
# Separate the "Term" column by semi-colons and count the occurrences
term_count_manila <- Manila_Pgen_goslims %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) # Count the occurrences and sort by descending count

# Print the count of each term
print(term_count_manila)

## # A tibble: 65 × 2
##    Term_unique                                   n
##    <chr>                                     <int>
##  1 anatomical structure development            129
##  2 signaling                                   113
##  3 cell differentiation                         86
##  4 cytoskeleton organization                    65
##  5 DNA-templated transcription                  51
##  6 reproductive process                         48
##  7 regulation of DNA-templated transcription    47
##  8 cell motility                                40
##  9 programmed cell death                        40
## 10 protein-containing complex assembly          39
## # ℹ 55 more rows

# Manila Reverse
term_count_manila_r <- Pgen_Manila_goslims %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) # Count the occurrences and sort by descending count

# Print the count of each term
print(term_count_manila_r)

## # A tibble: 65 × 2
##    Term_unique                                   n
##    <chr>                                     <int>
##  1 anatomical structure development            149
##  2 signaling                                   126
##  3 cell differentiation                         96
##  4 cytoskeleton organization                    68
##  5 DNA-templated transcription                  60
##  6 regulation of DNA-templated transcription    55
##  7 reproductive process                         55
##  8 programmed cell death                        44
##  9 immune system process                        42
## 10 protein-containing complex assembly          42
## # ℹ 55 more rows

#for Mercenaria Clam
# Separate the "Term" column by semi-colons and count the occurrences
term_count_Mercenaria <- Mercenaria_Pgen_goslims %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) # Count the occurrences and sort by descending count

# Print the count of each term
print(term_count_Mercenaria)

## # A tibble: 69 × 2
##    Term_unique                                   n
##    <chr>                                     <int>
##  1 anatomical structure development            247
##  2 signaling                                   189
##  3 cell differentiation                        170
##  4 DNA-templated transcription                  88
##  5 cytoskeleton organization                    87
##  6 cell motility                                83
##  7 regulation of DNA-templated transcription    82
##  8 reproductive process                         71
##  9 immune system process                        66
## 10 programmed cell death                        63
## # ℹ 59 more rows

#for Mercenaria Clam reverse
# Separate the "Term" column by semi-colons and count the occurrences
term_count_Mercenaria_r <- Pgen_Mercenaria_goslims %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) # Count the occurrences and sort by descending count

# Print the count of each term
print(term_count_Mercenaria_r) #over 3x more terms than Mercenaria not reversed...wonder why that is....

## # A tibble: 71 × 2
##    Term_unique                                   n
##    <chr>                                     <int>
##  1 anatomical structure development            768
##  2 signaling                                   635
##  3 cell differentiation                        517
##  4 DNA-templated transcription                 267
##  5 regulation of DNA-templated transcription   245
##  6 cell motility                               241
##  7 cytoskeleton organization                   237
##  8 immune system process                       232
##  9 programmed cell death                       229
## 10 vesicle-mediated transport                  218
## # ℹ 61 more rows

#combine all 5 datasets
term_count_all <- bind_rows(term_count_Pgen, term_count_manila, term_count_manila_r, term_count_Mercenaria, term_count_Mercenaria_r, .id = "transcriptome") 

term_count_best <- term_count_all %>% 
  mutate(transcriptome = ifelse(transcriptome == 1, "Pgenerosa", 
                    ifelse(transcriptome == 2, "Manila",
                           ifelse(transcriptome == 3, "Manila_r", 
                                  ifelse(transcriptome == 4, "Mercenaria", 
                                         ifelse(transcriptome == 5, "Mercenaria_r", NA))))))

6.2 NORMALIZE

#normalize by Pgen transcriptome
sum_terms<- term_count_Pgen %>% #count terms in n 
  summarise(total_n = sum(n)) #17,611

Pgen_normalized<- term_count_Pgen %>%
  mutate(Pgen_normalized = n/17611)

normal_counts<- term_count_best %>%
  group_by(transcriptome) %>%
  summarise(total_n = sum(n))
#Pgen = 17,611
#Manila = 1235
#Manila_r = 1390
#Mercenaria = 2198
#Mercenaria_r = 7027


filtered_data1 <- term_count_best %>%
  filter(transcriptome %in% c("Mercenaria_r", "Manila", "Mercenaria", "Manila_r")) %>%
  mutate(n_divided = case_when(
                               transcriptome == "Manila" ~ n/1235,
                               transcriptome == "Manila_r" ~ n/1390,
                               transcriptome == "Mercenaria" ~ n/2198,
                               transcriptome == "Mercenaria_r" ~ n/7027))

grouped_data1<-left_join(filtered_data1, Pgen_normalized, by="Term_unique")

species_normal<-grouped_data1 %>%
  mutate(normalized = n_divided/Pgen_normalized)%>%
  rename(n_species = n.x) %>%
  rename(n_Pgen = n.y) %>%
  mutate(log_scale = log(normalized))
#for heatmap

kable(species_normal[1:5,1:5])

transcriptome	Term_unique	n_species	n_divided	n_Pgen
Manila	anatomical structure development	129	0.1044534	1853
Manila	signaling	113	0.0914980	1603
Manila	cell differentiation	86	0.0696356	1281
Manila	cytoskeleton organization	65	0.0526316	516
Manila	DNA-templated transcription	51	0.0412955	741

6.3 count GOterms unique to species

#####make new variables with binary data

#wide format data

Manila_2<-Manila_Pgen_goslims %>%
  select(-c(3:11, 13:23))

Mercenaria_2<-Mercenaria_Pgen_goslims %>%
   select(-c(3:11, 13:23))

species_joined<-left_join(Mercenaria_2, Manila_2, by="gene")

## Warning in left_join(Mercenaria_2, Manila_2, by = "gene"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 17 of `x` matches multiple rows in `y`.
## ℹ Row 256 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

#Mercenaria
species_joined$`e-value.x`=ifelse(species_joined$`e-value.x` > 0, 1, 0)
#Manila
species_joined$`e-value.y`=ifelse(species_joined$`e-value.y` > 0, 1, 0)

species_matrix<-species_joined %>% select(gene, `e-value.x`, `e-value.y`) %>%
  rename(Mercenaria_evalue = `e-value.x`)%>%
  rename(Manila_evalue = `e-value.y`)%>%
  distinct(gene, .keep_all = TRUE)%>%
  mutate(Manila_evalue = ifelse(is.na(Manila_evalue), 0, Manila_evalue))%>%
  mutate(Pgen_evalue = 1)

species_matrix<-tibble::column_to_rownames(species_matrix, var="gene")

species_matrix<-as.matrix(species_matrix) #count matrix for UpSet plots

matrix_species = make_comb_mat(species_matrix, mode="distinct")
matrix_species_intersect = make_comb_mat(species_matrix, mode="intersect")
matrix_species_union = make_comb_mat(species_matrix, mode="union")
set_size(matrix_species)

## Mercenaria_evalue     Manila_evalue       Pgen_evalue 
##               980               210              1029

comb_size(matrix_species)

## 111 101 011 001 
## 202 778   8  41

# create sample data frame
species_count<- as.data.frame(species_matrix) %>%
  mutate(Mercenaria_Manila = NA) %>%
  mutate(Manila_Pgen = NA) %>%
  mutate(Pgen_Mercenaria = NA) %>%
  mutate(all=NA)%>%
  mutate(Pgen= NA)

# iterate through rows
########################
#Mercenaria_Manila does not include Pgen genome or else will return all zeros
species_count$Mercenaria_Manila <-ifelse(species_count$Mercenaria_evalue ==1 & species_count$Manila_evalue == 1, 1, 0)

#Manila + Pgenerosa
species_count$Manila_Pgen <-ifelse(species_count$Mercenaria_evalue ==0 & species_count$Manila_evalue == 1 & species_count$Pgen_evalue == 1, 1, 0)

#Mercenaria + Pgenerosa
species_count$Pgen_Mercenaria <-ifelse(species_count$Mercenaria_evalue == 1 & species_count$Manila_evalue == 0 & species_count$Pgen_evalue == 1, 1, 0)

#Mercenaria +Pgenerosa + Manila
species_count$all<-ifelse(species_count$Mercenaria_evalue == 1 & species_count$Manila_evalue == 1 & species_count$Pgen_evalue == 1, 1, 0)

#Pgen alone
species_count$Pgen <-ifelse(species_count$Mercenaria_evalue == 0 & species_count$Manila_evalue == 0 & species_count$Pgen_evalue == 1, 1, 0)

#return column names when species == 1
# identify row indices where Pgen_Mercenaria column equals 1
row_indices_a<- which(species_count$Mercenaria_Manila == 1)
row_indices_b<- which(species_count$Manila_Pgen == 1)
row_indices_c<- which(species_count$Pgen_Mercenaria == 1)
row_indices_d<- which(species_count$all == 1)
row_indices_e<- which(species_count$Pgen == 1)

# extract corresponding row names
genes_Mer_Man <- as.data.frame(rownames(species_count[row_indices_a, ])) 
names(genes_Mer_Man)[1]<-"gene"

genes_Man_Pgen <- as.data.frame(rownames(species_count[row_indices_b, ])) 
names(genes_Man_Pgen)[1]<-"gene"

genes_Mer_Pgen <- as.data.frame(rownames(species_count[row_indices_c, ])) 
names(genes_Mer_Pgen)[1]<-"gene"

genes_all <- as.data.frame(rownames(species_count[row_indices_d, ])) 
names(genes_all)[1]<-"gene"

genes_Pgen <- as.data.frame(rownames(species_count[row_indices_e, ])) 
names(genes_Pgen)[1]<-"gene"

#goslims
goslims_genes_MM<-left_join(genes_Mer_Man, goslims, by="gene")
goslims_gene_ManPgen<-left_join(genes_Man_Pgen, goslims, by ="gene")
goslims_genes_MerPgen<-left_join(genes_Mer_Pgen, goslims, by="gene")
goslims_genes_all<-left_join(genes_all, goslims, by="gene")
goslims_genes_Pgen<-left_join(genes_Pgen, goslims, by="gene")

#Unique GOterms only
#Mercenaria + Manila
MM_count_unique <- goslims_genes_MM %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
 # distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, species= "Mercenaria-Manila")

Man_Pgen_count_unique <- goslims_gene_ManPgen %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
 # distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, species= "Manila-Pgen")

Mer_Pgen_count_unique <- goslims_genes_MerPgen %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
 # distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, species= "Mercenaria-Pgen")

all_count_unique <- goslims_genes_all %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
 # distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, species= "all-3")

Pgen_count_unique <- goslims_genes_Pgen %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
 # distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, species= "Pgen_alone")

species_unique<-bind_rows(Mer_Pgen_count_unique, Man_Pgen_count_unique, MM_count_unique, Pgen_count_unique, all_count_unique)

kable(species_unique[1:4, 1:4])

Term_unique	n	count_unique	species
NA	261	68	Mercenaria-Pgen
anatomical structure development	203	68	Mercenaria-Pgen
signaling	181	68	Mercenaria-Pgen
cell differentiation	134	68	Mercenaria-Pgen

7 Oyster female gonad comparison

7.1 load data and filter for genebank accession numbers

oyster_gonad<-read.csv(file="https://raw.githubusercontent.com/course-fish546-2023/olivia-geoduck/main/data/pone.0036353.s003.csv")

oyster_gonad<- oyster_gonad %>%
  filter(Description != 0) %>%
  group_by(Cluster) %>%
  select(Genebank, Cluster)

#write.csv(oyster_gonad, file="/Users/oliviacattau/Documents/GitHub/olivia-geoduck/output/oyster_gonad_clusters.csv")

kable(oyster_gonad[1:10, 1:2])

Genebank	Cluster
BQ427044	1
CU683161	1
AM856831	1
AM867786	1
FP008634	1
AM854780	1
AM865407	1
AM858220	1
FP002214	1
AM857878	1

8 TISSUE

8.1 Count Goslims by tissue type

#count top GOterms
tissue_goslims <- pgenerosa_long %>%
    group_by(tissue) %>% 
  filter(binary == 1) %>%
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>%
  separate_rows(Term_unique, sep = ";") %>%
  mutate(Term_unique = trimws(Term_unique)) %>%
  count(Term_unique, tissue, sort = TRUE)

9 NORMALIZE

9.1 normalize goterm count by % in geoduck transcriptome by % tissue goterm counts by tissue transcriptome and return another % which is the normalized goterm count

#add new column to variable term_count_Pgen

#normalize data by % hit in Pgen transcriptome
sum_terms<- term_count_Pgen %>% #count terms in n 
  summarise(total_n = sum(n)) #17,611

Pgen_normalized<- term_count_Pgen %>%
  mutate(Pgen_normalized = n/17611)

#normalize tissue data
normal_counts<- tissue_goslims %>%
  group_by(tissue) %>%
  summarise(total_n = sum(n))
#heart = 16,021
#ctenidia = 15,911
#gonad = 14,715
#juv = 17,277
#larvae = 16,632

filtered_data <- tissue_goslims %>%
  filter(tissue %in% c("ctenidia", "heart", "gonad", "larvae", "juvenile")) %>%
  mutate(n_divided = case_when(tissue == "ctenidia" ~ n/15911,
                               tissue == "heart" ~ n/16021,
                               tissue == "gonad" ~ n/14715,
                               tissue == "larvae" ~ n/16632,
                               tissue == "juvenile" ~ n/17277))

#combine Pgen_normalized with filtered_data and add new column normalized where tissue_n/geoduck_n to produce new normalized value

grouped_data<-left_join(filtered_data, Pgen_normalized, by="Term_unique")

Tis_normal<-grouped_data %>%
  mutate(normalized = n_divided/Pgen_normalized)%>%
  rename(n_tissue = n.x) %>%
  rename(n_Pgen = n.y) %>%
  mutate(log_scale = log(normalized))
#for heatmap

kable(Tis_normal[1:5, 1:5])

tissue	Term_unique	n_tissue	n_divided	n_Pgen
juvenile	anatomical structure development	1817	0.1051687	1853
larvae	anatomical structure development	1743	0.1047980	1853
heart	anatomical structure development	1678	0.1047375	1853
ctenidia	anatomical structure development	1661	0.1043932	1853
juvenile	signaling	1572	0.0909880	1603

9.2 filter by tissue type

not actually unique goslims, only unique to tissue types

tidy1<-pgenerosa_goslims_long %>%
  group_by(tissue) %>%
  filter(tpm > 0) 

ctenidia<-pgenerosa_goslims %>%
  filter(ctenidia > 0 & gonad == 0 & heart == 0 & juvenile == 0 & larvae == 0)
count(ctenidia)

##     n
## 1 340

#340 obs (unique to ctenidia)
write.csv(ctenidia, file="/Users/oliviacattau/Documents/GitHub/code-for-Pgenerosa/output/ctenidia_unique.csv")

gonad<-pgenerosa_goslims %>%
  filter(gonad > 0 & ctenidia == 0 & heart == 0 & juvenile == 0 & larvae == 0)
count(gonad)

##     n
## 1 119

#119 obs (unique to gonad)
write.csv(gonad, file="/Users/oliviacattau/Documents/GitHub/code-for-Pgenerosa/output/gonad_unique.csv")

heart<-pgenerosa_goslims %>%
  filter(heart > 0 & ctenidia == 0 & gonad == 0 & juvenile == 0 & larvae == 0)
count(heart)

##     n
## 1 371

#371 obs (unique to heart)
write.csv(heart, file="/Users/oliviacattau/Documents/GitHub/code-for-Pgenerosa/output/heart_unique.csv")

juvenile<-pgenerosa_goslims %>%
  filter(juvenile > 0 & ctenidia == 0 & heart == 0 & gonad == 0 & larvae == 0)
count(juvenile)

##      n
## 1 2180

#2,180 obs (unique to juvenile alone)
write.csv(juvenile, file="/Users/oliviacattau/Documents/GitHub/code-for-Pgenerosa/output/juvenile_unique.csv")

larvae<-pgenerosa_goslims %>%
  filter(larvae > 0 & ctenidia == 0 & heart == 0 & gonad == 0 & juvenile == 0)
count(larvae)

##     n
## 1 868

#868 obs (unique to larvae alone)
write.csv(larvae, file="/Users/oliviacattau/Documents/GitHub/code-for-Pgenerosa/output/larvae_unique.csv")

juvorlarv<-pgenerosa_goslims %>%
  filter(ctenidia == 0 & gonad == 0 & heart == 0) %>%
  filter(juvenile > 0 & larvae > 0)
#5,022 obs unique to both juveniles or larvae
#1,974 obs unique to juveniles and larvae together

all_unique<-pgenerosa_goslims %>%
   filter(ctenidia > 0 & gonad > 0 & heart > 0 & juvenile > 0 & larvae > 0)

9.3 count GOterms for individual tissues

gonad larvae heart ctenidia juvenile

gonad_count_unique <- gonad %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% # distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, tissue= "gonad")

larvae_count_unique <- larvae %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% # distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, tissue= "larvae")

heart_count_unique <- heart %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% # distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, tissue= "heart")

ctenidia_count_unique <- ctenidia %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% # distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, tissue= "ctenidia")

juvenile_count_unique <- juvenile %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% # distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, tissue= "juvenile")

tissue_all_unique_genes_GOterms<-rbind(heart_count_unique, gonad_count_unique, ctenidia_count_unique, larvae_count_unique, juvenile_count_unique)

kable(tissue_all_unique_genes_GOterms[1:4, 1:4])

Term_unique	n	count_unique	tissue
NA	342	41	heart
signaling	13	41	heart
anatomical structure development	11	41	heart
cell differentiation	9	41	heart

10 organize data for UpSet plot

library(dplyr)
#####make new variables with binary data
tidy_data$heart=ifelse(tidy_data$heart > 0, 1, 0)
tidy_data$gonad=ifelse(tidy_data$gonad >0, 1, 0)
tidy_data$ctenidia=ifelse(tidy_data$ctenidia > 0, 1, 0)
tidy_data$larvae=ifelse(tidy_data$larvae > 0, 1, 0)
tidy_data$juvenile=ifelse(tidy_data$juvenile > 0, 1, 0)
new_data<-tidy_data %>% select(target_id,heart, gonad, ctenidia, larvae, juvenile)
new_data<-tibble::column_to_rownames(new_data, var="target_id")
new_data<-as.matrix(new_data) #count matrix for UpSet plots

11 Make UpSet plot for visualization of overlapping data:

link to UpSet R page

m1 = make_comb_mat(new_data) #only distinct intersections
m2=make_comb_mat(new_data, mode="intersect")
set_size(m1)

##    heart    gonad ctenidia   larvae juvenile 
##    17602    13682    17479    19449    23417

comb_size(m1)

## 11111 11110 11101 11011 10111 01111 11100 11010 11001 10110 10101 10011 01110 
## 11294    24   692   397  2147   337    54    13   122    51   658  1007     8 
## 01101 01011 00111 11000 10100 10010 10001 01100 01010 01001 00110 00101 00011 
##   100   258   886    43   124    60   545    20    46   155    79   665  1974 
## 10000 01000 00100 00010 00001 00000 
##   371   119   340   868  2180  9310

comb_data<-as.data.frame(comb_size(m1))

p1<-UpSet(m1 [comb_degree(m1)<=2], column_title ="All intersections of 2 or less") #intersections 2 or less
p2<-UpSet(m1 [comb_degree(m1)>=3], column_title= "All intersection of 3 or more") #intersections 3 or more
p3<-UpSet(m1, top_annotation = HeatmapAnnotation(
    degree = as.character(comb_degree(m1)),
    "Intersection\nsize" = anno_barplot(comb_size(m1), 
        border = FALSE, 
        gp = gpar(fill = "black"), 
        height = unit(2, "cm")
    ), 
    annotation_name_side = "left", 
    annotation_name_rot = 0), column_title="p. generosa transcriptome distinct intersections") #distinct mode

p5<-UpSet(m2 [comb_degree(m2)<=2], column_title ="All intersections of 2 or less")
p6<-UpSet(m2 [comb_degree(m2)>=3], column_title ="All intersections of 3 or more")
#####testing comb_set sizes
x2<-comb_size(m1)
x3<-as.data.frame(x2)
sum(x3$x2) #34,947 --distinct mode

## [1] 34947

x4<-comb_size(m2)
x5<-as.data.frame(x4)
sum(x5$x4) #450,489 --intersect mode

## [1] 450489

12 Make gene list for unique genes per tissue type

# create sample data frame
count_df<- as.data.frame(new_data) %>%
  mutate(heart_gonad = NA) %>%
  mutate(heart_ctenidia = NA) %>%
  mutate(ctenidia_gonad = NA)


# iterate through rows
########################
#heart + gonad
# create a new column for shared data between heart and gonad columns
count_df$heart_gonad <- ifelse(count_df$heart == 1 & count_df$gonad == 1 & count_df$ctenidia == 0 & count_df$juvenile == 0 & count_df$larvae == 0, 1, 0)

#heart + ctenidia
count_df$heart_ctenidia <- ifelse(count_df$heart == 1 & count_df$gonad == 0 & count_df$ctenidia == 1 & count_df$juvenile == 0 & count_df$larvae == 0, 1, 0) 

#ctenidia + gonad
count_df$ctenidia_gonad <- ifelse(count_df$heart == 0 & count_df$gonad == 1 & count_df$ctenidia == 1 & count_df$juvenile == 0 & count_df$larvae == 0, 1, 0) 

#larvae + juvenile
count_df$larv_juv <- ifelse(count_df$heart == 0 & count_df$gonad == 0 & count_df$ctenidia == 0 & count_df$juvenile == 1 & count_df$larvae == 1, 1, 0)

#ctenidia + juvenile 
count_df$ctenidia_juv <- ifelse(count_df$heart == 0 & count_df$gonad == 0 & count_df$ctenidia == 1 & count_df$juvenile == 1 & count_df$larvae == 0, 1, 0)

#heart + juvenile
count_df$heart_juv <- ifelse(count_df$heart == 1 & count_df$gonad == 0 & count_df$ctenidia == 0 & count_df$juvenile == 1 & count_df$larvae == 0, 1, 0)

#gonad + juvenile
count_df$gonad_juv <- ifelse(count_df$heart == 0 & count_df$gonad == 1 & count_df$ctenidia == 0 & count_df$juvenile == 1 & count_df$larvae == 0, 1, 0)

#ctenidia + larvae
count_df$larv_ctenidia <- ifelse(count_df$heart == 0 & count_df$gonad == 0 & count_df$ctenidia == 1 & count_df$juvenile == 0 & count_df$larvae == 1, 1, 0)

#heart + larvae
count_df$larv_heart <- ifelse(count_df$heart == 1 & count_df$gonad == 0 & count_df$ctenidia == 0 & count_df$juvenile == 0 & count_df$larvae == 1, 1, 0)

#gonad + larvae
count_df$larv_gonad <- ifelse(count_df$heart == 0 & count_df$gonad == 1 & count_df$ctenidia == 0 & count_df$juvenile == 0 & count_df$larvae == 1, 1, 0)

#return column names when tissues == 1
# identify row indices where heart_gonad column equals 1
row_indices  <- which(count_df$heart_gonad == 1)
row_indices_2<- which(count_df$heart_ctenidia == 1)
row_indices_3<- which(count_df$ctenidia_gonad == 1)
row_indices_4<- which(count_df$larv_juv == 1)
row_indices_5<- which(count_df$ctenidia_juv == 1)
row_indices_6<- which(count_df$heart_juv == 1)
row_indices_7<- which(count_df$gonad_juv == 1)
row_indices_8<- which(count_df$larv_ctenidia == 1)
row_indices_9<- which(count_df$larv_heart == 1)
row_indices_10<- which(count_df$larv_gonad == 1)


# extract corresponding row names
genes_heart_gonad <- as.data.frame(rownames(count_df[row_indices, ])) 
names(genes_heart_gonad)[1]<-"gene"

genes_heart_ctenidia <- as.data.frame(rownames(count_df[row_indices_2,]))
names(genes_heart_ctenidia)[1]<-"gene"

genes_ctenidia_gonad <- as.data.frame(rownames(count_df[row_indices_3, ])) 
names(genes_ctenidia_gonad)[1]<-"gene"

genes_larv_juv <- as.data.frame(rownames(count_df[row_indices_4, ])) 
names(genes_larv_juv)[1]<-"gene"

genes_ctenidia_juv <-as.data.frame(rownames(count_df[row_indices_5, ])) 
names(genes_ctenidia_juv)[1]<-"gene"

genes_heart_juv <- as.data.frame(rownames(count_df[row_indices_6, ])) 
names(genes_heart_juv)[1]<-"gene"

genes_gonad_juv <- as.data.frame(rownames(count_df[row_indices_7, ])) 
names(genes_gonad_juv)[1]<-"gene"

genes_larv_ctenidia <- as.data.frame(rownames(count_df[row_indices_8, ])) 
names(genes_larv_ctenidia)[1]<-"gene"

genes_larv_heart <- as.data.frame(rownames(count_df[row_indices_9, ])) 
names(genes_larv_heart)[1]<-"gene"

genes_larv_gonad <- as.data.frame(rownames(count_df[row_indices_10, ])) 
names(genes_larv_gonad)[1]<-"gene"

#goslims
goslims_genes_hg<-left_join(genes_heart_gonad, goslims, by="gene")
goslims_gene_hc<-left_join(genes_heart_ctenidia, goslims, by ="gene")
goslims_genes_cg<-left_join(genes_ctenidia_gonad, goslims, by="gene")
goslims_gene_lj<-left_join(genes_larv_juv, goslims, by="gene")
goslims_genes_cj<-left_join(genes_ctenidia_juv, goslims, by="gene")
goslims_genes_hj<-left_join(genes_heart_juv, goslims, by="gene")
goslims_genes_gj<-left_join(genes_gonad_juv, goslims, by="gene")
goslims_genes_cl<-left_join(genes_larv_ctenidia, goslims, by="gene")
goslims_genes_hl<-left_join(genes_larv_heart, goslims, by="gene")
goslims_genes_gl<-left_join(genes_larv_gonad, goslims, by="gene")


#Unique GOterms only
#heart-gonad
hg_count_unique <- goslims_genes_hg %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
 # distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, tissues= "heart-gonad")

#heart-ctenidia
hc_count_unique <- goslims_gene_hc %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  #distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>%# Count the occurrences and sort by descending count
  mutate(count_unique = n()-1, tissues= "heart-ctenidia")

#ctenidia-gonad
cg_count_unique <- goslims_genes_cg %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  #distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, tissues= "ctenidia-gonad")

#larvae-juvenile
lj_count_unique <- goslims_gene_lj %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  #distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, tissues= "larvae-juvenile") #count genes with GOslims and remove 1 (for NAs)

#ctenidia-juvenile
cj_count_unique <- goslims_genes_cj %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  #distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, tissues= "ctenidia-juvenile") #count genes with GOslims and remove 1 (for NAs)

#heart-juvenile
hj_count_unique <- goslims_genes_hj %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  #distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, tissues= "heart-juvenile") #count genes with GOslims and remove 1 (for NAs)

#gonad-juvenile
gj_count_unique <- goslims_genes_gj %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  #distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, tissues= "gonad-juvenile") #count genes with GOslims and remove 1 (for NAs)

#heart + larvae
hl_count_unique <- goslims_genes_hl %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  #distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, tissues= "heart-larvae") #count genes with GOslims and remove 1 (for NAs)

#ctenidia + larvae
cl_count_unique <- goslims_genes_cl %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  #distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, tissues= "ctenidia-larvae") #count genes with GOslims and remove 1 (for NAs)

#gonad + larvae
gl_count_unique <- goslims_genes_gl %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  #distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, tissues= "gonad-larvae") #count genes with GOslims and remove 1 (for NAs)

#term count Juv only
juv_count_unique <- juvenile %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  #distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, tissues= "juvenile") #count genes with GOslims and remove 1 (for NAs)

write.csv(juv_count_unique, file="/Users/oliviacattau/Documents/GitHub/olivia-geoduck/data/juv_count_unique.csv")

#combine into one DF
unique_overlapping_genes<-bind_rows(hg_count_unique, hc_count_unique, cg_count_unique, lj_count_unique, cj_count_unique, hj_count_unique, gj_count_unique, cl_count_unique, hl_count_unique, gl_count_unique)

13 Make stacked bar-plot for visualizing Unique Terms

#filter Terms down a bit 

terms_filtered <- term_count_best %>%
  group_by(Term_unique) %>%
  filter(n() >= 5) %>%
  filter(!is.na("Terms_unique")) %>%
  filter(n >= 10)

#load top 20 hits from Github
top20terms<-read.csv(file="https://raw.githubusercontent.com/course-fish546-2023/olivia-geoduck/main/output/tables/top20terms.csv")

compairative_transcriptomes<-top20terms %>%
  filter(transcriptome != "Pgenerosa") %>%
  filter(n >= 15)

stacked_barplot<-ggplot(top20terms, aes(x=transcriptome, y=n, fill=Term_unique))+geom_bar(stat = "identity")+theme_bw()

stacked_barplot2<-ggplot(compairative_transcriptomes, aes(x=transcriptome, y=n, fill=Term_unique))+geom_bar(stat = "identity")+theme_bw()

stacked_barplot

14 Heatmap

#Z=e-values, Y=Manilla genes, X=Mercenaria genes

top20_long<-reshape(top20terms, idvar = "Term_unique", timevar = "transcriptome", direction = "wide")

tissues_goterms_unique<-read.csv(file="https://raw.githubusercontent.com/ocattau/code-for-Pgenerosa/main/output/tissue_unique_Pgen_terms_heatmap_data.csv")

tissues2<-tissues_goterms_unique %>%
  filter(tissues != "larvae-juvenile")

#heatmap<-ggplot(pgenerosa_long, aes(x=gene, y=tissue, fill=tpm)) + 
 # geom_tile() #too large

heatmap2<-ggplot(top20terms, aes(x=transcriptome, y=Term_unique, fill=n))+geom_tile()+theme_bw()+ scale_fill_gradient(low="#c2f9ff", high="#003460")

heatmap3<-ggplot(compairative_transcriptomes, aes(x=transcriptome, y=Term_unique, fill=n))+geom_tile()+theme_bw() + scale_fill_gradient(low="#c2f9ff", high="#003460")

heatmap7<- ggplot(tissues2, aes(x=tissues, y=Term_unique, fill=n))+geom_tile()+theme_bw()+ scale_fill_gradient(low="#c2f9ff", high="#003460")+theme(axis.text.x = element_text(angle = 45, hjust = 1) )

Tis_normal_2<-Tis_normal %>%
  filter(Term_unique != "NA")

heatmap8<-ggplot(Tis_normal_2, aes(tissue, y=Term_unique, fill=normalized))+geom_tile()+theme_bw()+theme(axis.text.x = element_text(angle = 45, hjust = 1) )+ scale_fill_gradient2(low = "yellow", mid = "white", high = "red", midpoint = 0.9, na.value = "grey50")

heatmap9<-ggplot(Tis_normal, aes(tissue, y=Term_unique, fill=log_scale))+geom_tile(color="white")+  scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0, na.value = "grey50")+theme_bw()+theme(axis.text.x = element_text(angle = 45, hjust = 1) )#need to set color wheel to white = 1 and red = min and blue = max

species_normal_2<-species_normal %>%
  filter(Term_unique != "NA")%>%
  filter(transcriptome == "Manila_r" | transcriptome == "Mercenaria_r")

heatmap10<-ggplot(species_normal_2, aes(transcriptome, y=Term_unique, fill=normalized))+geom_tile(color="white")+  scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 1, na.value = "grey50")+theme_bw()+theme(axis.text.x = element_text(angle = 45, hjust = 1) )

heatmap7

heatmap8

heatmap9

heatmap10

15 scatter plots for e-values

scatter<-ggplot(distinct_goslims, aes(x=gonad, y=juvenile))+geom_point()+theme_bw()

scatter

scatter2<-ggplot(distinct_goslims, aes(x=heart, y=juvenile))+geom_point()+theme_bw()

scatter2

#compairative_clam_transcripts_right

scatter4<-ggplot(compairative_clam_transcripts_right, aes(x=`e-value.x`, y=`e-value.y`))+geom_point()+theme_bw()+xlab("Mercenaria")+ylab("Manila")+labs(title="Pgenerosa blast db")+ theme(plot.title = element_text(hjust = 0.5, margin = margin(b = 10)))#removed 3394 rows containing missing values

scatter5<-ggplot(compairative_clam_transcripts_left, aes(x=`e-value.x`, y=`e-value.y`))+geom_point()+theme_bw()+xlab("Mercenaria")+ylab("Manila")+labs(title="Pgenerosa blast query")+ theme(plot.title = element_text(hjust = 0.5, margin = margin(b = 10)))#Removed 4686 rows containing missing values (`geom_point()`).

geoduck genome linked transcriptome

Olivia Cattau

2023-04-06

1 LOAD DATA

1.1 load new genome linked transcriptome PGEN

1.2 load geoduck gene annotation file from

1.3 load goslim mapping from Swiss Prott

1.4 download other trasncriptome from clam species

1.5 load genomes from blast output

1.5.1 get summary data for genomes

1.6 make tables for clam genomes

2 JOIN DATA

2.1 combine all clams for scatterplot by e-value

3 join clam transcriptome with goslims

4 long format

5 combine datasets into one gene annotation file

5.1 break semi-colon in Term column and count unique values only

5.1.1 distinct goslims

6 SPECIES

6.1 Separate the Term columb by semi-colons to count occurances for table

6.2 NORMALIZE

6.3 count GOterms unique to species

7 Oyster female gonad comparison

7.1 load data and filter for genebank accession numbers

8 TISSUE

8.1 Count Goslims by tissue type

9 NORMALIZE

9.1 normalize goterm count by % in geoduck transcriptome by % tissue goterm counts by tissue transcriptome and return another % which is the normalized goterm count

9.2 filter by tissue type

9.3 count GOterms for individual tissues

10 organize data for UpSet plot

11 Make UpSet plot for visualization of overlapping data:

12 Make gene list for unique genes per tissue type

13 Make stacked bar-plot for visualizing Unique Terms

14 Heatmap

15 scatter plots for e-values