https://rpubs.com/ocattau/1041480

1 LOAD DATA

1.1 load new genome linked transcriptome PGEN

salmonMatrix<-read.table(file="https://raw.githubusercontent.com/sr320/paper-geoduck-gene-exp/main/data/salmon.merged.gene_tpm.tsv", header=TRUE, sep = '\t')
names(salmonMatrix)[1]<-"target_id"
names(salmonMatrix)[2]<-"gene"
data.table(salmonMatrix)
##              target_id            gene ctenidia    gonad    heart juvenile
##     1: PGEN_.00g000010 PGEN_.00g000010 0.000000 0.000000 0.000000 4.851851
##     2: PGEN_.00g000020 PGEN_.00g000020 0.000000 0.000000 0.000000 0.840164
##     3: PGEN_.00g000030 PGEN_.00g000030 0.225494 0.000000 0.000000 0.069598
##     4: PGEN_.00g000040 PGEN_.00g000040 0.565506 0.000000 0.254843 0.376364
##     5: PGEN_.00g000050 PGEN_.00g000050 0.508251 4.745168 0.228272 0.182728
##    ---                                                                    
## 34943: PGEN_.00g349430 PGEN_.00g349430 0.000000 0.000000 0.000000 0.000000
## 34944: PGEN_.00g349440 PGEN_.00g349440 1.955226 1.489901 1.621033 0.000000
## 34945: PGEN_.00g349450 PGEN_.00g349450 0.000000 0.000000 0.159121 0.000000
## 34946: PGEN_.00g349460 PGEN_.00g349460 0.000000 0.000000 0.000000 0.000000
## 34947: PGEN_.00g349470 PGEN_.00g349470 0.000000 0.000000 0.000000 0.000000
##          larvae
##     1: 3.423667
##     2: 0.370266
##     3: 0.052545
##     4: 1.131114
##     5: 1.117655
##    ---         
## 34943: 0.000000
## 34944: 2.222424
## 34945: 0.000000
## 34946: 0.000000
## 34947: 0.000000
#34,947

1.2 load geoduck gene annotation file from

mappingMatrix<-read.table(file="https://gannet.fish.washington.edu/Atumefaciens/20220419-pgen-gene_annotation_mapping/20220419-pgen-gene-accessions-gene_id-gene_name-gene_description-alt_gene_description-go_ids.tab", header=TRUE, sep = '\t', fill=TRUE)
colnames(mappingMatrix) <- c("target_id", "identifiers", "gene_id", "gene_name", "annotation", "alt_annotation", "goterms")
#7,736

1.3 load goslim mapping from Swiss Prott

link here

#double check to make sure this is reading the data correctly
goslims<-read.csv(file="https://raw.githubusercontent.com/RobertsLab/code/master/r_projects/sam/20230328-pgen-gene_annotation-update/outputs/02-goslim-mapping/20230329-pgen-annotations-SwissProt-GO-BP_GOslim.tab", header=TRUE, sep = '\t')
#14,672

1.4 download other trasncriptome from clam species

#from Mox blasting Manilla/Mercenaria on Pgen
Manilla_Pgen<-read.table(file="https://gannet.fish.washington.edu/gigas/data/p.generosa/Manilla_Pgenenerosa_blastx.tab")

1.5 load genomes from blast output

Mercenaria_genome<-read.csv(file="https://gannet.fish.washington.edu/gigas/data/p.generosa/Pgenerosa_Mercenaria_db_genome_blastn.tab", sep = '\t', header = FALSE)

Quadrangularis_genome<-read.csv(file="https://gannet.fish.washington.edu/gigas/data/p.generosa/Pgenerosa_quadrangularis_db_genome_blastn.tab", sep = '\t', header = FALSE)

Manila_genome<-read.csv(file="https://gannet.fish.washington.edu/gigas/data/p.generosa/Pgenerosa_manila_db_genome_blastn.tab", sep = '\t', header = FALSE)

Marissincia_genome<-read.csv(file="https://gannet.fish.washington.edu/gigas/data/p.generosa/Pgenerosa_marissinsa_db_genome_blastn.tab", sep = '\t', header = FALSE)

Solida_genome<-read.csv(file="https://gannet.fish.washington.edu/gigas/data/p.generosa/solida_db_genome_blastn.tab", sep = '\t', header = FALSE)

kable(Solida_genome[1:10,1:5])
V1 V2 V3 V4 V5
PGEN_.00g000060::Scaffold_01:70713-81099 OX365947.1 74.279 416 84
PGEN_.00g000270::Scaffold_01:344820-379444 OX365949.1 75.163 306 58
PGEN_.00g000300::Scaffold_01:503707-557915 OX365947.1 83.648 159 16
PGEN_.00g000320::Scaffold_01:571666-599482 OX365950.1 79.545 220 35
PGEN_.00g000670::Scaffold_01:1389639-1400328 OX365948.1 87.857 140 17
PGEN_.00g001140::Scaffold_01:2257307-2316262 OX365948.1 78.487 423 82
PGEN_.00g001170::Scaffold_01:2366888-2417042 OX365948.1 82.443 131 23
PGEN_.00g001350::Scaffold_01:2794647-2838542 OX365948.1 76.093 343 78
PGEN_.00g001380::Scaffold_01:2878758-2888130 OX365948.1 78.431 204 43
PGEN_.00g001410::Scaffold_01:2961919-2993436 OX365952.1 79.126 206 35

1.5.1 get summary data for genomes

#Mercenaria
Mercenaria_new <- separate(Mercenaria_genome, V1, into = c("gene", "Scaffold"), sep = "::")
names(Mercenaria_new)[1]<-"gene"
names(Mercenaria_new)[12]<-"e-value"

Mercenaria_summary <- Mercenaria_new %>%
  mutate(Marissincia_genes = lapply(strsplit(as.character(V2), ";"), function(Marissincia_genes) paste0(unique(Marissincia_genes), collapse = ";"))) %>% 
  distinct(Marissincia_genes, .keep_all = TRUE) 

#Quadrangularis
Quadrangularis_new <- separate(Quadrangularis_genome, V1, into = c("gene", "Scaffold"), sep = "::")
names(Quadrangularis_new)[1]<-"gene"
names(Quadrangularis_new)[12]<-"e-value"

Quadrangularis_summary <- Quadrangularis_new %>%
  mutate(genes = lapply(strsplit(as.character(V2), ";"), function(genes) paste0(unique(genes), collapse = ";"))) %>% 
  distinct(genes, .keep_all = TRUE) 

#Manila 
Manila_new <- separate(Manila_genome, V1, into = c("gene", "Scaffold"), sep = "::")
names(Manila_new)[1]<-"gene"
names(Manila_new)[12]<-"e-value"

Manila_summary <- Manila_new%>%
  mutate(genes = lapply(strsplit(as.character(V2), ";"), function(genes) paste0(unique(genes), collapse = ";"))) %>% 
  distinct(genes, .keep_all = TRUE) 

#Marissincia
Marissincia_new <- separate(Marissincia_genome, V1, into = c("gene", "Scaffold"), sep = "::")
names(Marissincia_new)[1]<-"gene"
names(Marissincia_new)[12]<-"e-value"

Marissincia_summary <- Marissincia_new %>%
  mutate(Marissincia_genes = lapply(strsplit(as.character(V2), ";"), function(Marissincia_genes) paste0(unique(Marissincia_genes), collapse = ";"))) %>% 
  distinct(Marissincia_genes, .keep_all = TRUE) 

#Solida
Solida_new <- separate(Solida_genome, V1, into = c("gene", "Scaffold"), sep = "::")
names(Solida_new)[1]<-"gene"
names(Solida_new)[12]<-"e-value"

Solida_summary <- Solida_new%>%
  mutate(genes = lapply(strsplit(as.character(V2), ";"), function(genes) paste0(unique(genes), collapse = ";"))) %>% 
  distinct(genes, .keep_all = TRUE) 

1.6 make tables for clam genomes

clam_genomes<-bind_rows(lst(Solida_new, Marissincia_new, Mercenaria_new, Manila_new, Quadrangularis_new), .id = "clam")
kable(clam_genomes[1:5,1:5])
clam gene Scaffold V2 V3
Solida_new PGEN_.00g000060 Scaffold_01:70713-81099 OX365947.1 74.279
Solida_new PGEN_.00g000270 Scaffold_01:344820-379444 OX365949.1 75.163
Solida_new PGEN_.00g000300 Scaffold_01:503707-557915 OX365947.1 83.648
Solida_new PGEN_.00g000320 Scaffold_01:571666-599482 OX365950.1 79.545
Solida_new PGEN_.00g000670 Scaffold_01:1389639-1400328 OX365948.1 87.857

2 JOIN DATA

2.1 combine all clams for scatterplot by e-value

#all clam transcriptomes
#Reminder: Merc->Pgen = 6521, Manila->Pgen = 761, Pgen -> Merc = 5,099, Pgen -> Manila = 657
#always join large -> small
compairative_clam_transcripts_right <- left_join(Mercenaria_Pgen_clean, Manila_Pgen_clean, by = "gene") 
## Warning in left_join(Mercenaria_Pgen_clean, Manila_Pgen_clean, by = "gene"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 17 of `x` matches multiple rows in `y`.
## ℹ Row 256 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
compairative_clam_transcripts_left <- left_join(Pgen_Mercenaria_clean, Pgen_Manila_clean, by = "gene") 

3 join clam transcriptome with goslims

Manila_Pgen_goslims <- Manila_Pgen_new %>%
left_join(goslims, by=c("gene"))
Pgen_Manila_goslims <- Pgen_Manila_new %>%
  left_join(goslims, by=c("gene"))
Mercenaria_Pgen_goslims<- Mercenaria_Pgen_new %>%
left_join(goslims, by=c("gene")) #6521
Pgen_Mercenaria_goslims<- Pgen_Mercenaria_new %>%
left_join(goslims, by=c("gene")) #5099

4 long format

tidy_data<-salmonMatrix
head(tidy_data)
##         target_id            gene ctenidia    gonad    heart juvenile   larvae
## 1 PGEN_.00g000010 PGEN_.00g000010 0.000000 0.000000 0.000000 4.851851 3.423667
## 2 PGEN_.00g000020 PGEN_.00g000020 0.000000 0.000000 0.000000 0.840164 0.370266
## 3 PGEN_.00g000030 PGEN_.00g000030 0.225494 0.000000 0.000000 0.069598 0.052545
## 4 PGEN_.00g000040 PGEN_.00g000040 0.565506 0.000000 0.254843 0.376364 1.131114
## 5 PGEN_.00g000050 PGEN_.00g000050 0.508251 4.745168 0.228272 0.182728 1.117655
## 6 PGEN_.00g000060 PGEN_.00g000060 6.490179 0.000000 0.000000 6.742646 5.936612
tidy_data_long<-tidy_data %>%
  gather(key="tissue", value="tpm", ctenidia:larvae) #name key as time, name value as tpm, gather columns heart:larvae, converts wide to long format
names(tidy_data_long)[2]<-"gene"
head(tidy_data_long)
##         target_id            gene   tissue      tpm
## 1 PGEN_.00g000010 PGEN_.00g000010 ctenidia 0.000000
## 2 PGEN_.00g000020 PGEN_.00g000020 ctenidia 0.000000
## 3 PGEN_.00g000030 PGEN_.00g000030 ctenidia 0.225494
## 4 PGEN_.00g000040 PGEN_.00g000040 ctenidia 0.565506
## 5 PGEN_.00g000050 PGEN_.00g000050 ctenidia 0.508251
## 6 PGEN_.00g000060 PGEN_.00g000060 ctenidia 6.490179
tidy_data_long$binary=ifelse(tidy_data_long$tpm > 0, 1, 0)

#add goslims below

5 combine datasets into one gene annotation file

pgenerosa_complete<-salmonMatrix %>% 
  left_join(mappingMatrix, by=c("target_id")) 
# could also do this with goslims
pgenerosa_simple_goslims<-salmonMatrix %>% #best pgenerosa data set
  left_join(goslims, by=c("gene"))
pgenerosa_goslims<-pgenerosa_complete %>%
  left_join(goslims, by=c("gene"))
pgenerosa_long<-tidy_data_long %>% #best long pgenerosa data set
  left_join(goslims, by=c("gene"))
pgenerosa_goslims_long<-tidy_data_long %>%
  left_join(pgenerosa_simple_goslims, by=c("gene"))
kable(pgenerosa_simple_goslims[1:10, 1:10])
target_id gene ctenidia gonad heart juvenile larvae accessions gene_id gene_name
PGEN_.00g000010 PGEN_.00g000010 0.000000 0.000000 0.000000 4.851851 3.423667 Q86IC9;Q552T5 8620183 omt5
PGEN_.00g000020 PGEN_.00g000020 0.000000 0.000000 0.000000 0.840164 0.370266 P04177 25085 Th
PGEN_.00g000030 PGEN_.00g000030 0.225494 0.000000 0.000000 0.069598 0.052545 NA NA NA
PGEN_.00g000040 PGEN_.00g000040 0.565506 0.000000 0.254843 0.376364 1.131114 NA NA NA
PGEN_.00g000050 PGEN_.00g000050 0.508251 4.745168 0.228272 0.182728 1.117655 Q8L840;O04092;Q9FT71 837636 RECQL4A
PGEN_.00g000060 PGEN_.00g000060 6.490179 0.000000 0.000000 6.742646 5.936612 Q61043;A0A1Y7VJL5;B2RQ73;B7ZMZ9;E9Q488;E9Q4S3;Q674R4;Q6ZPM7 18080 Nin
PGEN_.00g000070 PGEN_.00g000070 0.000000 0.000000 0.000000 0.025013 1.103535 NA NA NA
PGEN_.00g000080 PGEN_.00g000080 0.000000 0.000000 0.119488 0.229570 0.000000 A1E2V0 489433 BIRC3
PGEN_.00g000090 PGEN_.00g000090 0.187374 0.000000 0.000000 0.173312 0.000000 P34456 186266
PGEN_.00g000100 PGEN_.00g000100 0.000000 0.000000 0.000000 0.304322 0.000000 NA NA NA

5.1 break semi-colon in Term column and count unique values only

5.1.1 distinct goslims

# check for duplicate values within each cell using strsplit() and anyDuplicated()
duplicates <- sapply(strsplit(pgenerosa_simple_goslims$gene, ";"), anyDuplicated)

# print the row indices of cells with duplicate values
if (any(duplicates)) {
  print(which(duplicates > 0))
} else {
  print("No duplicates found")
}
## [1] "No duplicates found"
#accession numbers = no duplicates
#gene_ID = no duplicates
#allGoID duplicates = column 24458
#BP_GO_ID duplicates = column 24458
#CC_GO_ID duplicates = no duplicates
#MF_GO_ID duplicates = columns 4561  8040 13044 16138
#--------------------------------
#GOslim duplicates lots and lots 
#Terms duplicates lots and lots 

#break cells and reform with unique values only
distinct_goslims <- pgenerosa_simple_goslims %>%
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  distinct(Term_unique, .keep_all = TRUE) %>%
  mutate(GOslim_unique = lapply(strsplit(as.character(GOslim), ";"),function(GOslim_unique) paste0(unique(GOslim_unique), collapse = ";"))) %>% 
  distinct(GOslim_unique, .keep_all = TRUE)

kable(distinct_goslims[1:10,1:10])
target_id gene ctenidia gonad heart juvenile larvae accessions gene_id gene_name
PGEN_.00g000010 PGEN_.00g000010 0.000000 0.000000 0.000000 4.851851 3.423667 Q86IC9;Q552T5 8620183 omt5
PGEN_.00g000020 PGEN_.00g000020 0.000000 0.000000 0.000000 0.840164 0.370266 P04177 25085 Th
PGEN_.00g000050 PGEN_.00g000050 0.508251 4.745168 0.228272 0.182728 1.117655 Q8L840;O04092;Q9FT71 837636 RECQL4A
PGEN_.00g000060 PGEN_.00g000060 6.490179 0.000000 0.000000 6.742646 5.936612 Q61043;A0A1Y7VJL5;B2RQ73;B7ZMZ9;E9Q488;E9Q4S3;Q674R4;Q6ZPM7 18080 Nin
PGEN_.00g000080 PGEN_.00g000080 0.000000 0.000000 0.119488 0.229570 0.000000 A1E2V0 489433 BIRC3
PGEN_.00g000090 PGEN_.00g000090 0.187374 0.000000 0.000000 0.173312 0.000000 P34456 186266
PGEN_.00g000210 PGEN_.00g000210 2.810603 0.000000 2.775616 3.495122 0.898959 O00463;B4DIS9;B4E0A2;Q6FHY1 7188 TRAF5
PGEN_.00g000300 PGEN_.00g000300 4.091597 2.739254 3.785783 5.053620 1.087948 Q5REG4 100171717 DTX3
PGEN_.00g000380 PGEN_.00g000380 0.475060 0.000000 0.000000 0.315384 0.000000 Q8QG60;Q8QG52;Q8QGQ5 374092 CRY2
PGEN_.00g000440 PGEN_.00g000440 11.483098 5.728598 3.123012 6.873106 10.243535 Q9H583;Q5T3Q8;Q6P197;Q9NW23 55127 HEATR1

6 SPECIES

6.1 Separate the Term columb by semi-colons to count occurances for table

# For Pgenerosa alone (wide format) 
# Separate the "Term" column by semi-colons and count the occurrences
term_count_Pgen <- distinct_goslims %>% #make sure you have removed duplicate values beforehand
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) # Count the occurrences and sort by descending count

# Print the count of each term
print(term_count_Pgen)
## # A tibble: 71 × 2
##    Term_unique                                   n
##    <chr>                                     <int>
##  1 anatomical structure development           1853
##  2 signaling                                  1603
##  3 cell differentiation                       1281
##  4 DNA-templated transcription                 741
##  5 regulation of DNA-templated transcription   678
##  6 immune system process                       670
##  7 programmed cell death                       600
##  8 cell motility                               595
##  9 reproductive process                        536
## 10 cytoskeleton organization                   516
## # ℹ 61 more rows
# For Pgenerosa using pgenerosa_long (versus wide)
# Separate the "Term" column by semi-colons and count the occurrences
term_count_Pgen_long <- pgenerosa_long %>%  # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) # Count the occurrences and sort by descending count

# Print the count of each term
print(term_count_Pgen_long)
## # A tibble: 71 × 2
##    Term_unique                                   n
##    <chr>                                     <int>
##  1 anatomical structure development           1853
##  2 signaling                                  1603
##  3 cell differentiation                       1281
##  4 DNA-templated transcription                 741
##  5 regulation of DNA-templated transcription   678
##  6 immune system process                       670
##  7 programmed cell death                       600
##  8 cell motility                               595
##  9 reproductive process                        536
## 10 cytoskeleton organization                   516
## # ℹ 61 more rows
#exactly the same as Pgen_wide


#for Manila Clam
# Separate the "Term" column by semi-colons and count the occurrences
term_count_manila <- Manila_Pgen_goslims %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) # Count the occurrences and sort by descending count

# Print the count of each term
print(term_count_manila)
## # A tibble: 65 × 2
##    Term_unique                                   n
##    <chr>                                     <int>
##  1 anatomical structure development            129
##  2 signaling                                   113
##  3 cell differentiation                         86
##  4 cytoskeleton organization                    65
##  5 DNA-templated transcription                  51
##  6 reproductive process                         48
##  7 regulation of DNA-templated transcription    47
##  8 cell motility                                40
##  9 programmed cell death                        40
## 10 protein-containing complex assembly          39
## # ℹ 55 more rows
# Manila Reverse
term_count_manila_r <- Pgen_Manila_goslims %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) # Count the occurrences and sort by descending count

# Print the count of each term
print(term_count_manila_r)
## # A tibble: 65 × 2
##    Term_unique                                   n
##    <chr>                                     <int>
##  1 anatomical structure development            149
##  2 signaling                                   126
##  3 cell differentiation                         96
##  4 cytoskeleton organization                    68
##  5 DNA-templated transcription                  60
##  6 regulation of DNA-templated transcription    55
##  7 reproductive process                         55
##  8 programmed cell death                        44
##  9 immune system process                        42
## 10 protein-containing complex assembly          42
## # ℹ 55 more rows
#for Mercenaria Clam
# Separate the "Term" column by semi-colons and count the occurrences
term_count_Mercenaria <- Mercenaria_Pgen_goslims %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) # Count the occurrences and sort by descending count

# Print the count of each term
print(term_count_Mercenaria)
## # A tibble: 69 × 2
##    Term_unique                                   n
##    <chr>                                     <int>
##  1 anatomical structure development            247
##  2 signaling                                   189
##  3 cell differentiation                        170
##  4 DNA-templated transcription                  88
##  5 cytoskeleton organization                    87
##  6 cell motility                                83
##  7 regulation of DNA-templated transcription    82
##  8 reproductive process                         71
##  9 immune system process                        66
## 10 programmed cell death                        63
## # ℹ 59 more rows
#for Mercenaria Clam reverse
# Separate the "Term" column by semi-colons and count the occurrences
term_count_Mercenaria_r <- Pgen_Mercenaria_goslims %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) # Count the occurrences and sort by descending count

# Print the count of each term
print(term_count_Mercenaria_r) #over 3x more terms than Mercenaria not reversed...wonder why that is....
## # A tibble: 71 × 2
##    Term_unique                                   n
##    <chr>                                     <int>
##  1 anatomical structure development            768
##  2 signaling                                   635
##  3 cell differentiation                        517
##  4 DNA-templated transcription                 267
##  5 regulation of DNA-templated transcription   245
##  6 cell motility                               241
##  7 cytoskeleton organization                   237
##  8 immune system process                       232
##  9 programmed cell death                       229
## 10 vesicle-mediated transport                  218
## # ℹ 61 more rows
#combine all 5 datasets
term_count_all <- bind_rows(term_count_Pgen, term_count_manila, term_count_manila_r, term_count_Mercenaria, term_count_Mercenaria_r, .id = "transcriptome") 

term_count_best <- term_count_all %>% 
  mutate(transcriptome = ifelse(transcriptome == 1, "Pgenerosa", 
                    ifelse(transcriptome == 2, "Manila",
                           ifelse(transcriptome == 3, "Manila_r", 
                                  ifelse(transcriptome == 4, "Mercenaria", 
                                         ifelse(transcriptome == 5, "Mercenaria_r", NA))))))

6.2 NORMALIZE

#normalize by Pgen transcriptome
sum_terms<- term_count_Pgen %>% #count terms in n 
  summarise(total_n = sum(n)) #17,611

Pgen_normalized<- term_count_Pgen %>%
  mutate(Pgen_normalized = n/17611)

normal_counts<- term_count_best %>%
  group_by(transcriptome) %>%
  summarise(total_n = sum(n))
#Pgen = 17,611
#Manila = 1235
#Manila_r = 1390
#Mercenaria = 2198
#Mercenaria_r = 7027


filtered_data1 <- term_count_best %>%
  filter(transcriptome %in% c("Mercenaria_r", "Manila", "Mercenaria", "Manila_r")) %>%
  mutate(n_divided = case_when(
                               transcriptome == "Manila" ~ n/1235,
                               transcriptome == "Manila_r" ~ n/1390,
                               transcriptome == "Mercenaria" ~ n/2198,
                               transcriptome == "Mercenaria_r" ~ n/7027))

grouped_data1<-left_join(filtered_data1, Pgen_normalized, by="Term_unique")

species_normal<-grouped_data1 %>%
  mutate(normalized = n_divided/Pgen_normalized)%>%
  rename(n_species = n.x) %>%
  rename(n_Pgen = n.y) %>%
  mutate(log_scale = log(normalized))
#for heatmap

kable(species_normal[1:5,1:5])
transcriptome Term_unique n_species n_divided n_Pgen
Manila anatomical structure development 129 0.1044534 1853
Manila signaling 113 0.0914980 1603
Manila cell differentiation 86 0.0696356 1281
Manila cytoskeleton organization 65 0.0526316 516
Manila DNA-templated transcription 51 0.0412955 741

6.3 count GOterms unique to species

#####make new variables with binary data

#wide format data

Manila_2<-Manila_Pgen_goslims %>%
  select(-c(3:11, 13:23))

Mercenaria_2<-Mercenaria_Pgen_goslims %>%
   select(-c(3:11, 13:23))

species_joined<-left_join(Mercenaria_2, Manila_2, by="gene")
## Warning in left_join(Mercenaria_2, Manila_2, by = "gene"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 17 of `x` matches multiple rows in `y`.
## ℹ Row 256 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
#Mercenaria
species_joined$`e-value.x`=ifelse(species_joined$`e-value.x` > 0, 1, 0)
#Manila
species_joined$`e-value.y`=ifelse(species_joined$`e-value.y` > 0, 1, 0)

species_matrix<-species_joined %>% select(gene, `e-value.x`, `e-value.y`) %>%
  rename(Mercenaria_evalue = `e-value.x`)%>%
  rename(Manila_evalue = `e-value.y`)%>%
  distinct(gene, .keep_all = TRUE)%>%
  mutate(Manila_evalue = ifelse(is.na(Manila_evalue), 0, Manila_evalue))%>%
  mutate(Pgen_evalue = 1)

species_matrix<-tibble::column_to_rownames(species_matrix, var="gene")

species_matrix<-as.matrix(species_matrix) #count matrix for UpSet plots

matrix_species = make_comb_mat(species_matrix, mode="distinct")
matrix_species_intersect = make_comb_mat(species_matrix, mode="intersect")
matrix_species_union = make_comb_mat(species_matrix, mode="union")
set_size(matrix_species)
## Mercenaria_evalue     Manila_evalue       Pgen_evalue 
##               980               210              1029
comb_size(matrix_species)
## 111 101 011 001 
## 202 778   8  41
# create sample data frame
species_count<- as.data.frame(species_matrix) %>%
  mutate(Mercenaria_Manila = NA) %>%
  mutate(Manila_Pgen = NA) %>%
  mutate(Pgen_Mercenaria = NA) %>%
  mutate(all=NA)%>%
  mutate(Pgen= NA)

# iterate through rows
########################
#Mercenaria_Manila does not include Pgen genome or else will return all zeros
species_count$Mercenaria_Manila <-ifelse(species_count$Mercenaria_evalue ==1 & species_count$Manila_evalue == 1, 1, 0)

#Manila + Pgenerosa
species_count$Manila_Pgen <-ifelse(species_count$Mercenaria_evalue ==0 & species_count$Manila_evalue == 1 & species_count$Pgen_evalue == 1, 1, 0)

#Mercenaria + Pgenerosa
species_count$Pgen_Mercenaria <-ifelse(species_count$Mercenaria_evalue == 1 & species_count$Manila_evalue == 0 & species_count$Pgen_evalue == 1, 1, 0)

#Mercenaria +Pgenerosa + Manila
species_count$all<-ifelse(species_count$Mercenaria_evalue == 1 & species_count$Manila_evalue == 1 & species_count$Pgen_evalue == 1, 1, 0)

#Pgen alone
species_count$Pgen <-ifelse(species_count$Mercenaria_evalue == 0 & species_count$Manila_evalue == 0 & species_count$Pgen_evalue == 1, 1, 0)

#return column names when species == 1
# identify row indices where Pgen_Mercenaria column equals 1
row_indices_a<- which(species_count$Mercenaria_Manila == 1)
row_indices_b<- which(species_count$Manila_Pgen == 1)
row_indices_c<- which(species_count$Pgen_Mercenaria == 1)
row_indices_d<- which(species_count$all == 1)
row_indices_e<- which(species_count$Pgen == 1)

# extract corresponding row names
genes_Mer_Man <- as.data.frame(rownames(species_count[row_indices_a, ])) 
names(genes_Mer_Man)[1]<-"gene"

genes_Man_Pgen <- as.data.frame(rownames(species_count[row_indices_b, ])) 
names(genes_Man_Pgen)[1]<-"gene"

genes_Mer_Pgen <- as.data.frame(rownames(species_count[row_indices_c, ])) 
names(genes_Mer_Pgen)[1]<-"gene"

genes_all <- as.data.frame(rownames(species_count[row_indices_d, ])) 
names(genes_all)[1]<-"gene"

genes_Pgen <- as.data.frame(rownames(species_count[row_indices_e, ])) 
names(genes_Pgen)[1]<-"gene"

#goslims
goslims_genes_MM<-left_join(genes_Mer_Man, goslims, by="gene")
goslims_gene_ManPgen<-left_join(genes_Man_Pgen, goslims, by ="gene")
goslims_genes_MerPgen<-left_join(genes_Mer_Pgen, goslims, by="gene")
goslims_genes_all<-left_join(genes_all, goslims, by="gene")
goslims_genes_Pgen<-left_join(genes_Pgen, goslims, by="gene")

#Unique GOterms only
#Mercenaria + Manila
MM_count_unique <- goslims_genes_MM %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
 # distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, species= "Mercenaria-Manila")

Man_Pgen_count_unique <- goslims_gene_ManPgen %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
 # distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, species= "Manila-Pgen")

Mer_Pgen_count_unique <- goslims_genes_MerPgen %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
 # distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, species= "Mercenaria-Pgen")

all_count_unique <- goslims_genes_all %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
 # distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, species= "all-3")

Pgen_count_unique <- goslims_genes_Pgen %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
 # distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, species= "Pgen_alone")

species_unique<-bind_rows(Mer_Pgen_count_unique, Man_Pgen_count_unique, MM_count_unique, Pgen_count_unique, all_count_unique)

kable(species_unique[1:4, 1:4])
Term_unique n count_unique species
NA 261 68 Mercenaria-Pgen
anatomical structure development 203 68 Mercenaria-Pgen
signaling 181 68 Mercenaria-Pgen
cell differentiation 134 68 Mercenaria-Pgen

7 Oyster female gonad comparison

7.1 load data and filter for genebank accession numbers

oyster_gonad<-read.csv(file="https://raw.githubusercontent.com/course-fish546-2023/olivia-geoduck/main/data/pone.0036353.s003.csv")

oyster_gonad<- oyster_gonad %>%
  filter(Description != 0) %>%
  group_by(Cluster) %>%
  select(Genebank, Cluster)

#write.csv(oyster_gonad, file="/Users/oliviacattau/Documents/GitHub/olivia-geoduck/output/oyster_gonad_clusters.csv")

kable(oyster_gonad[1:10, 1:2])
Genebank Cluster
BQ427044 1
CU683161 1
AM856831 1
AM867786 1
FP008634 1
AM854780 1
AM865407 1
AM858220 1
FP002214 1
AM857878 1

8 TISSUE

8.1 Count Goslims by tissue type

#count top GOterms
tissue_goslims <- pgenerosa_long %>%
    group_by(tissue) %>% 
  filter(binary == 1) %>%
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>%
  separate_rows(Term_unique, sep = ";") %>%
  mutate(Term_unique = trimws(Term_unique)) %>%
  count(Term_unique, tissue, sort = TRUE)

9 NORMALIZE

9.1 normalize goterm count by % in geoduck transcriptome by % tissue goterm counts by tissue transcriptome and return another % which is the normalized goterm count

#add new column to variable term_count_Pgen

#normalize data by % hit in Pgen transcriptome
sum_terms<- term_count_Pgen %>% #count terms in n 
  summarise(total_n = sum(n)) #17,611

Pgen_normalized<- term_count_Pgen %>%
  mutate(Pgen_normalized = n/17611)

#normalize tissue data
normal_counts<- tissue_goslims %>%
  group_by(tissue) %>%
  summarise(total_n = sum(n))
#heart = 16,021
#ctenidia = 15,911
#gonad = 14,715
#juv = 17,277
#larvae = 16,632

filtered_data <- tissue_goslims %>%
  filter(tissue %in% c("ctenidia", "heart", "gonad", "larvae", "juvenile")) %>%
  mutate(n_divided = case_when(tissue == "ctenidia" ~ n/15911,
                               tissue == "heart" ~ n/16021,
                               tissue == "gonad" ~ n/14715,
                               tissue == "larvae" ~ n/16632,
                               tissue == "juvenile" ~ n/17277))

#combine Pgen_normalized with filtered_data and add new column normalized where tissue_n/geoduck_n to produce new normalized value

grouped_data<-left_join(filtered_data, Pgen_normalized, by="Term_unique")

Tis_normal<-grouped_data %>%
  mutate(normalized = n_divided/Pgen_normalized)%>%
  rename(n_tissue = n.x) %>%
  rename(n_Pgen = n.y) %>%
  mutate(log_scale = log(normalized))
#for heatmap

kable(Tis_normal[1:5, 1:5])
tissue Term_unique n_tissue n_divided n_Pgen
juvenile anatomical structure development 1817 0.1051687 1853
larvae anatomical structure development 1743 0.1047980 1853
heart anatomical structure development 1678 0.1047375 1853
ctenidia anatomical structure development 1661 0.1043932 1853
juvenile signaling 1572 0.0909880 1603

9.2 filter by tissue type

not actually unique goslims, only unique to tissue types

tidy1<-pgenerosa_goslims_long %>%
  group_by(tissue) %>%
  filter(tpm > 0) 

ctenidia<-pgenerosa_goslims %>%
  filter(ctenidia > 0 & gonad == 0 & heart == 0 & juvenile == 0 & larvae == 0)
count(ctenidia)
##     n
## 1 340
#340 obs (unique to ctenidia)
write.csv(ctenidia, file="/Users/oliviacattau/Documents/GitHub/code-for-Pgenerosa/output/ctenidia_unique.csv")

gonad<-pgenerosa_goslims %>%
  filter(gonad > 0 & ctenidia == 0 & heart == 0 & juvenile == 0 & larvae == 0)
count(gonad)
##     n
## 1 119
#119 obs (unique to gonad)
write.csv(gonad, file="/Users/oliviacattau/Documents/GitHub/code-for-Pgenerosa/output/gonad_unique.csv")

heart<-pgenerosa_goslims %>%
  filter(heart > 0 & ctenidia == 0 & gonad == 0 & juvenile == 0 & larvae == 0)
count(heart)
##     n
## 1 371
#371 obs (unique to heart)
write.csv(heart, file="/Users/oliviacattau/Documents/GitHub/code-for-Pgenerosa/output/heart_unique.csv")

juvenile<-pgenerosa_goslims %>%
  filter(juvenile > 0 & ctenidia == 0 & heart == 0 & gonad == 0 & larvae == 0)
count(juvenile)
##      n
## 1 2180
#2,180 obs (unique to juvenile alone)
write.csv(juvenile, file="/Users/oliviacattau/Documents/GitHub/code-for-Pgenerosa/output/juvenile_unique.csv")

larvae<-pgenerosa_goslims %>%
  filter(larvae > 0 & ctenidia == 0 & heart == 0 & gonad == 0 & juvenile == 0)
count(larvae)
##     n
## 1 868
#868 obs (unique to larvae alone)
write.csv(larvae, file="/Users/oliviacattau/Documents/GitHub/code-for-Pgenerosa/output/larvae_unique.csv")

juvorlarv<-pgenerosa_goslims %>%
  filter(ctenidia == 0 & gonad == 0 & heart == 0) %>%
  filter(juvenile > 0 & larvae > 0)
#5,022 obs unique to both juveniles or larvae
#1,974 obs unique to juveniles and larvae together

all_unique<-pgenerosa_goslims %>%
   filter(ctenidia > 0 & gonad > 0 & heart > 0 & juvenile > 0 & larvae > 0)

9.3 count GOterms for individual tissues

gonad larvae heart ctenidia juvenile

gonad_count_unique <- gonad %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% # distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, tissue= "gonad")

larvae_count_unique <- larvae %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% # distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, tissue= "larvae")

heart_count_unique <- heart %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% # distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, tissue= "heart")

ctenidia_count_unique <- ctenidia %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% # distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, tissue= "ctenidia")

juvenile_count_unique <- juvenile %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% # distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, tissue= "juvenile")

tissue_all_unique_genes_GOterms<-rbind(heart_count_unique, gonad_count_unique, ctenidia_count_unique, larvae_count_unique, juvenile_count_unique)

kable(tissue_all_unique_genes_GOterms[1:4, 1:4])
Term_unique n count_unique tissue
NA 342 41 heart
signaling 13 41 heart
anatomical structure development 11 41 heart
cell differentiation 9 41 heart

10 organize data for UpSet plot

library(dplyr)
#####make new variables with binary data
tidy_data$heart=ifelse(tidy_data$heart > 0, 1, 0)
tidy_data$gonad=ifelse(tidy_data$gonad >0, 1, 0)
tidy_data$ctenidia=ifelse(tidy_data$ctenidia > 0, 1, 0)
tidy_data$larvae=ifelse(tidy_data$larvae > 0, 1, 0)
tidy_data$juvenile=ifelse(tidy_data$juvenile > 0, 1, 0)
new_data<-tidy_data %>% select(target_id,heart, gonad, ctenidia, larvae, juvenile)
new_data<-tibble::column_to_rownames(new_data, var="target_id")
new_data<-as.matrix(new_data) #count matrix for UpSet plots

11 Make UpSet plot for visualization of overlapping data:

link to UpSet R page

m1 = make_comb_mat(new_data) #only distinct intersections
m2=make_comb_mat(new_data, mode="intersect")
set_size(m1)
##    heart    gonad ctenidia   larvae juvenile 
##    17602    13682    17479    19449    23417
comb_size(m1)
## 11111 11110 11101 11011 10111 01111 11100 11010 11001 10110 10101 10011 01110 
## 11294    24   692   397  2147   337    54    13   122    51   658  1007     8 
## 01101 01011 00111 11000 10100 10010 10001 01100 01010 01001 00110 00101 00011 
##   100   258   886    43   124    60   545    20    46   155    79   665  1974 
## 10000 01000 00100 00010 00001 00000 
##   371   119   340   868  2180  9310
comb_data<-as.data.frame(comb_size(m1))

p1<-UpSet(m1 [comb_degree(m1)<=2], column_title ="All intersections of 2 or less") #intersections 2 or less
p2<-UpSet(m1 [comb_degree(m1)>=3], column_title= "All intersection of 3 or more") #intersections 3 or more
p3<-UpSet(m1, top_annotation = HeatmapAnnotation(
    degree = as.character(comb_degree(m1)),
    "Intersection\nsize" = anno_barplot(comb_size(m1), 
        border = FALSE, 
        gp = gpar(fill = "black"), 
        height = unit(2, "cm")
    ), 
    annotation_name_side = "left", 
    annotation_name_rot = 0), column_title="p. generosa transcriptome distinct intersections") #distinct mode

p5<-UpSet(m2 [comb_degree(m2)<=2], column_title ="All intersections of 2 or less")
p6<-UpSet(m2 [comb_degree(m2)>=3], column_title ="All intersections of 3 or more")
#####testing comb_set sizes
x2<-comb_size(m1)
x3<-as.data.frame(x2)
sum(x3$x2) #34,947 --distinct mode
## [1] 34947
x4<-comb_size(m2)
x5<-as.data.frame(x4)
sum(x5$x4) #450,489 --intersect mode
## [1] 450489

12 Make gene list for unique genes per tissue type

# create sample data frame
count_df<- as.data.frame(new_data) %>%
  mutate(heart_gonad = NA) %>%
  mutate(heart_ctenidia = NA) %>%
  mutate(ctenidia_gonad = NA)


# iterate through rows
########################
#heart + gonad
# create a new column for shared data between heart and gonad columns
count_df$heart_gonad <- ifelse(count_df$heart == 1 & count_df$gonad == 1 & count_df$ctenidia == 0 & count_df$juvenile == 0 & count_df$larvae == 0, 1, 0)

#heart + ctenidia
count_df$heart_ctenidia <- ifelse(count_df$heart == 1 & count_df$gonad == 0 & count_df$ctenidia == 1 & count_df$juvenile == 0 & count_df$larvae == 0, 1, 0) 

#ctenidia + gonad
count_df$ctenidia_gonad <- ifelse(count_df$heart == 0 & count_df$gonad == 1 & count_df$ctenidia == 1 & count_df$juvenile == 0 & count_df$larvae == 0, 1, 0) 

#larvae + juvenile
count_df$larv_juv <- ifelse(count_df$heart == 0 & count_df$gonad == 0 & count_df$ctenidia == 0 & count_df$juvenile == 1 & count_df$larvae == 1, 1, 0)

#ctenidia + juvenile 
count_df$ctenidia_juv <- ifelse(count_df$heart == 0 & count_df$gonad == 0 & count_df$ctenidia == 1 & count_df$juvenile == 1 & count_df$larvae == 0, 1, 0)

#heart + juvenile
count_df$heart_juv <- ifelse(count_df$heart == 1 & count_df$gonad == 0 & count_df$ctenidia == 0 & count_df$juvenile == 1 & count_df$larvae == 0, 1, 0)

#gonad + juvenile
count_df$gonad_juv <- ifelse(count_df$heart == 0 & count_df$gonad == 1 & count_df$ctenidia == 0 & count_df$juvenile == 1 & count_df$larvae == 0, 1, 0)

#ctenidia + larvae
count_df$larv_ctenidia <- ifelse(count_df$heart == 0 & count_df$gonad == 0 & count_df$ctenidia == 1 & count_df$juvenile == 0 & count_df$larvae == 1, 1, 0)

#heart + larvae
count_df$larv_heart <- ifelse(count_df$heart == 1 & count_df$gonad == 0 & count_df$ctenidia == 0 & count_df$juvenile == 0 & count_df$larvae == 1, 1, 0)

#gonad + larvae
count_df$larv_gonad <- ifelse(count_df$heart == 0 & count_df$gonad == 1 & count_df$ctenidia == 0 & count_df$juvenile == 0 & count_df$larvae == 1, 1, 0)

#return column names when tissues == 1
# identify row indices where heart_gonad column equals 1
row_indices  <- which(count_df$heart_gonad == 1)
row_indices_2<- which(count_df$heart_ctenidia == 1)
row_indices_3<- which(count_df$ctenidia_gonad == 1)
row_indices_4<- which(count_df$larv_juv == 1)
row_indices_5<- which(count_df$ctenidia_juv == 1)
row_indices_6<- which(count_df$heart_juv == 1)
row_indices_7<- which(count_df$gonad_juv == 1)
row_indices_8<- which(count_df$larv_ctenidia == 1)
row_indices_9<- which(count_df$larv_heart == 1)
row_indices_10<- which(count_df$larv_gonad == 1)


# extract corresponding row names
genes_heart_gonad <- as.data.frame(rownames(count_df[row_indices, ])) 
names(genes_heart_gonad)[1]<-"gene"

genes_heart_ctenidia <- as.data.frame(rownames(count_df[row_indices_2,]))
names(genes_heart_ctenidia)[1]<-"gene"

genes_ctenidia_gonad <- as.data.frame(rownames(count_df[row_indices_3, ])) 
names(genes_ctenidia_gonad)[1]<-"gene"

genes_larv_juv <- as.data.frame(rownames(count_df[row_indices_4, ])) 
names(genes_larv_juv)[1]<-"gene"

genes_ctenidia_juv <-as.data.frame(rownames(count_df[row_indices_5, ])) 
names(genes_ctenidia_juv)[1]<-"gene"

genes_heart_juv <- as.data.frame(rownames(count_df[row_indices_6, ])) 
names(genes_heart_juv)[1]<-"gene"

genes_gonad_juv <- as.data.frame(rownames(count_df[row_indices_7, ])) 
names(genes_gonad_juv)[1]<-"gene"

genes_larv_ctenidia <- as.data.frame(rownames(count_df[row_indices_8, ])) 
names(genes_larv_ctenidia)[1]<-"gene"

genes_larv_heart <- as.data.frame(rownames(count_df[row_indices_9, ])) 
names(genes_larv_heart)[1]<-"gene"

genes_larv_gonad <- as.data.frame(rownames(count_df[row_indices_10, ])) 
names(genes_larv_gonad)[1]<-"gene"

#goslims
goslims_genes_hg<-left_join(genes_heart_gonad, goslims, by="gene")
goslims_gene_hc<-left_join(genes_heart_ctenidia, goslims, by ="gene")
goslims_genes_cg<-left_join(genes_ctenidia_gonad, goslims, by="gene")
goslims_gene_lj<-left_join(genes_larv_juv, goslims, by="gene")
goslims_genes_cj<-left_join(genes_ctenidia_juv, goslims, by="gene")
goslims_genes_hj<-left_join(genes_heart_juv, goslims, by="gene")
goslims_genes_gj<-left_join(genes_gonad_juv, goslims, by="gene")
goslims_genes_cl<-left_join(genes_larv_ctenidia, goslims, by="gene")
goslims_genes_hl<-left_join(genes_larv_heart, goslims, by="gene")
goslims_genes_gl<-left_join(genes_larv_gonad, goslims, by="gene")


#Unique GOterms only
#heart-gonad
hg_count_unique <- goslims_genes_hg %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
 # distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, tissues= "heart-gonad")

#heart-ctenidia
hc_count_unique <- goslims_gene_hc %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  #distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>%# Count the occurrences and sort by descending count
  mutate(count_unique = n()-1, tissues= "heart-ctenidia")

#ctenidia-gonad
cg_count_unique <- goslims_genes_cg %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  #distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, tissues= "ctenidia-gonad")

#larvae-juvenile
lj_count_unique <- goslims_gene_lj %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  #distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, tissues= "larvae-juvenile") #count genes with GOslims and remove 1 (for NAs)

#ctenidia-juvenile
cj_count_unique <- goslims_genes_cj %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  #distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, tissues= "ctenidia-juvenile") #count genes with GOslims and remove 1 (for NAs)

#heart-juvenile
hj_count_unique <- goslims_genes_hj %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  #distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, tissues= "heart-juvenile") #count genes with GOslims and remove 1 (for NAs)

#gonad-juvenile
gj_count_unique <- goslims_genes_gj %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  #distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, tissues= "gonad-juvenile") #count genes with GOslims and remove 1 (for NAs)

#heart + larvae
hl_count_unique <- goslims_genes_hl %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  #distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, tissues= "heart-larvae") #count genes with GOslims and remove 1 (for NAs)

#ctenidia + larvae
cl_count_unique <- goslims_genes_cl %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  #distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, tissues= "ctenidia-larvae") #count genes with GOslims and remove 1 (for NAs)

#gonad + larvae
gl_count_unique <- goslims_genes_gl %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  #distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, tissues= "gonad-larvae") #count genes with GOslims and remove 1 (for NAs)

#term count Juv only
juv_count_unique <- juvenile %>% # remove duplicates from Term column and create Term_unique column
  mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% 
  #distinct(Term_unique, .keep_all = TRUE) %>% 
  filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
  separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
  mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces 
  count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count 
  mutate(count_unique = n()-1, tissues= "juvenile") #count genes with GOslims and remove 1 (for NAs)

write.csv(juv_count_unique, file="/Users/oliviacattau/Documents/GitHub/olivia-geoduck/data/juv_count_unique.csv")

#combine into one DF
unique_overlapping_genes<-bind_rows(hg_count_unique, hc_count_unique, cg_count_unique, lj_count_unique, cj_count_unique, hj_count_unique, gj_count_unique, cl_count_unique, hl_count_unique, gl_count_unique)

13 Make stacked bar-plot for visualizing Unique Terms

#filter Terms down a bit 

terms_filtered <- term_count_best %>%
  group_by(Term_unique) %>%
  filter(n() >= 5) %>%
  filter(!is.na("Terms_unique")) %>%
  filter(n >= 10)

#load top 20 hits from Github
top20terms<-read.csv(file="https://raw.githubusercontent.com/course-fish546-2023/olivia-geoduck/main/output/tables/top20terms.csv")

compairative_transcriptomes<-top20terms %>%
  filter(transcriptome != "Pgenerosa") %>%
  filter(n >= 15)

stacked_barplot<-ggplot(top20terms, aes(x=transcriptome, y=n, fill=Term_unique))+geom_bar(stat = "identity")+theme_bw()

stacked_barplot2<-ggplot(compairative_transcriptomes, aes(x=transcriptome, y=n, fill=Term_unique))+geom_bar(stat = "identity")+theme_bw()

stacked_barplot

14 Heatmap

#Z=e-values, Y=Manilla genes, X=Mercenaria genes

top20_long<-reshape(top20terms, idvar = "Term_unique", timevar = "transcriptome", direction = "wide")

tissues_goterms_unique<-read.csv(file="https://raw.githubusercontent.com/ocattau/code-for-Pgenerosa/main/output/tissue_unique_Pgen_terms_heatmap_data.csv")

tissues2<-tissues_goterms_unique %>%
  filter(tissues != "larvae-juvenile")

#heatmap<-ggplot(pgenerosa_long, aes(x=gene, y=tissue, fill=tpm)) + 
 # geom_tile() #too large

heatmap2<-ggplot(top20terms, aes(x=transcriptome, y=Term_unique, fill=n))+geom_tile()+theme_bw()+ scale_fill_gradient(low="#c2f9ff", high="#003460")

heatmap3<-ggplot(compairative_transcriptomes, aes(x=transcriptome, y=Term_unique, fill=n))+geom_tile()+theme_bw() + scale_fill_gradient(low="#c2f9ff", high="#003460")

heatmap7<- ggplot(tissues2, aes(x=tissues, y=Term_unique, fill=n))+geom_tile()+theme_bw()+ scale_fill_gradient(low="#c2f9ff", high="#003460")+theme(axis.text.x = element_text(angle = 45, hjust = 1) )

Tis_normal_2<-Tis_normal %>%
  filter(Term_unique != "NA")

heatmap8<-ggplot(Tis_normal_2, aes(tissue, y=Term_unique, fill=normalized))+geom_tile()+theme_bw()+theme(axis.text.x = element_text(angle = 45, hjust = 1) )+ scale_fill_gradient2(low = "yellow", mid = "white", high = "red", midpoint = 0.9, na.value = "grey50")

heatmap9<-ggplot(Tis_normal, aes(tissue, y=Term_unique, fill=log_scale))+geom_tile(color="white")+  scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0, na.value = "grey50")+theme_bw()+theme(axis.text.x = element_text(angle = 45, hjust = 1) )#need to set color wheel to white = 1 and red = min and blue = max

species_normal_2<-species_normal %>%
  filter(Term_unique != "NA")%>%
  filter(transcriptome == "Manila_r" | transcriptome == "Mercenaria_r")

heatmap10<-ggplot(species_normal_2, aes(transcriptome, y=Term_unique, fill=normalized))+geom_tile(color="white")+  scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 1, na.value = "grey50")+theme_bw()+theme(axis.text.x = element_text(angle = 45, hjust = 1) )

heatmap7

heatmap8

heatmap9

heatmap10

15 scatter plots for e-values

scatter<-ggplot(distinct_goslims, aes(x=gonad, y=juvenile))+geom_point()+theme_bw()

scatter

scatter2<-ggplot(distinct_goslims, aes(x=heart, y=juvenile))+geom_point()+theme_bw()

scatter2

#compairative_clam_transcripts_right

scatter4<-ggplot(compairative_clam_transcripts_right, aes(x=`e-value.x`, y=`e-value.y`))+geom_point()+theme_bw()+xlab("Mercenaria")+ylab("Manila")+labs(title="Pgenerosa blast db")+ theme(plot.title = element_text(hjust = 0.5, margin = margin(b = 10)))#removed 3394 rows containing missing values

scatter5<-ggplot(compairative_clam_transcripts_left, aes(x=`e-value.x`, y=`e-value.y`))+geom_point()+theme_bw()+xlab("Mercenaria")+ylab("Manila")+labs(title="Pgenerosa blast query")+ theme(plot.title = element_text(hjust = 0.5, margin = margin(b = 10)))#Removed 4686 rows containing missing values (`geom_point()`).