https://rpubs.com/ocattau/1041480
salmonMatrix<-read.table(file="https://raw.githubusercontent.com/sr320/paper-geoduck-gene-exp/main/data/salmon.merged.gene_tpm.tsv", header=TRUE, sep = '\t')
names(salmonMatrix)[1]<-"target_id"
names(salmonMatrix)[2]<-"gene"
data.table(salmonMatrix)## target_id gene ctenidia gonad heart juvenile
## 1: PGEN_.00g000010 PGEN_.00g000010 0.000000 0.000000 0.000000 4.851851
## 2: PGEN_.00g000020 PGEN_.00g000020 0.000000 0.000000 0.000000 0.840164
## 3: PGEN_.00g000030 PGEN_.00g000030 0.225494 0.000000 0.000000 0.069598
## 4: PGEN_.00g000040 PGEN_.00g000040 0.565506 0.000000 0.254843 0.376364
## 5: PGEN_.00g000050 PGEN_.00g000050 0.508251 4.745168 0.228272 0.182728
## ---
## 34943: PGEN_.00g349430 PGEN_.00g349430 0.000000 0.000000 0.000000 0.000000
## 34944: PGEN_.00g349440 PGEN_.00g349440 1.955226 1.489901 1.621033 0.000000
## 34945: PGEN_.00g349450 PGEN_.00g349450 0.000000 0.000000 0.159121 0.000000
## 34946: PGEN_.00g349460 PGEN_.00g349460 0.000000 0.000000 0.000000 0.000000
## 34947: PGEN_.00g349470 PGEN_.00g349470 0.000000 0.000000 0.000000 0.000000
## larvae
## 1: 3.423667
## 2: 0.370266
## 3: 0.052545
## 4: 1.131114
## 5: 1.117655
## ---
## 34943: 0.000000
## 34944: 2.222424
## 34945: 0.000000
## 34946: 0.000000
## 34947: 0.000000
#34,947mappingMatrix<-read.table(file="https://gannet.fish.washington.edu/Atumefaciens/20220419-pgen-gene_annotation_mapping/20220419-pgen-gene-accessions-gene_id-gene_name-gene_description-alt_gene_description-go_ids.tab", header=TRUE, sep = '\t', fill=TRUE)
colnames(mappingMatrix) <- c("target_id", "identifiers", "gene_id", "gene_name", "annotation", "alt_annotation", "goterms")
#7,736#double check to make sure this is reading the data correctly
goslims<-read.csv(file="https://raw.githubusercontent.com/RobertsLab/code/master/r_projects/sam/20230328-pgen-gene_annotation-update/outputs/02-goslim-mapping/20230329-pgen-annotations-SwissProt-GO-BP_GOslim.tab", header=TRUE, sep = '\t')
#14,672#from Mox blasting Manilla/Mercenaria on Pgen
Manilla_Pgen<-read.table(file="https://gannet.fish.washington.edu/gigas/data/p.generosa/Manilla_Pgenenerosa_blastx.tab")Mercenaria_genome<-read.csv(file="https://gannet.fish.washington.edu/gigas/data/p.generosa/Pgenerosa_Mercenaria_db_genome_blastn.tab", sep = '\t', header = FALSE)
Quadrangularis_genome<-read.csv(file="https://gannet.fish.washington.edu/gigas/data/p.generosa/Pgenerosa_quadrangularis_db_genome_blastn.tab", sep = '\t', header = FALSE)
Manila_genome<-read.csv(file="https://gannet.fish.washington.edu/gigas/data/p.generosa/Pgenerosa_manila_db_genome_blastn.tab", sep = '\t', header = FALSE)
Marissincia_genome<-read.csv(file="https://gannet.fish.washington.edu/gigas/data/p.generosa/Pgenerosa_marissinsa_db_genome_blastn.tab", sep = '\t', header = FALSE)
Solida_genome<-read.csv(file="https://gannet.fish.washington.edu/gigas/data/p.generosa/solida_db_genome_blastn.tab", sep = '\t', header = FALSE)
kable(Solida_genome[1:10,1:5])| V1 | V2 | V3 | V4 | V5 |
|---|---|---|---|---|
| PGEN_.00g000060::Scaffold_01:70713-81099 | OX365947.1 | 74.279 | 416 | 84 |
| PGEN_.00g000270::Scaffold_01:344820-379444 | OX365949.1 | 75.163 | 306 | 58 |
| PGEN_.00g000300::Scaffold_01:503707-557915 | OX365947.1 | 83.648 | 159 | 16 |
| PGEN_.00g000320::Scaffold_01:571666-599482 | OX365950.1 | 79.545 | 220 | 35 |
| PGEN_.00g000670::Scaffold_01:1389639-1400328 | OX365948.1 | 87.857 | 140 | 17 |
| PGEN_.00g001140::Scaffold_01:2257307-2316262 | OX365948.1 | 78.487 | 423 | 82 |
| PGEN_.00g001170::Scaffold_01:2366888-2417042 | OX365948.1 | 82.443 | 131 | 23 |
| PGEN_.00g001350::Scaffold_01:2794647-2838542 | OX365948.1 | 76.093 | 343 | 78 |
| PGEN_.00g001380::Scaffold_01:2878758-2888130 | OX365948.1 | 78.431 | 204 | 43 |
| PGEN_.00g001410::Scaffold_01:2961919-2993436 | OX365952.1 | 79.126 | 206 | 35 |
#Mercenaria
Mercenaria_new <- separate(Mercenaria_genome, V1, into = c("gene", "Scaffold"), sep = "::")
names(Mercenaria_new)[1]<-"gene"
names(Mercenaria_new)[12]<-"e-value"
Mercenaria_summary <- Mercenaria_new %>%
mutate(Marissincia_genes = lapply(strsplit(as.character(V2), ";"), function(Marissincia_genes) paste0(unique(Marissincia_genes), collapse = ";"))) %>%
distinct(Marissincia_genes, .keep_all = TRUE)
#Quadrangularis
Quadrangularis_new <- separate(Quadrangularis_genome, V1, into = c("gene", "Scaffold"), sep = "::")
names(Quadrangularis_new)[1]<-"gene"
names(Quadrangularis_new)[12]<-"e-value"
Quadrangularis_summary <- Quadrangularis_new %>%
mutate(genes = lapply(strsplit(as.character(V2), ";"), function(genes) paste0(unique(genes), collapse = ";"))) %>%
distinct(genes, .keep_all = TRUE)
#Manila
Manila_new <- separate(Manila_genome, V1, into = c("gene", "Scaffold"), sep = "::")
names(Manila_new)[1]<-"gene"
names(Manila_new)[12]<-"e-value"
Manila_summary <- Manila_new%>%
mutate(genes = lapply(strsplit(as.character(V2), ";"), function(genes) paste0(unique(genes), collapse = ";"))) %>%
distinct(genes, .keep_all = TRUE)
#Marissincia
Marissincia_new <- separate(Marissincia_genome, V1, into = c("gene", "Scaffold"), sep = "::")
names(Marissincia_new)[1]<-"gene"
names(Marissincia_new)[12]<-"e-value"
Marissincia_summary <- Marissincia_new %>%
mutate(Marissincia_genes = lapply(strsplit(as.character(V2), ";"), function(Marissincia_genes) paste0(unique(Marissincia_genes), collapse = ";"))) %>%
distinct(Marissincia_genes, .keep_all = TRUE)
#Solida
Solida_new <- separate(Solida_genome, V1, into = c("gene", "Scaffold"), sep = "::")
names(Solida_new)[1]<-"gene"
names(Solida_new)[12]<-"e-value"
Solida_summary <- Solida_new%>%
mutate(genes = lapply(strsplit(as.character(V2), ";"), function(genes) paste0(unique(genes), collapse = ";"))) %>%
distinct(genes, .keep_all = TRUE) clam_genomes<-bind_rows(lst(Solida_new, Marissincia_new, Mercenaria_new, Manila_new, Quadrangularis_new), .id = "clam")
kable(clam_genomes[1:5,1:5])| clam | gene | Scaffold | V2 | V3 |
|---|---|---|---|---|
| Solida_new | PGEN_.00g000060 | Scaffold_01:70713-81099 | OX365947.1 | 74.279 |
| Solida_new | PGEN_.00g000270 | Scaffold_01:344820-379444 | OX365949.1 | 75.163 |
| Solida_new | PGEN_.00g000300 | Scaffold_01:503707-557915 | OX365947.1 | 83.648 |
| Solida_new | PGEN_.00g000320 | Scaffold_01:571666-599482 | OX365950.1 | 79.545 |
| Solida_new | PGEN_.00g000670 | Scaffold_01:1389639-1400328 | OX365948.1 | 87.857 |
#all clam transcriptomes
#Reminder: Merc->Pgen = 6521, Manila->Pgen = 761, Pgen -> Merc = 5,099, Pgen -> Manila = 657
#always join large -> small
compairative_clam_transcripts_right <- left_join(Mercenaria_Pgen_clean, Manila_Pgen_clean, by = "gene") ## Warning in left_join(Mercenaria_Pgen_clean, Manila_Pgen_clean, by = "gene"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 17 of `x` matches multiple rows in `y`.
## ℹ Row 256 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
compairative_clam_transcripts_left <- left_join(Pgen_Mercenaria_clean, Pgen_Manila_clean, by = "gene") Manila_Pgen_goslims <- Manila_Pgen_new %>%
left_join(goslims, by=c("gene"))
Pgen_Manila_goslims <- Pgen_Manila_new %>%
left_join(goslims, by=c("gene"))
Mercenaria_Pgen_goslims<- Mercenaria_Pgen_new %>%
left_join(goslims, by=c("gene")) #6521
Pgen_Mercenaria_goslims<- Pgen_Mercenaria_new %>%
left_join(goslims, by=c("gene")) #5099tidy_data<-salmonMatrix
head(tidy_data)## target_id gene ctenidia gonad heart juvenile larvae
## 1 PGEN_.00g000010 PGEN_.00g000010 0.000000 0.000000 0.000000 4.851851 3.423667
## 2 PGEN_.00g000020 PGEN_.00g000020 0.000000 0.000000 0.000000 0.840164 0.370266
## 3 PGEN_.00g000030 PGEN_.00g000030 0.225494 0.000000 0.000000 0.069598 0.052545
## 4 PGEN_.00g000040 PGEN_.00g000040 0.565506 0.000000 0.254843 0.376364 1.131114
## 5 PGEN_.00g000050 PGEN_.00g000050 0.508251 4.745168 0.228272 0.182728 1.117655
## 6 PGEN_.00g000060 PGEN_.00g000060 6.490179 0.000000 0.000000 6.742646 5.936612
tidy_data_long<-tidy_data %>%
gather(key="tissue", value="tpm", ctenidia:larvae) #name key as time, name value as tpm, gather columns heart:larvae, converts wide to long format
names(tidy_data_long)[2]<-"gene"
head(tidy_data_long)## target_id gene tissue tpm
## 1 PGEN_.00g000010 PGEN_.00g000010 ctenidia 0.000000
## 2 PGEN_.00g000020 PGEN_.00g000020 ctenidia 0.000000
## 3 PGEN_.00g000030 PGEN_.00g000030 ctenidia 0.225494
## 4 PGEN_.00g000040 PGEN_.00g000040 ctenidia 0.565506
## 5 PGEN_.00g000050 PGEN_.00g000050 ctenidia 0.508251
## 6 PGEN_.00g000060 PGEN_.00g000060 ctenidia 6.490179
tidy_data_long$binary=ifelse(tidy_data_long$tpm > 0, 1, 0)
#add goslims belowpgenerosa_complete<-salmonMatrix %>%
left_join(mappingMatrix, by=c("target_id"))
# could also do this with goslims
pgenerosa_simple_goslims<-salmonMatrix %>% #best pgenerosa data set
left_join(goslims, by=c("gene"))
pgenerosa_goslims<-pgenerosa_complete %>%
left_join(goslims, by=c("gene"))
pgenerosa_long<-tidy_data_long %>% #best long pgenerosa data set
left_join(goslims, by=c("gene"))
pgenerosa_goslims_long<-tidy_data_long %>%
left_join(pgenerosa_simple_goslims, by=c("gene"))
kable(pgenerosa_simple_goslims[1:10, 1:10])| target_id | gene | ctenidia | gonad | heart | juvenile | larvae | accessions | gene_id | gene_name |
|---|---|---|---|---|---|---|---|---|---|
| PGEN_.00g000010 | PGEN_.00g000010 | 0.000000 | 0.000000 | 0.000000 | 4.851851 | 3.423667 | Q86IC9;Q552T5 | 8620183 | omt5 |
| PGEN_.00g000020 | PGEN_.00g000020 | 0.000000 | 0.000000 | 0.000000 | 0.840164 | 0.370266 | P04177 | 25085 | Th |
| PGEN_.00g000030 | PGEN_.00g000030 | 0.225494 | 0.000000 | 0.000000 | 0.069598 | 0.052545 | NA | NA | NA |
| PGEN_.00g000040 | PGEN_.00g000040 | 0.565506 | 0.000000 | 0.254843 | 0.376364 | 1.131114 | NA | NA | NA |
| PGEN_.00g000050 | PGEN_.00g000050 | 0.508251 | 4.745168 | 0.228272 | 0.182728 | 1.117655 | Q8L840;O04092;Q9FT71 | 837636 | RECQL4A |
| PGEN_.00g000060 | PGEN_.00g000060 | 6.490179 | 0.000000 | 0.000000 | 6.742646 | 5.936612 | Q61043;A0A1Y7VJL5;B2RQ73;B7ZMZ9;E9Q488;E9Q4S3;Q674R4;Q6ZPM7 | 18080 | Nin |
| PGEN_.00g000070 | PGEN_.00g000070 | 0.000000 | 0.000000 | 0.000000 | 0.025013 | 1.103535 | NA | NA | NA |
| PGEN_.00g000080 | PGEN_.00g000080 | 0.000000 | 0.000000 | 0.119488 | 0.229570 | 0.000000 | A1E2V0 | 489433 | BIRC3 |
| PGEN_.00g000090 | PGEN_.00g000090 | 0.187374 | 0.000000 | 0.000000 | 0.173312 | 0.000000 | P34456 | 186266 | |
| PGEN_.00g000100 | PGEN_.00g000100 | 0.000000 | 0.000000 | 0.000000 | 0.304322 | 0.000000 | NA | NA | NA |
# check for duplicate values within each cell using strsplit() and anyDuplicated()
duplicates <- sapply(strsplit(pgenerosa_simple_goslims$gene, ";"), anyDuplicated)
# print the row indices of cells with duplicate values
if (any(duplicates)) {
print(which(duplicates > 0))
} else {
print("No duplicates found")
}## [1] "No duplicates found"
#accession numbers = no duplicates
#gene_ID = no duplicates
#allGoID duplicates = column 24458
#BP_GO_ID duplicates = column 24458
#CC_GO_ID duplicates = no duplicates
#MF_GO_ID duplicates = columns 4561 8040 13044 16138
#--------------------------------
#GOslim duplicates lots and lots
#Terms duplicates lots and lots
#break cells and reform with unique values only
distinct_goslims <- pgenerosa_simple_goslims %>%
mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>%
distinct(Term_unique, .keep_all = TRUE) %>%
mutate(GOslim_unique = lapply(strsplit(as.character(GOslim), ";"),function(GOslim_unique) paste0(unique(GOslim_unique), collapse = ";"))) %>%
distinct(GOslim_unique, .keep_all = TRUE)
kable(distinct_goslims[1:10,1:10])| target_id | gene | ctenidia | gonad | heart | juvenile | larvae | accessions | gene_id | gene_name |
|---|---|---|---|---|---|---|---|---|---|
| PGEN_.00g000010 | PGEN_.00g000010 | 0.000000 | 0.000000 | 0.000000 | 4.851851 | 3.423667 | Q86IC9;Q552T5 | 8620183 | omt5 |
| PGEN_.00g000020 | PGEN_.00g000020 | 0.000000 | 0.000000 | 0.000000 | 0.840164 | 0.370266 | P04177 | 25085 | Th |
| PGEN_.00g000050 | PGEN_.00g000050 | 0.508251 | 4.745168 | 0.228272 | 0.182728 | 1.117655 | Q8L840;O04092;Q9FT71 | 837636 | RECQL4A |
| PGEN_.00g000060 | PGEN_.00g000060 | 6.490179 | 0.000000 | 0.000000 | 6.742646 | 5.936612 | Q61043;A0A1Y7VJL5;B2RQ73;B7ZMZ9;E9Q488;E9Q4S3;Q674R4;Q6ZPM7 | 18080 | Nin |
| PGEN_.00g000080 | PGEN_.00g000080 | 0.000000 | 0.000000 | 0.119488 | 0.229570 | 0.000000 | A1E2V0 | 489433 | BIRC3 |
| PGEN_.00g000090 | PGEN_.00g000090 | 0.187374 | 0.000000 | 0.000000 | 0.173312 | 0.000000 | P34456 | 186266 | |
| PGEN_.00g000210 | PGEN_.00g000210 | 2.810603 | 0.000000 | 2.775616 | 3.495122 | 0.898959 | O00463;B4DIS9;B4E0A2;Q6FHY1 | 7188 | TRAF5 |
| PGEN_.00g000300 | PGEN_.00g000300 | 4.091597 | 2.739254 | 3.785783 | 5.053620 | 1.087948 | Q5REG4 | 100171717 | DTX3 |
| PGEN_.00g000380 | PGEN_.00g000380 | 0.475060 | 0.000000 | 0.000000 | 0.315384 | 0.000000 | Q8QG60;Q8QG52;Q8QGQ5 | 374092 | CRY2 |
| PGEN_.00g000440 | PGEN_.00g000440 | 11.483098 | 5.728598 | 3.123012 | 6.873106 | 10.243535 | Q9H583;Q5T3Q8;Q6P197;Q9NW23 | 55127 | HEATR1 |
# For Pgenerosa alone (wide format)
# Separate the "Term" column by semi-colons and count the occurrences
term_count_Pgen <- distinct_goslims %>% #make sure you have removed duplicate values beforehand
filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces
count(Term_unique, sort = TRUE) # Count the occurrences and sort by descending count
# Print the count of each term
print(term_count_Pgen)## # A tibble: 71 × 2
## Term_unique n
## <chr> <int>
## 1 anatomical structure development 1853
## 2 signaling 1603
## 3 cell differentiation 1281
## 4 DNA-templated transcription 741
## 5 regulation of DNA-templated transcription 678
## 6 immune system process 670
## 7 programmed cell death 600
## 8 cell motility 595
## 9 reproductive process 536
## 10 cytoskeleton organization 516
## # ℹ 61 more rows
# For Pgenerosa using pgenerosa_long (versus wide)
# Separate the "Term" column by semi-colons and count the occurrences
term_count_Pgen_long <- pgenerosa_long %>% # remove duplicates from Term column and create Term_unique column
mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>%
distinct(Term_unique, .keep_all = TRUE) %>%
filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces
count(Term_unique, sort = TRUE) # Count the occurrences and sort by descending count
# Print the count of each term
print(term_count_Pgen_long)## # A tibble: 71 × 2
## Term_unique n
## <chr> <int>
## 1 anatomical structure development 1853
## 2 signaling 1603
## 3 cell differentiation 1281
## 4 DNA-templated transcription 741
## 5 regulation of DNA-templated transcription 678
## 6 immune system process 670
## 7 programmed cell death 600
## 8 cell motility 595
## 9 reproductive process 536
## 10 cytoskeleton organization 516
## # ℹ 61 more rows
#exactly the same as Pgen_wide
#for Manila Clam
# Separate the "Term" column by semi-colons and count the occurrences
term_count_manila <- Manila_Pgen_goslims %>% # remove duplicates from Term column and create Term_unique column
mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>%
distinct(Term_unique, .keep_all = TRUE) %>%
filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces
count(Term_unique, sort = TRUE) # Count the occurrences and sort by descending count
# Print the count of each term
print(term_count_manila)## # A tibble: 65 × 2
## Term_unique n
## <chr> <int>
## 1 anatomical structure development 129
## 2 signaling 113
## 3 cell differentiation 86
## 4 cytoskeleton organization 65
## 5 DNA-templated transcription 51
## 6 reproductive process 48
## 7 regulation of DNA-templated transcription 47
## 8 cell motility 40
## 9 programmed cell death 40
## 10 protein-containing complex assembly 39
## # ℹ 55 more rows
# Manila Reverse
term_count_manila_r <- Pgen_Manila_goslims %>% # remove duplicates from Term column and create Term_unique column
mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>%
distinct(Term_unique, .keep_all = TRUE) %>%
filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces
count(Term_unique, sort = TRUE) # Count the occurrences and sort by descending count
# Print the count of each term
print(term_count_manila_r)## # A tibble: 65 × 2
## Term_unique n
## <chr> <int>
## 1 anatomical structure development 149
## 2 signaling 126
## 3 cell differentiation 96
## 4 cytoskeleton organization 68
## 5 DNA-templated transcription 60
## 6 regulation of DNA-templated transcription 55
## 7 reproductive process 55
## 8 programmed cell death 44
## 9 immune system process 42
## 10 protein-containing complex assembly 42
## # ℹ 55 more rows
#for Mercenaria Clam
# Separate the "Term" column by semi-colons and count the occurrences
term_count_Mercenaria <- Mercenaria_Pgen_goslims %>% # remove duplicates from Term column and create Term_unique column
mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>%
distinct(Term_unique, .keep_all = TRUE) %>%
filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces
count(Term_unique, sort = TRUE) # Count the occurrences and sort by descending count
# Print the count of each term
print(term_count_Mercenaria)## # A tibble: 69 × 2
## Term_unique n
## <chr> <int>
## 1 anatomical structure development 247
## 2 signaling 189
## 3 cell differentiation 170
## 4 DNA-templated transcription 88
## 5 cytoskeleton organization 87
## 6 cell motility 83
## 7 regulation of DNA-templated transcription 82
## 8 reproductive process 71
## 9 immune system process 66
## 10 programmed cell death 63
## # ℹ 59 more rows
#for Mercenaria Clam reverse
# Separate the "Term" column by semi-colons and count the occurrences
term_count_Mercenaria_r <- Pgen_Mercenaria_goslims %>% # remove duplicates from Term column and create Term_unique column
mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>%
distinct(Term_unique, .keep_all = TRUE) %>%
filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces
count(Term_unique, sort = TRUE) # Count the occurrences and sort by descending count
# Print the count of each term
print(term_count_Mercenaria_r) #over 3x more terms than Mercenaria not reversed...wonder why that is....## # A tibble: 71 × 2
## Term_unique n
## <chr> <int>
## 1 anatomical structure development 768
## 2 signaling 635
## 3 cell differentiation 517
## 4 DNA-templated transcription 267
## 5 regulation of DNA-templated transcription 245
## 6 cell motility 241
## 7 cytoskeleton organization 237
## 8 immune system process 232
## 9 programmed cell death 229
## 10 vesicle-mediated transport 218
## # ℹ 61 more rows
#combine all 5 datasets
term_count_all <- bind_rows(term_count_Pgen, term_count_manila, term_count_manila_r, term_count_Mercenaria, term_count_Mercenaria_r, .id = "transcriptome")
term_count_best <- term_count_all %>%
mutate(transcriptome = ifelse(transcriptome == 1, "Pgenerosa",
ifelse(transcriptome == 2, "Manila",
ifelse(transcriptome == 3, "Manila_r",
ifelse(transcriptome == 4, "Mercenaria",
ifelse(transcriptome == 5, "Mercenaria_r", NA))))))#normalize by Pgen transcriptome
sum_terms<- term_count_Pgen %>% #count terms in n
summarise(total_n = sum(n)) #17,611
Pgen_normalized<- term_count_Pgen %>%
mutate(Pgen_normalized = n/17611)
normal_counts<- term_count_best %>%
group_by(transcriptome) %>%
summarise(total_n = sum(n))
#Pgen = 17,611
#Manila = 1235
#Manila_r = 1390
#Mercenaria = 2198
#Mercenaria_r = 7027
filtered_data1 <- term_count_best %>%
filter(transcriptome %in% c("Mercenaria_r", "Manila", "Mercenaria", "Manila_r")) %>%
mutate(n_divided = case_when(
transcriptome == "Manila" ~ n/1235,
transcriptome == "Manila_r" ~ n/1390,
transcriptome == "Mercenaria" ~ n/2198,
transcriptome == "Mercenaria_r" ~ n/7027))
grouped_data1<-left_join(filtered_data1, Pgen_normalized, by="Term_unique")
species_normal<-grouped_data1 %>%
mutate(normalized = n_divided/Pgen_normalized)%>%
rename(n_species = n.x) %>%
rename(n_Pgen = n.y) %>%
mutate(log_scale = log(normalized))
#for heatmap
kable(species_normal[1:5,1:5])| transcriptome | Term_unique | n_species | n_divided | n_Pgen |
|---|---|---|---|---|
| Manila | anatomical structure development | 129 | 0.1044534 | 1853 |
| Manila | signaling | 113 | 0.0914980 | 1603 |
| Manila | cell differentiation | 86 | 0.0696356 | 1281 |
| Manila | cytoskeleton organization | 65 | 0.0526316 | 516 |
| Manila | DNA-templated transcription | 51 | 0.0412955 | 741 |
#####make new variables with binary data
#wide format data
Manila_2<-Manila_Pgen_goslims %>%
select(-c(3:11, 13:23))
Mercenaria_2<-Mercenaria_Pgen_goslims %>%
select(-c(3:11, 13:23))
species_joined<-left_join(Mercenaria_2, Manila_2, by="gene")## Warning in left_join(Mercenaria_2, Manila_2, by = "gene"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 17 of `x` matches multiple rows in `y`.
## ℹ Row 256 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
#Mercenaria
species_joined$`e-value.x`=ifelse(species_joined$`e-value.x` > 0, 1, 0)
#Manila
species_joined$`e-value.y`=ifelse(species_joined$`e-value.y` > 0, 1, 0)
species_matrix<-species_joined %>% select(gene, `e-value.x`, `e-value.y`) %>%
rename(Mercenaria_evalue = `e-value.x`)%>%
rename(Manila_evalue = `e-value.y`)%>%
distinct(gene, .keep_all = TRUE)%>%
mutate(Manila_evalue = ifelse(is.na(Manila_evalue), 0, Manila_evalue))%>%
mutate(Pgen_evalue = 1)
species_matrix<-tibble::column_to_rownames(species_matrix, var="gene")
species_matrix<-as.matrix(species_matrix) #count matrix for UpSet plots
matrix_species = make_comb_mat(species_matrix, mode="distinct")
matrix_species_intersect = make_comb_mat(species_matrix, mode="intersect")
matrix_species_union = make_comb_mat(species_matrix, mode="union")
set_size(matrix_species)## Mercenaria_evalue Manila_evalue Pgen_evalue
## 980 210 1029
comb_size(matrix_species)## 111 101 011 001
## 202 778 8 41
# create sample data frame
species_count<- as.data.frame(species_matrix) %>%
mutate(Mercenaria_Manila = NA) %>%
mutate(Manila_Pgen = NA) %>%
mutate(Pgen_Mercenaria = NA) %>%
mutate(all=NA)%>%
mutate(Pgen= NA)
# iterate through rows
########################
#Mercenaria_Manila does not include Pgen genome or else will return all zeros
species_count$Mercenaria_Manila <-ifelse(species_count$Mercenaria_evalue ==1 & species_count$Manila_evalue == 1, 1, 0)
#Manila + Pgenerosa
species_count$Manila_Pgen <-ifelse(species_count$Mercenaria_evalue ==0 & species_count$Manila_evalue == 1 & species_count$Pgen_evalue == 1, 1, 0)
#Mercenaria + Pgenerosa
species_count$Pgen_Mercenaria <-ifelse(species_count$Mercenaria_evalue == 1 & species_count$Manila_evalue == 0 & species_count$Pgen_evalue == 1, 1, 0)
#Mercenaria +Pgenerosa + Manila
species_count$all<-ifelse(species_count$Mercenaria_evalue == 1 & species_count$Manila_evalue == 1 & species_count$Pgen_evalue == 1, 1, 0)
#Pgen alone
species_count$Pgen <-ifelse(species_count$Mercenaria_evalue == 0 & species_count$Manila_evalue == 0 & species_count$Pgen_evalue == 1, 1, 0)
#return column names when species == 1
# identify row indices where Pgen_Mercenaria column equals 1
row_indices_a<- which(species_count$Mercenaria_Manila == 1)
row_indices_b<- which(species_count$Manila_Pgen == 1)
row_indices_c<- which(species_count$Pgen_Mercenaria == 1)
row_indices_d<- which(species_count$all == 1)
row_indices_e<- which(species_count$Pgen == 1)
# extract corresponding row names
genes_Mer_Man <- as.data.frame(rownames(species_count[row_indices_a, ]))
names(genes_Mer_Man)[1]<-"gene"
genes_Man_Pgen <- as.data.frame(rownames(species_count[row_indices_b, ]))
names(genes_Man_Pgen)[1]<-"gene"
genes_Mer_Pgen <- as.data.frame(rownames(species_count[row_indices_c, ]))
names(genes_Mer_Pgen)[1]<-"gene"
genes_all <- as.data.frame(rownames(species_count[row_indices_d, ]))
names(genes_all)[1]<-"gene"
genes_Pgen <- as.data.frame(rownames(species_count[row_indices_e, ]))
names(genes_Pgen)[1]<-"gene"
#goslims
goslims_genes_MM<-left_join(genes_Mer_Man, goslims, by="gene")
goslims_gene_ManPgen<-left_join(genes_Man_Pgen, goslims, by ="gene")
goslims_genes_MerPgen<-left_join(genes_Mer_Pgen, goslims, by="gene")
goslims_genes_all<-left_join(genes_all, goslims, by="gene")
goslims_genes_Pgen<-left_join(genes_Pgen, goslims, by="gene")
#Unique GOterms only
#Mercenaria + Manila
MM_count_unique <- goslims_genes_MM %>% # remove duplicates from Term column and create Term_unique column
mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>%
# distinct(Term_unique, .keep_all = TRUE) %>%
filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces
count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count
mutate(count_unique = n()-1, species= "Mercenaria-Manila")
Man_Pgen_count_unique <- goslims_gene_ManPgen %>% # remove duplicates from Term column and create Term_unique column
mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>%
# distinct(Term_unique, .keep_all = TRUE) %>%
filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces
count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count
mutate(count_unique = n()-1, species= "Manila-Pgen")
Mer_Pgen_count_unique <- goslims_genes_MerPgen %>% # remove duplicates from Term column and create Term_unique column
mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>%
# distinct(Term_unique, .keep_all = TRUE) %>%
filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces
count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count
mutate(count_unique = n()-1, species= "Mercenaria-Pgen")
all_count_unique <- goslims_genes_all %>% # remove duplicates from Term column and create Term_unique column
mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>%
# distinct(Term_unique, .keep_all = TRUE) %>%
filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces
count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count
mutate(count_unique = n()-1, species= "all-3")
Pgen_count_unique <- goslims_genes_Pgen %>% # remove duplicates from Term column and create Term_unique column
mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>%
# distinct(Term_unique, .keep_all = TRUE) %>%
filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces
count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count
mutate(count_unique = n()-1, species= "Pgen_alone")
species_unique<-bind_rows(Mer_Pgen_count_unique, Man_Pgen_count_unique, MM_count_unique, Pgen_count_unique, all_count_unique)
kable(species_unique[1:4, 1:4])| Term_unique | n | count_unique | species |
|---|---|---|---|
| NA | 261 | 68 | Mercenaria-Pgen |
| anatomical structure development | 203 | 68 | Mercenaria-Pgen |
| signaling | 181 | 68 | Mercenaria-Pgen |
| cell differentiation | 134 | 68 | Mercenaria-Pgen |
oyster_gonad<-read.csv(file="https://raw.githubusercontent.com/course-fish546-2023/olivia-geoduck/main/data/pone.0036353.s003.csv")
oyster_gonad<- oyster_gonad %>%
filter(Description != 0) %>%
group_by(Cluster) %>%
select(Genebank, Cluster)
#write.csv(oyster_gonad, file="/Users/oliviacattau/Documents/GitHub/olivia-geoduck/output/oyster_gonad_clusters.csv")
kable(oyster_gonad[1:10, 1:2])| Genebank | Cluster |
|---|---|
| BQ427044 | 1 |
| CU683161 | 1 |
| AM856831 | 1 |
| AM867786 | 1 |
| FP008634 | 1 |
| AM854780 | 1 |
| AM865407 | 1 |
| AM858220 | 1 |
| FP002214 | 1 |
| AM857878 | 1 |
#count top GOterms
tissue_goslims <- pgenerosa_long %>%
group_by(tissue) %>%
filter(binary == 1) %>%
mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>%
distinct(Term_unique, .keep_all = TRUE) %>%
filter(!is.na(Term_unique)) %>%
separate_rows(Term_unique, sep = ";") %>%
mutate(Term_unique = trimws(Term_unique)) %>%
count(Term_unique, tissue, sort = TRUE)#add new column to variable term_count_Pgen
#normalize data by % hit in Pgen transcriptome
sum_terms<- term_count_Pgen %>% #count terms in n
summarise(total_n = sum(n)) #17,611
Pgen_normalized<- term_count_Pgen %>%
mutate(Pgen_normalized = n/17611)
#normalize tissue data
normal_counts<- tissue_goslims %>%
group_by(tissue) %>%
summarise(total_n = sum(n))
#heart = 16,021
#ctenidia = 15,911
#gonad = 14,715
#juv = 17,277
#larvae = 16,632
filtered_data <- tissue_goslims %>%
filter(tissue %in% c("ctenidia", "heart", "gonad", "larvae", "juvenile")) %>%
mutate(n_divided = case_when(tissue == "ctenidia" ~ n/15911,
tissue == "heart" ~ n/16021,
tissue == "gonad" ~ n/14715,
tissue == "larvae" ~ n/16632,
tissue == "juvenile" ~ n/17277))
#combine Pgen_normalized with filtered_data and add new column normalized where tissue_n/geoduck_n to produce new normalized value
grouped_data<-left_join(filtered_data, Pgen_normalized, by="Term_unique")
Tis_normal<-grouped_data %>%
mutate(normalized = n_divided/Pgen_normalized)%>%
rename(n_tissue = n.x) %>%
rename(n_Pgen = n.y) %>%
mutate(log_scale = log(normalized))
#for heatmap
kable(Tis_normal[1:5, 1:5])| tissue | Term_unique | n_tissue | n_divided | n_Pgen |
|---|---|---|---|---|
| juvenile | anatomical structure development | 1817 | 0.1051687 | 1853 |
| larvae | anatomical structure development | 1743 | 0.1047980 | 1853 |
| heart | anatomical structure development | 1678 | 0.1047375 | 1853 |
| ctenidia | anatomical structure development | 1661 | 0.1043932 | 1853 |
| juvenile | signaling | 1572 | 0.0909880 | 1603 |
not actually unique goslims, only unique to tissue types
tidy1<-pgenerosa_goslims_long %>%
group_by(tissue) %>%
filter(tpm > 0)
ctenidia<-pgenerosa_goslims %>%
filter(ctenidia > 0 & gonad == 0 & heart == 0 & juvenile == 0 & larvae == 0)
count(ctenidia)## n
## 1 340
#340 obs (unique to ctenidia)
write.csv(ctenidia, file="/Users/oliviacattau/Documents/GitHub/code-for-Pgenerosa/output/ctenidia_unique.csv")
gonad<-pgenerosa_goslims %>%
filter(gonad > 0 & ctenidia == 0 & heart == 0 & juvenile == 0 & larvae == 0)
count(gonad)## n
## 1 119
#119 obs (unique to gonad)
write.csv(gonad, file="/Users/oliviacattau/Documents/GitHub/code-for-Pgenerosa/output/gonad_unique.csv")
heart<-pgenerosa_goslims %>%
filter(heart > 0 & ctenidia == 0 & gonad == 0 & juvenile == 0 & larvae == 0)
count(heart)## n
## 1 371
#371 obs (unique to heart)
write.csv(heart, file="/Users/oliviacattau/Documents/GitHub/code-for-Pgenerosa/output/heart_unique.csv")
juvenile<-pgenerosa_goslims %>%
filter(juvenile > 0 & ctenidia == 0 & heart == 0 & gonad == 0 & larvae == 0)
count(juvenile)## n
## 1 2180
#2,180 obs (unique to juvenile alone)
write.csv(juvenile, file="/Users/oliviacattau/Documents/GitHub/code-for-Pgenerosa/output/juvenile_unique.csv")
larvae<-pgenerosa_goslims %>%
filter(larvae > 0 & ctenidia == 0 & heart == 0 & gonad == 0 & juvenile == 0)
count(larvae)## n
## 1 868
#868 obs (unique to larvae alone)
write.csv(larvae, file="/Users/oliviacattau/Documents/GitHub/code-for-Pgenerosa/output/larvae_unique.csv")
juvorlarv<-pgenerosa_goslims %>%
filter(ctenidia == 0 & gonad == 0 & heart == 0) %>%
filter(juvenile > 0 & larvae > 0)
#5,022 obs unique to both juveniles or larvae
#1,974 obs unique to juveniles and larvae together
all_unique<-pgenerosa_goslims %>%
filter(ctenidia > 0 & gonad > 0 & heart > 0 & juvenile > 0 & larvae > 0)gonad larvae heart ctenidia juvenile
gonad_count_unique <- gonad %>% # remove duplicates from Term column and create Term_unique column
mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% # distinct(Term_unique, .keep_all = TRUE) %>%
filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces
count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count
mutate(count_unique = n()-1, tissue= "gonad")
larvae_count_unique <- larvae %>% # remove duplicates from Term column and create Term_unique column
mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% # distinct(Term_unique, .keep_all = TRUE) %>%
filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces
count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count
mutate(count_unique = n()-1, tissue= "larvae")
heart_count_unique <- heart %>% # remove duplicates from Term column and create Term_unique column
mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% # distinct(Term_unique, .keep_all = TRUE) %>%
filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces
count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count
mutate(count_unique = n()-1, tissue= "heart")
ctenidia_count_unique <- ctenidia %>% # remove duplicates from Term column and create Term_unique column
mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% # distinct(Term_unique, .keep_all = TRUE) %>%
filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces
count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count
mutate(count_unique = n()-1, tissue= "ctenidia")
juvenile_count_unique <- juvenile %>% # remove duplicates from Term column and create Term_unique column
mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>% # distinct(Term_unique, .keep_all = TRUE) %>%
filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces
count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count
mutate(count_unique = n()-1, tissue= "juvenile")
tissue_all_unique_genes_GOterms<-rbind(heart_count_unique, gonad_count_unique, ctenidia_count_unique, larvae_count_unique, juvenile_count_unique)
kable(tissue_all_unique_genes_GOterms[1:4, 1:4])| Term_unique | n | count_unique | tissue |
|---|---|---|---|
| NA | 342 | 41 | heart |
| signaling | 13 | 41 | heart |
| anatomical structure development | 11 | 41 | heart |
| cell differentiation | 9 | 41 | heart |
library(dplyr)
#####make new variables with binary data
tidy_data$heart=ifelse(tidy_data$heart > 0, 1, 0)
tidy_data$gonad=ifelse(tidy_data$gonad >0, 1, 0)
tidy_data$ctenidia=ifelse(tidy_data$ctenidia > 0, 1, 0)
tidy_data$larvae=ifelse(tidy_data$larvae > 0, 1, 0)
tidy_data$juvenile=ifelse(tidy_data$juvenile > 0, 1, 0)
new_data<-tidy_data %>% select(target_id,heart, gonad, ctenidia, larvae, juvenile)
new_data<-tibble::column_to_rownames(new_data, var="target_id")
new_data<-as.matrix(new_data) #count matrix for UpSet plotsm1 = make_comb_mat(new_data) #only distinct intersections
m2=make_comb_mat(new_data, mode="intersect")
set_size(m1)## heart gonad ctenidia larvae juvenile
## 17602 13682 17479 19449 23417
comb_size(m1)## 11111 11110 11101 11011 10111 01111 11100 11010 11001 10110 10101 10011 01110
## 11294 24 692 397 2147 337 54 13 122 51 658 1007 8
## 01101 01011 00111 11000 10100 10010 10001 01100 01010 01001 00110 00101 00011
## 100 258 886 43 124 60 545 20 46 155 79 665 1974
## 10000 01000 00100 00010 00001 00000
## 371 119 340 868 2180 9310
comb_data<-as.data.frame(comb_size(m1))
p1<-UpSet(m1 [comb_degree(m1)<=2], column_title ="All intersections of 2 or less") #intersections 2 or less
p2<-UpSet(m1 [comb_degree(m1)>=3], column_title= "All intersection of 3 or more") #intersections 3 or more
p3<-UpSet(m1, top_annotation = HeatmapAnnotation(
degree = as.character(comb_degree(m1)),
"Intersection\nsize" = anno_barplot(comb_size(m1),
border = FALSE,
gp = gpar(fill = "black"),
height = unit(2, "cm")
),
annotation_name_side = "left",
annotation_name_rot = 0), column_title="p. generosa transcriptome distinct intersections") #distinct mode
p5<-UpSet(m2 [comb_degree(m2)<=2], column_title ="All intersections of 2 or less")
p6<-UpSet(m2 [comb_degree(m2)>=3], column_title ="All intersections of 3 or more")
#####testing comb_set sizes
x2<-comb_size(m1)
x3<-as.data.frame(x2)
sum(x3$x2) #34,947 --distinct mode## [1] 34947
x4<-comb_size(m2)
x5<-as.data.frame(x4)
sum(x5$x4) #450,489 --intersect mode## [1] 450489
# create sample data frame
count_df<- as.data.frame(new_data) %>%
mutate(heart_gonad = NA) %>%
mutate(heart_ctenidia = NA) %>%
mutate(ctenidia_gonad = NA)
# iterate through rows
########################
#heart + gonad
# create a new column for shared data between heart and gonad columns
count_df$heart_gonad <- ifelse(count_df$heart == 1 & count_df$gonad == 1 & count_df$ctenidia == 0 & count_df$juvenile == 0 & count_df$larvae == 0, 1, 0)
#heart + ctenidia
count_df$heart_ctenidia <- ifelse(count_df$heart == 1 & count_df$gonad == 0 & count_df$ctenidia == 1 & count_df$juvenile == 0 & count_df$larvae == 0, 1, 0)
#ctenidia + gonad
count_df$ctenidia_gonad <- ifelse(count_df$heart == 0 & count_df$gonad == 1 & count_df$ctenidia == 1 & count_df$juvenile == 0 & count_df$larvae == 0, 1, 0)
#larvae + juvenile
count_df$larv_juv <- ifelse(count_df$heart == 0 & count_df$gonad == 0 & count_df$ctenidia == 0 & count_df$juvenile == 1 & count_df$larvae == 1, 1, 0)
#ctenidia + juvenile
count_df$ctenidia_juv <- ifelse(count_df$heart == 0 & count_df$gonad == 0 & count_df$ctenidia == 1 & count_df$juvenile == 1 & count_df$larvae == 0, 1, 0)
#heart + juvenile
count_df$heart_juv <- ifelse(count_df$heart == 1 & count_df$gonad == 0 & count_df$ctenidia == 0 & count_df$juvenile == 1 & count_df$larvae == 0, 1, 0)
#gonad + juvenile
count_df$gonad_juv <- ifelse(count_df$heart == 0 & count_df$gonad == 1 & count_df$ctenidia == 0 & count_df$juvenile == 1 & count_df$larvae == 0, 1, 0)
#ctenidia + larvae
count_df$larv_ctenidia <- ifelse(count_df$heart == 0 & count_df$gonad == 0 & count_df$ctenidia == 1 & count_df$juvenile == 0 & count_df$larvae == 1, 1, 0)
#heart + larvae
count_df$larv_heart <- ifelse(count_df$heart == 1 & count_df$gonad == 0 & count_df$ctenidia == 0 & count_df$juvenile == 0 & count_df$larvae == 1, 1, 0)
#gonad + larvae
count_df$larv_gonad <- ifelse(count_df$heart == 0 & count_df$gonad == 1 & count_df$ctenidia == 0 & count_df$juvenile == 0 & count_df$larvae == 1, 1, 0)
#return column names when tissues == 1
# identify row indices where heart_gonad column equals 1
row_indices <- which(count_df$heart_gonad == 1)
row_indices_2<- which(count_df$heart_ctenidia == 1)
row_indices_3<- which(count_df$ctenidia_gonad == 1)
row_indices_4<- which(count_df$larv_juv == 1)
row_indices_5<- which(count_df$ctenidia_juv == 1)
row_indices_6<- which(count_df$heart_juv == 1)
row_indices_7<- which(count_df$gonad_juv == 1)
row_indices_8<- which(count_df$larv_ctenidia == 1)
row_indices_9<- which(count_df$larv_heart == 1)
row_indices_10<- which(count_df$larv_gonad == 1)
# extract corresponding row names
genes_heart_gonad <- as.data.frame(rownames(count_df[row_indices, ]))
names(genes_heart_gonad)[1]<-"gene"
genes_heart_ctenidia <- as.data.frame(rownames(count_df[row_indices_2,]))
names(genes_heart_ctenidia)[1]<-"gene"
genes_ctenidia_gonad <- as.data.frame(rownames(count_df[row_indices_3, ]))
names(genes_ctenidia_gonad)[1]<-"gene"
genes_larv_juv <- as.data.frame(rownames(count_df[row_indices_4, ]))
names(genes_larv_juv)[1]<-"gene"
genes_ctenidia_juv <-as.data.frame(rownames(count_df[row_indices_5, ]))
names(genes_ctenidia_juv)[1]<-"gene"
genes_heart_juv <- as.data.frame(rownames(count_df[row_indices_6, ]))
names(genes_heart_juv)[1]<-"gene"
genes_gonad_juv <- as.data.frame(rownames(count_df[row_indices_7, ]))
names(genes_gonad_juv)[1]<-"gene"
genes_larv_ctenidia <- as.data.frame(rownames(count_df[row_indices_8, ]))
names(genes_larv_ctenidia)[1]<-"gene"
genes_larv_heart <- as.data.frame(rownames(count_df[row_indices_9, ]))
names(genes_larv_heart)[1]<-"gene"
genes_larv_gonad <- as.data.frame(rownames(count_df[row_indices_10, ]))
names(genes_larv_gonad)[1]<-"gene"
#goslims
goslims_genes_hg<-left_join(genes_heart_gonad, goslims, by="gene")
goslims_gene_hc<-left_join(genes_heart_ctenidia, goslims, by ="gene")
goslims_genes_cg<-left_join(genes_ctenidia_gonad, goslims, by="gene")
goslims_gene_lj<-left_join(genes_larv_juv, goslims, by="gene")
goslims_genes_cj<-left_join(genes_ctenidia_juv, goslims, by="gene")
goslims_genes_hj<-left_join(genes_heart_juv, goslims, by="gene")
goslims_genes_gj<-left_join(genes_gonad_juv, goslims, by="gene")
goslims_genes_cl<-left_join(genes_larv_ctenidia, goslims, by="gene")
goslims_genes_hl<-left_join(genes_larv_heart, goslims, by="gene")
goslims_genes_gl<-left_join(genes_larv_gonad, goslims, by="gene")
#Unique GOterms only
#heart-gonad
hg_count_unique <- goslims_genes_hg %>% # remove duplicates from Term column and create Term_unique column
mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>%
# distinct(Term_unique, .keep_all = TRUE) %>%
filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces
count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count
mutate(count_unique = n()-1, tissues= "heart-gonad")
#heart-ctenidia
hc_count_unique <- goslims_gene_hc %>% # remove duplicates from Term column and create Term_unique column
mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>%
#distinct(Term_unique, .keep_all = TRUE) %>%
filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces
count(Term_unique, sort = TRUE) %>%# Count the occurrences and sort by descending count
mutate(count_unique = n()-1, tissues= "heart-ctenidia")
#ctenidia-gonad
cg_count_unique <- goslims_genes_cg %>% # remove duplicates from Term column and create Term_unique column
mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>%
#distinct(Term_unique, .keep_all = TRUE) %>%
filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces
count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count
mutate(count_unique = n()-1, tissues= "ctenidia-gonad")
#larvae-juvenile
lj_count_unique <- goslims_gene_lj %>% # remove duplicates from Term column and create Term_unique column
mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>%
#distinct(Term_unique, .keep_all = TRUE) %>%
filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces
count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count
mutate(count_unique = n()-1, tissues= "larvae-juvenile") #count genes with GOslims and remove 1 (for NAs)
#ctenidia-juvenile
cj_count_unique <- goslims_genes_cj %>% # remove duplicates from Term column and create Term_unique column
mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>%
#distinct(Term_unique, .keep_all = TRUE) %>%
filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces
count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count
mutate(count_unique = n()-1, tissues= "ctenidia-juvenile") #count genes with GOslims and remove 1 (for NAs)
#heart-juvenile
hj_count_unique <- goslims_genes_hj %>% # remove duplicates from Term column and create Term_unique column
mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>%
#distinct(Term_unique, .keep_all = TRUE) %>%
filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces
count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count
mutate(count_unique = n()-1, tissues= "heart-juvenile") #count genes with GOslims and remove 1 (for NAs)
#gonad-juvenile
gj_count_unique <- goslims_genes_gj %>% # remove duplicates from Term column and create Term_unique column
mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>%
#distinct(Term_unique, .keep_all = TRUE) %>%
filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces
count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count
mutate(count_unique = n()-1, tissues= "gonad-juvenile") #count genes with GOslims and remove 1 (for NAs)
#heart + larvae
hl_count_unique <- goslims_genes_hl %>% # remove duplicates from Term column and create Term_unique column
mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>%
#distinct(Term_unique, .keep_all = TRUE) %>%
filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces
count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count
mutate(count_unique = n()-1, tissues= "heart-larvae") #count genes with GOslims and remove 1 (for NAs)
#ctenidia + larvae
cl_count_unique <- goslims_genes_cl %>% # remove duplicates from Term column and create Term_unique column
mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>%
#distinct(Term_unique, .keep_all = TRUE) %>%
filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces
count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count
mutate(count_unique = n()-1, tissues= "ctenidia-larvae") #count genes with GOslims and remove 1 (for NAs)
#gonad + larvae
gl_count_unique <- goslims_genes_gl %>% # remove duplicates from Term column and create Term_unique column
mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>%
#distinct(Term_unique, .keep_all = TRUE) %>%
filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces
count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count
mutate(count_unique = n()-1, tissues= "gonad-larvae") #count genes with GOslims and remove 1 (for NAs)
#term count Juv only
juv_count_unique <- juvenile %>% # remove duplicates from Term column and create Term_unique column
mutate(Term_unique = lapply(strsplit(as.character(Term), ";"), function(Term_unique) paste0(unique(Term_unique), collapse = ";"))) %>%
#distinct(Term_unique, .keep_all = TRUE) %>%
filter(!is.na(Term_unique)) %>% # Remove rows with missing (NA) values in the "Term" column
separate_rows(Term_unique, sep = ";") %>% # Separate the "Term" column by semi-colons
mutate(Term_unique = trimws(Term_unique)) %>% # Remove any leading/trailing white spaces
count(Term_unique, sort = TRUE) %>% # Count the occurrences and sort by descending count
mutate(count_unique = n()-1, tissues= "juvenile") #count genes with GOslims and remove 1 (for NAs)
write.csv(juv_count_unique, file="/Users/oliviacattau/Documents/GitHub/olivia-geoduck/data/juv_count_unique.csv")
#combine into one DF
unique_overlapping_genes<-bind_rows(hg_count_unique, hc_count_unique, cg_count_unique, lj_count_unique, cj_count_unique, hj_count_unique, gj_count_unique, cl_count_unique, hl_count_unique, gl_count_unique)#filter Terms down a bit
terms_filtered <- term_count_best %>%
group_by(Term_unique) %>%
filter(n() >= 5) %>%
filter(!is.na("Terms_unique")) %>%
filter(n >= 10)
#load top 20 hits from Github
top20terms<-read.csv(file="https://raw.githubusercontent.com/course-fish546-2023/olivia-geoduck/main/output/tables/top20terms.csv")
compairative_transcriptomes<-top20terms %>%
filter(transcriptome != "Pgenerosa") %>%
filter(n >= 15)
stacked_barplot<-ggplot(top20terms, aes(x=transcriptome, y=n, fill=Term_unique))+geom_bar(stat = "identity")+theme_bw()
stacked_barplot2<-ggplot(compairative_transcriptomes, aes(x=transcriptome, y=n, fill=Term_unique))+geom_bar(stat = "identity")+theme_bw()
stacked_barplot#Z=e-values, Y=Manilla genes, X=Mercenaria genes
top20_long<-reshape(top20terms, idvar = "Term_unique", timevar = "transcriptome", direction = "wide")
tissues_goterms_unique<-read.csv(file="https://raw.githubusercontent.com/ocattau/code-for-Pgenerosa/main/output/tissue_unique_Pgen_terms_heatmap_data.csv")
tissues2<-tissues_goterms_unique %>%
filter(tissues != "larvae-juvenile")
#heatmap<-ggplot(pgenerosa_long, aes(x=gene, y=tissue, fill=tpm)) +
# geom_tile() #too large
heatmap2<-ggplot(top20terms, aes(x=transcriptome, y=Term_unique, fill=n))+geom_tile()+theme_bw()+ scale_fill_gradient(low="#c2f9ff", high="#003460")
heatmap3<-ggplot(compairative_transcriptomes, aes(x=transcriptome, y=Term_unique, fill=n))+geom_tile()+theme_bw() + scale_fill_gradient(low="#c2f9ff", high="#003460")
heatmap7<- ggplot(tissues2, aes(x=tissues, y=Term_unique, fill=n))+geom_tile()+theme_bw()+ scale_fill_gradient(low="#c2f9ff", high="#003460")+theme(axis.text.x = element_text(angle = 45, hjust = 1) )
Tis_normal_2<-Tis_normal %>%
filter(Term_unique != "NA")
heatmap8<-ggplot(Tis_normal_2, aes(tissue, y=Term_unique, fill=normalized))+geom_tile()+theme_bw()+theme(axis.text.x = element_text(angle = 45, hjust = 1) )+ scale_fill_gradient2(low = "yellow", mid = "white", high = "red", midpoint = 0.9, na.value = "grey50")
heatmap9<-ggplot(Tis_normal, aes(tissue, y=Term_unique, fill=log_scale))+geom_tile(color="white")+ scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0, na.value = "grey50")+theme_bw()+theme(axis.text.x = element_text(angle = 45, hjust = 1) )#need to set color wheel to white = 1 and red = min and blue = max
species_normal_2<-species_normal %>%
filter(Term_unique != "NA")%>%
filter(transcriptome == "Manila_r" | transcriptome == "Mercenaria_r")
heatmap10<-ggplot(species_normal_2, aes(transcriptome, y=Term_unique, fill=normalized))+geom_tile(color="white")+ scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 1, na.value = "grey50")+theme_bw()+theme(axis.text.x = element_text(angle = 45, hjust = 1) )
heatmap7heatmap8heatmap9heatmap10scatter<-ggplot(distinct_goslims, aes(x=gonad, y=juvenile))+geom_point()+theme_bw()
scatterscatter2<-ggplot(distinct_goslims, aes(x=heart, y=juvenile))+geom_point()+theme_bw()
scatter2#compairative_clam_transcripts_right
scatter4<-ggplot(compairative_clam_transcripts_right, aes(x=`e-value.x`, y=`e-value.y`))+geom_point()+theme_bw()+xlab("Mercenaria")+ylab("Manila")+labs(title="Pgenerosa blast db")+ theme(plot.title = element_text(hjust = 0.5, margin = margin(b = 10)))#removed 3394 rows containing missing values
scatter5<-ggplot(compairative_clam_transcripts_left, aes(x=`e-value.x`, y=`e-value.y`))+geom_point()+theme_bw()+xlab("Mercenaria")+ylab("Manila")+labs(title="Pgenerosa blast query")+ theme(plot.title = element_text(hjust = 0.5, margin = margin(b = 10)))#Removed 4686 rows containing missing values (`geom_point()`).