Finding the right the parameters for assembling transcripts is important. Ultimately, you want to make sure you are capturing biological complexity while filtering potential program errors. Here, I am analyzing the output from HISAT2 mapped reads that have been assembled by StringTie.

Load in necessary packages

library(dplyr)
library(ggvenn)
library(tidyverse)
library(ggpubr)
library(data.table)
library(UpSetR)
library(ComplexHeatmap)

Read in the .tmap file. The .tmap file contains genomic inforamtion about the transcripts that have been identified. Also, read in a file that contains a list of all antisense transcripts that have been identified in each set of parameters

#default
default_tmap <- read_delim("/mnt/Kanta/antisense_lncRNAs/hisat2_mapping/AT/at_tissue_atlas_trial/default_assemble/gffcompare/gffcompare.default_assemble.gtf.tmap", 
    delim = "\t", escape_double = FALSE, 
    trim_ws = TRUE)

default_as <- read_delim("/mnt/Kanta/antisense_lncRNAs/hisat2_mapping/AT/at_tissue_atlas_trial/default_assemble/gffcompare/CPC2_output/default_antisense_tx_names.txt", 
    delim = "\t", escape_double = FALSE, 
    col_names = FALSE, trim_ws = TRUE)

#test1
test1_tmap <- read_delim("/mnt/Kanta/antisense_lncRNAs/hisat2_mapping/AT/at_tissue_atlas_trial/test1_assemble/gffcompare/gffcompare.test1_assemble.gtf.tmap", 
    delim = "\t", escape_double = FALSE, 
    trim_ws = TRUE)


test1_as <- read_delim("/mnt/Kanta/antisense_lncRNAs/hisat2_mapping/AT/at_tissue_atlas_trial/test1_assemble/gffcompare/CPC2_output/test1_antisense_tx_names.txt", 
    delim = "\t", escape_double = FALSE, 
    col_names = FALSE, trim_ws = TRUE)

#test2
test2_tmap <- read_delim("/mnt/Kanta/antisense_lncRNAs/hisat2_mapping/AT/at_tissue_atlas_trial/test2_assemble/gffcompare/gffcompare.test2_assemble.gtf.tmap", 
    delim = "\t", escape_double = FALSE, 
    trim_ws = TRUE)


test2_as <- read_delim("/mnt/Kanta/antisense_lncRNAs/hisat2_mapping/AT/at_tissue_atlas_trial/test2_assemble/gffcompare/CPC2_output/test2_antisense_tx_names.txt", 
    delim = "\t", escape_double = FALSE, 
    col_names = FALSE, trim_ws = TRUE)

#test3
test3_tmap <- read_delim("/mnt/Kanta/antisense_lncRNAs/hisat2_mapping/AT/at_tissue_atlas_trial/test3_assemble/gffcompare/gffcompare.test3_assemble.gtf.tmap", 
    delim = "\t", escape_double = FALSE, 
    trim_ws = TRUE)


test3_as <- read_delim("/mnt/Kanta/antisense_lncRNAs/hisat2_mapping/AT/at_tissue_atlas_trial/test3_assemble/gffcompare/CPC2_output/test3_antisense_tx_names.txt", 
    delim = "\t", escape_double = FALSE, 
    col_names = FALSE, trim_ws = TRUE)


#test4
test4_tmap <- read_delim("/mnt/Kanta/antisense_lncRNAs/hisat2_mapping/AT/at_tissue_atlas_trial/test4_assemble/gffcompare/gffcompare.at_tissue_atlas_test4.gtf.tmap", 
    delim = "\t", escape_double = FALSE, 
    trim_ws = TRUE)


test4_as <- read_delim("/mnt/Kanta/antisense_lncRNAs/hisat2_mapping/AT/at_tissue_atlas_trial/test4_assemble/gffcompare/CPC2_output/test4_antisense_tx_names.txt", 
    delim = "\t", escape_double = FALSE, 
    col_names = FALSE, trim_ws = TRUE)

Lets clean up these tmap files and extract generate a file that contains the antisense/pcg pair from each annotation

#default
default_tmap_filt <- dplyr::filter(default_tmap, class_code == 'x') #extract all antisense transcripts

default_tmap_filt<- default_tmap_filt[, c(5,4,2,1,6,10,11,12)] #filter for only the columns of interest and reorder for downstream analyses

default_tmap_filt %>%
  right_join(default_as, by = c("qry_id" = "X1")) -> default_tmap_filt #grab genomic information for antisense transcripts

default_tmap_filt$ref_id <- gsub("transcript:","", as.character(default_tmap_filt$ref_id)) #clean up column information
default_tmap_filt$ref_gene_id <- gsub("gene:","", as.character(default_tmap_filt$ref_gene_id)) #clean up column information

default_gene_pairs <- default_tmap_filt[, c(1,2,3,4)] #generate a file that contains gene and transcript information for the pcg and antisense

default_gene_pairs$ref_id <-gsub("transcript:","", as.character(default_gene_pairs$ref_id)) #clean up column information
default_gene_pairs$ref_gene_id <-gsub("gene:","", as.character(default_gene_pairs$ref_gene_id)) #clean up column information

default_gene_pairs <- default_gene_pairs[, c(2,4)] %>% distinct() #filter for redundancy in the gene pair file

#test1
test1_tmap_filt <- dplyr::filter(test1_tmap, class_code == 'x')

test1_tmap_filt<- test1_tmap_filt[, c(5,4,2,1,6,10,11,12)] 

test1_tmap_filt %>%
  right_join(test1_as, by = c("qry_id" = "X1")) -> test1_tmap_filt 

test1_tmap_filt$ref_id <- gsub("transcript:","", as.character(test1_tmap_filt$ref_id)) 
test1_tmap_filt$ref_gene_id <- gsub("gene:","", as.character(test1_tmap_filt$ref_gene_id))

test1_gene_pairs <- test1_tmap_filt[, c(1,2,3,4)]

test1_gene_pairs$ref_id <-gsub("transcript:","", as.character(test1_gene_pairs$ref_id)) 
test1_gene_pairs$ref_gene_id <-gsub("gene:","", as.character(test1_gene_pairs$ref_gene_id)) 

test1_gene_pairs <- test1_gene_pairs[, c(2,4)] %>% distinct()

#test2
test2_tmap_filt <- dplyr::filter(test2_tmap, class_code == 'x')

test2_tmap_filt<- test2_tmap_filt[, c(5,4,2,1,6,10,11,12)] 

test2_tmap_filt %>%
  right_join(test2_as, by = c("qry_id" = "X1")) -> test2_tmap_filt 

test2_tmap_filt$ref_id <- gsub("transcript:","", as.character(test2_tmap_filt$ref_id)) 
test2_tmap_filt$ref_gene_id <- gsub("gene:","", as.character(test2_tmap_filt$ref_gene_id))

test2_gene_pairs <- test2_tmap_filt[, c(1,2,3,4)]

test2_gene_pairs$ref_id <-gsub("transcript:","", as.character(test2_gene_pairs$ref_id)) 
test2_gene_pairs$ref_gene_id <-gsub("gene:","", as.character(test2_gene_pairs$ref_gene_id)) 

test2_gene_pairs <- test2_gene_pairs[, c(2,4)] %>% distinct()

#test3
test3_tmap_filt <- dplyr::filter(test3_tmap, class_code == 'x')

test3_tmap_filt<- test3_tmap_filt[, c(5,4,2,1,6,10,11,12)] 

test3_tmap_filt %>%
  right_join(test3_as, by = c("qry_id" = "X1")) -> test3_tmap_filt 

test3_tmap_filt$ref_id <- gsub("transcript:","", as.character(test3_tmap_filt$ref_id)) 
test3_tmap_filt$ref_gene_id <- gsub("gene:","", as.character(test3_tmap_filt$ref_gene_id))

test3_gene_pairs <- test3_tmap_filt[, c(1,2,3,4)]

test3_gene_pairs$ref_id <-gsub("transcript:","", as.character(test3_gene_pairs$ref_id)) 
test3_gene_pairs$ref_gene_id <-gsub("gene:","", as.character(test3_gene_pairs$ref_gene_id)) 

test3_gene_pairs <- test3_gene_pairs[, c(2,4)] %>% distinct()

#test4
test4_tmap_filt <- dplyr::filter(test4_tmap, class_code == 'x')

test4_tmap_filt<- test4_tmap_filt[, c(5,4,2,1,6,10,11,12)] 

test4_tmap_filt %>%
  right_join(test4_as, by = c("qry_id" = "X1")) -> test4_tmap_filt 

test4_tmap_filt$ref_id <- gsub("transcript:","", as.character(test4_tmap_filt$ref_id)) 
test4_tmap_filt$ref_gene_id <- gsub("gene:","", as.character(test4_tmap_filt$ref_gene_id))

test4_gene_pairs <- test4_tmap_filt[, c(1,2,3,4)]

test4_gene_pairs$ref_id <-gsub("transcript:","", as.character(test4_gene_pairs$ref_id)) 
test4_gene_pairs$ref_gene_id <-gsub("gene:","", as.character(test4_gene_pairs$ref_gene_id)) 

test4_gene_pairs <- test4_gene_pairs[, c(2,4)] %>% distinct()

What genes are unique to each assembly? What genes are shared across all assembly parameters?

#lets generate files that contain the pcgs that have antisense transcripts for each assembly
default_pcgs <- default_gene_pairs$ref_gene_id
default_pcgs_df <- default_gene_pairs$ref_gene_id %>% as.data.frame() %>% setNames(., c("default_pcgs"))
test1_pcgs <- test1_gene_pairs$ref_gene_id
test1_pcgs_df <- test1_gene_pairs$ref_gene_id %>% as.data.frame() %>% setNames(., c("test1_pcgs"))
test2_pcgs <- test2_gene_pairs$ref_gene_id
test2_pcgs_df <- test2_gene_pairs$ref_gene_id %>% as.data.frame() %>% setNames(., c("test2_pcgs"))
test3_pcgs <- test3_gene_pairs$ref_gene_id
test3_pcgs_df <- test3_gene_pairs$ref_gene_id %>% as.data.frame() %>% setNames(., c("test3_pcgs"))
test4_pcgs <- test4_gene_pairs$ref_gene_id
test4_pcgs_df <- test4_gene_pairs$ref_gene_id %>% as.data.frame() %>% setNames(., c("test4_pcgs"))


#all comparison
input <- list(Default=default_pcgs, 
              Test1=test1_pcgs, 
              Test2=test2_pcgs, 
              Test3=test3_pcgs, 
              Test4=test4_pcgs)

m1 = make_comb_mat(input)
UpSet(m1)

UpSet(m1, 
      set_order = c("Default", "Test1", "Test2", "Test3", "Test4"),
      comb_col = c("#283747"),
      comb_order = order(comb_size(m1),decreasing = TRUE),
      top_annotation = upset_top_annotation(m1, add_numbers = TRUE, axis_param = list(gp = gpar(fontsize = 12))),
      right_annotation = upset_right_annotation(m1, add_numbers = TRUE, axis_param = list(gp = gpar(fontsize = 12))),
      row_names_gp = gpar(fontsize = 12)) -> all_upset_plot

Grab the genes that are in each intersection

default_pcgs_2 <- default_pcgs_df %>% dplyr::mutate(type="default") %>% dplyr::rename(gene=1)
test1_pcgs_2 <- test1_pcgs_df %>% dplyr::mutate(type="test1") %>% dplyr::rename(gene=1)
test2_pcgs_2 <- test2_pcgs_df %>% dplyr::mutate(type="test2") %>% dplyr::rename(gene=1)
test3_pcgs_2 <- test3_pcgs_df %>% dplyr::mutate(type="test3") %>% dplyr::rename(gene=1)
test4_pcgs_2 <- test4_pcgs_df %>% dplyr::mutate(type="test4") %>% dplyr::rename(gene=1)

all <- rbind(default_pcgs_2, 
             test1_pcgs_2, 
             test2_pcgs_2, 
             test3_pcgs_2, 
             test4_pcgs_2)

all_2 <- data.table::dcast(setDT(all), gene ~ type, fun.aggregate = length)  

all_2 %>% as.data.frame %>% dplyr::filter(., default==1 & test1==1 & test2==1 & test3==1 & test4==1) %>% distinct(gene) -> present_all 

all_2 %>% as.data.frame %>% dplyr::filter(., default==1 & test1==0 & test2==0 & test3==0 & test4==0) %>% distinct(gene) -> unique_default

all_2 %>% as.data.frame %>% dplyr::filter(., default==0 & test1==0 & test2==0 & test3==0 & test4==1) %>% distinct(gene) -> unique_test4

all_2 %>% as.data.frame %>% dplyr::filter(.,test3==1 & test4==0) %>% distinct(gene) -> present_t3_not_t4

Let’s pull some information from the tmap file about the pcg/antisense pairs that are unique to the default assembly and test4 assembly

default_unique_vs_t4_tmap <- default_tmap_filt
default_unique_vs_t4_tmap %>%
  right_join(., unique_default, by=c("ref_gene_id" = "gene")) -> default_unique_vs_t4_tmap

test4_unique_vs_default_tmap <- test4_tmap_filt
test4_unique_vs_default_tmap %>%
  right_join(., unique_test4, by=c("ref_gene_id" = "gene")) -> test4_unique_vs_default_tmap

present_t3_not_t4_tmap <- test3_tmap_filt
present_t3_not_t4_tmap %>%
  right_join(., present_t3_not_t4, by=c("ref_gene_id" = "gene")) -> present_t3_not_t4_tmap

How many exons do they have?

hist(default_unique_vs_t4_tmap$num_exons)

hist(test4_unique_vs_default_tmap$num_exons)

default_num_exon <- ggplot(default_tmap_filt, aes(x=num_exons)) + 
  geom_bar(stat = "count", fill = "#5499C7")+
  scale_x_continuous(labels=as.character(default_tmap_filt$num_exons), breaks = default_tmap_filt$num_exons)+
  labs(x= "Number of exons of default antisense transcripts", y= "Transcript count")+
  geom_text(aes(label = after_stat(count)), stat = "count", vjust = -0.5)+
  theme_minimal(base_size = 12)

test1_num_exon <- ggplot(test1_tmap_filt, aes(x=num_exons)) + 
  geom_bar(stat = "count", fill = "#2980B9")+
  scale_x_continuous(labels=as.character(test1_tmap_filt$num_exons), breaks = test1_tmap_filt$num_exons)+
  labs(x= "Number of exons of test 1 antisense transcripts", y= "Transcript count")+
  geom_text(aes(label = after_stat(count)), stat = "count", vjust = -0.5)+
  theme_minimal(base_size = 12)

test2_num_exon <- ggplot(test2_tmap_filt, aes(x=num_exons)) + 
  geom_bar(stat = "count", fill = "#2471A3")+
  scale_x_continuous(labels=as.character(test2_tmap_filt$num_exons), breaks = test2_tmap_filt$num_exons)+
  labs(x= "Number of exons of test 2 antisense transcripts", y= "Transcript count")+
  geom_text(aes(label = after_stat(count)), stat = "count", vjust = -0.5)+
  theme_minimal(base_size = 12)

test3_num_exon <- ggplot(test3_tmap_filt, aes(x=num_exons)) + 
  geom_bar(stat = "count", fill = "#1F618D")+
  scale_x_continuous(labels=as.character(test3_tmap_filt$num_exons), breaks = test3_tmap_filt$num_exons)+
  labs(x= "Number of exons of test 3 antisense transcripts", y= "Transcript count")+
  geom_text(aes(label = after_stat(count)), stat = "count", vjust = -0.5)+
  theme_minimal(base_size = 12)

test4_num_exon <- ggplot(test4_tmap_filt, aes(x=num_exons)) + 
  geom_bar(stat = "count", fill = "#1A5276")+
  scale_x_continuous(labels=as.character(test4_tmap_filt$num_exons), breaks = test4_tmap_filt$num_exons)+
  labs(x= "Number of exons of test 4 antisense transcripts", y= "Transcript count")+
  geom_text(aes(label = after_stat(count)), stat = "count", vjust = -0.5)+
  theme_minimal(base_size = 12)
num_exon_def_unq <- ggplot(default_unique_vs_t4_tmap, aes(x=num_exons, y=num_exons)) + 
  geom_bar(stat = "identity", fill = "#707B7C")+
  scale_x_continuous(labels=as.character(default_unique_vs_t4_tmap$num_exons), breaks = default_unique_vs_t4_tmap$num_exons)+
  labs(x= "number of exons", y= "transcript count")+
  theme_minimal(base_size = 16)

num_exon_t4_unq <- ggplot(test4_unique_vs_default_tmap, aes(x=num_exons, y=num_exons)) + 
  geom_bar(stat = "identity", fill = "#34495E")+
  scale_x_continuous(labels=as.character(test4_unique_vs_default_tmap$num_exons),breaks=c(test4_unique_vs_default_tmap$num_exons))+
  labs(x= "number of exons", y= "transcript count")+
  geom_text(aes(label = num_exons), vjust = 0)+
  theme_minimal(base_size = 16)

What is the distribution of antisense/pcg loci orientation? How many of the unique default and test3 transcripts fall into each class of the gene orientations?

Read in ccivr outout and tmap file

default_cc <- read_csv("/mnt/Kanta/antisense_lncRNAs/hisat2_mapping/AT/at_tissue_atlas_trial/default_assemble/gffcompare/CPC2_output/ccivr/ccivr_output/Table.csv")

test1_cc <- read_csv("/mnt/Kanta/antisense_lncRNAs/hisat2_mapping/AT/at_tissue_atlas_trial/test1_assemble/gffcompare/CPC2_output/ccivr/ccivr_output/Table.csv")
  
test2_cc <- read_csv("/mnt/Kanta/antisense_lncRNAs/hisat2_mapping/AT/at_tissue_atlas_trial/test2_assemble/gffcompare/CPC2_output/ccivr/ccivr_output/Table.csv")
  
test3_cc <- read_csv("/mnt/Kanta/antisense_lncRNAs/hisat2_mapping/AT/at_tissue_atlas_trial/test3_assemble/gffcompare/CPC2_output/ccivr/ccivr_output/Table.csv")

test4_cc <- read_csv("/mnt/Kanta/antisense_lncRNAs/hisat2_mapping/AT/at_tissue_atlas_trial/test4_assemble/gffcompare/CPC2_output/ccivr/ccivr_output/Table.csv")

#lets combine the tmap file ccivr output file

default_cc_tmap <- default_tmap %>%
  dplyr::select(ref_gene_id,
         ref_id,
         qry_gene_id,
         qry_id,
         ref_match_len,
         num_exons)

test1_cc_tmap <- test1_tmap %>%
  dplyr::select(ref_gene_id,
         ref_id,
         qry_gene_id,
         qry_id,
         ref_match_len, 
         num_exons)

test2_cc_tmap <- test2_tmap %>%
  dplyr::select(ref_gene_id,
         ref_id,
         qry_gene_id,
         qry_id,
         ref_match_len, 
         num_exons)

test3_cc_tmap <- test3_tmap %>%
  dplyr::select(ref_gene_id,
         ref_id,
         qry_gene_id,
         qry_id,
         ref_match_len, 
         num_exons)

test4_cc_tmap <- test4_tmap %>%
  dplyr::select(ref_gene_id,
         ref_id,
         qry_gene_id,
         qry_id,
         ref_match_len, 
         num_exons)
#clean up the columns
default_cc_tmap$ref_gene_id <- gsub('gene:', '', default_cc_tmap$ref_gene_id)
default_cc_tmap$ref_id <- gsub('transcript:', '', default_cc_tmap$ref_id)
default_cc_tmap$qry_gene_id <- gsub('gene:', '', default_cc_tmap$qry_gene_id)
default_cc_tmap$qry_id <- gsub('transcript:', '', default_cc_tmap$qry_id)

test1_cc_tmap$ref_gene_id <- gsub('gene:', '', test1_cc_tmap$ref_gene_id)
test1_cc_tmap$ref_id <- gsub('transcript:', '', test1_cc_tmap$ref_id)
test1_cc_tmap$qry_gene_id <- gsub('gene:', '', test1_cc_tmap$qry_gene_id)
test1_cc_tmap$qry_id <- gsub('transcript:', '', test1_cc_tmap$qry_id)

test2_cc_tmap$ref_gene_id <- gsub('gene:', '', test2_cc_tmap$ref_gene_id)
test2_cc_tmap$ref_id <- gsub('transcript:', '', test2_cc_tmap$ref_id)
test2_cc_tmap$qry_gene_id <- gsub('gene:', '', test2_cc_tmap$qry_gene_id)
test2_cc_tmap$qry_id <- gsub('transcript:', '', test2_cc_tmap$qry_id)

test3_cc_tmap$ref_gene_id <- gsub('gene:', '', test3_cc_tmap$ref_gene_id)
test3_cc_tmap$ref_id <- gsub('transcript:', '', test3_cc_tmap$ref_id)
test3_cc_tmap$qry_gene_id <- gsub('gene:', '', test3_cc_tmap$qry_gene_id)
test3_cc_tmap$qry_id <- gsub('transcript:', '', test3_cc_tmap$qry_id)

test4_cc_tmap$ref_gene_id <- gsub('gene:', '', test4_cc_tmap$ref_gene_id)
test4_cc_tmap$ref_id <- gsub('transcript:', '', test4_cc_tmap$ref_id)
test4_cc_tmap$qry_gene_id <- gsub('gene:', '', test4_cc_tmap$qry_gene_id)
test4_cc_tmap$qry_id <- gsub('transcript:', '', test4_cc_tmap$qry_id)

Join the ccivr output and clean tmap file based up sense and antisense

default_j <- left_join(default_cc_tmap,
          default_cc,
          by = c("ref_id" = "id",
                 "qry_id" = "_id")) %>%
          na.omit() %>%
          distinct()

test1_j <- left_join(test1_cc_tmap,
          test1_cc,
          by = c("ref_id" = "id",
                 "qry_id" = "_id")) %>%
          na.omit() %>%
          distinct()


test2_j <- left_join(test2_cc_tmap,
          test2_cc,
          by = c("ref_id" = "id",
                 "qry_id" = "_id")) %>%
          na.omit() %>%
          distinct()

test3_j <- left_join(test3_cc_tmap,
          test3_cc,
          by = c("ref_id" = "id",
                 "qry_id" = "_id")) %>%
          na.omit() %>%
          distinct()

test4_j <- left_join(test4_cc_tmap,
          test4_cc,
          by = c("ref_id" = "id",
                 "qry_id" = "_id")) %>%
          na.omit() %>%
          distinct()

Generate a ccivr output table of the identified NATs

default_nat <- left_join(default_as, default_j,
          by = c("X1" = "qry_id")) %>% 
          na.omit() %>%
          distinct()

default_nat %>%
  distinct(ref_gene_id) -> default_unique_pcg_cc_output

test1_nat <- left_join(test1_as, test1_j,
          by = c("X1" = "qry_id")) %>% 
          distinct()%>%
          na.omit()
test1_nat %>%
  distinct(ref_gene_id) -> test1_unique_pcg_cc_output

test2_nat <- left_join(test2_as, test2_j,
          by = c("X1" = "qry_id")) %>% 
          distinct()%>%
          na.omit()
test2_nat %>%
  distinct(ref_gene_id) -> test2_unique_pcg_cc_output

test3_nat <- left_join(test3_as, test3_j,
          by = c("X1" = "qry_id")) %>% 
          distinct()%>%
          na.omit()
test3_nat %>%
  distinct(ref_gene_id) -> test3_unique_pcg_cc_output

test4_nat <- left_join(test4_as, test4_j,
          by = c("X1" = "qry_id")) %>% 
          distinct()%>%
          na.omit()
test4_nat %>%
  distinct(ref_gene_id) -> test4_unique_pcg_cc_output

#determine the frequency of each class code 
default_type_sum <- default_nat %>%
  group_by(Type) %>%
  summarise(n = n()) %>%
  mutate(freq = (n / sum(n)) * 100,
         trial = "default")
test1_type_sum <- test1_nat %>%
  group_by(Type) %>%
  summarise(n = n()) %>%
  mutate(freq = (n / sum(n)) * 100,
         trial = "test1")

test2_type_sum <- test2_nat %>%
  group_by(Type) %>%
  summarise(n = n()) %>%
  mutate(freq = (n / sum(n)) * 100,
         trial = "test2")

test3_type_sum <- test3_nat %>%
  group_by(Type) %>%
  summarise(n = n()) %>%
  mutate(freq = (n / sum(n)) * 100,
         trial = "test3")

test4_type_sum <- test4_nat %>%
  group_by(Type) %>%
  summarise(n = n()) %>%
  mutate(freq = (n / sum(n)) * 100,
         trial = "test4")
all_test_com <- do.call("rbind", list(default_type_sum,
                                  test1_type_sum,
                                  test2_type_sum,
                                  test3_type_sum, 
                                  test4_type_sum)) #lets combine all of the summary tables for each species into one data frame

level_order <- c('default', 'test1', 'test2', 'test3', 'test4') #specify the order in which you want the species to be ordered in the plot

all_test_com %>%
  ggplot(., aes(fill = Type, x = factor(trial, level = level_order), y = freq)) +
  geom_bar(stat = "identity", alpha = .8) +
  ggsci::scale_fill_aaas() +
  ylab('Proportion of cis-NATs') +
  xlab('Assemebly Parameters') +
  geom_text(aes(label=paste0(sprintf("%1.1f", freq),"%")),
            position=position_stack(vjust=0.5),
            size = 6) +
  theme_classic(base_size = 22) -> all_test_com_graph

Show the proportion of genes that have isoforms

default_as_tx <- default_tmap_filt[, c(1,2)]
write.csv(default_as_tx, "/mnt/Kanta/antisense_lncRNAs/hisat2_mapping/AT/at_tissue_atlas_trial/interim_files/default_as_tx.csv")
default_as_gen <- default_tmap_filt[, c(1,2)] %>% distinct(qry_gene_id)

test1_as_tx <- test1_tmap_filt[, c(1,2)]
write.csv(test1_as_tx, "//mnt/Kanta/antisense_lncRNAs/hisat2_mapping/AT/at_tissue_atlas_trial/interim_files/test1_as_tx.csv")
test1_as_gen <- test1_tmap_filt[, c(1,2)] %>% distinct(qry_gene_id)

test2_as_tx <- test2_tmap_filt[, c(1,2)]
write.csv(test2_as_tx, "/mnt/Kanta/antisense_lncRNAs/hisat2_mapping/AT/at_tissue_atlas_trial/interim_files/test2_as_tx.csv")
test2_as_gen <- test2_tmap_filt[, c(1,2)] %>% distinct(qry_gene_id)

test3_as_tx <- test3_tmap_filt[, c(1,2)]
write.csv(test3_as_tx, "/mnt/Kanta/antisense_lncRNAs/hisat2_mapping/AT/at_tissue_atlas_trial/interim_files/test3_as_tx.csv")
test3_as_gen <- test3_tmap_filt[, c(1,2)] %>% distinct(qry_gene_id)

test4_as_tx <- test4_tmap_filt[, c(1,2)]
write.csv(test4_as_tx, "/mnt/Kanta/antisense_lncRNAs/hisat2_mapping/AT/at_tissue_atlas_trial/interim_files/test4_as_tx.csv")
test4_as_gen <- test4_tmap_filt[, c(1,2)] %>% distinct(qry_gene_id)
isoform_gene <- read_delim("/mnt/Kanta/antisense_lncRNAs/hisat2_mapping/sup_files/isoform_gene.txt", 
    delim = "\t", escape_double = FALSE, 
    trim_ws = TRUE)
## Rows: 10 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): Type, trial
## dbl (2): n, freq
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
level_order <- c('default', 'test1', 'test2', 'test3', 'test4')

isoform_gene %>%
  ggplot(., aes(fill = Type, x = factor(trial, level = level_order), y = freq)) +
  geom_bar(stat = "identity", alpha = .7) +
 ggsci::scale_fill_aaas() +
  ylab('Proportion of cis-NATs') +
  xlab('Assemebly Parameters') +
  geom_text(aes(label=paste0(sprintf("%1.1f", freq),"%")),
            position=position_stack(vjust=0.5),
            size = 6) +
  theme_classic(base_size = 18) -> iso_freq_all

True Default and Test4 comparisons

default_unique_ori <- default_unique_vs_t4_tmap[, c(4,3,2,1,5,6,7,8)] 

default_unique_ori_com <- left_join(default_unique_ori, default_j, 
          by = c("ref_gene_id" = "ref_gene_id")) %>% distinct(ref_gene_id, qry_gene_id.x, Type)
## Warning in left_join(default_unique_ori, default_j, by = c(ref_gene_id = "ref_gene_id")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 17 of `x` matches multiple rows in `y`.
## ℹ Row 149 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.