# read supplementary 2
supp2 <- read.csv("~/Documents/alternativeSplicing/supp2.csv")

# mutually exclusive exons
mx <- supp2 %>%
  filter(MX == 1) %>%                                                       ## keep only mututal exclusive exons
  mutate(coordinate = paste(Start.position, End.position, sep = "-")) %>%   ## append start and end positions
  mutate(coordinate = paste(substring(Chromosome, 4), coordinate, sep=":")) ## append chromosome # to positions

gene_and_mx_exon_count <- mx %>% select(Gene.Symbol) %>% 
  group_by(Gene.Symbol) %>%
  summarize(count = n())

# only keep genes with more than one mx exons
gene_with_more_than_one_exon <- gene_and_mx_exon_count %>%
  filter(count > 1) %>%
  select(Gene.Symbol) %>%
  inner_join(mx, by = "Gene.Symbol")

# cassette exons
cassette <- supp2 %>%
  filter(CASSETTE== 1) %>%                                                       
  mutate(coordinate = paste(Start.position, End.position, sep = "-")) %>%   
  mutate(coordinate = paste(substring(Chromosome, 4), coordinate, sep=":"))

Number of genes with multuplie mutually exclusive exons: 209

Number of total mutually exclusive exons: 559

Number of genes with cassette exons: 3032

Number of total cassette exons: 5051

-extract nucleotide sequences using the chromosome coordinates of these exons

-tblasx against lamprey, spotted gar, zebrafish, fugu, and coelacanth; e-value threshold: 0.1

-cds sequences of these five species are found https://uswest.ensembl.org/info/about/species.html

Number of blast hits

Species Mutually exclusive exons Cassette exons
lamprey 321 2751
spotted gar 380 3337
zebrafish 337 2934
fugu 353 3042
coelacanth 354 3145
# list of exon hits

# mutually exclusive
listInput <- list(lamprey = lamprey$qseqid, spotted_gar = spotted_gar$qseqid, zebrafish = zebrafish$qseqid, fugu = fugu$qseqid, coelacanth = coelacanth$qseqid)

upset(fromList(listInput), order.by = "freq", empty.intersections = "on")

# cassette 
listInput_cassette <- list(lamprey_cassette = lamprey_cassette$qseqid, spotted_gar_cassette = spotted_gar_cassette$qseqid, zebrafish_cassette = zebrafish_cassette$qseqid, fugu_cassette = fugu_cassette$qseqid, coelacanth_cassette = coelacanth_cassette$qseqid)

upset(fromList(listInput_cassette), order.by = "freq", empty.intersections = "on")