Download data - I already have this downloaded.
cd ../data
curl -O https://gannet.fish.washington.edu/seashell/bu-mox/scrubbed/120321-cvBS/19F_R1_val_1_bismark_bt2_pe.deduplicated.sorted.bam
curl -O https://gannet.fish.washington.edu/seashell/bu-mox/scrubbed/120321-cvBS/19F_R1_val_1_bismark_bt2_pe.deduplicated.sorted.bam.bai
download bed file
cd ../data
curl -O https://eagle.fish.washington.edu/Cvirg_tracks/C_virginica-3.0_Gnomon_genes.bed
Convert the bed file to a bam file
/home/shared/bedtools2/bin/bedtools bamtobed -i /home/shared/8TB_HDD_02/schulh2/GitHub/haila-coursework/assignments/data/19F_R1_val_1_bismark_bt2_pe.deduplicated.sorted.bam > /home/shared/8TB_HDD_02/schulh2/GitHub/haila-coursework/assignments/output/08-19F.bed
/home/shared/bedtools2/bin/bedtools coverage \
-a ../data/C_virginica-3.0_Gnomon_genes.bed \
-b ../output/08-19F.bed \
> ../output/08-gene-19F-coverage.out
inspect file
head ../output/08-gene-19F-coverage.out
## NC_035780.1 13578 14594 gene-LOC111116054 0 + 65 1008 1016 0.9921260
## NC_035780.1 28961 33324 gene-LOC111126949 0 + 2679 4363 4363 1.0000000
## NC_035780.1 43111 66897 gene-LOC111110729 0 - 11794 23132 23786 0.9725049
## NC_035780.1 85606 95254 gene-LOC111112434 0 - 4618 9648 9648 1.0000000
## NC_035780.1 99840 106460 gene-LOC111120752 0 + 4023 6620 6620 1.0000000
## NC_035780.1 108305 110077 gene-LOC111128944 0 - 1527 1772 1772 1.0000000
## NC_035780.1 151859 157536 gene-LOC111128953 0 + 3334 5677 5677 1.0000000
## NC_035780.1 163809 183798 gene-LOC111105691 0 - 6683 19625 19989 0.9817900
## NC_035780.1 164820 166793 gene-LOC111105685 0 + 684 1973 1973 1.0000000
## NC_035780.1 169468 170178 gene-LOC111105702 0 - 138 710 710 1.0000000
download bed file of transposable elements and IncRNAs
cd ../data
curl -O http://owl.fish.washington.edu/halfshell/genomic-databank/cgigas_uk_roslin_v1_gene.gff
curl -O http://owl.fish.washington.edu/halfshell/genomic-databank/cgigas_uk_roslin_v1_rm.te.bed
curl -O http://owl.fish.washington.edu/halfshell/genomic-databank/cgigas_uk_roslin_v1_lncRNA.gff
Find where the files intersect
/home/shared/bedtools2/bin/bedtools intersect \
-a ../data/cgigas_uk_roslin_v1_gene.gff \
-b ../data/cgigas_uk_roslin_v1_rm.te.bed \
> ../output/08-gene-TE-intersect.out
head -2 ../output/08-gene-TE-intersect.out
## NC_047559.1 Gnomon gene 15715 15759 . + . ID=gene-LOC109621113;Dbxref=GeneID:109621113;Name=LOC109621113;gbkey=Gene;gene=LOC109621113;gene_biotype=protein_coding
## NC_047559.1 Gnomon gene 19138 19160 . - . ID=gene-LOC117687066;Dbxref=GeneID:117687066;Name=LOC117687066;gbkey=Gene;gene=LOC117687066;gene_biotype=protein_coding
/home/shared/bedtools2/bin/bedtools closest \
-a ../data/cgigas_uk_roslin_v1_lncRNA.gff \
-b ../data/cgigas_uk_roslin_v1_gene.gff \
> ../output/08-lnc-gene-closet.out
take a look at file
head ../output/08-lnc-gene-closet.out
## NC_047559.1 Gnomon lnc_RNA 9839 11386 . + . ID=rna-XR_004604272.1;Parent=gene-LOC117693020;Dbxref=GeneID:117693020,Genbank:XR_004604272.1;Name=XR_004604272.1;gbkey=ncRNA;gene=LOC117693020;model_evidence=Supporting evidence includes similarity to: 1 EST%2C and 100%25 coverage of the annotated genomic feature by RNAseq alignments%2C including 15 samples with support for all annotated introns;product=uncharacterized LOC117693020;transcript_id=XR_004604272.1 NC_047559.1 Gnomon gene 9839 11386 . + . ID=gene-LOC117693020;Dbxref=GeneID:117693020;Name=LOC117693020;gbkey=Gene;gene=LOC117693020;gene_biotype=lncRNA
## NC_047559.1 Gnomon lnc_RNA 167270 168430 . - . ID=rna-XR_004601744.1;Parent=gene-LOC117689460;Dbxref=GeneID:117689460,Genbank:XR_004601744.1;Name=XR_004601744.1;gbkey=ncRNA;gene=LOC117689460;model_evidence=Supporting evidence includes similarity to: 3 long SRA reads%2C and 98%25 coverage of the annotated genomic feature by RNAseq alignments%2C including 52 samples with support for all annotated introns;product=uncharacterized LOC117689460;transcript_id=XR_004601744.1 NC_047559.1 Gnomon gene 151758 185673 . + . ID=gene-LOC117687070;Dbxref=GeneID:117687070;Name=LOC117687070;gbkey=Gene;gene=LOC117687070;gene_biotype=protein_coding
## NC_047559.1 Gnomon lnc_RNA 167270 168430 . - . ID=rna-XR_004601744.1;Parent=gene-LOC117689460;Dbxref=GeneID:117689460,Genbank:XR_004601744.1;Name=XR_004601744.1;gbkey=ncRNA;gene=LOC117689460;model_evidence=Supporting evidence includes similarity to: 3 long SRA reads%2C and 98%25 coverage of the annotated genomic feature by RNAseq alignments%2C including 52 samples with support for all annotated introns;product=uncharacterized LOC117689460;transcript_id=XR_004601744.1 NC_047559.1 Gnomon gene 167270 168430 . - . ID=gene-LOC117689460;Dbxref=GeneID:117689460;Name=LOC117689460;gbkey=Gene;gene=LOC117689460;gene_biotype=lncRNA
## NC_047559.1 Gnomon lnc_RNA 226703 229170 . + . ID=rna-XR_004596449.1;Parent=gene-LOC105326952;Dbxref=GeneID:105326952,Genbank:XR_004596449.1;Name=XR_004596449.1;gbkey=ncRNA;gene=LOC105326952;model_evidence=Supporting evidence includes similarity to: 1 long SRA read%2C and 100%25 coverage of the annotated genomic feature by RNAseq alignments%2C including 5 samples with support for all annotated introns;product=uncharacterized LOC105326952;transcript_id=XR_004596449.1 NC_047559.1 Gnomon gene 226703 229170 . + . ID=gene-LOC105326952;Dbxref=GeneID:105326952;Name=LOC105326952;gbkey=Gene;gene=LOC105326952;gene_biotype=lncRNA
## NC_047559.1 Gnomon lnc_RNA 242189 242939 . - . ID=rna-XR_004602779.1;Parent=gene-LOC117690921;Dbxref=GeneID:117690921,Genbank:XR_004602779.1;Name=XR_004602779.1;gbkey=ncRNA;gene=LOC117690921;model_evidence=Supporting evidence includes similarity to: 100%25 coverage of the annotated genomic feature by RNAseq alignments%2C including 46 samples with support for all annotated introns;product=uncharacterized LOC117690921;transcript_id=XR_004602779.1 NC_047559.1 Gnomon gene 242189 242939 . - . ID=gene-LOC117690921;Dbxref=GeneID:117690921;Name=LOC117690921;gbkey=Gene;gene=LOC117690921;gene_biotype=lncRNA
## NC_047559.1 Gnomon lnc_RNA 254843 258147 . + . ID=rna-XR_004602775.1;Parent=gene-LOC117690907;Dbxref=GeneID:117690907,Genbank:XR_004602775.1;Name=XR_004602775.1;gbkey=ncRNA;gene=LOC117690907;model_evidence=Supporting evidence includes similarity to: 100%25 coverage of the annotated genomic feature by RNAseq alignments%2C including 9 samples with support for all annotated introns;product=uncharacterized LOC117690907;transcript_id=XR_004602775.1 NC_047559.1 Gnomon gene 254843 258147 . + . ID=gene-LOC117690907;Dbxref=GeneID:117690907;Name=LOC117690907;gbkey=Gene;gene=LOC117690907;gene_biotype=lncRNA
## NC_047559.1 Gnomon lnc_RNA 415528 418254 . - . ID=rna-XR_004598226.1;Parent=gene-LOC117684466;Dbxref=GeneID:117684466,Genbank:XR_004598226.1;Name=XR_004598226.1;gbkey=ncRNA;gene=LOC117684466;model_evidence=Supporting evidence includes similarity to: 2 long SRA reads%2C and 92%25 coverage of the annotated genomic feature by RNAseq alignments%2C including 125 samples with support for all annotated introns;product=uncharacterized LOC117684466%2C transcript variant X1;transcript_id=XR_004598226.1 NC_047559.1 Gnomon gene 355807 433459 . + . ID=gene-LOC105346550;Dbxref=GeneID:105346550;Name=LOC105346550;gbkey=Gene;gene=LOC105346550;gene_biotype=protein_coding
## NC_047559.1 Gnomon lnc_RNA 415528 418254 . - . ID=rna-XR_004598226.1;Parent=gene-LOC117684466;Dbxref=GeneID:117684466,Genbank:XR_004598226.1;Name=XR_004598226.1;gbkey=ncRNA;gene=LOC117684466;model_evidence=Supporting evidence includes similarity to: 2 long SRA reads%2C and 92%25 coverage of the annotated genomic feature by RNAseq alignments%2C including 125 samples with support for all annotated introns;product=uncharacterized LOC117684466%2C transcript variant X1;transcript_id=XR_004598226.1 NC_047559.1 Gnomon gene 415528 421174 . - . ID=gene-LOC117684466;Dbxref=GeneID:117684466;Name=LOC117684466;gbkey=Gene;gene=LOC117684466;gene_biotype=lncRNA
## NC_047559.1 Gnomon lnc_RNA 415530 421174 . - . ID=rna-XR_004598230.1;Parent=gene-LOC117684466;Dbxref=GeneID:117684466,Genbank:XR_004598230.1;Name=XR_004598230.1;gbkey=ncRNA;gene=LOC117684466;model_evidence=Supporting evidence includes similarity to: 2 long SRA reads%2C and 92%25 coverage of the annotated genomic feature by RNAseq alignments%2C including 6 samples with support for all annotated introns;product=uncharacterized LOC117684466%2C transcript variant X4;transcript_id=XR_004598230.1 NC_047559.1 Gnomon gene 355807 433459 . + . ID=gene-LOC105346550;Dbxref=GeneID:105346550;Name=LOC105346550;gbkey=Gene;gene=LOC105346550;gene_biotype=protein_coding
## NC_047559.1 Gnomon lnc_RNA 415530 421174 . - . ID=rna-XR_004598230.1;Parent=gene-LOC117684466;Dbxref=GeneID:117684466,Genbank:XR_004598230.1;Name=XR_004598230.1;gbkey=ncRNA;gene=LOC117684466;model_evidence=Supporting evidence includes similarity to: 2 long SRA reads%2C and 92%25 coverage of the annotated genomic feature by RNAseq alignments%2C including 6 samples with support for all annotated introns;product=uncharacterized LOC117684466%2C transcript variant X4;transcript_id=XR_004598230.1 NC_047559.1 Gnomon gene 415528 421174 . - . ID=gene-LOC117684466;Dbxref=GeneID:117684466;Name=LOC117684466;gbkey=Gene;gene=LOC117684466;gene_biotype=lncRNA