Homework 2

# Pre-process the bed files by sorting instances
bedtools sort -i ERa_hg18.bed > out/ERa_hg18_sorted.bed
bedtools sort -i ERb_hg18.bed > out/ERb_hg18_sorted.bed

# Calculate genome coverage
#bedtools genomecov -i out/ERa_hg18_sorted.bed -g hg18_chrom_sizes.txt > out/ERa_hg18_coverage.txt

#bedtools genomecov -i out/ERb_hg18_sorted.bed -g hg18_chrom_sizes.txt > out/ERb_hg18_coverage.txt
# Load packages
library("ggplot2")
library("VennDiagram")
## Loading required package: grid

## Loading required package: futile.logger
# Read the coverage files
df_ERa <- read.table("out/ERa_hg18_coverage.txt")
df_ERb <- read.table("out/ERb_hg18_coverage.txt")
# Add a column with the site name
df_ERa$site <- "ERa"
df_ERb$site <- "ERb"
# Combine 2 dataframes
df_ER <- rbind(df_ERa, df_ERb)
# Edit column names
colnames(df_ER) <- c("chr", "depth", "coverage_size", "chr_size", "fraction", "site")
# Remove rows of 'genome'
df_ER <- df_ER[df_ER$chr != 'genome', ]
# Keep only rows with coverage
df_ER_hits <- df_ER[df_ER$depth == 1, ]
# Plot
p <- ggplot(df_ER_hits, aes(chr, fraction))
p + geom_bar(stat = "identity", aes(fill = site), position = "dodge")

wc -l out/ERa_hg18_sorted.bed
# A = 581

wc -l out/ERb_hg18_sorted.bed
# B = 485

bedtools intersect -a out/ERa_hg18_sorted.bed -b out/ERb_hg18_sorted.bed -c | awk '$4 == 1' | wc -l
# A and B = 345

bedtools intersect -a out/ERa_hg18_sorted.bed -b out/ERb_hg18_sorted.bed -v | wc -l
# A and not B = 236

bedtools intersect -b out/ERa_hg18_sorted.bed -a out/ERb_hg18_sorted.bed -v | wc -l
# B and not A = 140
##      581 out/ERa_hg18_sorted.bed
##      485 out/ERb_hg18_sorted.bed
##      345
##      236
##      140
venn.plot <- draw.pairwise.venn(581, 485, 345, 
                                c("ERa", "ERb"), fill = c("salmon", "turquoise"));
grid.draw(venn.plot);

grid.newpage();