CREs prediction using Cistrome data

Load packages

library(knitr)
library(ggplot2)
library(dplyr)

import narrow peak data and export the coordinates for extraction

Here is the format instruction 1. chrom - Name of the chromosome (or contig, scaffold, etc.).
2. chromStart - The starting position of the feature in the chromosome or scaffold. The first base in a chromosome is numbered 0.
3. chromEnd - The ending position of the feature in the chromosome or scaffold. The chromEnd base is not included in the display of the feature. For example, the first 100 bases of a chromosome are defined as chromStart=0, chromEnd=100, and span the bases numbered 0-99.
4. name - Name given to a region (preferably unique). Use “.” if no name is assigned.
5. score - Indicates how dark the peak will be displayed in the browser (0-1000). If all scores were “‘0”’ when the data were submitted to the DCC, the DCC assigned scores 1-1000 based on signal value. Ideally the average signalValue per base spread is between 100-1000.
6. strand - +/- to denote strand or orientation (whenever applicable). Use”.” if no orientation is assigned.
7. signalValue - Measurement of overall (usually, average) enrichment for the region.
8. pValue - Measurement of statistical significance (-log10). Use -1 if no pValue is assigned.
9. qValue - Measurement of statistical significance using false discovery rate (-log10). Use -1 if no qValue is assigned.
10. peak - Point-source called for this peak; 0-based offset from chromStart. Use -1 if no point-source called.

### Load narrow peaks
Peak <- read.table("/Users/leon/Documents/Project/Cistrome/peaks/HSF_tnt/HSF7_colamp_a/chr1-5/chr1-5_GEM_events.narrowPeak", header = F)
colnames(Peak) <- c("chr","start", "end", "index", "score", "strand", "signalValue", "pvalue", "qvalue", "peak" )

### sort coordindates
Peak <- Peak[order(Peak[,"chr"], Peak[,"start"]),]
head(Peak)
### Extract positive sequences 
Peak$summit <- Peak$start + 101
Peak$positive_start <- Peak$summit - 15
Peak$positive_end <- Peak$summit + 15
Peak$postitve_index <- paste0(Peak$chr, ":", Peak$positive_start, "-", Peak$positive_end)

### Extract  two negative sequences 
Peak$negative1_start <- Peak$start - 30
Peak$negative1_index <- paste0(Peak$chr, ":", Peak$negative1_start, "-", Peak$start)

Peak$negative2_end <- Peak$end + 30
Peak$negative2_index <- paste0(Peak$chr, ":", Peak$end, "-", Peak$negative2_end)

### Export the Peak information 
Peak_index <- Peak[,c("postitve_index", "negative1_index", "negative2_index")]
head(Peak_index,20)
#write.table(Peak_index, "/Users/leon/Documents/Project/Cistrome/HSF7_colamp_a_PeakIndex")

Load coordinates of DL-derived binding motif

HSFA6B <- read.csv("/Users/Leon/OneDrive - Cornell University/Projects/6_Cotton/17_Cistrome/03_DL-prediction/wOTU-HSFA6B_col.csv",header = F)

HSFA6B_clean <- data.frame(t(HSFA6B))
colnames(HSFA6B_clean) <- HSFA6B_clean[1,]
HSFA6B_clean <- HSFA6B_clean[2:987,]

HSFA6B_clean$Window_str <- seq(-1971, -1,by = 2)
HSFA6B_clean$Window_end <- seq(-2001, -31,by = 2)
HSFA6B_clean$Window_mid <- seq(-1986, -16,by = 2)

head(HSFA6B_clean)
tail(HSFA6B_clean)
HSF7 <- read.csv("/Users/Leon/OneDrive - Cornell University/Projects/6_Cotton/17_Cistrome/03_DL-prediction/wOTU-HSF7_col_a.csv",header = F)
HSF7_clean <- data.frame(t(HSF7))
colnames(HSF7_clean) <- HSF7_clean[1,]
HSF7_clean <- HSF7_clean[2:987,]

HSF7_clean$Window_str <- seq(-1971, -1,by = 2)
HSF7_clean$Window_end <- seq(-2001, -31,by = 2)
HSF7_clean$Window_mid <- seq(-1986, -16,by = 2)

head(HSF7_clean)
HSF6 <- read.csv("/Users/Leon/OneDrive - Cornell University/Projects/6_Cotton/17_Cistrome/03_DL-prediction/wOTU-HSF6_col_a.csv",header = F)
HSF6_clean <- data.frame(t(HSF6))
colnames(HSF6_clean) <- HSF6_clean[1,]
HSF6_clean <- HSF6_clean[2:987,]

HSF6_clean$Window_str <- seq(-1971, -1,by = 2)
HSF6_clean$Window_end <- seq(-2001, -31,by = 2)
HSF6_clean$Window_mid <- seq(-1986, -16,by = 2)

head(HSF6_clean)

Load the plot table

Gene_plot <- read.csv("/Users/Leon/OneDrive - Cornell University/Projects/6_Cotton/17_Cistrome/03_DL-prediction/Plot_GeneList.csv", header = T)

head(Gene_plot, 20)

Plot the genes of interests of HSFA6B

### reload plot table 
for (i in 1:nrow(Gene_plot)){
  Plot_df <- HSFA6B_clean[,c(Gene_plot[i,1], "Window_mid")]
  Plot_df[,1] <- as.numeric(Plot_df[,1])
  
  print(ggplot(Plot_df,aes(x = Window_mid,y = get(Gene_plot[i,1]))) +
    geom_point(size = 0.3, alpha = 0.5) +
    geom_line(alpha = 0.5,size = 0.4, color = "steelblue") +
    theme_bw(base_size = 10) +
    theme(axis.text.x=element_text(angle=0)) + 
    ylab(paste0("Score of confidence of ", Gene_plot[i,7])) +
    xlab ("Distance to TSS (bp)") +
    ggtitle(paste0("CREs on 2Kb promoter of ", Gene_plot[i,1], " regulated by HSFA6B")))
}

Plot the genes of interests of HSF6

### reload plot table 
for (i in 1:nrow(Gene_plot)){
  Plot_df <- HSF6_clean[,c(Gene_plot[i,1], "Window_mid")]
  Plot_df[,1] <- as.numeric(Plot_df[,1])
  
  print(ggplot(Plot_df,aes(x = Window_mid,y = get(Gene_plot[i,1]))) +
    geom_point(size = 0.3, alpha = 0.5) +
    geom_line(alpha = 0.5,size = 0.4, color = "steelblue") +
    theme_bw(base_size = 10) +
    theme(axis.text.x=element_text(angle=0)) + 
    ylab(paste0("Score of confidence of ", Gene_plot[i,7])) +
    xlab ("Distance to TSS (bp)") +
    ggtitle(paste0("CREs on 2Kb promoter of ", Gene_plot[i,1], " regulated by HSF6")))
}

Plot the genes of interests of HSF7

### reload plot table 
for (i in 1:nrow(Gene_plot)){
  Plot_df <- HSF7_clean[,c(Gene_plot[i,1], "Window_mid")]
  Plot_df[,1] <- as.numeric(Plot_df[,1])
  
  print(ggplot(Plot_df,aes(x = Window_mid,y = get(Gene_plot[i,1]))) +
    geom_point(size = 0.3, alpha = 0.5) +
    geom_line(alpha = 0.5,size = 0.4, color = "steelblue") +
    theme_bw(base_size = 10) +
    theme(axis.text.x=element_text(angle=0)) + 
    ylab(paste0("Score of confidence of ", Gene_plot[i,7])) +
    xlab ("Distance to TSS (bp)") +
    ggtitle(paste0("CREs on 2Kb promoter of ", Gene_plot[i,1], " regulated by HSF7")))
}