DL-based CREs predition of cotton TFs

CREs prediction using Cistrome data

Load packages

library(knitr)
library(ggplot2)
library(dplyr)

import narrow peak data and export the coordinates for extraction

Here is the format instruction 1. chrom - Name of the chromosome (or contig, scaffold, etc.).
2. chromStart - The starting position of the feature in the chromosome or scaffold. The first base in a chromosome is numbered 0.
3. chromEnd - The ending position of the feature in the chromosome or scaffold. The chromEnd base is not included in the display of the feature. For example, the first 100 bases of a chromosome are defined as chromStart=0, chromEnd=100, and span the bases numbered 0-99.
4. name - Name given to a region (preferably unique). Use “.” if no name is assigned.
5. score - Indicates how dark the peak will be displayed in the browser (0-1000). If all scores were “‘0”’ when the data were submitted to the DCC, the DCC assigned scores 1-1000 based on signal value. Ideally the average signalValue per base spread is between 100-1000.
6. strand - +/- to denote strand or orientation (whenever applicable). Use”.” if no orientation is assigned.
7. signalValue - Measurement of overall (usually, average) enrichment for the region.
8. pValue - Measurement of statistical significance (-log10). Use -1 if no pValue is assigned.
9. qValue - Measurement of statistical significance using false discovery rate (-log10). Use -1 if no qValue is assigned.
10. peak - Point-source called for this peak; 0-based offset from chromStart. Use -1 if no point-source called.

### Load narrow peaks
Peak <- read.table("/Users/leon/Documents/Project/Cistrome/peaks/HSF_tnt/HSF7_colamp_a/chr1-5/chr1-5_GEM_events.narrowPeak", header = F)
colnames(Peak) <- c("chr","start", "end", "index", "score", "strand", "signalValue", "pvalue", "qvalue", "peak" )

### sort coordindates
Peak <- Peak[order(Peak[,"chr"], Peak[,"start"]),]
head(Peak)

### Extract positive sequences 
Peak$summit <- Peak$start + 101
Peak$positive_start <- Peak$summit - 15
Peak$positive_end <- Peak$summit + 15
Peak$postitve_index <- paste0(Peak$chr, ":", Peak$positive_start, "-", Peak$positive_end)

### Extract  two negative sequences 
Peak$negative1_start <- Peak$start - 30
Peak$negative1_index <- paste0(Peak$chr, ":", Peak$negative1_start, "-", Peak$start)

Peak$negative2_end <- Peak$end + 30
Peak$negative2_index <- paste0(Peak$chr, ":", Peak$end, "-", Peak$negative2_end)

### Export the Peak information 
Peak_index <- Peak[,c("postitve_index", "negative1_index", "negative2_index")]
head(Peak_index,20)

#write.table(Peak_index, "/Users/leon/Documents/Project/Cistrome/HSF7_colamp_a_PeakIndex")

Load coordinates of DL-derived binding motif

HSFA6B <- read.csv("/Users/Leon/OneDrive - Cornell University/Projects/6_Cotton/17_Cistrome/03_DL-prediction/wOTU-HSFA6B_col.csv",header = F)

HSFA6B_clean <- data.frame(t(HSFA6B))
colnames(HSFA6B_clean) <- HSFA6B_clean[1,]
HSFA6B_clean <- HSFA6B_clean[2:987,]

HSFA6B_clean$Window_str <- seq(-1971, -1,by = 2)
HSFA6B_clean$Window_end <- seq(-2001, -31,by = 2)
HSFA6B_clean$Window_mid <- seq(-1986, -16,by = 2)

head(HSFA6B_clean)

tail(HSFA6B_clean)

HSF7 <- read.csv("/Users/Leon/OneDrive - Cornell University/Projects/6_Cotton/17_Cistrome/03_DL-prediction/wOTU-HSF7_col_a.csv",header = F)
HSF7_clean <- data.frame(t(HSF7))
colnames(HSF7_clean) <- HSF7_clean[1,]
HSF7_clean <- HSF7_clean[2:987,]

HSF7_clean$Window_str <- seq(-1971, -1,by = 2)
HSF7_clean$Window_end <- seq(-2001, -31,by = 2)
HSF7_clean$Window_mid <- seq(-1986, -16,by = 2)

head(HSF7_clean)

HSF6 <- read.csv("/Users/Leon/OneDrive - Cornell University/Projects/6_Cotton/17_Cistrome/03_DL-prediction/wOTU-HSF6_col_a.csv",header = F)
HSF6_clean <- data.frame(t(HSF6))
colnames(HSF6_clean) <- HSF6_clean[1,]
HSF6_clean <- HSF6_clean[2:987,]

HSF6_clean$Window_str <- seq(-1971, -1,by = 2)
HSF6_clean$Window_end <- seq(-2001, -31,by = 2)
HSF6_clean$Window_mid <- seq(-1986, -16,by = 2)

head(HSF6_clean)

Load the plot table

Gene_plot <- read.csv("/Users/Leon/OneDrive - Cornell University/Projects/6_Cotton/17_Cistrome/03_DL-prediction/Plot_GeneList.csv", header = T)

head(Gene_plot, 20)

Plot the genes of interests of HSFA6B

### reload plot table 
for (i in 1:nrow(Gene_plot)){
  Plot_df <- HSFA6B_clean[,c(Gene_plot[i,1], "Window_mid")]
  Plot_df[,1] <- as.numeric(Plot_df[,1])
  
  print(ggplot(Plot_df,aes(x = Window_mid,y = get(Gene_plot[i,1]))) +
    geom_point(size = 0.3, alpha = 0.5) +
    geom_line(alpha = 0.5,size = 0.4, color = "steelblue") +
    theme_bw(base_size = 10) +
    theme(axis.text.x=element_text(angle=0)) + 
    ylab(paste0("Score of confidence of ", Gene_plot[i,7])) +
    xlab ("Distance to TSS (bp)") +
    ggtitle(paste0("CREs on 2Kb promoter of ", Gene_plot[i,1], " regulated by HSFA6B")))
}

Plot the genes of interests of HSF6

### reload plot table 
for (i in 1:nrow(Gene_plot)){
  Plot_df <- HSF6_clean[,c(Gene_plot[i,1], "Window_mid")]
  Plot_df[,1] <- as.numeric(Plot_df[,1])
  
  print(ggplot(Plot_df,aes(x = Window_mid,y = get(Gene_plot[i,1]))) +
    geom_point(size = 0.3, alpha = 0.5) +
    geom_line(alpha = 0.5,size = 0.4, color = "steelblue") +
    theme_bw(base_size = 10) +
    theme(axis.text.x=element_text(angle=0)) + 
    ylab(paste0("Score of confidence of ", Gene_plot[i,7])) +
    xlab ("Distance to TSS (bp)") +
    ggtitle(paste0("CREs on 2Kb promoter of ", Gene_plot[i,1], " regulated by HSF6")))
}

Plot the genes of interests of HSF7

### reload plot table 
for (i in 1:nrow(Gene_plot)){
  Plot_df <- HSF7_clean[,c(Gene_plot[i,1], "Window_mid")]
  Plot_df[,1] <- as.numeric(Plot_df[,1])
  
  print(ggplot(Plot_df,aes(x = Window_mid,y = get(Gene_plot[i,1]))) +
    geom_point(size = 0.3, alpha = 0.5) +
    geom_line(alpha = 0.5,size = 0.4, color = "steelblue") +
    theme_bw(base_size = 10) +
    theme(axis.text.x=element_text(angle=0)) + 
    ylab(paste0("Score of confidence of ", Gene_plot[i,7])) +
    xlab ("Distance to TSS (bp)") +
    ggtitle(paste0("CREs on 2Kb promoter of ", Gene_plot[i,1], " regulated by HSF7")))
}