library(knitr)
library(ggplot2)
library(dplyr)
Here is the format instruction
1. chrom - Name of the chromosome (or contig, scaffold, etc.).
2. chromStart - The starting position of the feature in the chromosome
or scaffold. The first base in a chromosome is numbered 0.
3. chromEnd - The ending position of the feature in the chromosome or
scaffold. The chromEnd base is not included in the display of the
feature. For example, the first 100 bases of a chromosome are defined as
chromStart=0, chromEnd=100, and span the bases numbered 0-99.
4. name - Name given to a region (preferably unique). Use “.” if no name
is assigned.
5. score - Indicates how dark the peak will be displayed in the browser
(0-1000). If all scores were “‘0”’ when the data were submitted to the
DCC, the DCC assigned scores 1-1000 based on signal value. Ideally the
average signalValue per base spread is between 100-1000.
6. strand - +/- to denote strand or orientation (whenever applicable).
Use”.” if no orientation is assigned.
7. signalValue - Measurement of overall (usually, average) enrichment
for the region.
8. pValue - Measurement of statistical significance (-log10). Use -1 if
no pValue is assigned.
9. qValue - Measurement of statistical significance using false
discovery rate (-log10). Use -1 if no qValue is assigned.
10. peak - Point-source called for this peak; 0-based offset from
chromStart. Use -1 if no point-source called.
### Load narrow peaks
Peak <- read.table("/Users/leon/Documents/Project/Cistrome/peaks/HSF_tnt/HSF7_colamp_a/chr1-5/chr1-5_GEM_events.narrowPeak", header = F)
colnames(Peak) <- c("chr","start", "end", "index", "score", "strand", "signalValue", "pvalue", "qvalue", "peak" )
### sort coordindates
Peak <- Peak[order(Peak[,"chr"], Peak[,"start"]),]
head(Peak)
### Extract positive sequences
Peak$summit <- Peak$start + 101
Peak$positive_start <- Peak$summit - 15
Peak$positive_end <- Peak$summit + 15
Peak$postitve_index <- paste0(Peak$chr, ":", Peak$positive_start, "-", Peak$positive_end)
### Extract two negative sequences
Peak$negative1_start <- Peak$start - 30
Peak$negative1_index <- paste0(Peak$chr, ":", Peak$negative1_start, "-", Peak$start)
Peak$negative2_end <- Peak$end + 30
Peak$negative2_index <- paste0(Peak$chr, ":", Peak$end, "-", Peak$negative2_end)
### Export the Peak information
Peak_index <- Peak[,c("postitve_index", "negative1_index", "negative2_index")]
head(Peak_index,20)
#write.table(Peak_index, "/Users/leon/Documents/Project/Cistrome/HSF7_colamp_a_PeakIndex")
HSFA6B <- read.csv("/Users/Leon/OneDrive - Cornell University/Projects/6_Cotton/17_Cistrome/03_DL-prediction/wOTU-HSFA6B_col.csv",header = F)
HSFA6B_clean <- data.frame(t(HSFA6B))
colnames(HSFA6B_clean) <- HSFA6B_clean[1,]
HSFA6B_clean <- HSFA6B_clean[2:987,]
HSFA6B_clean$Window_str <- seq(-1971, -1,by = 2)
HSFA6B_clean$Window_end <- seq(-2001, -31,by = 2)
HSFA6B_clean$Window_mid <- seq(-1986, -16,by = 2)
head(HSFA6B_clean)
tail(HSFA6B_clean)
HSF7 <- read.csv("/Users/Leon/OneDrive - Cornell University/Projects/6_Cotton/17_Cistrome/03_DL-prediction/wOTU-HSF7_col_a.csv",header = F)
HSF7_clean <- data.frame(t(HSF7))
colnames(HSF7_clean) <- HSF7_clean[1,]
HSF7_clean <- HSF7_clean[2:987,]
HSF7_clean$Window_str <- seq(-1971, -1,by = 2)
HSF7_clean$Window_end <- seq(-2001, -31,by = 2)
HSF7_clean$Window_mid <- seq(-1986, -16,by = 2)
head(HSF7_clean)
HSF6 <- read.csv("/Users/Leon/OneDrive - Cornell University/Projects/6_Cotton/17_Cistrome/03_DL-prediction/wOTU-HSF6_col_a.csv",header = F)
HSF6_clean <- data.frame(t(HSF6))
colnames(HSF6_clean) <- HSF6_clean[1,]
HSF6_clean <- HSF6_clean[2:987,]
HSF6_clean$Window_str <- seq(-1971, -1,by = 2)
HSF6_clean$Window_end <- seq(-2001, -31,by = 2)
HSF6_clean$Window_mid <- seq(-1986, -16,by = 2)
head(HSF6_clean)
Gene_plot <- read.csv("/Users/Leon/OneDrive - Cornell University/Projects/6_Cotton/17_Cistrome/03_DL-prediction/Plot_GeneList.csv", header = T)
head(Gene_plot, 20)
### reload plot table
for (i in 1:nrow(Gene_plot)){
Plot_df <- HSFA6B_clean[,c(Gene_plot[i,1], "Window_mid")]
Plot_df[,1] <- as.numeric(Plot_df[,1])
print(ggplot(Plot_df,aes(x = Window_mid,y = get(Gene_plot[i,1]))) +
geom_point(size = 0.3, alpha = 0.5) +
geom_line(alpha = 0.5,size = 0.4, color = "steelblue") +
theme_bw(base_size = 10) +
theme(axis.text.x=element_text(angle=0)) +
ylab(paste0("Score of confidence of ", Gene_plot[i,7])) +
xlab ("Distance to TSS (bp)") +
ggtitle(paste0("CREs on 2Kb promoter of ", Gene_plot[i,1], " regulated by HSFA6B")))
}