.libPaths("/n/data1/cores/ntc/R-4.0.1/library/")
setwd("/n/scratch3/groups/hms/cores/ntc/Weinstock008")
library(DESeq2)
taking in filtered bed file made from running mhm on ATAC data
counting around peak centers.
have to revert to peak span for working bed file but excluding all
dREGs that have been removed based on low ATAC read count (<6 tot
reads).
linked value based on “dREG” list and filtered bed contains 11567
peak hits. final bed should agree.
centers <- read.table("PDX_filtered.bed", sep="\t")
peaks <- read.table("TCL_PDX_dist_peak.bed", sep="\t")
colnames(centers) <- c("Chr", "Start_center", "End_center", "dREG", "Blank", "Strand")
colnames(peaks) <- c("Chr", "Start", "End", "dREG", "Score", "Strand", "Center_Start", "Center_End")
combined <- peaks[(peaks$dREG %in% centers$dREG),]
write.table(combined, "PDX_filtered.bed", quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE)
working on master data file with scripts run (bedtools closest and
intersect) to link GGA annot genes to dREG filtered annot enhancers.
add in counts data from PRO and ATAC across dREG span.
full <- read.table("overall.bed", sep="\t")
colnames(full) <- c("chr_gene", "start_gene", "end_gene", "name_gene", "score_gene", "strand_gene",
"chr_peak", "start_peak", "end_peak", "name_peak", "score_peak", "strand_peak", "center_start", "center_end", "biotype", "distance")
write.table(full, "PDX_Gene_to_Enhancer_master.txt", quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE)
load DEseq and normalize with DEseq size factors based on depth
Import gene body counts table generated in O2 into R
normalizing counts for all data types with DEseq size factors and
melding to full table
raw_atac <- read.table("PDX_ATAC_counts.txt", header = TRUE, sep = "\t", row.names = 1)
raw_pro <- read.table("PDX_PRO_counts.txt", header = TRUE, sep = "\t", row.names = 1)
raw_rna <- read.table("PDX_RNA_counts.txt", header = TRUE, sep = "\t", row.names = 1)
### Create metadata object to categorize samples:
metadata_atac <- data.frame(row.names = colnames(raw_atac[, 6:ncol(raw_atac)]),
disease = c(rep("AITL",2),rep("Other",5)))
metadata_atac # verify the table looks ok
#confirm that row and column names match
all(rownames(metadata_atac) %in% colnames(raw_atac))
all(rownames(metadata_atac) == colnames(raw_atac)[6:ncol(raw_atac)])
## Create DESeq Data Set from the count table, and sample list:
dds_atac <- DESeqDataSetFromMatrix(countData = raw_atac[, 6:ncol(raw_atac)], # the matrix of counts from featureCounts;
colData = metadata_atac, # the sample table created in the last step;
design= ~disease) # this is R's notation for models
dds_atac <- estimateSizeFactors(dds_atac)
deseq_sizes_atac <- as.data.frame(sizeFactors(dds_atac))
deseq_sizes_atac
write.table(deseq_sizes_atac, "ATAC_norm_factors.txt", quote = FALSE, sep = "\t", row.names = TRUE, col.names = TRUE)
normalization of ATAC data
dds_atac <- estimateDispersions(dds_atac)
pdf("Dispersion_ATAC_norm.pdf")
plotDispEsts(dds_atac)
dev.off()
# check results of normalization
colSums(counts(dds_atac)) # raw read counts per sample
colSums(counts(dds_atac, normalized=T)) # Total number of normalized counts per sample
data.frame(colSums(counts(dds_atac)), (colSums(counts(dds_atac, normalized=T))), row.names=colnames(counts(dds_atac)))
write.table(counts(dds_atac, normalized=TRUE), file="ATAC_norm_counts.txt", sep="\t")
counts_atac <- read.table("ATAC_norm_counts.txt", sep="\t")
counts_atac$dREG <- row.names(counts_atac)
full <- merge(full, counts_atac, by.x = "name_peak", by.y = "dREG", all.x=TRUE)
Now PRO
Create metadata object to categorize samples:
metadata_pro <- data.frame(row.names = colnames(raw_pro[, 1:ncol(raw_pro)]),
disease = c(rep("AITL",2),rep("Other",5)))
metadata_pro # verify the table looks ok
#confirm that row and column names match
all(rownames(metadata_pro) %in% colnames(raw_pro))
all(rownames(metadata_pro) == colnames(raw_pro)[1:ncol(raw_pro)])
## Create DESeq Data Set from the count table, and sample list:
dds_pro <- DESeqDataSetFromMatrix(countData = raw_pro[, 1:ncol(raw_pro)], # the matrix of counts from featureCounts;
colData = metadata_pro, # the sample table created in the last step;
design= ~disease) # this is R's notation for models
dds_pro <- estimateSizeFactors(dds_pro)
deseq_sizes_pro <- as.data.frame(sizeFactors(dds_pro))
deseq_sizes_pro
write.table(deseq_sizes_pro, "PRO_norm_factors.txt", quote = FALSE, sep = "\t", row.names = TRUE, col.names = TRUE)
normalization of PRO data
dds_pro <- estimateDispersions(dds_pro)
pdf("Dispersion_PRO_norm.pdf")
plotDispEsts(dds_pro)
dev.off()
# check results of normalization
colSums(counts(dds_pro)) # raw read counts per sample
colSums(counts(dds_pro, normalized=T)) # Total number of normalized counts per sample
data.frame(colSums(counts(dds_pro)), (colSums(counts(dds_pro, normalized=T))), row.names=colnames(counts(dds_pro)))
write.table(counts(dds_pro, normalized=TRUE), file="PRO_norm_counts.txt", sep="\t")
counts_pro <- read.table("PRO_norm_counts.txt", sep="\t")
counts_pro$dREG <- row.names(counts_pro)
full <- merge(full, counts_pro, by.x = "name_peak", by.y = "dREG", all.x=TRUE)
Now RNA
Create metadata object to categorize samples:
metadata_rna <- data.frame(row.names = colnames(raw_rna[, 6:ncol(raw_rna)]),
disease = c(rep("AITL",2),rep("Other",5)))
metadata_rna # verify the table looks ok
#confirm that row and column names match
all(rownames(metadata_rna) %in% colnames(raw_rna))
all(rownames(metadata_rna) == colnames(raw_rna)[6:ncol(raw_rna)])
## Create DESeq Data Set from the count table, and sample list:
dds_rna <- DESeqDataSetFromMatrix(countData = raw_rna[, 6:ncol(raw_rna)], # the matrix of counts from featureCounts;
colData = metadata_rna, # the sample table created in the last step;
design= ~disease) # this is R's notation for models
dds_rna <- estimateSizeFactors(dds_rna)
deseq_sizes_rna <- as.data.frame(sizeFactors(dds_rna))
deseq_sizes_rna
write.table(deseq_sizes_rna, "RNA_norm_factors.txt", quote = FALSE, sep = "\t", row.names = TRUE, col.names = TRUE)
normalization of RNA data
dds_rna <- estimateDispersions(dds_rna)
pdf("Dispersion_RNA_norm.pdf")
plotDispEsts(dds_rna)
dev.off()
# check results of normalization
colSums(counts(dds_rna)) # raw read counts per sample
colSums(counts(dds_rna, normalized=T)) # Total number of normalized counts per sample
data.frame(colSums(counts(dds_rna)), (colSums(counts(dds_rna, normalized=T))), row.names=colnames(counts(dds_rna)))
write.table(counts(dds_rna, normalized=TRUE), file="RNA_norm_counts.txt", sep="\t")
counts_rna <- read.table("RNA_norm_counts.txt", sep="\t")
counts_rna$dREG <- row.names(counts_rna)
full <- merge(full, counts_rna, by.x = "name_peak", by.y = "dREG", all.x=TRUE)
final table completed
write.table(full, "PDX_Final_gene_to_enhancer.txt", quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE)
# clean-up and save out DEseq data sets
saveRDS(dds_atac, file="ATAC.rds")
saveRDS(dds_pro, file="PRO.rds")
saveRDS(dds_rna, file="RNA.rds")
# RNA counts for GGA locations #########
gga_rna <- read.table("RNA_GGA_counts.txt", header = TRUE, sep = "\t", row.names = 1)
### Create metadata object to categorize samples:
metadata_rna <- data.frame(row.names = colnames(gga_rna[, 6:ncol(gga_rna)]),
disease = c(rep("AITL",2),rep("Other",5)))
metadata_rna # verify the table looks ok
#confirm that row and column names match
all(rownames(metadata_rna) %in% colnames(gga_rna))
all(rownames(metadata_rna) == colnames(gga_rna)[6:ncol(gga_rna)])
# Create DESeq Data Set from the count table, and sample list:
dds_rna <- DESeqDataSetFromMatrix(countData = gga_rna[, 6:ncol(gga_rna)], # the matrix of counts from featureCounts;
colData = metadata_rna, # the sample table created in the last step;
design= ~disease) # this is R's notation for models
dds_rna <- estimateSizeFactors(dds_rna)
deseq_sizes_rna <- as.data.frame(sizeFactors(dds_rna))
deseq_sizes_rna
write.table(deseq_sizes_rna, "GGA_norm_factors.txt", quote = FALSE, sep = "\t", row.names = TRUE, col.names = TRUE)
normalization of RNA data
dds_rna <- estimateDispersions(dds_rna)
pdf("Dispersion_GGA_norm.pdf")
plotDispEsts(dds_rna)
dev.off()
# check results of normalization
colSums(counts(dds_rna)) # raw read counts per sample
colSums(counts(dds_rna, normalized=T)) # Total number of normalized counts per sample
data.frame(colSums(counts(dds_rna)), (colSums(counts(dds_rna, normalized=T))), row.names=colnames(counts(dds_rna)))
write.table(counts(dds_rna, normalized=TRUE), file="TES_TSS_RNA_norm_counts.txt", sep="\t")
######### END #########
## MKJ 26SEP23
## MKJ 04OCT23
---
title: "Combined PDX Analysis of RNA-seq, PRO-seq, and ATAC-seq"
output: html_notebook
editor_options: 
chunk_output_type: inline
---

```{r}
.libPaths("/n/data1/cores/ntc/R-4.0.1/library/")
setwd("/n/scratch3/groups/hms/cores/ntc/Weinstock008")

library(DESeq2)

```

taking in filtered bed file made from running mhm on ATAC data counting around peak centers.

have to revert to peak span for working bed file but excluding all dREGs that have been removed based on low ATAC read count (<6 tot reads).

linked value based on "dREG" list and filtered bed contains 11567 peak hits. final bed should agree.

```{r}
centers <- read.table("PDX_filtered.bed", sep="\t")
peaks <- read.table("TCL_PDX_dist_peak.bed", sep="\t")

colnames(centers) <- c("Chr", "Start_center", "End_center", "dREG", "Blank", "Strand")
colnames(peaks) <- c("Chr", "Start", "End", "dREG", "Score", "Strand", "Center_Start", "Center_End")

combined <- peaks[(peaks$dREG %in% centers$dREG),]
write.table(combined, "PDX_filtered.bed", quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE)

```

working on master data file with scripts run (bedtools closest and intersect) to link GGA annot genes to dREG filtered annot enhancers.

add in counts data from PRO and ATAC across dREG span.

```{r}
full <- read.table("overall.bed", sep="\t")
colnames(full) <- c("chr_gene", "start_gene", "end_gene", "name_gene", "score_gene", "strand_gene", 
     "chr_peak", "start_peak", "end_peak", "name_peak", "score_peak", "strand_peak", "center_start", "center_end", "biotype", "distance")

write.table(full, "PDX_Gene_to_Enhancer_master.txt", quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE)

```

load DEseq and normalize with DEseq size factors based on depth

Import gene body counts table generated in O2 into R

normalizing counts for all data types with DEseq size factors and melding to full table

```{r}
raw_atac <- read.table("PDX_ATAC_counts.txt", header = TRUE, sep = "\t", row.names = 1)
raw_pro <- read.table("PDX_PRO_counts.txt", header = TRUE, sep = "\t", row.names = 1)
raw_rna <- read.table("PDX_RNA_counts.txt", header = TRUE, sep = "\t", row.names = 1)

### Create metadata object to categorize samples:

metadata_atac <- data.frame(row.names = colnames(raw_atac[, 6:ncol(raw_atac)]),
                       disease = c(rep("AITL",2),rep("Other",5)))
                       
metadata_atac # verify the table looks ok

#confirm that row and column names match
all(rownames(metadata_atac) %in% colnames(raw_atac))
all(rownames(metadata_atac) == colnames(raw_atac)[6:ncol(raw_atac)])


## Create DESeq Data Set from the count table, and sample list:

dds_atac <- DESeqDataSetFromMatrix(countData = raw_atac[, 6:ncol(raw_atac)],      # the matrix of counts from featureCounts;
                              colData = metadata_atac,                     # the sample table created in the last step;
                              design= ~disease)            # this is R's notation for models

dds_atac <- estimateSizeFactors(dds_atac)
deseq_sizes_atac <- as.data.frame(sizeFactors(dds_atac))
deseq_sizes_atac
write.table(deseq_sizes_atac, "ATAC_norm_factors.txt", quote = FALSE, sep = "\t", row.names = TRUE, col.names = TRUE)

```

normalization of ATAC data

```{r}
dds_atac <- estimateDispersions(dds_atac)
pdf("Dispersion_ATAC_norm.pdf")
plotDispEsts(dds_atac)
dev.off()

# check results of normalization
colSums(counts(dds_atac))                # raw read counts per sample
colSums(counts(dds_atac, normalized=T))  # Total number of normalized counts per sample
data.frame(colSums(counts(dds_atac)), (colSums(counts(dds_atac, normalized=T))), row.names=colnames(counts(dds_atac)))

write.table(counts(dds_atac, normalized=TRUE), file="ATAC_norm_counts.txt", sep="\t")
counts_atac <- read.table("ATAC_norm_counts.txt", sep="\t")
counts_atac$dREG <- row.names(counts_atac)

full <- merge(full, counts_atac, by.x = "name_peak", by.y = "dREG", all.x=TRUE)

```
Now PRO

Create metadata object to categorize samples:

```{r}
metadata_pro <- data.frame(row.names = colnames(raw_pro[, 1:ncol(raw_pro)]),
                            disease = c(rep("AITL",2),rep("Other",5)))

metadata_pro # verify the table looks ok

#confirm that row and column names match
all(rownames(metadata_pro) %in% colnames(raw_pro))
all(rownames(metadata_pro) == colnames(raw_pro)[1:ncol(raw_pro)])


## Create DESeq Data Set from the count table, and sample list:

dds_pro <- DESeqDataSetFromMatrix(countData = raw_pro[, 1:ncol(raw_pro)],      # the matrix of counts from featureCounts;
                                   colData = metadata_pro,                     # the sample table created in the last step;
                                   design= ~disease)            # this is R's notation for models

dds_pro <- estimateSizeFactors(dds_pro)
deseq_sizes_pro <- as.data.frame(sizeFactors(dds_pro))
deseq_sizes_pro
write.table(deseq_sizes_pro, "PRO_norm_factors.txt", quote = FALSE, sep = "\t", row.names = TRUE, col.names = TRUE)

```

normalization of PRO data

```{r}
dds_pro <- estimateDispersions(dds_pro)
pdf("Dispersion_PRO_norm.pdf")
plotDispEsts(dds_pro)
dev.off()

# check results of normalization
colSums(counts(dds_pro))                # raw read counts per sample
colSums(counts(dds_pro, normalized=T))  # Total number of normalized counts per sample
data.frame(colSums(counts(dds_pro)), (colSums(counts(dds_pro, normalized=T))), row.names=colnames(counts(dds_pro)))

write.table(counts(dds_pro, normalized=TRUE), file="PRO_norm_counts.txt", sep="\t")
counts_pro <- read.table("PRO_norm_counts.txt", sep="\t")
counts_pro$dREG <- row.names(counts_pro)


full <- merge(full, counts_pro, by.x = "name_peak", by.y = "dREG", all.x=TRUE)

```
Now RNA

Create metadata object to categorize samples:

```{r}
metadata_rna <- data.frame(row.names = colnames(raw_rna[, 6:ncol(raw_rna)]),
                            disease = c(rep("AITL",2),rep("Other",5)))

metadata_rna # verify the table looks ok

#confirm that row and column names match
all(rownames(metadata_rna) %in% colnames(raw_rna))
all(rownames(metadata_rna) == colnames(raw_rna)[6:ncol(raw_rna)])


## Create DESeq Data Set from the count table, and sample list:

dds_rna <- DESeqDataSetFromMatrix(countData = raw_rna[, 6:ncol(raw_rna)],      # the matrix of counts from featureCounts;
                                   colData = metadata_rna,                     # the sample table created in the last step;
                                   design= ~disease)            # this is R's notation for models

dds_rna <- estimateSizeFactors(dds_rna)
deseq_sizes_rna <- as.data.frame(sizeFactors(dds_rna))
deseq_sizes_rna
write.table(deseq_sizes_rna, "RNA_norm_factors.txt", quote = FALSE, sep = "\t", row.names = TRUE, col.names = TRUE)

```

normalization of RNA data

```{r}
dds_rna <- estimateDispersions(dds_rna)
pdf("Dispersion_RNA_norm.pdf")
plotDispEsts(dds_rna)
dev.off()

# check results of normalization
colSums(counts(dds_rna))                # raw read counts per sample
colSums(counts(dds_rna, normalized=T))  # Total number of normalized counts per sample
data.frame(colSums(counts(dds_rna)), (colSums(counts(dds_rna, normalized=T))), row.names=colnames(counts(dds_rna)))

write.table(counts(dds_rna, normalized=TRUE), file="RNA_norm_counts.txt", sep="\t")
counts_rna <- read.table("RNA_norm_counts.txt", sep="\t")
counts_rna$dREG <- row.names(counts_rna)

full <- merge(full, counts_rna, by.x = "name_peak", by.y = "dREG", all.x=TRUE)

```

final table completed

```{r}
write.table(full, "PDX_Final_gene_to_enhancer.txt", quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE)

# clean-up and save out DEseq data sets

saveRDS(dds_atac, file="ATAC.rds")
saveRDS(dds_pro, file="PRO.rds")
saveRDS(dds_rna, file="RNA.rds")

```

```{r}
# RNA counts for GGA locations #########

gga_rna <- read.table("RNA_GGA_counts.txt", header = TRUE, sep = "\t", row.names = 1)

### Create metadata object to categorize samples:

metadata_rna <- data.frame(row.names = colnames(gga_rna[, 6:ncol(gga_rna)]),
                           disease = c(rep("AITL",2),rep("Other",5)))

metadata_rna # verify the table looks ok

#confirm that row and column names match
all(rownames(metadata_rna) %in% colnames(gga_rna))
all(rownames(metadata_rna) == colnames(gga_rna)[6:ncol(gga_rna)])

# Create DESeq Data Set from the count table, and sample list:

dds_rna <- DESeqDataSetFromMatrix(countData = gga_rna[, 6:ncol(gga_rna)],      # the matrix of counts from featureCounts;
                                  colData = metadata_rna,                     # the sample table created in the last step;
                                  design= ~disease)            # this is R's notation for models

dds_rna <- estimateSizeFactors(dds_rna)
deseq_sizes_rna <- as.data.frame(sizeFactors(dds_rna))
deseq_sizes_rna
write.table(deseq_sizes_rna, "GGA_norm_factors.txt", quote = FALSE, sep = "\t", row.names = TRUE, col.names = TRUE)

```

normalization of RNA data

```{r}
dds_rna <- estimateDispersions(dds_rna)
pdf("Dispersion_GGA_norm.pdf")
plotDispEsts(dds_rna)
dev.off()

# check results of normalization
colSums(counts(dds_rna))                # raw read counts per sample
colSums(counts(dds_rna, normalized=T))  # Total number of normalized counts per sample
data.frame(colSums(counts(dds_rna)), (colSums(counts(dds_rna, normalized=T))), row.names=colnames(counts(dds_rna)))

write.table(counts(dds_rna, normalized=TRUE), file="TES_TSS_RNA_norm_counts.txt", sep="\t")

######### END #########
## MKJ 26SEP23
## MKJ 04OCT23
```