.libPaths("/n/data1/cores/ntc/R-4.0.1/library/")
setwd("/n/scratch3/groups/hms/cores/ntc/Weinstock008")

library(DESeq2)

taking in filtered bed file made from running mhm on ATAC data counting around peak centers.

have to revert to peak span for working bed file but excluding all dREGs that have been removed based on low ATAC read count (<6 tot reads).

linked value based on “dREG” list and filtered bed contains 11567 peak hits. final bed should agree.

centers <- read.table("PDX_filtered.bed", sep="\t")
peaks <- read.table("TCL_PDX_dist_peak.bed", sep="\t")

colnames(centers) <- c("Chr", "Start_center", "End_center", "dREG", "Blank", "Strand")
colnames(peaks) <- c("Chr", "Start", "End", "dREG", "Score", "Strand", "Center_Start", "Center_End")

combined <- peaks[(peaks$dREG %in% centers$dREG),]
write.table(combined, "PDX_filtered.bed", quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE)

working on master data file with scripts run (bedtools closest and intersect) to link GGA annot genes to dREG filtered annot enhancers.

add in counts data from PRO and ATAC across dREG span.

full <- read.table("overall.bed", sep="\t")
colnames(full) <- c("chr_gene", "start_gene", "end_gene", "name_gene", "score_gene", "strand_gene", 
     "chr_peak", "start_peak", "end_peak", "name_peak", "score_peak", "strand_peak", "center_start", "center_end", "biotype", "distance")

write.table(full, "PDX_Gene_to_Enhancer_master.txt", quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE)

load DEseq and normalize with DEseq size factors based on depth

Import gene body counts table generated in O2 into R

normalizing counts for all data types with DEseq size factors and melding to full table

raw_atac <- read.table("PDX_ATAC_counts.txt", header = TRUE, sep = "\t", row.names = 1)
raw_pro <- read.table("PDX_PRO_counts.txt", header = TRUE, sep = "\t", row.names = 1)
raw_rna <- read.table("PDX_RNA_counts.txt", header = TRUE, sep = "\t", row.names = 1)

### Create metadata object to categorize samples:

metadata_atac <- data.frame(row.names = colnames(raw_atac[, 6:ncol(raw_atac)]),
                       disease = c(rep("AITL",2),rep("Other",5)))
                       
metadata_atac # verify the table looks ok

#confirm that row and column names match
all(rownames(metadata_atac) %in% colnames(raw_atac))
all(rownames(metadata_atac) == colnames(raw_atac)[6:ncol(raw_atac)])


## Create DESeq Data Set from the count table, and sample list:

dds_atac <- DESeqDataSetFromMatrix(countData = raw_atac[, 6:ncol(raw_atac)],      # the matrix of counts from featureCounts;
                              colData = metadata_atac,                     # the sample table created in the last step;
                              design= ~disease)            # this is R's notation for models

dds_atac <- estimateSizeFactors(dds_atac)
deseq_sizes_atac <- as.data.frame(sizeFactors(dds_atac))
deseq_sizes_atac
write.table(deseq_sizes_atac, "ATAC_norm_factors.txt", quote = FALSE, sep = "\t", row.names = TRUE, col.names = TRUE)

normalization of ATAC data

dds_atac <- estimateDispersions(dds_atac)
pdf("Dispersion_ATAC_norm.pdf")
plotDispEsts(dds_atac)
dev.off()

# check results of normalization
colSums(counts(dds_atac))                # raw read counts per sample
colSums(counts(dds_atac, normalized=T))  # Total number of normalized counts per sample
data.frame(colSums(counts(dds_atac)), (colSums(counts(dds_atac, normalized=T))), row.names=colnames(counts(dds_atac)))

write.table(counts(dds_atac, normalized=TRUE), file="ATAC_norm_counts.txt", sep="\t")
counts_atac <- read.table("ATAC_norm_counts.txt", sep="\t")
counts_atac$dREG <- row.names(counts_atac)

full <- merge(full, counts_atac, by.x = "name_peak", by.y = "dREG", all.x=TRUE)

Now PRO

Create metadata object to categorize samples:

metadata_pro <- data.frame(row.names = colnames(raw_pro[, 1:ncol(raw_pro)]),
                            disease = c(rep("AITL",2),rep("Other",5)))

metadata_pro # verify the table looks ok

#confirm that row and column names match
all(rownames(metadata_pro) %in% colnames(raw_pro))
all(rownames(metadata_pro) == colnames(raw_pro)[1:ncol(raw_pro)])


## Create DESeq Data Set from the count table, and sample list:

dds_pro <- DESeqDataSetFromMatrix(countData = raw_pro[, 1:ncol(raw_pro)],      # the matrix of counts from featureCounts;
                                   colData = metadata_pro,                     # the sample table created in the last step;
                                   design= ~disease)            # this is R's notation for models

dds_pro <- estimateSizeFactors(dds_pro)
deseq_sizes_pro <- as.data.frame(sizeFactors(dds_pro))
deseq_sizes_pro
write.table(deseq_sizes_pro, "PRO_norm_factors.txt", quote = FALSE, sep = "\t", row.names = TRUE, col.names = TRUE)

normalization of PRO data

dds_pro <- estimateDispersions(dds_pro)
pdf("Dispersion_PRO_norm.pdf")
plotDispEsts(dds_pro)
dev.off()

# check results of normalization
colSums(counts(dds_pro))                # raw read counts per sample
colSums(counts(dds_pro, normalized=T))  # Total number of normalized counts per sample
data.frame(colSums(counts(dds_pro)), (colSums(counts(dds_pro, normalized=T))), row.names=colnames(counts(dds_pro)))

write.table(counts(dds_pro, normalized=TRUE), file="PRO_norm_counts.txt", sep="\t")
counts_pro <- read.table("PRO_norm_counts.txt", sep="\t")
counts_pro$dREG <- row.names(counts_pro)


full <- merge(full, counts_pro, by.x = "name_peak", by.y = "dREG", all.x=TRUE)

Now RNA

Create metadata object to categorize samples:

metadata_rna <- data.frame(row.names = colnames(raw_rna[, 6:ncol(raw_rna)]),
                            disease = c(rep("AITL",2),rep("Other",5)))

metadata_rna # verify the table looks ok

#confirm that row and column names match
all(rownames(metadata_rna) %in% colnames(raw_rna))
all(rownames(metadata_rna) == colnames(raw_rna)[6:ncol(raw_rna)])


## Create DESeq Data Set from the count table, and sample list:

dds_rna <- DESeqDataSetFromMatrix(countData = raw_rna[, 6:ncol(raw_rna)],      # the matrix of counts from featureCounts;
                                   colData = metadata_rna,                     # the sample table created in the last step;
                                   design= ~disease)            # this is R's notation for models

dds_rna <- estimateSizeFactors(dds_rna)
deseq_sizes_rna <- as.data.frame(sizeFactors(dds_rna))
deseq_sizes_rna
write.table(deseq_sizes_rna, "RNA_norm_factors.txt", quote = FALSE, sep = "\t", row.names = TRUE, col.names = TRUE)

normalization of RNA data

dds_rna <- estimateDispersions(dds_rna)
pdf("Dispersion_RNA_norm.pdf")
plotDispEsts(dds_rna)
dev.off()

# check results of normalization
colSums(counts(dds_rna))                # raw read counts per sample
colSums(counts(dds_rna, normalized=T))  # Total number of normalized counts per sample
data.frame(colSums(counts(dds_rna)), (colSums(counts(dds_rna, normalized=T))), row.names=colnames(counts(dds_rna)))

write.table(counts(dds_rna, normalized=TRUE), file="RNA_norm_counts.txt", sep="\t")
counts_rna <- read.table("RNA_norm_counts.txt", sep="\t")
counts_rna$dREG <- row.names(counts_rna)

full <- merge(full, counts_rna, by.x = "name_peak", by.y = "dREG", all.x=TRUE)

final table completed

write.table(full, "PDX_Final_gene_to_enhancer.txt", quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE)

# clean-up and save out DEseq data sets

saveRDS(dds_atac, file="ATAC.rds")
saveRDS(dds_pro, file="PRO.rds")
saveRDS(dds_rna, file="RNA.rds")
# RNA counts for GGA locations #########

gga_rna <- read.table("RNA_GGA_counts.txt", header = TRUE, sep = "\t", row.names = 1)

### Create metadata object to categorize samples:

metadata_rna <- data.frame(row.names = colnames(gga_rna[, 6:ncol(gga_rna)]),
                           disease = c(rep("AITL",2),rep("Other",5)))

metadata_rna # verify the table looks ok

#confirm that row and column names match
all(rownames(metadata_rna) %in% colnames(gga_rna))
all(rownames(metadata_rna) == colnames(gga_rna)[6:ncol(gga_rna)])

# Create DESeq Data Set from the count table, and sample list:

dds_rna <- DESeqDataSetFromMatrix(countData = gga_rna[, 6:ncol(gga_rna)],      # the matrix of counts from featureCounts;
                                  colData = metadata_rna,                     # the sample table created in the last step;
                                  design= ~disease)            # this is R's notation for models

dds_rna <- estimateSizeFactors(dds_rna)
deseq_sizes_rna <- as.data.frame(sizeFactors(dds_rna))
deseq_sizes_rna
write.table(deseq_sizes_rna, "GGA_norm_factors.txt", quote = FALSE, sep = "\t", row.names = TRUE, col.names = TRUE)

normalization of RNA data

dds_rna <- estimateDispersions(dds_rna)
pdf("Dispersion_GGA_norm.pdf")
plotDispEsts(dds_rna)
dev.off()

# check results of normalization
colSums(counts(dds_rna))                # raw read counts per sample
colSums(counts(dds_rna, normalized=T))  # Total number of normalized counts per sample
data.frame(colSums(counts(dds_rna)), (colSums(counts(dds_rna, normalized=T))), row.names=colnames(counts(dds_rna)))

write.table(counts(dds_rna, normalized=TRUE), file="TES_TSS_RNA_norm_counts.txt", sep="\t")

######### END #########
## MKJ 26SEP23
## MKJ 04OCT23
---
title: "Combined PDX Analysis of RNA-seq, PRO-seq, and ATAC-seq"
output: html_notebook
editor_options: 
chunk_output_type: inline
---

```{r}
.libPaths("/n/data1/cores/ntc/R-4.0.1/library/")
setwd("/n/scratch3/groups/hms/cores/ntc/Weinstock008")

library(DESeq2)

```

taking in filtered bed file made from running mhm on ATAC data counting around peak centers.

have to revert to peak span for working bed file but excluding all dREGs that have been removed based on low ATAC read count (<6 tot reads).

linked value based on "dREG" list and filtered bed contains 11567 peak hits. final bed should agree.

```{r}
centers <- read.table("PDX_filtered.bed", sep="\t")
peaks <- read.table("TCL_PDX_dist_peak.bed", sep="\t")

colnames(centers) <- c("Chr", "Start_center", "End_center", "dREG", "Blank", "Strand")
colnames(peaks) <- c("Chr", "Start", "End", "dREG", "Score", "Strand", "Center_Start", "Center_End")

combined <- peaks[(peaks$dREG %in% centers$dREG),]
write.table(combined, "PDX_filtered.bed", quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE)

```

working on master data file with scripts run (bedtools closest and intersect) to link GGA annot genes to dREG filtered annot enhancers.

add in counts data from PRO and ATAC across dREG span.

```{r}
full <- read.table("overall.bed", sep="\t")
colnames(full) <- c("chr_gene", "start_gene", "end_gene", "name_gene", "score_gene", "strand_gene", 
     "chr_peak", "start_peak", "end_peak", "name_peak", "score_peak", "strand_peak", "center_start", "center_end", "biotype", "distance")

write.table(full, "PDX_Gene_to_Enhancer_master.txt", quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE)

```

load DEseq and normalize with DEseq size factors based on depth

Import gene body counts table generated in O2 into R

normalizing counts for all data types with DEseq size factors and melding to full table

```{r}
raw_atac <- read.table("PDX_ATAC_counts.txt", header = TRUE, sep = "\t", row.names = 1)
raw_pro <- read.table("PDX_PRO_counts.txt", header = TRUE, sep = "\t", row.names = 1)
raw_rna <- read.table("PDX_RNA_counts.txt", header = TRUE, sep = "\t", row.names = 1)

### Create metadata object to categorize samples:

metadata_atac <- data.frame(row.names = colnames(raw_atac[, 6:ncol(raw_atac)]),
                       disease = c(rep("AITL",2),rep("Other",5)))
                       
metadata_atac # verify the table looks ok

#confirm that row and column names match
all(rownames(metadata_atac) %in% colnames(raw_atac))
all(rownames(metadata_atac) == colnames(raw_atac)[6:ncol(raw_atac)])


## Create DESeq Data Set from the count table, and sample list:

dds_atac <- DESeqDataSetFromMatrix(countData = raw_atac[, 6:ncol(raw_atac)],      # the matrix of counts from featureCounts;
                              colData = metadata_atac,                     # the sample table created in the last step;
                              design= ~disease)            # this is R's notation for models

dds_atac <- estimateSizeFactors(dds_atac)
deseq_sizes_atac <- as.data.frame(sizeFactors(dds_atac))
deseq_sizes_atac
write.table(deseq_sizes_atac, "ATAC_norm_factors.txt", quote = FALSE, sep = "\t", row.names = TRUE, col.names = TRUE)

```

normalization of ATAC data

```{r}
dds_atac <- estimateDispersions(dds_atac)
pdf("Dispersion_ATAC_norm.pdf")
plotDispEsts(dds_atac)
dev.off()

# check results of normalization
colSums(counts(dds_atac))                # raw read counts per sample
colSums(counts(dds_atac, normalized=T))  # Total number of normalized counts per sample
data.frame(colSums(counts(dds_atac)), (colSums(counts(dds_atac, normalized=T))), row.names=colnames(counts(dds_atac)))

write.table(counts(dds_atac, normalized=TRUE), file="ATAC_norm_counts.txt", sep="\t")
counts_atac <- read.table("ATAC_norm_counts.txt", sep="\t")
counts_atac$dREG <- row.names(counts_atac)

full <- merge(full, counts_atac, by.x = "name_peak", by.y = "dREG", all.x=TRUE)

```
Now PRO

Create metadata object to categorize samples:

```{r}
metadata_pro <- data.frame(row.names = colnames(raw_pro[, 1:ncol(raw_pro)]),
                            disease = c(rep("AITL",2),rep("Other",5)))

metadata_pro # verify the table looks ok

#confirm that row and column names match
all(rownames(metadata_pro) %in% colnames(raw_pro))
all(rownames(metadata_pro) == colnames(raw_pro)[1:ncol(raw_pro)])


## Create DESeq Data Set from the count table, and sample list:

dds_pro <- DESeqDataSetFromMatrix(countData = raw_pro[, 1:ncol(raw_pro)],      # the matrix of counts from featureCounts;
                                   colData = metadata_pro,                     # the sample table created in the last step;
                                   design= ~disease)            # this is R's notation for models

dds_pro <- estimateSizeFactors(dds_pro)
deseq_sizes_pro <- as.data.frame(sizeFactors(dds_pro))
deseq_sizes_pro
write.table(deseq_sizes_pro, "PRO_norm_factors.txt", quote = FALSE, sep = "\t", row.names = TRUE, col.names = TRUE)

```

normalization of PRO data

```{r}
dds_pro <- estimateDispersions(dds_pro)
pdf("Dispersion_PRO_norm.pdf")
plotDispEsts(dds_pro)
dev.off()

# check results of normalization
colSums(counts(dds_pro))                # raw read counts per sample
colSums(counts(dds_pro, normalized=T))  # Total number of normalized counts per sample
data.frame(colSums(counts(dds_pro)), (colSums(counts(dds_pro, normalized=T))), row.names=colnames(counts(dds_pro)))

write.table(counts(dds_pro, normalized=TRUE), file="PRO_norm_counts.txt", sep="\t")
counts_pro <- read.table("PRO_norm_counts.txt", sep="\t")
counts_pro$dREG <- row.names(counts_pro)


full <- merge(full, counts_pro, by.x = "name_peak", by.y = "dREG", all.x=TRUE)

```
Now RNA

Create metadata object to categorize samples:

```{r}
metadata_rna <- data.frame(row.names = colnames(raw_rna[, 6:ncol(raw_rna)]),
                            disease = c(rep("AITL",2),rep("Other",5)))

metadata_rna # verify the table looks ok

#confirm that row and column names match
all(rownames(metadata_rna) %in% colnames(raw_rna))
all(rownames(metadata_rna) == colnames(raw_rna)[6:ncol(raw_rna)])


## Create DESeq Data Set from the count table, and sample list:

dds_rna <- DESeqDataSetFromMatrix(countData = raw_rna[, 6:ncol(raw_rna)],      # the matrix of counts from featureCounts;
                                   colData = metadata_rna,                     # the sample table created in the last step;
                                   design= ~disease)            # this is R's notation for models

dds_rna <- estimateSizeFactors(dds_rna)
deseq_sizes_rna <- as.data.frame(sizeFactors(dds_rna))
deseq_sizes_rna
write.table(deseq_sizes_rna, "RNA_norm_factors.txt", quote = FALSE, sep = "\t", row.names = TRUE, col.names = TRUE)

```

normalization of RNA data

```{r}
dds_rna <- estimateDispersions(dds_rna)
pdf("Dispersion_RNA_norm.pdf")
plotDispEsts(dds_rna)
dev.off()

# check results of normalization
colSums(counts(dds_rna))                # raw read counts per sample
colSums(counts(dds_rna, normalized=T))  # Total number of normalized counts per sample
data.frame(colSums(counts(dds_rna)), (colSums(counts(dds_rna, normalized=T))), row.names=colnames(counts(dds_rna)))

write.table(counts(dds_rna, normalized=TRUE), file="RNA_norm_counts.txt", sep="\t")
counts_rna <- read.table("RNA_norm_counts.txt", sep="\t")
counts_rna$dREG <- row.names(counts_rna)

full <- merge(full, counts_rna, by.x = "name_peak", by.y = "dREG", all.x=TRUE)

```

final table completed

```{r}
write.table(full, "PDX_Final_gene_to_enhancer.txt", quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE)

# clean-up and save out DEseq data sets

saveRDS(dds_atac, file="ATAC.rds")
saveRDS(dds_pro, file="PRO.rds")
saveRDS(dds_rna, file="RNA.rds")

```

```{r}
# RNA counts for GGA locations #########

gga_rna <- read.table("RNA_GGA_counts.txt", header = TRUE, sep = "\t", row.names = 1)

### Create metadata object to categorize samples:

metadata_rna <- data.frame(row.names = colnames(gga_rna[, 6:ncol(gga_rna)]),
                           disease = c(rep("AITL",2),rep("Other",5)))

metadata_rna # verify the table looks ok

#confirm that row and column names match
all(rownames(metadata_rna) %in% colnames(gga_rna))
all(rownames(metadata_rna) == colnames(gga_rna)[6:ncol(gga_rna)])

# Create DESeq Data Set from the count table, and sample list:

dds_rna <- DESeqDataSetFromMatrix(countData = gga_rna[, 6:ncol(gga_rna)],      # the matrix of counts from featureCounts;
                                  colData = metadata_rna,                     # the sample table created in the last step;
                                  design= ~disease)            # this is R's notation for models

dds_rna <- estimateSizeFactors(dds_rna)
deseq_sizes_rna <- as.data.frame(sizeFactors(dds_rna))
deseq_sizes_rna
write.table(deseq_sizes_rna, "GGA_norm_factors.txt", quote = FALSE, sep = "\t", row.names = TRUE, col.names = TRUE)

```

normalization of RNA data

```{r}
dds_rna <- estimateDispersions(dds_rna)
pdf("Dispersion_GGA_norm.pdf")
plotDispEsts(dds_rna)
dev.off()

# check results of normalization
colSums(counts(dds_rna))                # raw read counts per sample
colSums(counts(dds_rna, normalized=T))  # Total number of normalized counts per sample
data.frame(colSums(counts(dds_rna)), (colSums(counts(dds_rna, normalized=T))), row.names=colnames(counts(dds_rna)))

write.table(counts(dds_rna, normalized=TRUE), file="TES_TSS_RNA_norm_counts.txt", sep="\t")

######### END #########
## MKJ 26SEP23
## MKJ 04OCT23
```