##################################################################################
##### DESeq2 R script for Differential Expression using pairwise comparisons #####
##################################################################################

## Set working directory
setwd("/Users/kuroko/Dropbox (Partners HealthCare)/Weinstock NTC Collab/Experiments/TCL Cell Lines/Data/ALCL/Full R project scripts")
getwd()
[1] "/Users/kuroko/Dropbox (Partners HealthCare)/Weinstock NTC Collab/Experiments/TCL Cell Lines/Data/ALCL/Full R project scripts"
## Before running this script start by transferring the necessary files to the working directory set above:
# All .out.bam files: these were generated from the NTC standard pipeline
# .gtf file with annotations of genome of interest: can be downloaded from multiple sources; or there are dedicated R packages

## Load DESeq2 and Rsubread packages

library(DESeq2)        # Loads DESeq2 package
library(Rsubread)      # Loads Rsubread package (counts fragments for each feature)
library(tidyverse)     # Loads tidyverse which allows various convenient data manipulations
library(RColorBrewer)
library(pheatmap)
library(ggplot2)
library(ggrepel)
library(EnhancedVolcano)
library(ggpubr)

### featureCount steps performed on O2 to avoid having to make local copies of .sam files

### shell script used to run feature count ###  

# Import gene body counts table generated in O2 into R
fc <- read.table("PRO_ALCL_featurecounts", sep="\t", header=TRUE, row.names=1)

# Import SAF file for later use mapping tx_id to gene names.  convert to tibble.  If you are able to get annotation from your counts table, this may not be necessary.
#gtf <- read.table(".gtf", sep = "\t", header = TRUE, row.names=1) 
###  Give nice names to the columns in counts

# Store original colnames
#orig_names <- colnames(fc)

### Renaming columns.  This is really inelegant but it worked.  Depending on naming, may need to adjust trim length.

# Trim 38 char from left side, delete "_sorted.sam" from right, replace . with _
#new_sample_colnames <- str_sub(colnames(fc), 29, str_length(colnames(fc))) %>% str_replace("_sorted.sam","") %>% str_replace("\\.","_")
#names_col1_5 <- colnames(fc)[1:5]                                                # save the names of columns 1-5
#new_names <- c(names_col1_5, new_sample_colnames[6:length(new_sample_colnames)])     # combine names of cols 1-5 with shortened samples names
#new_names

#colnames(fc) <- new_names             # replace old column names with new names
colnames(fc)[6:14] <- c("KARAS299", "L82", "SUPM2", "SUDHL1", "KIJK", "SR786", "FEPD", "DL40", "MAC2A")
colnames(fc)                          # check names
 [1] "Chr"      "Start"    "End"      "Strand"   "Length"   "KARAS299" "L82"      "SUPM2"    "SUDHL1"  
[10] "KIJK"     "SR786"    "FEPD"     "DL40"     "MAC2A"   
### Create metadata object to categorize samples:

metadata <- data.frame(row.names = colnames(fc[, 6:ncol(fc)]),
                       subtype = c(rep("ALK_positive",6),rep("ALK_negative",3))
)  # genotype+timepoint, used for DEseq conditions
metadata # verify the table looks ok

#confirm that row and column names match
all(rownames(metadata) %in% colnames(fc))
[1] TRUE
all(rownames(metadata) == colnames(fc)[6:ncol(fc)])
[1] TRUE
## Create DESeq Data Set from the count table, and sample list:

dds <- DESeqDataSetFromMatrix(countData = fc[, 6:ncol(fc)],      # the matrix of counts from featureCounts;
                              colData = metadata,                     # the sample table created in the last step;
                              design= ~subtype)            # determine what value you'd like DEseq to compare
Warning in DESeqDataSet(se, design = design, ignoreRank) :
  some variables in design formula are characters, converting to factors
dds <- estimateSizeFactors(dds)
deseq_sizes <- as.data.frame(sizeFactors(dds))
deseq_sizes

keep <- rowMax(counts(dds, normalize=TRUE)) >= 20
dds <- dds[keep,]
saveRDS(dds, file="ALCL_DEseq_dds.rds")
#dREG_readFilt <- dREG[dREG$dREGpeakID %in% rownames(dds),]
#write.table(dREG_readFilt, file="dREGpeaks.readFilt.bed", sep="\t", col.names=FALSE, row.names=FALSE,
           # quote=FALSE)
# sizeFactors(dds) <- spike_factors

## Estimate and plot dispersions for standard normalization DESeq data set:
dds <- estimateDispersions(dds)
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
pdf("Dispersion_ALCL.pdf")
plotDispEsts(dds)


# check results of normalization
colSums(counts(dds))                # raw read counts per sample
KARAS299      L82    SUPM2   SUDHL1     KIJK    SR786     FEPD     DL40    MAC2A 
27955213 11405038  6390226  7076909  8115871  8363398  7897084  5772651 14835319 
colSums(counts(dds, normalized=T))  # Total number of normalized counts per sample
KARAS299      L82    SUPM2   SUDHL1     KIJK    SR786     FEPD     DL40    MAC2A 
 9735115  9581010  8993572  9074870  9153605  9419018  8840723  8847171 10752376 
data.frame(colSums(counts(dds)), (colSums(counts(dds, normalized=T))), row.names=colnames(counts(dds)))

# For these samples, spike ratios are close so DEseq2 normalization will be used.

#run DESeq
dds <- DESeq(dds)
using pre-existing size factors
estimating dispersions
found already estimated dispersions, replacing these
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
dds <- nbinomWaldTest(dds)                                     # Wald test for differential expression
found results columns, replacing these
# user requested comparisons

#   ALK+ v. ALK-

# create results objects for each requested contrast

res_neg_v_pos  <- results(dds, contrast = c("subtype","ALK_negative","ALK_positive"), alpha = 0.01)

# Calculate mean normalized counts for each condition

AvgNorm_pos        <- as.data.frame(counts(dds, normalized = TRUE)) %>% select(contains("ALK_positive")) %>% rowMeans(.)
AvgNorm_neg        <- as.data.frame(counts(dds, normalized = TRUE)) %>% select(contains("ALK_negative")) %>% rowMeans(.)

# extract gene names from saf annotation file
#geneNames <- dplyr::select(row.names(fc), c("EnsemblID"))

# Start to build results tables

# Results table for comparisons 

resTable <- data.frame("EnsemblID"=as.data.frame(rownames(dds)))
#resTable$geneNames <- geneNames$GeneName

resTable$AvgNormCounts_ALK_positive      <- AvgNorm_pos
resTable$AvgNormCounts_ALK_negative         <- AvgNorm_neg
resTable$baseMean                   <- as.data.frame(res_neg_v_pos)$baseMean

# VS positive

resTable$log2FoldChange_neg_v_pos        <- as.data.frame(res_neg_v_pos)$log2FoldChange
resTable$log2fcSE_neg_v_pos              <- as.data.frame(res_neg_v_pos)$lfcSE
resTable$pvalue_neg_v_pos                <- as.data.frame(res_neg_v_pos)$pvalue
resTable$padj_neg_v_pos                  <- as.data.frame(res_neg_v_pos)$padj

# Write data tables to text files

write.csv(counts(dds, normalized=FALSE), file="ALK_positive_and_negative_WholeGene_raw_counts.csv")
write.csv(counts(dds, normalized=TRUE), file="ALK_positive_and_negative_DEseq2normalized_counts.csv")

write_tsv(resTable,"ALCL_results.txt",
          append = FALSE,
          col_names = TRUE,
          na = "NA"
)

#### PLOTS ####

# EnhancedVolcano plots

ProjectName <- "PROseq_ALCL"
padjCutoff <- 1e-6
FCcutoff <- 1.32

pdf("ALCL_volcano.pdf")
EnhancedVolcano(res_neg_v_pos,
                lab = resTable$rownames.dds.,
                title = '_',
                x = 'log2FoldChange',
                y = 'padj',
                #xlim = c(-6,6),
                xlab = bquote(~Log[2]~ 'fold change'),
                ylab = bquote('-'~Log[10]~ 'adj p-val'),
                pCutoff = padjCutoff,
                FCcutoff = FCcutoff,
                pointSize = 1.0,
                labSize = 2.0,
                colAlpha = 0.5,
                legendPosition = 'right',
                legendLabSize = 6,
                legendIconSize = 3.0,
                drawConnectors = TRUE,
                widthConnectors = 0.2,
                colConnectors = 'grey30')
Warning: ggrepel: 93 unlabeled data points (too many overlaps). Consider increasing max.overlaps
## MA plots

# vs positive

pdf("ALCL_MA.pdf")
DESeq2::plotMA(res_neg_v_pos, alpha = 0.01, main = "_")


# plotDispEsts(dds)
vsd<-vst(dds,blind=FALSE)
rld<-rlog(dds,blind=FALSE)
pdf("ALCL_PCA.pdf")
plotPCA(vsd, intgroup=c("subtype")) #same as before for analysis

# distance plot
sampleDists<-dist(t(assay(vsd)))
sampleDistMatrix<-as.matrix(sampleDists)
#row.names(sampleDistMatrix)<-paste(vsd$subtype, rep, sep="_")
colnames(sampleDistMatrix)<-NULL
colors <- colorRampPalette(rev(brewer.pal(9,"Blues")))(255)
pheatmap(sampleDistMatrix, col=colors, border_color=NA, show_colnames=FALSE, show_rownames=TRUE)


sessionInfo()
R version 4.1.1 (2021-08-10)
Platform: x86_64-apple-darwin17.0 (64-bit)
Running under: macOS Big Sur 11.6

Matrix products: default
LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] parallel  stats4    stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] ggpubr_0.4.0                EnhancedVolcano_1.10.0      ggrepel_0.9.1              
 [4] pheatmap_1.0.12             RColorBrewer_1.1-2          forcats_0.5.1              
 [7] stringr_1.4.0               dplyr_1.0.7                 purrr_0.3.4                
[10] readr_2.1.1                 tidyr_1.1.4                 tibble_3.1.6               
[13] ggplot2_3.3.5               tidyverse_1.3.1             Rsubread_2.6.4             
[16] DESeq2_1.32.0               SummarizedExperiment_1.22.0 Biobase_2.52.0             
[19] MatrixGenerics_1.4.3        matrixStats_0.61.0          GenomicRanges_1.44.0       
[22] GenomeInfoDb_1.28.4         IRanges_2.26.0              S4Vectors_0.30.2           
[25] BiocGenerics_0.38.0        

loaded via a namespace (and not attached):
  [1] ggbeeswarm_0.6.0       colorspace_2.0-2       ggsignif_0.6.3         ellipsis_0.3.2        
  [5] XVector_0.32.0         fs_1.5.2               rstudioapi_0.13        farver_2.1.0          
  [9] bit64_4.0.5            AnnotationDbi_1.54.1   fansi_0.5.0            lubridate_1.8.0       
 [13] xml2_1.3.3             splines_4.1.1          extrafont_0.17         cachem_1.0.6          
 [17] geneplotter_1.70.0     knitr_1.36             jsonlite_1.7.2         broom_0.7.10          
 [21] Rttf2pt1_1.3.9         annotate_1.70.0        dbplyr_2.1.1           png_0.1-7             
 [25] compiler_4.1.1         httr_1.4.2             backports_1.4.1        assertthat_0.2.1      
 [29] Matrix_1.4-0           fastmap_1.1.0          cli_3.1.0              htmltools_0.5.2       
 [33] tools_4.1.1            gtable_0.3.0           glue_1.5.1             GenomeInfoDbData_1.2.6
 [37] maps_3.4.0             Rcpp_1.0.7             carData_3.0-4          cellranger_1.1.0      
 [41] jquerylib_0.1.4        vctrs_0.3.8            Biostrings_2.60.2      ggalt_0.4.0           
 [45] extrafontdb_1.0        xfun_0.29              rvest_1.0.2            lifecycle_1.0.1       
 [49] rstatix_0.7.0          XML_3.99-0.8           zlibbioc_1.38.0        MASS_7.3-54           
 [53] scales_1.1.1           vroom_1.5.7            hms_1.1.1              proj4_1.0-10.1        
 [57] yaml_2.2.1             memoise_2.0.1          ggrastr_1.0.1          stringi_1.7.6         
 [61] RSQLite_2.2.9          genefilter_1.74.1      BiocParallel_1.26.2    rlang_0.4.12          
 [65] pkgconfig_2.0.3        bitops_1.0-7           evaluate_0.14          lattice_0.20-45       
 [69] labeling_0.4.2         bit_4.0.4              tidyselect_1.1.1       magrittr_2.0.1        
 [73] R6_2.5.1               generics_0.1.1         DelayedArray_0.18.0    DBI_1.1.1             
 [77] pillar_1.6.4           haven_2.4.3            withr_2.4.3            abind_1.4-5           
 [81] survival_3.2-13        KEGGREST_1.32.0        RCurl_1.98-1.5         ash_1.0-15            
 [85] car_3.0-12             modelr_0.1.8           crayon_1.4.2           KernSmooth_2.23-20    
 [89] utf8_1.2.2             tzdb_0.2.0             rmarkdown_2.11         locfit_1.5-9.4        
 [93] grid_4.1.1             readxl_1.3.1           blob_1.2.2             reprex_2.0.1          
 [97] digest_0.6.29          xtable_1.8-4           munsell_0.5.0          beeswarm_0.4.0        
[101] vipor_0.4.5           
---
title: "PRO ALCL DEseq"
output: html_notebook
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```


```{r}
##################################################################################
##### DESeq2 R script for Differential Expression using pairwise comparisons #####
##################################################################################

## Set working directory
setwd("/Users/kuroko/Dropbox (Partners HealthCare)/Weinstock NTC Collab/Experiments/TCL Cell Lines/Data/ALCL/Full R project scripts")
getwd()

## Before running this script start by transferring the necessary files to the working directory set above:
# All .out.bam files: these were generated from the NTC standard pipeline
# .gtf file with annotations of genome of interest: can be downloaded from multiple sources; or there are dedicated R packages

## Load DESeq2 and Rsubread packages

library(DESeq2)        # Loads DESeq2 package
library(Rsubread)      # Loads Rsubread package (counts fragments for each feature)
library(tidyverse)     # Loads tidyverse which allows various convenient data manipulations
library(RColorBrewer)
library(pheatmap)
library(ggplot2)
library(ggrepel)
library(EnhancedVolcano)
library(ggpubr)

### featureCount steps performed on O2 to avoid having to make local copies of .sam files

### shell script used to run feature count ###  

# Import gene body counts table generated in O2 into R
fc <- read.table("PRO_ALCL_featurecounts", sep="\t", header=TRUE, row.names=1)

# Import SAF file for later use mapping tx_id to gene names.  convert to tibble.  If you are able to get annotation from your counts table, this may not be necessary.
#gtf <- read.table(".gtf", sep = "\t", header = TRUE, row.names=1) 
###  Give nice names to the columns in counts

# Store original colnames
#orig_names <- colnames(fc)

### Renaming columns.  This is really inelegant but it worked.  Depending on naming, may need to adjust trim length.

# Trim 38 char from left side, delete "_sorted.sam" from right, replace . with _
#new_sample_colnames <- str_sub(colnames(fc), 29, str_length(colnames(fc))) %>% str_replace("_sorted.sam","") %>% str_replace("\\.","_")
#names_col1_5 <- colnames(fc)[1:5]                                                # save the names of columns 1-5
#new_names <- c(names_col1_5, new_sample_colnames[6:length(new_sample_colnames)])     # combine names of cols 1-5 with shortened samples names
#new_names

#colnames(fc) <- new_names             # replace old column names with new names
colnames(fc)[6:14] <- c("KARAS299", "L82", "SUPM2", "SUDHL1", "KIJK", "SR786", "FEPD", "DL40", "MAC2A")
colnames(fc)                          # check names
### Create metadata object to categorize samples:

metadata <- data.frame(row.names = colnames(fc[, 6:ncol(fc)]),
                       subtype = c(rep("ALK_positive",6),rep("ALK_negative",3))
)  # genotype+timepoint, used for DEseq conditions
metadata # verify the table looks ok

#confirm that row and column names match
all(rownames(metadata) %in% colnames(fc))
all(rownames(metadata) == colnames(fc)[6:ncol(fc)])


## Create DESeq Data Set from the count table, and sample list:

dds <- DESeqDataSetFromMatrix(countData = fc[, 6:ncol(fc)],      # the matrix of counts from featureCounts;
                              colData = metadata,                     # the sample table created in the last step;
                              design= ~subtype)            # determine what value you'd like DEseq to compare

dds <- estimateSizeFactors(dds)
deseq_sizes <- as.data.frame(sizeFactors(dds))
deseq_sizes

keep <- rowMax(counts(dds, normalize=TRUE)) >= 20
dds <- dds[keep,]
saveRDS(dds, file="ALCL_DEseq_dds.rds")
#dREG_readFilt <- dREG[dREG$dREGpeakID %in% rownames(dds),]
#write.table(dREG_readFilt, file="dREGpeaks.readFilt.bed", sep="\t", col.names=FALSE, row.names=FALSE,
           # quote=FALSE)
# sizeFactors(dds) <- spike_factors

## Estimate and plot dispersions for standard normalization DESeq data set:
dds <- estimateDispersions(dds)
pdf("Dispersion_ALCL.pdf")
plotDispEsts(dds)
dev.off()


# check results of normalization
colSums(counts(dds))                # raw read counts per sample
colSums(counts(dds, normalized=T))  # Total number of normalized counts per sample
data.frame(colSums(counts(dds)), (colSums(counts(dds, normalized=T))), row.names=colnames(counts(dds)))

# For these samples, spike ratios are close so DEseq2 normalization will be used.

#run DESeq
dds <- DESeq(dds)
dds <- nbinomWaldTest(dds)                                     # Wald test for differential expression

# user requested comparisons

#   ALK+ v. ALK-

# create results objects for each requested contrast

res_neg_v_pos  <- results(dds, contrast = c("subtype","ALK_negative","ALK_positive"), alpha = 0.01)

# Calculate mean normalized counts for each condition

AvgNorm_pos        <- as.data.frame(counts(dds, normalized = TRUE)) %>% select(contains("ALK_positive")) %>% rowMeans(.)
AvgNorm_neg        <- as.data.frame(counts(dds, normalized = TRUE)) %>% select(contains("ALK_negative")) %>% rowMeans(.)

# extract gene names from saf annotation file
#geneNames <- dplyr::select(row.names(fc), c("EnsemblID"))

# Start to build results tables

# Results table for comparisons 

resTable <- data.frame("EnsemblID"=as.data.frame(rownames(dds)))
#resTable$geneNames <- geneNames$GeneName

resTable$AvgNormCounts_ALK_positive      <- AvgNorm_pos
resTable$AvgNormCounts_ALK_negative         <- AvgNorm_neg
resTable$baseMean                   <- as.data.frame(res_neg_v_pos)$baseMean

# VS positive

resTable$log2FoldChange_neg_v_pos        <- as.data.frame(res_neg_v_pos)$log2FoldChange
resTable$log2fcSE_neg_v_pos              <- as.data.frame(res_neg_v_pos)$lfcSE
resTable$pvalue_neg_v_pos                <- as.data.frame(res_neg_v_pos)$pvalue
resTable$padj_neg_v_pos                  <- as.data.frame(res_neg_v_pos)$padj

# Write data tables to text files

write.csv(counts(dds, normalized=FALSE), file="ALK_positive_and_negative_WholeGene_raw_counts.csv")
write.csv(counts(dds, normalized=TRUE), file="ALK_positive_and_negative_DEseq2normalized_counts.csv")

write_tsv(resTable,"ALCL_results.txt",
          append = FALSE,
          col_names = TRUE,
          na = "NA"
)

#### PLOTS ####

# EnhancedVolcano plots

ProjectName <- "PROseq_ALCL"
padjCutoff <- 1e-6
FCcutoff <- 1.32

pdf("ALCL_volcano.pdf")
EnhancedVolcano(res_neg_v_pos,
                lab = resTable$rownames.dds.,
                title = '_',
                x = 'log2FoldChange',
                y = 'padj',
                #xlim = c(-6,6),
                xlab = bquote(~Log[2]~ 'fold change'),
                ylab = bquote('-'~Log[10]~ 'adj p-val'),
                pCutoff = padjCutoff,
                FCcutoff = FCcutoff,
                pointSize = 1.0,
                labSize = 2.0,
                colAlpha = 0.5,
                legendPosition = 'right',
                legendLabSize = 6,
                legendIconSize = 3.0,
                drawConnectors = TRUE,
                widthConnectors = 0.2,
                colConnectors = 'grey30')
dev.off()


## MA plots

# vs positive

pdf("ALCL_MA.pdf")
DESeq2::plotMA(res_neg_v_pos, alpha = 0.01, main = "_")
dev.off()

# plotDispEsts(dds)
vsd<-vst(dds,blind=FALSE)
rld<-rlog(dds,blind=FALSE)
pdf("ALCL_PCA.pdf")
plotPCA(vsd, intgroup=c("subtype")) #same as before for analysis
dev.off()

# distance plot
sampleDists<-dist(t(assay(vsd)))
sampleDistMatrix<-as.matrix(sampleDists)
#row.names(sampleDistMatrix)<-paste(vsd$subtype, rep, sep="_")
colnames(sampleDistMatrix)<-NULL
colors <- colorRampPalette(rev(brewer.pal(9,"Blues")))(255)
pheatmap(sampleDistMatrix, col=colors, border_color=NA, show_colnames=FALSE, show_rownames=TRUE)


sessionInfo()
```
