Data Retrieval

This project is focused on analyzing the TCGA-THCA data from a multiomics approach. Specifically, we will be analyzing mRNA, miRNA, lncRNA, and methylation data to describe the trends and relationships between the multiomic profiles of patients and their prognostic characteristics (e.g. survival and/or recurrence).

To start off, we need to import the relevant data we will be using. This data is sourced from TCGA (The Cancer Genome Atlas), although the mRNA, miRNA, methylation, and clinical data came from Firehose (from the Broad Institute), and the lncRNA data came from TANRIC.

Let’s first import the datasets we will be working with, and assign them to variables.

library(readxl)
Warning: package ‘readxl’ was built under R version 4.2.2
#Inputting the data into variables that we can manipulate
clinical <- read_excel("THCA_Clinical_Data.xlsx")
lncRNA <- read_excel("THCA_lncRNA_Data.xlsx")
#methylation <- read_excel("THCA_Methylation_Data.xlsx")
miRNA <- read_excel("THCA_miRNA_Data_Final.xlsx")
mRNA <- read_excel("THCA_mRNA_Data_Final.xlsx")

Let’s look at the head of these tables to understand their structure.

#Accessing the first few rows of each dataset
head(clinical)
head(lncRNA)
#head(methylation)
head(miRNA)
head(mRNA)

Data Preprocessing

After looking at the table heads, there are a couple of issues that we can see:

1) The sample labels aren't identical - some have extra elements at the beginning or end, which         need to be standardized

2) The number of samples in each file aren't the same - for example, miRNA only has 541 samples,        and lncRNA has 556. This also needs to be standardized to make sure only samples with all           data are used.

Let’s start with step 1.

The general format of the data should look something like this: ‘TCGA-XX-XXXX-XX’, so let’s look at each file to see how we can fix it.

lncRNA seems to have extra labels at the start which we can remove, as well as underscores which we can convert to hyphens. However, what’s also important here is that we have normal and tumor samples explictly labeled, whereas other files use the ‘01’ marking for tumors and ‘11’ marking for normal samples. We need to adapt our lncRNA data to follow this structure.

#We are going through each column in the lncRNA dataset
for (i in 1:ncol(lncRNA)) {
  #If the column name - which is the sample ID - contains Normal, we'll add a -11 and format the name to remove the beginning and convert all underscores to hyphens.
  
  if (grepl("Normal", colnames(lncRNA)[i], fixed = TRUE) == TRUE) {
    colnames(lncRNA)[i] <- sub(".*-TC", "TC", colnames(lncRNA)[i])
    colnames(lncRNA)[i] <- gsub("_", "-", colnames(lncRNA)[i])
    colnames(lncRNA)[i] <- paste(colnames(lncRNA)[i], '-11', sep="")
  } else {
    #Otherwise, we'll still format the name, but we'll add a -01 at the end instead.
    colnames(lncRNA)[i] <- sub(".*-TC", "TC", colnames(lncRNA)[i])
    colnames(lncRNA)[i] <- gsub("_", "-", colnames(lncRNA)[i])
    colnames(lncRNA)[i] <- paste(colnames(lncRNA)[i], '-01', sep="")
  }
    
}

#We'll convert the first column to the standard name Gene_ID and remove the extra marker in the Ensemble ID after the decimal point.
colnames(lncRNA)[1] <- "Gene_ID"
lncRNA$Gene_ID <- sub("\\..*", "", lncRNA$Gene_ID)
head(lncRNA)

That looks more in line with the standard sample notation.

Both the methylation and the mRNA file have underscores that we can convert to hyphens.

#We'll iterate over each column and convert the underscores to dashes, for both datasets. Then, we'll convert the first column to the Gene_ID label.
for (i in 1:ncol(mRNA)) {
  colnames(mRNA)[i] <- gsub("_", "-", colnames(mRNA)[i])
}
colnames(mRNA)[1] <- "Gene_ID"
head(mRNA)

for (i in 1:ncol(methylation)) {
  colnames(methylation)[i] <- gsub("_", "-", colnames(methylation)[i])
}
colnames(methylation)[1] <- "Gene_ID"
head(methylation)

That seems to be fixed as well.

For miRNA data, there is a lot of extraneous data at the end about sample and vials, which we don’t need, so let’s remove that.

#We'll iterate over the column names and remove any extraneous information after our normal or tumor sample labels, and then create the Gene_ID column.

for (i in 1:ncol(miRNA)) {
  colnames(miRNA)[i] <- sub("_01.*", "_01", colnames(miRNA)[i])
  colnames(miRNA)[i] <- sub("_01.*", "_01", colnames(miRNA)[i])
  colnames(miRNA)[i] <- gsub("_", "-", colnames(miRNA)[i])
}
colnames(miRNA)[1] <- "Gene_ID"
head(miRNA)

Done!

For clinical data, we have our ‘Sample ID’ column with appropriate notation, so we don’t need to change that.

Now, for step 2, we have to identify which samples are missing from any of the files, as we only want data points present in each file. To accomplish this, I am first going to access all the sample IDs from each file. I’m then going to add them together to form a vector, and use the table function to see how many times each value appears in the vector. Given that we have 5 files of data, a sample present in all files should appear at least 5 times. Thus, we will isolate any values that appear less than 5 times.

#Here we access the specific sample names through indexing of the data (which starts from column 2).
clin_samples <- clinical$`Sample ID`
lnc_samples <- colnames(lncRNA)[2:557]
meth_samples <- colnames(methylation)[5:571]
miR_samples <- colnames(miRNA)[2:570]
mRNA_samples <- colnames(mRNA)[2:569]

#We concatenate all the sample data into a vector.
temp <- c(lnc_samples, meth_samples, miR_samples, mRNA_samples)
#Next, we count the occurrence of each unique sample ID in our vector using the table function - if a sample is present in all 4 datasets, it'll have a count of 4.
temp_table = table(temp)
#Now, we just want to keep the samples that have an occurrence of 4.
keep1 <- names(temp_table[temp_table >= 4])
keep1 <- c("Gene_ID", keep1)

Now, we will remove their columns/rows from each data file in a sequential manner.

#Next, we just subset the datasets to only keep columns that match a value in our vector of accepted samples - using the subset and select functions.
miRNA_new <- subset(miRNA, select=c(keep1))
mRNA_new <- subset(mRNA, select=c(keep1))
methylation_new <- subset(methylation, select=c(keep1))
lncRNA_new <- subset(lncRNA, select=c(keep1))

One additional step we need to take is to make sure that all our tumor samples have clinical data available. Thus, I’m going to compare our tumor samples (with the -01 ending) to our clinical sample IDs to find and also remove any samples that don’t overlap, using a similar method.

#Now, since clinical data only contains tumor samples, we need to separate out our tumor samples by searching for all samples with the -01 tag.
tumor_samples <- grep("-01", keep1, value=TRUE)
normal_samples <- grep("-11", keep1, value=TRUE)

#Same as above - vector, count, keep anything occurring twice since we are comparing 2 datasets.
temp2 <- c(tumor_samples, clin_samples)
temp_table_2 = table(temp2)
keep2 <- names(temp_table_2[temp_table_2 >= 2])
keep2 <- c(keep2)

#We then delete any rows that don't correspond to a sample in our list, by using the in function to only keep rows with values in our vector.

clin_new <- clinical[clinical$`Sample ID` %in% keep2,]
clin_new <- data.frame(clin_new)

clin_new$'Metastasis_Group' <- with(clin_new, ifelse(clin_new$American.Joint.Committee.on.Cancer.Metastasis.Stage.Code == "M1", 1, 2))

clin_new$'Recurrence_Group' <- with(clin_new, ifelse(clin_new$Disease.Free.Status == "1:Recurred/Progressed", 1, 2))

clin_new

Now, all our data formatting is done - each of our datasets is composed of 542 samples, with 492 tumor samples (as the clinical data suggests) and 50 normal samples. Before we move into our actual analysis, we need to make sure all our samples have data points for the RNAs being analyzed. Since a lot of the files have ‘NA’ values in them, we can’t work with this, so we’ll just eliminate all rows containing an NA.

library(dplyr)
library(readr)
library(edgeR)
#For each dataset, we are going to remove any rows with NA in them, and then convert it to a data frame to work with our DESeq2 functions.
miRNA_new[miRNA_new == "NA"] <- NA
miRNA_final <- miRNA_new[complete.cases(miRNA_new), ]
miRNA_final = data.frame(miRNA_final)

mRNA_new[mRNA_new == "NA"] <- NA
mRNA_final <- mRNA_new[complete.cases(mRNA_new), ]
mRNA_final = data.frame(mRNA_final)

lncRNA_new[lncRNA_new == "NA"] <- NA
lncRNA_final <- lncRNA_new[complete.cases(lncRNA_new), ]
lncRNA_final = data.frame(lncRNA_final)

methylation_new[methylation_new == "NA"] <- NA 
methylation_final <- methylation_new[complete.cases(methylation_new), ]
methylation_final = data.frame(methylation_final)

Differential Gene Expression

We finally have our final datasets that we can work with. Let’s begin with analysis.

We first should create a proper metadata variable to begin analysis, by considering tumor and normal samples. However, there are different levels of analysis we can conduct, which means we’ll create a couple of metadata files based on clinical data.

First, let’s create a file with a simple distribution of group 0 for normal tissue and group 1 for tumor tissue.

library(DESeq2)
library(ggplot2)

#Access sample IDs to form column 1, then check for the .01 tag to assign groups as 1 or 0 for tumor or normal.
metadata <- data.frame("Sample_ID"=c(colnames(lncRNA_final)[2:543]))
metadata$'Group' <- ifelse(grepl(".01", metadata$Sample_ID, fixed = TRUE) == TRUE, 1, 0)
metadata$Group <- as.factor(metadata$Group)

for (i in 1:542) {
  sample <- metadata[i, 1]
  sample2 <- gsub("\\.", "-", sample)
  row1 <- clin_new[which(clin_new$Sample.ID == sample2), ]
  group1 <- row1$Overall.Survival..Months.
  if (grepl(".11", sample, fixed=TRUE) == TRUE) {
    metadata[i, c("days_to_death")] <- 10000
  } else{
    metadata[i, c("days_to_death")] <- as.numeric(group1) * 30
  }
}

Next, let’s create a metadata file that adds in recurrent samples as being group 2, and primary samples as being group 1.

#Access sample IDs to form column 1, then if the tumor is normal, assign group 0. If the tumor is not normal, reference clinical data for disease free status, and assign group 2 to recurred samples.
metadata_recur <- data.frame("Sample_ID"=c(colnames(lncRNA_final)[2:543]))
for (i in 1:542) {
  sample <- metadata_recur[i, 1]
  sample2 <- gsub("\\.", "-", sample)
  row1 <- clin_new[which(clin_new$Sample.ID == sample2), ]
  group1 <- row1$Recurrence_Group
  if (grepl(".11", sample, fixed=TRUE) == TRUE) {
    metadata_recur[i, c("Group")] <- 0
    metadata_recur[i, c("days_to_death")] <- 10000
  } else {
    if (group1 == 1) {
      metadata_recur[i, c("Group")] <- 1
    } else{
      metadata_recur[i, c("Group")] <- 2
    }
    metadata_recur[i, c("days_to_death")] <- as.numeric(row1$Overall.Survival..Months.) * 30
  }
}

metadata_recur$Group <- as.factor(metadata_recur$Group)

We can also create a metadata file to evaluate metastatic vs. non-metastatic samples, which can form group 1 and 2 respectively.

#Access sample IDs to form column 1, then if the tumor is normal, assign group 0. If the tumor is not normal, reference clinical data for metastatic status, and assign group 1 to metastatic samples.
metadata_metas <- data.frame("Sample_ID"=c(colnames(lncRNA_final)[2:543]))

for (i in 1:542) {
  sample <- metadata_metas[i, 1]
  sample2 <- gsub("\\.", "-", sample)
  row1 <- clin_new[which(clin_new$Sample.ID == sample2), ]
  group1 <- row1$Metastasis_Group
  if (grepl(".11", sample, fixed=TRUE) == TRUE) {
    metadata_metas[i, c("Group")] <- 0
    metadata_metas[i, c("days_to_death")] <- 10000
  } else {
    if (group1 == 1) {
      metadata_metas[i, c("Group")] <- 1
    } else{
      metadata_metas[i, c("Group")] <- 2
    }
    metadata_metas[i, c("days_to_death")] <- as.numeric(row1$Overall.Survival..Months.)*30
  }
}

metadata_metas$Group <- as.factor(metadata_metas$Group)

Now, for each of our datasets, we can format them to work with DESeq2, and then run the appropriate analysis/visualizations for them.

lncRNA

#Need to convert the gene names to the actual row names, and remove that column
lnc_DESEQ <- lncRNA_final[,-1]
rownames(lnc_DESEQ) <- lncRNA_final$Gene_ID
Metadata Metastasis
#Create the DESeq data and analyze it
dds <- DESeqDataSetFromMatrix(countData=round(lnc_DESEQ), colData=metadata, design=~Group)
converting counts to integer mode
dds <- DESeq(dds)
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
-- note: fitType='parametric', but the dispersion trend was not well captured by the
   function: y = a/x + b, and a local regression fit was automatically substituted.
   specify fitType='local' or 'mean' to avoid this message next time.
final dispersion estimates
fitting model and testing
-- replacing outliers and refitting for 15 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)
estimating dispersions
fitting model and testing
#Plot the dispersion estimates of the differential data
plotDispEsts(dds)


#Access and store the results of DEG
res_lnc_metas <- results(dds, tidy=TRUE)
rownames(res_lnc_metas) <- res_lnc_metas[, 1]
res_lnc_metas <- res_lnc_metas %>% 
  arrange(res_lnc_metas$padj)
head(res_lnc_metas)

summary(res_lnc_metas)
     row               baseMean        log2FoldChange       lfcSE            stat        
 Length:12727       Min.   :  0.0000   Min.   :-3.384   Min.   :0.000   Min.   :-14.998  
 Class :character   1st Qu.:  0.0000   1st Qu.:-0.239   1st Qu.:0.270   1st Qu.: -0.520  
 Mode  :character   Median :  0.0000   Median : 0.049   Median :1.316   Median :  0.010  
                    Mean   :  0.3318   Mean   :-0.065   Mean   :2.163   Mean   : -0.397  
                    3rd Qu.:  0.0134   3rd Qu.: 0.084   3rd Qu.:4.674   3rd Qu.:  0.032  
                    Max.   :391.1179   Max.   : 5.322   Max.   :4.674   Max.   : 16.550  
                                       NA's   :7953     NA's   :7953    NA's   :7953     
     pvalue           padj      
 Min.   :0.000   Min.   :0.000  
 1st Qu.:0.106   1st Qu.:0.001  
 Median :0.910   Median :0.080  
 Mean   :0.633   Mean   :0.271  
 3rd Qu.:0.990   3rd Qu.:0.525  
 Max.   :1.000   Max.   :1.000  
 NA's   :7953    NA's   :10655  
#Analyze the distribution of p-values (likely skewed)
hist(res_lnc_metas$padj)


#Analyze distribution of p-values with LFC threshold of 1.5.
res2 <- results(dds, lfcThreshold=1.5, alpha=0.01)
hist(res2$padj)


for (i in 1:10) {
  plotCounts(dds, gene=rownames(res_lnc_metas)[i], intgroup="Group", main=rownames(res_lnc_metas)[i])
}


plotMA(dds, lfcThreshold=1.5, alpha=0.01)
#reset par
par(mfrow=c(1,1))

# Make a basic volcano plot
with(res_lnc_metas, plot(log2FoldChange, -log10(pvalue), pch=20, main="Volcano plot", xlim=c(-3,3)))

# Add colored points: blue if padj<0.01, red if log2FC>1 and padj<0.05)
with(subset(res_lnc_metas, padj<.01 ), points(log2FoldChange, -log10(pvalue), pch=20, col="blue"))
with(subset(res_lnc_metas, padj<.01 & abs(log2FoldChange)>2), points(log2FoldChange, -log10(pvalue), pch=20, col="red"))


library(pheatmap)
vsdata <- varianceStabilizingTransformation(dds, blind=TRUE)
-- note: fitType='parametric', but the dispersion trend was not well captured by the
   function: y = a/x + b, and a local regression fit was automatically substituted.
   specify fitType='local' or 'mean' to avoid this message next time.
plotPCA(vsdata, intgroup="Group")


rld_mat <- assay(vsdata)
rld_cor = cor(rld_mat)
pheatmap(rld_cor, cluster_rows=TRUE, cluster_cols=TRUE, show_colnames=FALSE, show_rownames = FALSE)

Now, let’s use the same methodology to work through the other RNA types we have with us:

miRNA

#Need to convert the gene names to the actual row names, and remove that column
mi_DESEQ <- miRNA_final[,-1]
rownames(mi_DESEQ) <- miRNA_final$Gene_ID
Metadata Normal
#Create the DESeq data and analyze it
dds <- DESeqDataSetFromMatrix(countData=round(mi_DESEQ), colData=metadata, design=~Group)
converting counts to integer mode
dds <- DESeq(dds)
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
-- note: fitType='parametric', but the dispersion trend was not well captured by the
   function: y = a/x + b, and a local regression fit was automatically substituted.
   specify fitType='local' or 'mean' to avoid this message next time.
final dispersion estimates
fitting model and testing
-- replacing outliers and refitting for 3 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)
estimating dispersions
fitting model and testing
#Plot the dispersion estimates of the differential data
plotDispEsts(dds)


#Access and store the results of DEG
res_miR_main <- results(dds, tidy=TRUE)
rownames(res_miR_main) <- res_miR_main[, 1]
res_miR_main <- res_miR_main %>% 
  arrange(res_miR_main$padj)
head(res_miR_main)

summary(res_miR_main)
     row               baseMean        log2FoldChange          lfcSE        
 Length:139         Min.   :    25.7   Min.   :-2.126281   Min.   :0.05621  
 Class :character   1st Qu.:   196.9   1st Qu.:-0.347177   1st Qu.:0.08113  
 Mode  :character   Median :  1190.2   Median :-0.003324   Median :0.10156  
                    Mean   : 25381.0   Mean   : 0.151709   Mean   :0.11360  
                    3rd Qu.:  6978.9   3rd Qu.: 0.476889   3rd Qu.:0.14036  
                    Max.   :986817.4   Max.   : 5.976988   Max.   :0.27290  
      stat               pvalue               padj          
 Min.   :-12.31630   Min.   :0.0000000   Min.   :0.0000000  
 1st Qu.: -3.46220   1st Qu.:0.0000000   1st Qu.:0.0000000  
 Median : -0.04179   Median :0.0000522   Median :0.0001037  
 Mean   :  1.27820   Mean   :0.1382176   Mean   :0.1532180  
 3rd Qu.:  5.12964   3rd Qu.:0.0981589   3rd Qu.:0.1305359  
 Max.   : 25.71245   Max.   :0.9765655   Max.   :0.9765655  
#Analyze the distribution of p-values (likely skewed)
hist(res_miR_main$padj)


#Analyze distribution of p-values with LFC threshold of 1.5.
res2 <- results(dds, lfcThreshold=1.5, alpha=0.01)
hist(res2$padj)


for (i in 1:10) {
  plotCounts(dds, gene=rownames(res_miR_main)[i], intgroup="Group", main=rownames(res_miR_main)[i])
}


plotMA(dds, lfcThreshold=1.5, alpha=0.01)
#reset par
par(mfrow=c(1,1))

# Make a basic volcano plot
with(res_miR_main, plot(log2FoldChange, -log10(pvalue), pch=20, main="Volcano plot", xlim=c(-3,3)))

# Add colored points: blue if padj<0.01, red if log2FC>1 and padj<0.05)
with(subset(res_miR_main, padj<.01 ), points(log2FoldChange, -log10(pvalue), pch=20, col="blue"))
with(subset(res_miR_main, padj<.01 & abs(log2FoldChange)>2), points(log2FoldChange, -log10(pvalue), pch=20, col="red"))


library(pheatmap)
Warning: package ‘pheatmap’ was built under R version 4.2.2
vsdata <- varianceStabilizingTransformation(dds, blind=TRUE)
-- note: fitType='parametric', but the dispersion trend was not well captured by the
   function: y = a/x + b, and a local regression fit was automatically substituted.
   specify fitType='local' or 'mean' to avoid this message next time.
plotPCA(vsdata, intgroup="Group")


rld_mat <- assay(vsdata)
rld_cor = cor(rld_mat)
pheatmap(rld_cor, cluster_rows=TRUE, cluster_cols=TRUE, show_colnames=FALSE, show_rownames = FALSE)

Metadata Recurrence
#Create the DESeq data and analyze it
dds <- DESeqDataSetFromMatrix(countData=round(mi_DESEQ), colData=metadata_recur, design=~Group)
converting counts to integer mode
dds <- DESeq(dds)
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
-- replacing outliers and refitting for 31 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)
estimating dispersions
fitting model and testing
#Plot the dispersion estimates of the differential data
plotDispEsts(dds)


#Access and store the results of DEG
res_miR_recur <- results(dds, tidy=TRUE)
rownames(res_miR_recur) <- res_miR_recur[, 1]
res_miR_recur <- res_miR_recur %>% 
  arrange(res_miR_recur$padj)
head(res_miR_recur)

summary(res_miR_recur)
     row               baseMean         log2FoldChange         lfcSE              stat          
 Length:517         Min.   :      0.4   Min.   :-2.41223   Min.   :0.05337   Min.   :-11.53610  
 Class :character   1st Qu.:      2.7   1st Qu.:-0.37298   1st Qu.:0.10972   1st Qu.: -2.52993  
 Mode  :character   Median :     23.1   Median : 0.01108   Median :0.16785   Median :  0.06862  
                    Mean   :  10488.8   Mean   : 0.13365   Mean   :0.18855   Mean   :  0.60866  
                    3rd Qu.:    498.2   3rd Qu.: 0.55130   3rd Qu.:0.24149   3rd Qu.:  3.42082  
                    Max.   :1031530.8   Max.   : 5.63717   Max.   :0.70103   Max.   : 22.40207  
     pvalue              padj          
 Min.   :0.000000   Min.   :0.0000000  
 1st Qu.:0.000001   1st Qu.:0.0000041  
 Median :0.005420   Median :0.0108183  
 Mean   :0.161064   Mean   :0.1858572  
 3rd Qu.:0.210264   3rd Qu.:0.2801715  
 Max.   :0.993476   Max.   :0.9934762  
#Analyze the distribution of p-values (likely skewed)
hist(res_miR_recur$padj)


#Analyze distribution of p-values with LFC threshold of 1.5.
res2 <- results(dds, lfcThreshold=1.5, alpha=0.01)
hist(res2$padj)


for (i in 1:10) {
  plotCounts(dds, gene=rownames(res_miR_recur)[i], intgroup="Group", main=rownames(res_miR_recur)[i])
}


plotMA(dds, lfcThreshold=1.5, alpha=0.01)
#reset par
par(mfrow=c(1,1))

# Make a basic volcano plot
with(res_miR_recur, plot(log2FoldChange, -log10(pvalue), pch=20, main="Volcano plot", xlim=c(-3,3)))

# Add colored points: blue if padj<0.01, red if log2FC>1 and padj<0.05)
with(subset(res_miR_recur, padj<.01 ), points(log2FoldChange, -log10(pvalue), pch=20, col="blue"))
with(subset(res_miR_recur, padj<.01 & abs(log2FoldChange)>2), points(log2FoldChange, -log10(pvalue), pch=20, col="red"))


library(pheatmap)
vsdata <- varianceStabilizingTransformation(dds, blind=TRUE)
plotPCA(vsdata, intgroup="Group")


rld_mat <- assay(vsdata)
rld_cor = cor(rld_mat)
pheatmap(rld_cor, cluster_rows=TRUE, cluster_cols=TRUE, show_colnames=FALSE, show_rownames = FALSE)

Metadata Metastasis
#Create the DESeq data and analyze it
dds <- DESeqDataSetFromMatrix(countData=round(mi_DESEQ), colData=metadata_metas, design=~Group)
converting counts to integer mode
dds <- DESeq(dds)
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
-- replacing outliers and refitting for 25 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)
estimating dispersions
fitting model and testing
#Plot the dispersion estimates of the differential data
plotDispEsts(dds)


#Access and store the results of DEG
res_miR_metas <- results(dds, tidy=TRUE)
rownames(res_miR_metas) <- res_miR_metas[, 1]
res_miR_metas <- res_miR_metas %>% 
  arrange(res_miR_metas$padj)
head(res_miR_metas)

summary(res_miR_metas)
     row               baseMean         log2FoldChange         lfcSE              stat         
 Length:517         Min.   :      0.4   Min.   :-2.43592   Min.   :0.05318   Min.   :-11.7341  
 Class :character   1st Qu.:      2.7   1st Qu.:-0.37799   1st Qu.:0.10932   1st Qu.: -2.5484  
 Mode  :character   Median :     23.1   Median : 0.01754   Median :0.16730   Median :  0.1431  
                    Mean   :  10488.9   Mean   : 0.14134   Mean   :0.18822   Mean   :  0.6393  
                    3rd Qu.:    498.2   3rd Qu.: 0.55949   3rd Qu.:0.24097   3rd Qu.:  3.5059  
                    Max.   :1031530.8   Max.   : 5.66327   Max.   :0.71200   Max.   : 22.5772  
     pvalue               padj          
 Min.   :0.0000000   Min.   :0.0000000  
 1st Qu.:0.0000004   1st Qu.:0.0000016  
 Median :0.0047254   Median :0.0094085  
 Mean   :0.1587452   Mean   :0.1828219  
 3rd Qu.:0.2185203   3rd Qu.:0.2908206  
 Max.   :0.9995498   Max.   :0.9995498  
#Analyze the distribution of p-values (likely skewed)
hist(res_miR_metas$padj)


#Analyze distribution of p-values with LFC threshold of 1.5.
res2 <- results(dds, lfcThreshold=1.5, alpha=0.01)
hist(res2$padj)


for (i in 1:10) {
  plotCounts(dds, gene=rownames(res_miR_metas)[i], intgroup="Group", main=rownames(res_miR_metas)[i])
}


plotMA(dds, lfcThreshold=1.5, alpha=0.01)
#reset par
par(mfrow=c(1,1))

# Make a basic volcano plot
with(res_miR_metas, plot(log2FoldChange, -log10(pvalue), pch=20, main="Volcano plot", xlim=c(-3,3)))

# Add colored points: blue if padj<0.01, red if log2FC>1 and padj<0.05)
with(subset(res_miR_metas, padj<.01 ), points(log2FoldChange, -log10(pvalue), pch=20, col="blue"))
with(subset(res_miR_metas, padj<.01 & abs(log2FoldChange)>2), points(log2FoldChange, -log10(pvalue), pch=20, col="red"))


library(pheatmap)
vsdata <- varianceStabilizingTransformation(dds, blind=TRUE)
plotPCA(vsdata, intgroup="Group")


rld_mat <- assay(vsdata)
rld_cor = cor(rld_mat)
pheatmap(rld_cor, cluster_rows=TRUE, cluster_cols=TRUE, show_colnames=FALSE, show_rownames = FALSE)

mRNA

#Need to convert the gene names to the actual row names, and remove that column
m_DESEQ <- mRNA_final[,-1]
rownames(m_DESEQ) <- mRNA_final$Gene_ID
Metadata Normal
#Create the DESeq data and analyze it
dds <- DESeqDataSetFromMatrix(countData=round(m_DESEQ), colData=metadata, design=~Group)
converting counts to integer mode
dds <- DESeq(dds)
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
-- replacing outliers and refitting for 1292 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)
estimating dispersions
fitting model and testing
#Plot the dispersion estimates of the differential data
plotDispEsts(dds)


#Access and store the results of DEG
res_m_main <- results(dds, tidy=TRUE)
rownames(res_m_main) <- res_m_main[, 1]
res_m_main <- res_m_main %>% 
  arrange(res_m_main$padj)
head(res_m_main)

summary(res_m_main)
     row               baseMean         log2FoldChange        lfcSE             stat         
 Length:20473       Min.   :      0.0   Min.   :-4.7300   Min.   :0.0220   Min.   :-18.4327  
 Class :character   1st Qu.:     24.6   1st Qu.:-0.3090   1st Qu.:0.0593   1st Qu.: -3.1685  
 Mode  :character   Median :    611.7   Median : 0.0325   Median :0.1061   Median :  0.0297  
                    Mean   :   2643.9   Mean   : 0.1126   Mean   :0.3284   Mean   :  0.4885  
                    3rd Qu.:   2355.0   3rd Qu.: 0.3527   3rd Qu.:0.2193   3rd Qu.:  3.6408  
                    Max.   :2153260.9   Max.   : 8.6128   Max.   :4.6740   Max.   : 30.6929  
                                        NA's   :367       NA's   :367      NA's   :367       
     pvalue            padj       
 Min.   :0.0000   Min.   :0.0000  
 1st Qu.:0.0000   1st Qu.:0.0000  
 Median :0.0007   Median :0.0010  
 Mean   :0.1706   Mean   :0.1736  
 3rd Qu.:0.1769   3rd Qu.:0.1977  
 Max.   :0.9999   Max.   :0.9999  
 NA's   :367      NA's   :757     
#Analyze the distribution of p-values (likely skewed)
hist(res_m_main$padj)


#Analyze distribution of p-values with LFC threshold of 1.5.
res2 <- results(dds, lfcThreshold=1.5, alpha=0.01)
hist(res2$padj)


for (i in 1:10) {
  plotCounts(dds, gene=rownames(res_m_main)[i], intgroup="Group", main=rownames(res_m_main)[i])
}


plotMA(dds, lfcThreshold=1.5, alpha=0.01)
#reset par
par(mfrow=c(1,1))

# Make a basic volcano plot
with(res_m_main, plot(log2FoldChange, -log10(pvalue), pch=20, main="Volcano plot", xlim=c(-3,3)))

# Add colored points: blue if padj<0.01, red if log2FC>1 and padj<0.05)
with(subset(res_m_main, padj<.01 ), points(log2FoldChange, -log10(pvalue), pch=20, col="blue"))
with(subset(res_m_main, padj<.01 & abs(log2FoldChange)>2), points(log2FoldChange, -log10(pvalue), pch=20, col="red"))


library(pheatmap)
vsdata <- varianceStabilizingTransformation(dds, blind=TRUE)
plotPCA(vsdata, intgroup="Group")


rld_mat <- assay(vsdata)
rld_cor = cor(rld_mat)
pheatmap(rld_cor, cluster_rows=TRUE, cluster_cols=TRUE, show_colnames=FALSE, show_rownames = FALSE)

Metadata Recurrence
#Create the DESeq data and analyze it
dds <- DESeqDataSetFromMatrix(countData=round(m_DESEQ), colData=metadata_recur, design=~Group)
converting counts to integer mode
dds <- DESeq(dds)
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
-- replacing outliers and refitting for 1290 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)
estimating dispersions
fitting model and testing
#Plot the dispersion estimates of the differential data
plotDispEsts(dds)


#Access and store the results of DEG
res_m_recur <- results(dds, tidy=TRUE)
rownames(res_m_recur) <- res_m_recur[, 1]
res_m_recur <- res_m_recur %>% 
  arrange(res_m_recur$padj)
head(res_m_recur)

summary(res_m_recur)
     row               baseMean         log2FoldChange        lfcSE             stat              pvalue      
 Length:20531       Min.   :      0.0   Min.   :-4.7194   Min.   :0.0239   Min.   :-16.7630   Min.   :0.0000  
 Class :character   1st Qu.:     24.0   1st Qu.:-0.2854   1st Qu.:0.0641   1st Qu.: -2.8046   1st Qu.:0.0000  
 Mode  :character   Median :    606.9   Median : 0.0488   Median :0.1149   Median :  0.0566   Median :0.0017  
                    Mean   :   2641.9   Mean   : 0.1409   Mean   :0.3587   Mean   :  0.5535   Mean   :0.1811  
                    3rd Qu.:   2347.3   3rd Qu.: 0.3738   3rd Qu.:0.2378   3rd Qu.:  3.4825   3rd Qu.:0.2230  
                    Max.   :2128218.5   Max.   : 8.7262   Max.   :5.0175   Max.   : 27.6647   Max.   :0.9996  
                                        NA's   :373       NA's   :373      NA's   :373        NA's   :373     
      padj       
 Min.   :0.0000  
 1st Qu.:0.0000  
 Median :0.0019  
 Mean   :0.1683  
 3rd Qu.:0.2032  
 Max.   :0.9996  
 NA's   :1154    
#Analyze the distribution of p-values (likely skewed)
hist(res_m_recur$padj)


#Analyze distribution of p-values with LFC threshold of 1.5.
res2 <- results(dds, lfcThreshold=1.5, alpha=0.01)
hist(res2$padj)


for (i in 1:10) {
  plotCounts(dds, gene=rownames(res_m_recur)[i], intgroup="Group", main=rownames(res_m_recur)[i])
}


plotMA(dds, lfcThreshold=1.5, alpha=0.01)
#reset par
par(mfrow=c(1,1))

# Make a basic volcano plot
with(res_m_recur, plot(log2FoldChange, -log10(pvalue), pch=20, main="Volcano plot", xlim=c(-3,3)))

# Add colored points: blue if padj<0.01, red if log2FC>1 and padj<0.05)
with(subset(res_m_recur, padj<.01 ), points(log2FoldChange, -log10(pvalue), pch=20, col="blue"))
with(subset(res_m_recur, padj<.01 & abs(log2FoldChange)>2), points(log2FoldChange, -log10(pvalue), pch=20, col="red"))


library(pheatmap)
vsdata <- varianceStabilizingTransformation(dds, blind=TRUE)
plotPCA(vsdata, intgroup="Group")


rld_mat <- assay(vsdata)
rld_cor = cor(rld_mat)
pheatmap(rld_cor, cluster_rows=TRUE, cluster_cols=TRUE, show_colnames=FALSE, show_rownames = FALSE)

Metadata Metastasis
#Create the DESeq data and analyze it
dds <- DESeqDataSetFromMatrix(countData=round(m_DESEQ), colData=metadata_metas, design=~Group)
converting counts to integer mode
dds <- DESeq(dds)
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
-- replacing outliers and refitting for 1136 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)
estimating dispersions
fitting model and testing
#Plot the dispersion estimates of the differential data
plotDispEsts(dds)


#Access and store the results of DEG
res_m_metas <- results(dds, tidy=TRUE)
rownames(res_m_metas) <- res_m_metas[, 1]
res_m_metas <- res_m_metas %>% 
  arrange(res_m_metas$padj)
head(res_m_metas)

summary(res_m_metas)
     row               baseMean         log2FoldChange        lfcSE             stat              pvalue      
 Length:20531       Min.   :      0.0   Min.   :-4.7832   Min.   :0.0238   Min.   :-17.0028   Min.   :0.0000  
 Class :character   1st Qu.:     24.0   1st Qu.:-0.2860   1st Qu.:0.0639   1st Qu.: -2.8185   1st Qu.:0.0000  
 Mode  :character   Median :    606.5   Median : 0.0509   Median :0.1145   Median :  0.0575   Median :0.0016  
                    Mean   :   2642.3   Mean   : 0.1421   Mean   :0.3576   Mean   :  0.5661   Mean   :0.1799  
                    3rd Qu.:   2347.3   3rd Qu.: 0.3786   3rd Qu.:0.2372   3rd Qu.:  3.5152   3rd Qu.:0.2163  
                    Max.   :2128218.5   Max.   : 8.6091   Max.   :4.9986   Max.   : 27.8391   Max.   :0.9999  
                                        NA's   :373       NA's   :373      NA's   :373        NA's   :373     
      padj       
 Min.   :0.0000  
 1st Qu.:0.0000  
 Median :0.0024  
 Mean   :0.1858  
 3rd Qu.:0.2398  
 Max.   :0.9999  
 NA's   :764     
#Analyze the distribution of p-values (likely skewed)
hist(res_m_metas$padj)


#Analyze distribution of p-values with LFC threshold of 1.5.
res2 <- results(dds, lfcThreshold=1.5, alpha=0.01)
hist(res2$padj)


for (i in 1:10) {
  plotCounts(dds, gene=rownames(res_m_metas)[i], intgroup="Group", main=rownames(res_m_metas)[i])
}


plotMA(dds, lfcThreshold=1.5, alpha=0.01)
#reset par
par(mfrow=c(1,1))

# Make a basic volcano plot
with(res_m_metas, plot(log2FoldChange, -log10(pvalue), pch=20, main="Volcano plot", xlim=c(-3,3)))

# Add colored points: blue if padj<0.01, red if log2FC>1 and padj<0.05)
with(subset(res_m_metas, padj<.01 ), points(log2FoldChange, -log10(pvalue), pch=20, col="blue"))
with(subset(res_m_metas, padj<.01 & abs(log2FoldChange)>2), points(log2FoldChange, -log10(pvalue), pch=20, col="red"))


library(pheatmap)
vsdata <- varianceStabilizingTransformation(dds, blind=TRUE)
plotPCA(vsdata, intgroup="Group")


rld_mat <- assay(vsdata)
rld_cor = cor(rld_mat)
pheatmap(rld_cor, cluster_rows=TRUE, cluster_cols=TRUE, show_colnames=FALSE, show_rownames = FALSE)

Let’s test out the GDCRNATools package.

---
title: "TCGA_THCA Analysis"
output:
  html_document:
    df_print: paged
  html_notebook: default
  pdf_document: default
---
## *Data Retrieval*

This project is focused on analyzing the TCGA-THCA data from a multiomics approach. Specifically, we will be analyzing mRNA, miRNA, lncRNA, and methylation data to describe the trends and relationships between the multiomic profiles of patients and their prognostic characteristics (e.g. survival and/or recurrence).

To start off, we need to import the relevant data we will be using. This data is sourced from TCGA (The Cancer Genome Atlas), although the mRNA, miRNA, methylation, and clinical data came from Firehose (from the Broad Institute), and the lncRNA data came from TANRIC.

Let's first import the datasets we will be working with, and assign them to variables.
```{r}
library(readxl)

#Inputting the data into variables that we can manipulate
clinical <- read_excel("THCA_Clinical_Data.xlsx")
lncRNA <- read_excel("THCA_lncRNA_Data.xlsx")
#methylation <- read_excel("THCA_Methylation_Data.xlsx")
miRNA <- read_excel("THCA_miRNA_Data_Final.xlsx")
mRNA <- read_excel("THCA_mRNA_Data_Final.xlsx")
```

Let's look at the head of these tables to understand their structure.

```{r}
#Accessing the first few rows of each dataset
head(clinical)
head(lncRNA)
#head(methylation)
head(miRNA)
head(mRNA)
```
## *Data Preprocessing*

After looking at the table heads, there are a couple of issues that we can see:

    1) The sample labels aren't identical - some have extra elements at the beginning or end, which         need to be standardized
    
    2) The number of samples in each file aren't the same - for example, miRNA only has 541 samples,        and lncRNA has 556. This also needs to be standardized to make sure only samples with all           data are used.

Let's start with step 1.

The general format of the data should look something like this: 'TCGA-XX-XXXX-XX', so let's look at each file to see how we can fix it.

lncRNA seems to have extra labels at the start which we can remove, as well as underscores which we can convert to hyphens. However, what's also important here is that we have normal and tumor samples explictly labeled, whereas other files use the '01' marking for tumors and '11' marking for normal samples. We need to adapt our lncRNA data to follow this structure.
```{r}
#We are going through each column in the lncRNA dataset
for (i in 1:ncol(lncRNA)) {
  #If the column name - which is the sample ID - contains Normal, we'll add a -11 and format the name to remove the beginning and convert all underscores to hyphens.
  
  if (grepl("Normal", colnames(lncRNA)[i], fixed = TRUE) == TRUE) {
    colnames(lncRNA)[i] <- sub(".*-TC", "TC", colnames(lncRNA)[i])
    colnames(lncRNA)[i] <- gsub("_", "-", colnames(lncRNA)[i])
    colnames(lncRNA)[i] <- paste(colnames(lncRNA)[i], '-11', sep="")
  } else {
    #Otherwise, we'll still format the name, but we'll add a -01 at the end instead.
    colnames(lncRNA)[i] <- sub(".*-TC", "TC", colnames(lncRNA)[i])
    colnames(lncRNA)[i] <- gsub("_", "-", colnames(lncRNA)[i])
    colnames(lncRNA)[i] <- paste(colnames(lncRNA)[i], '-01', sep="")
  }
    
}

#We'll convert the first column to the standard name Gene_ID and remove the extra marker in the Ensemble ID after the decimal point.
colnames(lncRNA)[1] <- "Gene_ID"
lncRNA$Gene_ID <- sub("\\..*", "", lncRNA$Gene_ID)
head(lncRNA)
```

That looks more in line with the standard sample notation.

Both the methylation and the mRNA file have underscores that we can convert to hyphens.

```{r}
#We'll iterate over each column and convert the underscores to dashes, for both datasets. Then, we'll convert the first column to the Gene_ID label.
for (i in 1:ncol(mRNA)) {
  colnames(mRNA)[i] <- gsub("_", "-", colnames(mRNA)[i])
}
colnames(mRNA)[1] <- "Gene_ID"
head(mRNA)

#for (i in 1:ncol(methylation)) {
#  colnames(methylation)[i] <- gsub("_", "-", colnames(methylation)[i])
#}
#colnames(methylation)[1] <- "Gene_ID"
#head(methylation)
```
That seems to be fixed as well.

For miRNA data, there is a lot of extraneous data at the end about sample and vials, which we don't need, so let's remove that.

```{r}
#We'll iterate over the column names and remove any extraneous information after our normal or tumor sample labels, and then create the Gene_ID column.

for (i in 1:ncol(miRNA)) {
  colnames(miRNA)[i] <- sub("_01.*", "_01", colnames(miRNA)[i])
  colnames(miRNA)[i] <- sub("_01.*", "_01", colnames(miRNA)[i])
  colnames(miRNA)[i] <- gsub("_", "-", colnames(miRNA)[i])
}
colnames(miRNA)[1] <- "Gene_ID"
head(miRNA)
```
Done!

For clinical data, we have our 'Sample ID' column with appropriate notation, so we don't need to change that.

Now, for step 2, we have to identify which samples are missing from any of the files, as we only want data points present in each file. To accomplish this, I am first going to access all the sample IDs from each file. I'm then going to add them together to form a vector, and use the table function to see how many times each value appears in the vector. Given that we have 5 files of data, a sample present in all files should appear at least 5 times. Thus, we will isolate any values that appear less than 5 times.

```{r}
#Here we access the specific sample names through indexing of the data (which starts from column 2).
clin_samples <- clinical$`Sample ID`
lnc_samples <- colnames(lncRNA)[2:557]
meth_samples <- colnames(methylation)[5:571]
miR_samples <- colnames(miRNA)[2:570]
mRNA_samples <- colnames(mRNA)[2:569]

#We concatenate all the sample data into a vector.
temp <- c(lnc_samples, meth_samples, miR_samples, mRNA_samples)
#Next, we count the occurrence of each unique sample ID in our vector using the table function - if a sample is present in all 4 datasets, it'll have a count of 4.
temp_table = table(temp)
#Now, we just want to keep the samples that have an occurrence of 4.
keep1 <- names(temp_table[temp_table >= 4])
keep1 <- c("Gene_ID", keep1)

```

Now, we will remove their columns/rows from each data file in a sequential manner.

```{r}
#Next, we just subset the datasets to only keep columns that match a value in our vector of accepted samples - using the subset and select functions.
miRNA_new <- subset(miRNA, select=c(keep1))
mRNA_new <- subset(mRNA, select=c(keep1))
methylation_new <- subset(methylation, select=c(keep1))
lncRNA_new <- subset(lncRNA, select=c(keep1))
```

One additional step we need to take is to make sure that all our tumor samples have clinical data available. Thus, I'm going to compare our tumor samples (with the -01 ending) to our clinical sample IDs to find and also remove any samples that don't overlap, using a similar method.

```{r}
#Now, since clinical data only contains tumor samples, we need to separate out our tumor samples by searching for all samples with the -01 tag.
tumor_samples <- grep("-01", keep1, value=TRUE)
normal_samples <- grep("-11", keep1, value=TRUE)

#Same as above - vector, count, keep anything occurring twice since we are comparing 2 datasets.
temp2 <- c(tumor_samples, clin_samples)
temp_table_2 = table(temp2)
keep2 <- names(temp_table_2[temp_table_2 >= 2])
keep2 <- c(keep2)

#We then delete any rows that don't correspond to a sample in our list, by using the in function to only keep rows with values in our vector.

clin_new <- clinical[clinical$`Sample ID` %in% keep2,]
clin_new <- data.frame(clin_new)

clin_new$'Metastasis_Group' <- with(clin_new, ifelse(clin_new$American.Joint.Committee.on.Cancer.Metastasis.Stage.Code == "M1", 1, 2))

clin_new$'Recurrence_Group' <- with(clin_new, ifelse(clin_new$Disease.Free.Status == "1:Recurred/Progressed", 1, 2))

clin_new
```

Now, all our data formatting is done - each of our datasets is composed of 542 samples, with 492 tumor samples (as the clinical data suggests) and 50 normal samples. Before we move into our actual analysis, we need to make sure all our samples have data points for the RNAs being analyzed. Since a lot of the files have 'NA' values in them, we can't work with this, so we'll just eliminate all rows containing an NA.

```{r}
library(dplyr)
library(readr)
library(edgeR)
#For each dataset, we are going to remove any rows with NA in them, and then convert it to a data frame to work with our DESeq2 functions.
miRNA_new[miRNA_new == "NA"] <- NA
miRNA_final <- miRNA_new[complete.cases(miRNA_new), ]
miRNA_final = data.frame(miRNA_final)

mRNA_new[mRNA_new == "NA"] <- NA
mRNA_final <- mRNA_new[complete.cases(mRNA_new), ]
mRNA_final = data.frame(mRNA_final)

lncRNA_new[lncRNA_new == "NA"] <- NA
lncRNA_final <- lncRNA_new[complete.cases(lncRNA_new), ]
lncRNA_final = data.frame(lncRNA_final)

methylation_new[methylation_new == "NA"] <- NA 
methylation_final <- methylation_new[complete.cases(methylation_new), ]
methylation_final = data.frame(methylation_final)

```

## *Differential Gene Expression*


We finally have our final datasets that we can work with. Let's begin with analysis. 

We first should create a proper metadata variable to begin analysis, by considering tumor and normal samples. However, there are different levels of analysis we can conduct, which means we'll create a couple of metadata files based on clinical data.

First, let's create a file with a simple distribution of group 0 for normal tissue and group 1 for tumor tissue.

```{r}
library(DESeq2)
library(ggplot2)

#Access sample IDs to form column 1, then check for the .01 tag to assign groups as 1 or 0 for tumor or normal.
metadata <- data.frame("Sample_ID"=c(colnames(lncRNA_final)[2:543]))
metadata$'Group' <- ifelse(grepl(".01", metadata$Sample_ID, fixed = TRUE) == TRUE, 1, 0)
metadata$Group <- as.factor(metadata$Group)

for (i in 1:542) {
  sample <- metadata[i, 1]
  sample2 <- gsub("\\.", "-", sample)
  row1 <- clin_new[which(clin_new$Sample.ID == sample2), ]
  group1 <- row1$Overall.Survival..Months.
  if (grepl(".11", sample, fixed=TRUE) == TRUE) {
    metadata[i, c("days_to_death")] <- 10000
  } else{
    metadata[i, c("days_to_death")] <- as.numeric(group1) * 30
  }
}

```

Next, let's create a metadata file that adds in recurrent samples as being group 2, and primary samples as being group 1.

```{r}
#Access sample IDs to form column 1, then if the tumor is normal, assign group 0. If the tumor is not normal, reference clinical data for disease free status, and assign group 2 to recurred samples.
metadata_recur <- data.frame("Sample_ID"=c(colnames(lncRNA_final)[2:543]))
for (i in 1:542) {
  sample <- metadata_recur[i, 1]
  sample2 <- gsub("\\.", "-", sample)
  row1 <- clin_new[which(clin_new$Sample.ID == sample2), ]
  group1 <- row1$Recurrence_Group
  if (grepl(".11", sample, fixed=TRUE) == TRUE) {
    metadata_recur[i, c("Group")] <- 0
    metadata_recur[i, c("days_to_death")] <- 10000
  } else {
    if (group1 == 1) {
      metadata_recur[i, c("Group")] <- 1
    } else{
      metadata_recur[i, c("Group")] <- 2
    }
    metadata_recur[i, c("days_to_death")] <- as.numeric(row1$Overall.Survival..Months.) * 30
  }
}

metadata_recur$Group <- as.factor(metadata_recur$Group)
```

We can also create a metadata file to evaluate metastatic vs. non-metastatic samples, which can form group 1 and 2 respectively.

```{r}
#Access sample IDs to form column 1, then if the tumor is normal, assign group 0. If the tumor is not normal, reference clinical data for metastatic status, and assign group 1 to metastatic samples.
metadata_metas <- data.frame("Sample_ID"=c(colnames(lncRNA_final)[2:543]))

for (i in 1:542) {
  sample <- metadata_metas[i, 1]
  sample2 <- gsub("\\.", "-", sample)
  row1 <- clin_new[which(clin_new$Sample.ID == sample2), ]
  group1 <- row1$Metastasis_Group
  if (grepl(".11", sample, fixed=TRUE) == TRUE) {
    metadata_metas[i, c("Group")] <- 0
    metadata_metas[i, c("days_to_death")] <- 10000
  } else {
    if (group1 == 1) {
      metadata_metas[i, c("Group")] <- 1
    } else{
      metadata_metas[i, c("Group")] <- 2
    }
    metadata_metas[i, c("days_to_death")] <- as.numeric(row1$Overall.Survival..Months.)*30
  }
}

metadata_metas$Group <- as.factor(metadata_metas$Group)
```

Now, for each of our datasets, we can format them to work with DESeq2, and then run the appropriate analysis/visualizations for them.

#### lncRNA

```{r}
#Need to convert the gene names to the actual row names, and remove that column
lnc_DESEQ <- lncRNA_final[,-1]
rownames(lnc_DESEQ) <- lncRNA_final$Gene_ID
```



##### Metadata Metastasis
```{r}
#Create the DESeq data and analyze it
dds <- DESeqDataSetFromMatrix(countData=round(lnc_DESEQ), colData=metadata, design=~Group)
dds <- DESeq(dds)

#Plot the dispersion estimates of the differential data
plotDispEsts(dds)

#Access and store the results of DEG
res_lnc_metas <- results(dds, tidy=TRUE)
rownames(res_lnc_metas) <- res_lnc_metas[, 1]
res_lnc_metas <- res_lnc_metas %>% 
  arrange(res_lnc_metas$padj)
head(res_lnc_metas)

summary(res_lnc_metas)

#Analyze the distribution of p-values (likely skewed)
hist(res_lnc_metas$padj)

#Analyze distribution of p-values with LFC threshold of 1.5.
res2 <- results(dds, lfcThreshold=1.5, alpha=0.01)
hist(res2$padj)

for (i in 1:10) {
  plotCounts(dds, gene=rownames(res_lnc_metas)[i], intgroup="Group", main=rownames(res_lnc_metas)[i])
}

plotMA(dds, lfcThreshold=1.5, alpha=0.01)
#reset par
par(mfrow=c(1,1))
# Make a basic volcano plot
with(res_lnc_metas, plot(log2FoldChange, -log10(pvalue), pch=20, main="Volcano plot", xlim=c(-3,3)))

# Add colored points: blue if padj<0.01, red if log2FC>1 and padj<0.05)
with(subset(res_lnc_metas, padj<.01 ), points(log2FoldChange, -log10(pvalue), pch=20, col="blue"))
with(subset(res_lnc_metas, padj<.01 & abs(log2FoldChange)>2), points(log2FoldChange, -log10(pvalue), pch=20, col="red"))

library(pheatmap)
vsdata <- varianceStabilizingTransformation(dds, blind=TRUE)
plotPCA(vsdata, intgroup="Group")

rld_mat <- assay(vsdata)
rld_cor = cor(rld_mat)
pheatmap(rld_cor, cluster_rows=TRUE, cluster_cols=TRUE, show_colnames=FALSE, show_rownames = FALSE)
```

Now, let's use the same methodology to work through the other RNA types we have with us:

#### miRNA

```{r}
#Need to convert the gene names to the actual row names, and remove that column
mi_DESEQ <- miRNA_final[,-1]
rownames(mi_DESEQ) <- miRNA_final$Gene_ID
```

##### Metadata Normal
```{r}
#Create the DESeq data and analyze it
dds <- DESeqDataSetFromMatrix(countData=round(mi_DESEQ), colData=metadata, design=~Group)
dds <- DESeq(dds)

#Plot the dispersion estimates of the differential data
plotDispEsts(dds)

#Access and store the results of DEG
res_miR_main <- results(dds, tidy=TRUE)
rownames(res_miR_main) <- res_miR_main[, 1]
res_miR_main <- res_miR_main %>% 
  arrange(res_miR_main$padj)
head(res_miR_main)

summary(res_miR_main)

#Analyze the distribution of p-values (likely skewed)
hist(res_miR_main$padj)

#Analyze distribution of p-values with LFC threshold of 1.5.
res2 <- results(dds, lfcThreshold=1.5, alpha=0.01)
hist(res2$padj)

for (i in 1:10) {
  plotCounts(dds, gene=rownames(res_miR_main)[i], intgroup="Group", main=rownames(res_miR_main)[i])
}

plotMA(dds, lfcThreshold=1.5, alpha=0.01)
#reset par
par(mfrow=c(1,1))
# Make a basic volcano plot
with(res_miR_main, plot(log2FoldChange, -log10(pvalue), pch=20, main="Volcano plot", xlim=c(-3,3)))

# Add colored points: blue if padj<0.01, red if log2FC>1 and padj<0.05)
with(subset(res_miR_main, padj<.01 ), points(log2FoldChange, -log10(pvalue), pch=20, col="blue"))
with(subset(res_miR_main, padj<.01 & abs(log2FoldChange)>2), points(log2FoldChange, -log10(pvalue), pch=20, col="red"))

library(pheatmap)
vsdata <- varianceStabilizingTransformation(dds, blind=TRUE)
plotPCA(vsdata, intgroup="Group")

rld_mat <- assay(vsdata)
rld_cor = cor(rld_mat)
pheatmap(rld_cor, cluster_rows=TRUE, cluster_cols=TRUE, show_colnames=FALSE, show_rownames = FALSE)
```

##### Metadata Recurrence
```{r}
#Create the DESeq data and analyze it
dds <- DESeqDataSetFromMatrix(countData=round(mi_DESEQ), colData=metadata_recur, design=~Group)
dds <- DESeq(dds)

#Plot the dispersion estimates of the differential data
plotDispEsts(dds)

#Access and store the results of DEG
res_miR_recur <- results(dds, tidy=TRUE)
rownames(res_miR_recur) <- res_miR_recur[, 1]
res_miR_recur <- res_miR_recur %>% 
  arrange(res_miR_recur$padj)
head(res_miR_recur)

summary(res_miR_recur)

#Analyze the distribution of p-values (likely skewed)
hist(res_miR_recur$padj)

#Analyze distribution of p-values with LFC threshold of 1.5.
res2 <- results(dds, lfcThreshold=1.5, alpha=0.01)
hist(res2$padj)

for (i in 1:10) {
  plotCounts(dds, gene=rownames(res_miR_recur)[i], intgroup="Group", main=rownames(res_miR_recur)[i])
}

plotMA(dds, lfcThreshold=1.5, alpha=0.01)
#reset par
par(mfrow=c(1,1))
# Make a basic volcano plot
with(res_miR_recur, plot(log2FoldChange, -log10(pvalue), pch=20, main="Volcano plot", xlim=c(-3,3)))

# Add colored points: blue if padj<0.01, red if log2FC>1 and padj<0.05)
with(subset(res_miR_recur, padj<.01 ), points(log2FoldChange, -log10(pvalue), pch=20, col="blue"))
with(subset(res_miR_recur, padj<.01 & abs(log2FoldChange)>2), points(log2FoldChange, -log10(pvalue), pch=20, col="red"))

library(pheatmap)
vsdata <- varianceStabilizingTransformation(dds, blind=TRUE)
plotPCA(vsdata, intgroup="Group")

rld_mat <- assay(vsdata)
rld_cor = cor(rld_mat)
pheatmap(rld_cor, cluster_rows=TRUE, cluster_cols=TRUE, show_colnames=FALSE, show_rownames = FALSE)
```

##### Metadata Metastasis
```{r}
#Create the DESeq data and analyze it
dds <- DESeqDataSetFromMatrix(countData=round(mi_DESEQ), colData=metadata_metas, design=~Group)
dds <- DESeq(dds)

#Plot the dispersion estimates of the differential data
plotDispEsts(dds)

#Access and store the results of DEG
res_miR_metas <- results(dds, tidy=TRUE)
rownames(res_miR_metas) <- res_miR_metas[, 1]
res_miR_metas <- res_miR_metas %>% 
  arrange(res_miR_metas$padj)
head(res_miR_metas)

summary(res_miR_metas)

#Analyze the distribution of p-values (likely skewed)
hist(res_miR_metas$padj)

#Analyze distribution of p-values with LFC threshold of 1.5.
res2 <- results(dds, lfcThreshold=1.5, alpha=0.01)
hist(res2$padj)

for (i in 1:10) {
  plotCounts(dds, gene=rownames(res_miR_metas)[i], intgroup="Group", main=rownames(res_miR_metas)[i])
}

plotMA(dds, lfcThreshold=1.5, alpha=0.01)
#reset par
par(mfrow=c(1,1))
# Make a basic volcano plot
with(res_miR_metas, plot(log2FoldChange, -log10(pvalue), pch=20, main="Volcano plot", xlim=c(-3,3)))

# Add colored points: blue if padj<0.01, red if log2FC>1 and padj<0.05)
with(subset(res_miR_metas, padj<.01 ), points(log2FoldChange, -log10(pvalue), pch=20, col="blue"))
with(subset(res_miR_metas, padj<.01 & abs(log2FoldChange)>2), points(log2FoldChange, -log10(pvalue), pch=20, col="red"))

library(pheatmap)
vsdata <- varianceStabilizingTransformation(dds, blind=TRUE)
plotPCA(vsdata, intgroup="Group")

rld_mat <- assay(vsdata)
rld_cor = cor(rld_mat)
pheatmap(rld_cor, cluster_rows=TRUE, cluster_cols=TRUE, show_colnames=FALSE, show_rownames = FALSE)
```

#### mRNA

```{r}
#Need to convert the gene names to the actual row names, and remove that column
m_DESEQ <- mRNA_final[,-1]
rownames(m_DESEQ) <- mRNA_final$Gene_ID
```

##### Metadata Normal
```{r}
#Create the DESeq data and analyze it
dds <- DESeqDataSetFromMatrix(countData=round(m_DESEQ), colData=metadata, design=~Group)
dds <- DESeq(dds)

#Plot the dispersion estimates of the differential data
plotDispEsts(dds)

#Access and store the results of DEG
res_m_main <- results(dds, tidy=TRUE)
rownames(res_m_main) <- res_m_main[, 1]
res_m_main <- res_m_main %>% 
  arrange(res_m_main$padj)
head(res_m_main)

summary(res_m_main)

#Analyze the distribution of p-values (likely skewed)
hist(res_m_main$padj)

#Analyze distribution of p-values with LFC threshold of 1.5.
res2 <- results(dds, lfcThreshold=1.5, alpha=0.01)
hist(res2$padj)

for (i in 1:10) {
  plotCounts(dds, gene=rownames(res_m_main)[i], intgroup="Group", main=rownames(res_m_main)[i])
}

plotMA(dds, lfcThreshold=1.5, alpha=0.01)
#reset par
par(mfrow=c(1,1))
# Make a basic volcano plot
with(res_m_main, plot(log2FoldChange, -log10(pvalue), pch=20, main="Volcano plot", xlim=c(-3,3)))

# Add colored points: blue if padj<0.01, red if log2FC>1 and padj<0.05)
with(subset(res_m_main, padj<.01 ), points(log2FoldChange, -log10(pvalue), pch=20, col="blue"))
with(subset(res_m_main, padj<.01 & abs(log2FoldChange)>2), points(log2FoldChange, -log10(pvalue), pch=20, col="red"))

library(pheatmap)
vsdata <- varianceStabilizingTransformation(dds, blind=TRUE)
plotPCA(vsdata, intgroup="Group")

rld_mat <- assay(vsdata)
rld_cor = cor(rld_mat)
pheatmap(rld_cor, cluster_rows=TRUE, cluster_cols=TRUE, show_colnames=FALSE, show_rownames = FALSE)
```

##### Metadata Recurrence
```{r}
#Create the DESeq data and analyze it
dds <- DESeqDataSetFromMatrix(countData=round(m_DESEQ), colData=metadata_recur, design=~Group)
dds <- DESeq(dds)

#Plot the dispersion estimates of the differential data
plotDispEsts(dds)

#Access and store the results of DEG
res_m_recur <- results(dds, tidy=TRUE)
rownames(res_m_recur) <- res_m_recur[, 1]
res_m_recur <- res_m_recur %>% 
  arrange(res_m_recur$padj)
head(res_m_recur)

summary(res_m_recur)

#Analyze the distribution of p-values (likely skewed)
hist(res_m_recur$padj)

#Analyze distribution of p-values with LFC threshold of 1.5.
res2 <- results(dds, lfcThreshold=1.5, alpha=0.01)
hist(res2$padj)

for (i in 1:10) {
  plotCounts(dds, gene=rownames(res_m_recur)[i], intgroup="Group", main=rownames(res_m_recur)[i])
}

plotMA(dds, lfcThreshold=1.5, alpha=0.01)
#reset par
par(mfrow=c(1,1))
# Make a basic volcano plot
with(res_m_recur, plot(log2FoldChange, -log10(pvalue), pch=20, main="Volcano plot", xlim=c(-3,3)))

# Add colored points: blue if padj<0.01, red if log2FC>1 and padj<0.05)
with(subset(res_m_recur, padj<.01 ), points(log2FoldChange, -log10(pvalue), pch=20, col="blue"))
with(subset(res_m_recur, padj<.01 & abs(log2FoldChange)>2), points(log2FoldChange, -log10(pvalue), pch=20, col="red"))

library(pheatmap)
vsdata <- varianceStabilizingTransformation(dds, blind=TRUE)
plotPCA(vsdata, intgroup="Group")

rld_mat <- assay(vsdata)
rld_cor = cor(rld_mat)
pheatmap(rld_cor, cluster_rows=TRUE, cluster_cols=TRUE, show_colnames=FALSE, show_rownames = FALSE)
```

##### Metadata Metastasis
```{r}
#Create the DESeq data and analyze it
dds <- DESeqDataSetFromMatrix(countData=round(m_DESEQ), colData=metadata_metas, design=~Group)
dds <- DESeq(dds)

#Plot the dispersion estimates of the differential data
plotDispEsts(dds)

#Access and store the results of DEG
res_m_metas <- results(dds, tidy=TRUE)
rownames(res_m_metas) <- res_m_metas[, 1]
res_m_metas <- res_m_metas %>% 
  arrange(res_m_metas$padj)
head(res_m_metas)

summary(res_m_metas)

#Analyze the distribution of p-values (likely skewed)
hist(res_m_metas$padj)

#Analyze distribution of p-values with LFC threshold of 1.5.
res2 <- results(dds, lfcThreshold=1.5, alpha=0.01)
hist(res2$padj)

for (i in 1:10) {
  plotCounts(dds, gene=rownames(res_m_metas)[i], intgroup="Group", main=rownames(res_m_metas)[i])
}

plotMA(dds, lfcThreshold=1.5, alpha=0.01)
#reset par
par(mfrow=c(1,1))
# Make a basic volcano plot
with(res_m_metas, plot(log2FoldChange, -log10(pvalue), pch=20, main="Volcano plot", xlim=c(-3,3)))

# Add colored points: blue if padj<0.01, red if log2FC>1 and padj<0.05)
with(subset(res_m_metas, padj<.01 ), points(log2FoldChange, -log10(pvalue), pch=20, col="blue"))
with(subset(res_m_metas, padj<.01 & abs(log2FoldChange)>2), points(log2FoldChange, -log10(pvalue), pch=20, col="red"))

library(pheatmap)
vsdata <- varianceStabilizingTransformation(dds, blind=TRUE)
plotPCA(vsdata, intgroup="Group")

rld_mat <- assay(vsdata)
rld_cor = cor(rld_mat)
pheatmap(rld_cor, cluster_rows=TRUE, cluster_cols=TRUE, show_colnames=FALSE, show_rownames = FALSE)
```


Let's test out the GDCRNATools package.

```{r}
library(GDCRNATools)
library("AnnotationDbi")
library("org.Hs.eg.db")

m_DESEQ <- m_DESEQ[30:20531,]
m_DESEQ$'ENTREZ' <- c(rownames(m_DESEQ))
m_DESEQ$'ENTREZ' <- gsub(".*\\|", "", m_DESEQ$'ENTREZ')
m_DESEQ$'Ensemble' <- mapIds(org.Hs.eg.db, keys=m_DESEQ$'ENTREZ', keytype="ENTREZID", column="ENSEMBL")
rownames(m_DESEQ) <- make.names(m_DESEQ$Ensemble, unique=TRUE)
m_DESEQ$Ensemble <- NULL
m_DESEQ$ENTREZ <- NULL
metadata_new <- data.frame(c(metadata$Sample_ID))
metadata_new$'Group' <- ifelse(metadata$Group == 0, 'NormalTissue', 'PrimaryTumor')

DEGAll <- gdcDEAnalysis(counts=round(m_DESEQ), group=metadata_new$Group, comparison='NormalTissue-PrimaryTumor', method='DESeq2')

dePC <- gdcDEReport(deg=DEGAll)
head(dePC)

gdcBarPlot(dePC, angle=45, data.type='RNAseq')



enrichOutput <- gdcEnrichAnalysis(gene=rownames(dePC)[1:500], simplify=TRUE)
gdcEnrichPlot(enrichment=enrichOutput, type='bar', category='KEGG', num.terms=5)
```

```{r}

```


