title: “compare BRCA data with the filtered data set before NMF analysis”
author: “Emine Guven”
date: “10 June 2020”
output:
library(matrixStats)
library(edgeR)
## Loading required package: limma
library(RColorBrewer)
library(ggplot2)
library(gplots)
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library(LSD)
library(NMF)
## Loading required package: pkgmaker
## Loading required package: registry
## Loading required package: rngtools
## Loading required package: cluster
## NMF - BioConductor layer [OK] | Shared memory capabilities [OK] | Cores 3/4

Breast = read.csv(file = “Breast.csv”,header=T,stringsAsFactors=F)

saveRDS(Breast, file = “breast.rds”)

load the BRCA data

BRCA <- readRDS(file = "breast.rds")
final_filtered_set<-readRDS(file="060920filtered.rds")


patients<-colnames(BRCA) #this gives just numbers of the expressed genes
BRCA[1:3,1:3]
##            GeneNames TCGA.A2.A25E.01A.11R.A169.07.tumor
## 1  ENSG00000242268.2                        0.000000000
## 2  ENSG00000270112.3                        0.003031674
## 3 ENSG00000167578.15                        3.022232825
##   TCGA.C8.A3M7.01A.12R.A21T.07.tumor
## 1                          0.1615208
## 2                          0.0000000
## 3                          2.4301671
patients<-BRCA[1,]
patients<-patients[-1]
patients[1:3] #checking patients 1 to 3
##   TCGA.A2.A25E.01A.11R.A169.07.tumor TCGA.C8.A3M7.01A.12R.A21T.07.tumor
## 1                                  0                          0.1615208
##   TCGA.D8.A1JG.01B.11R.A13Q.07.tumor
## 1                         0.01724332
geneNames = BRCA[,1] # okay finally this gives the geneNames
head(geneNames)
## [1] "ENSG00000242268.2"  "ENSG00000270112.3"  "ENSG00000167578.15"
## [4] "ENSG00000273842.1"  "ENSG00000078237.5"  "ENSG00000146083.10"
BRCA=BRCA[,-1] #this drops the first column; geneNames
#newBRCA=BRCA[1:100,1:100]

rownames(BRCA)=geneNames

BRCA<-data.matrix(BRCA,rownames.force = NA)
rownames(BRCA)=geneNames;

######filtered data##########

dim(final_filtered_set)
## [1]  688 1222
subset_genes=rownames(final_filtered_set)
head(subset_genes)
## [1] "ENSG00000251615.3"  "ENSG00000143224.16" "ENSG00000131845.13"
## [4] "ENSG00000166896.6"  "ENSG00000066697.13" "ENSG00000111684.9"
length(subset_genes)
## [1] 688
subset_patients=colnames(final_filtered_set)
head(subset_patients)
## [1] "TCGA.A2.A25E.01A.11R.A169.07.tumor" "TCGA.C8.A3M7.01A.12R.A21T.07.tumor"
## [3] "TCGA.D8.A1JG.01B.11R.A13Q.07.tumor" "TCGA.AO.A0JA.01A.11R.A056.07.tumor"
## [5] "TCGA.A2.A4S0.01A.21R.A266.07.tumor" "TCGA.B6.A40C.01A.11R.A239.07.tumor"
length(subset_patients)
## [1] 1222
summary(as.numeric(BRCA))
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##      0.0      0.0      0.0      5.1      0.7 430170.0
summary(as.numeric(round(BRCA)))
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##      0.0      0.0      0.0      5.1      1.0 430170.0
summary(as.numeric(final_filtered_set))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   3.000   4.000   4.953   6.000 111.000
summary(as.numeric(round(final_filtered_set)))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   3.000   4.000   4.953   6.000 111.000
global_mean=mean(BRCA)
filtered_mean=mean(final_filtered_set)
#pdf(file = "/Users/user/Documents/github/APOBEC3-CANCER/analysis/scatter_BRCA_1.pdf")
#do with log2
par(mfrow=c(1,2))
smoothScatter(log2(BRCA), xlab = 'expressed genes:log2FPKM', ylab='patients') #,xlim=c(0,7),ylim=c(0,7),pch=20
abline(v = log2(global_mean), col = 'red')
text(1, 0, "global_mean", col = "black", adj = c(0, -.1))
smoothScatter(log2(final_filtered_set), xlab = 'filtered expressed genes:log2FPKM', ylab='patients') #,xlim=c(0,7),ylim=c(0,7),pch=20
abline(v = log2(filtered_mean), col = 'red')
text(1, 0, "filtered_mean", col = "black", adj = c(0, -.2))

another way of doing scatter plot
#the same here would make more sense with log2
#pdf("~/Documents/NMFmodel_TCGAdata/plots/log_FPKMvsfilteredFPKM.pdf")
#par(mfrow=c(1,2))
plot(log2(BRCA), col = "navy blue", 
     pch = 17, cex=0.5, xlab = 'expressed genes:log2(FPKM)', ylab = 'patients')
abline(v = log2(global_mean))
#text(1, 0, "global_mean", col = "black", adj = c(0, -.1))

par(new=TRUE)
plot(log2(final_filtered_set), col= "pink", pch = 4, xaxt='n',yaxt='n', cex=0.2, xlab='',ylab='')

#text(1, 0, "filtered_mean", col = "black", adj = c(0, -.1))
#dev.off()
make heat scatter
par(mfrow=c(1,2))
heatscatter(BRCA[,1], BRCA[,2], pch = 19, add.contour = TRUE,
            colpal = "spectral", cor=FALSE)
abline(v = global_mean)
text(1, 0, "mean(FPKM)", col = "blue", adj = c(0, -.1))

heatscatter(final_filtered_set[,1], final_filtered_set[,2], pch = 19, add.contour = TRUE, colpal = "spectral", cor=FALSE)
abline(v = filtered_mean)
text(1, 0, "mean(FPKM)", col = "blue", adj = c(1, -.1))

make heat scatter
par(mfrow=c(1,2))
heatscatter(log2(BRCA[,1]), log2(BRCA[,2]), pch = 19, add.contour = TRUE,
            colpal = "spectral", cor=FALSE)
abline(v = log2(global_mean))
text(1, 0, "log2(mean(FPKM))", col = "blue", adj = c(0, -.1))

heatscatter(log2(final_filtered_set[,1]), log2(final_filtered_set[,2]), pch = 19, add.contour = TRUE, colpal = "spectral", cor=FALSE)
abline(v = log2(filtered_mean))
text(1, 0, "log2(mean(FPKM))", col = "blue", adj = c(1, -.1))

make histogram with the row logs
par(mfrow=c(2,2))
hist(BRCA,main="raw data")
hist(log2(BRCA), breaks = length(BRCA)/100, main="log(raw) data")
hist(final_filtered_set,breaks=30, main="filtered_data raw" )
hist(log2(final_filtered_set), breaks=30,main="log(filtered_data)")

check cv values for each row
fpkm_sd = rowSds(BRCA)
fpkm_mean = rowMeans(BRCA)

fpkm_cv = fpkm_sd/fpkm_mean


filtered_sd = rowSds(final_filtered_set)
filtered_mean = rowMeans(final_filtered_set)

filtered_cv = filtered_sd/filtered_mean
#plot cv values for the big data
#The first thing that comes to mind is to use the Standard Deviation (SD). #This will give us a measure of the variability in the spread of the data #from the averages. Using something like the Coefficient of Variation (CV), #which is the SD divided by the average, one can express the same value #relative to the average. I think the latter is more appropriate in this #case, since it is easier to compare across features/genes.
par(mfrow=c(2,2))
plot(fpkm_mean, fpkm_cv, main = "cv  vs mean BRCA raw data")
plot(log2(fpkm_mean), log2(fpkm_cv), main = "cv vs mean BRCA log2(fpkm) transformation")

plot(fpkm_sd, fpkm_cv, main = "cv vs std BRCA raw data")
plot(log2(fpkm_sd), log2(fpkm_cv), main = "cv vs std BRCA log2(fpkm) transformation")

###plot cv vs mean etc for filtered data set
par(mfrow=c(2,2))
plot(filtered_mean, filtered_cv, main = "filtered raw data")
plot(log2(filtered_mean), log2(filtered_cv), main = "cv vs mean log2(filtered) tranformation")

plot(filtered_sd, filtered_cv, main = "filtered raw data")
plot(log2(filtered_sd), log2(filtered_cv), main = "cv vs std log2(filtered) tranformation")

#pdf("heatmap_filterd.pdf")
aheatmap(final_filtered_set, Rowv = NA, Colv = NA, main= "heatmap of whole filtered",color = colorRampPalette(c("light blue", "red", "green", "yellow"))(20), cexRow = 0.1, cexCol = 0.1 )

aheatmap(final_filtered_set[1:100,1:100],  main= "heatmap of filtered subset=100", Rowv = NA, Colv = NA, color = colorRampPalette(c("light blue", "red", "green", "yellow"))(20), cexRow = 0.3, cexCol = 0.3 )

#dev.off()
aheatmap(BRCA[1:100,1:100],   Rowv = NA, Colv = NA, main= "heatmap of whole data subset=100",color = colorRampPalette(c("light blue", "red", "green", "yellow"))(20), cexRow = 0.2, cexCol = 0.2 )

#dev.off()