load the BRCA data
BRCA <- readRDS(file = "breast.rds")
final_filtered_set<-readRDS(file="060920filtered.rds")
patients<-colnames(BRCA) #this gives just numbers of the expressed genes
BRCA[1:3,1:3]
## GeneNames TCGA.A2.A25E.01A.11R.A169.07.tumor
## 1 ENSG00000242268.2 0.000000000
## 2 ENSG00000270112.3 0.003031674
## 3 ENSG00000167578.15 3.022232825
## TCGA.C8.A3M7.01A.12R.A21T.07.tumor
## 1 0.1615208
## 2 0.0000000
## 3 2.4301671
patients<-BRCA[1,]
patients<-patients[-1]
patients[1:3] #checking patients 1 to 3
## TCGA.A2.A25E.01A.11R.A169.07.tumor TCGA.C8.A3M7.01A.12R.A21T.07.tumor
## 1 0 0.1615208
## TCGA.D8.A1JG.01B.11R.A13Q.07.tumor
## 1 0.01724332
geneNames = BRCA[,1] # okay finally this gives the geneNames
head(geneNames)
## [1] "ENSG00000242268.2" "ENSG00000270112.3" "ENSG00000167578.15"
## [4] "ENSG00000273842.1" "ENSG00000078237.5" "ENSG00000146083.10"
BRCA=BRCA[,-1] #this drops the first column; geneNames
#newBRCA=BRCA[1:100,1:100]
rownames(BRCA)=geneNames
BRCA<-data.matrix(BRCA,rownames.force = NA)
rownames(BRCA)=geneNames;
######filtered data##########
dim(final_filtered_set)
## [1] 688 1222
subset_genes=rownames(final_filtered_set)
head(subset_genes)
## [1] "ENSG00000251615.3" "ENSG00000143224.16" "ENSG00000131845.13"
## [4] "ENSG00000166896.6" "ENSG00000066697.13" "ENSG00000111684.9"
length(subset_genes)
## [1] 688
subset_patients=colnames(final_filtered_set)
head(subset_patients)
## [1] "TCGA.A2.A25E.01A.11R.A169.07.tumor" "TCGA.C8.A3M7.01A.12R.A21T.07.tumor"
## [3] "TCGA.D8.A1JG.01B.11R.A13Q.07.tumor" "TCGA.AO.A0JA.01A.11R.A056.07.tumor"
## [5] "TCGA.A2.A4S0.01A.21R.A266.07.tumor" "TCGA.B6.A40C.01A.11R.A239.07.tumor"
length(subset_patients)
## [1] 1222
summary(as.numeric(BRCA))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 0.0 0.0 5.1 0.7 430170.0
summary(as.numeric(round(BRCA)))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 0.0 0.0 5.1 1.0 430170.0
summary(as.numeric(final_filtered_set))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 3.000 4.000 4.953 6.000 111.000
summary(as.numeric(round(final_filtered_set)))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 3.000 4.000 4.953 6.000 111.000
global_mean=mean(BRCA)
filtered_mean=mean(final_filtered_set)
#pdf(file = "/Users/user/Documents/github/APOBEC3-CANCER/analysis/scatter_BRCA_1.pdf")
#do with log2
par(mfrow=c(1,2))
smoothScatter(log2(BRCA), xlab = 'expressed genes:log2FPKM', ylab='patients') #,xlim=c(0,7),ylim=c(0,7),pch=20
abline(v = log2(global_mean), col = 'red')
text(1, 0, "global_mean", col = "black", adj = c(0, -.1))
smoothScatter(log2(final_filtered_set), xlab = 'filtered expressed genes:log2FPKM', ylab='patients') #,xlim=c(0,7),ylim=c(0,7),pch=20
abline(v = log2(filtered_mean), col = 'red')
text(1, 0, "filtered_mean", col = "black", adj = c(0, -.2))

another way of doing scatter plot
#the same here would make more sense with log2
#pdf("~/Documents/NMFmodel_TCGAdata/plots/log_FPKMvsfilteredFPKM.pdf")
#par(mfrow=c(1,2))
plot(log2(BRCA), col = "navy blue",
pch = 17, cex=0.5, xlab = 'expressed genes:log2(FPKM)', ylab = 'patients')
abline(v = log2(global_mean))
#text(1, 0, "global_mean", col = "black", adj = c(0, -.1))
par(new=TRUE)
plot(log2(final_filtered_set), col= "pink", pch = 4, xaxt='n',yaxt='n', cex=0.2, xlab='',ylab='')

#text(1, 0, "filtered_mean", col = "black", adj = c(0, -.1))
#dev.off()
make heat scatter
par(mfrow=c(1,2))
heatscatter(BRCA[,1], BRCA[,2], pch = 19, add.contour = TRUE,
colpal = "spectral", cor=FALSE)
abline(v = global_mean)
text(1, 0, "mean(FPKM)", col = "blue", adj = c(0, -.1))
heatscatter(final_filtered_set[,1], final_filtered_set[,2], pch = 19, add.contour = TRUE, colpal = "spectral", cor=FALSE)
abline(v = filtered_mean)
text(1, 0, "mean(FPKM)", col = "blue", adj = c(1, -.1))

make heat scatter
par(mfrow=c(1,2))
heatscatter(log2(BRCA[,1]), log2(BRCA[,2]), pch = 19, add.contour = TRUE,
colpal = "spectral", cor=FALSE)
abline(v = log2(global_mean))
text(1, 0, "log2(mean(FPKM))", col = "blue", adj = c(0, -.1))
heatscatter(log2(final_filtered_set[,1]), log2(final_filtered_set[,2]), pch = 19, add.contour = TRUE, colpal = "spectral", cor=FALSE)
abline(v = log2(filtered_mean))
text(1, 0, "log2(mean(FPKM))", col = "blue", adj = c(1, -.1))

make histogram with the row logs
par(mfrow=c(2,2))
hist(BRCA,main="raw data")
hist(log2(BRCA), breaks = length(BRCA)/100, main="log(raw) data")
hist(final_filtered_set,breaks=30, main="filtered_data raw" )
hist(log2(final_filtered_set), breaks=30,main="log(filtered_data)")

check cv values for each row
fpkm_sd = rowSds(BRCA)
fpkm_mean = rowMeans(BRCA)
fpkm_cv = fpkm_sd/fpkm_mean
filtered_sd = rowSds(final_filtered_set)
filtered_mean = rowMeans(final_filtered_set)
filtered_cv = filtered_sd/filtered_mean
#plot cv values for the big data
#The first thing that comes to mind is to use the Standard Deviation (SD). #This will give us a measure of the variability in the spread of the data #from the averages. Using something like the Coefficient of Variation (CV), #which is the SD divided by the average, one can express the same value #relative to the average. I think the latter is more appropriate in this #case, since it is easier to compare across features/genes.
par(mfrow=c(2,2))
plot(fpkm_mean, fpkm_cv, main = "cv vs mean BRCA raw data")
plot(log2(fpkm_mean), log2(fpkm_cv), main = "cv vs mean BRCA log2(fpkm) transformation")
plot(fpkm_sd, fpkm_cv, main = "cv vs std BRCA raw data")
plot(log2(fpkm_sd), log2(fpkm_cv), main = "cv vs std BRCA log2(fpkm) transformation")

###plot cv vs mean etc for filtered data set
par(mfrow=c(2,2))
plot(filtered_mean, filtered_cv, main = "filtered raw data")
plot(log2(filtered_mean), log2(filtered_cv), main = "cv vs mean log2(filtered) tranformation")
plot(filtered_sd, filtered_cv, main = "filtered raw data")
plot(log2(filtered_sd), log2(filtered_cv), main = "cv vs std log2(filtered) tranformation")

#pdf("heatmap_filterd.pdf")
aheatmap(final_filtered_set, Rowv = NA, Colv = NA, main= "heatmap of whole filtered",color = colorRampPalette(c("light blue", "red", "green", "yellow"))(20), cexRow = 0.1, cexCol = 0.1 )

aheatmap(final_filtered_set[1:100,1:100], main= "heatmap of filtered subset=100", Rowv = NA, Colv = NA, color = colorRampPalette(c("light blue", "red", "green", "yellow"))(20), cexRow = 0.3, cexCol = 0.3 )

#dev.off()
aheatmap(BRCA[1:100,1:100], Rowv = NA, Colv = NA, main= "heatmap of whole data subset=100",color = colorRampPalette(c("light blue", "red", "green", "yellow"))(20), cexRow = 0.2, cexCol = 0.2 )

#dev.off()