## Create a directory in Windows
C:\Users\chaadams\Dropbox\COH\Hispanic tumours\Overexpression select genes\Linux version

## Create a Linux directory 
cd /home/chaadams/cadams.grp/Seq/180618/cadams_analysis/
  module add R/3.5.1
R

## Dependencies 
source("http://bioconductor.org/biocLite.R")
biocLite("annotate")
biocLite("GO.db")
biocLite("org.Hs.egENSEMBL")
biocLite("limma")
biocLite("Glimma")
biocLite("gplots")
biocLite("org.Hs.eg.db")
biocLite("RColorBrewer")
biocLite("biomaRt")
biocLite("edgeR")

library(GO.db)
library(edgeR)
library(gplots)
library(RColorBrewer)
library(edgeR)
library(limma)
library(Glimma)
library(gplots)
library(org.Hs.eg.db)
library(RColorBrewer)
library(biomaRt)
library(annotate)

Code

setwd("/home/chaadams/cadams.grp/Seq/180618/cadams_analysis")
#Read the sample information into R
sampleinfo <- read.csv("./data/final_pheno.csv")

# Read the data into R
seqdata=read.csv("data/sneuhausen_180612_180615AB_180618_hg19_125_RNAseq_HTSeq_RAW_fixed_sampleIDs.csv",stringsAsFactors = FALSE)

# Use the gene symbols to fetch the ENTREZIDs (we have Ensembl IDs)
hs <- org.Hs.eg.db
my.symbols <- seqdata[,2]
entrezid=select(hs, 
        keys = my.symbols,
        columns = c("ENTREZID", "SYMBOL"),
        keytype = "SYMBOL")
entrezid_keep <- na.omit(entrezid)
dim(entrezid_keep)
entrezid_keep$symbol=entrezid_keep$SYMBOL
seqdata2 <- merge(seqdata,entrezid_keep,by="symbol")
dim(seqdata2)
length(unique(seqdata2$symbol)) == nrow(seqdata2)
length(unique(seqdata2$ENTREZID)) == nrow(seqdata2)
seqdata4=seqdata2[!duplicated(seqdata2[,136]),]
dim(seqdata4)
seqdata4 <- seqdata4[order(seqdata4$ENTREZID),]
write.csv(seqdata4, "./data/seqdata_with_entrezID.csv")

# Remove first two columns from seqdata
countdata <- seqdata4[,-(1:9)]

# Store EntrezGeneID as rownames
rownames(countdata) <- countdata[,127]

# Remove the 2 extra columns at the end
countdata <- countdata[,-(126:127)]

substr("ThisIsAString", start=1, stop=5)
# using substr, you extract the characters starting at position 1 and stopping at position 7 of the colnames
colnames(countdata) <- substr(colnames(countdata), 2, 7)

# Obtain CPMs
myCPM <- cpm(countdata)
# Have a look at the output
head(myCPM)

# Which values in myCPM are greater than 0.5?
thresh <- myCPM > 0.5
# This produces a logical matrix with TRUEs and FALSEs
head(thresh)

# Summary of how many TRUEs there are in each row
table(rowSums(thresh))

# Keep genes that have at least 2 TRUES in each row of thresh
keep <- rowSums(thresh) >= 2 #all rows fit this criteria

# Subset the rows of countdata to keep the more highly expressed genes
counts.keep <- countdata[keep,]
summary(keep)
dim(counts.keep)

dgeObj <- DGEList(counts.keep)
# have a look at dgeObj
dgeObj
# See what slots are stored in dgeObj
names(dgeObj)
# Library size information is stored in the samples slot
dgeObj$samples

# Get log2 counts per million
logcounts <- cpm(dgeObj,log=TRUE)

# Bring in the gene annotation 
ann <- select(org.Hs.eg.db,keys=rownames(logcounts),columns=c("ENTREZID","SYMBOL","GENENAME"))
# Have a look at the annotation
head(ann, n=2)

table(ann$ENTREZID==rownames(logcounts)) 

logcounts.annotated <- cbind(logcounts, ann)
head(logcounts.annotated$GENENAME, n=2)
setwd("/home/chaadams/cadams.grp/Seq/180618/cadams_analysis/Overexpression")
write.csv(logcounts.annotated,file="./data/Annotated_logcounts_for_overexpression_select_genes.csv",row.names=FALSE)

# Set-up the data:
#shorten the name for ease of typing
#transform it such that the rownames are the gene symbol
#and then transpose it such that the columns are genes 

mydata=logcounts.annotated
mylist=c("2064", "672", "675", "4609", "79728")

data=mydata[mylist, ]
rownames(data)=data$SYMBOL

#transpose such that the genes identified by ENTREZID are the colnames; patients the rows
data_t=t(data)

#remove the extra rows
data_t_clean=data_t[-c(126:128),]

#give it a final nice name
d=data_t_clean
head(d)
str(d)

#change to a data frame from a list
#change factors to numeric

asNumeric <- function(x) as.numeric(as.character(x))
factorsNumeric <- function(d) modifyList(d, lapply(d[, sapply(d, is.factor)],   
                                                   asNumeric))
f=factorsNumeric(d)
str(f)

save(dgeObj,logcounts, logcounts.annotated, sampleinfo,counts.keep, seqdata4, f, file="./data/overexpression_select_genes.Rdata")


#get histograms
#get them all on one page vs one-by-one
hist(f$ERBB2)
library(tidyr)
library(ggplot2)

png(file="./figures/Histograms_5genes.png")
ggplot(gather(f), aes(value)) + 
  geom_histogram(bins = 10) + 
  facet_wrap(~key, scales = 'free_x')
dev.off()

## Copy over the main Linux directory to my Windows directory to access it and run Markdown

Histrograms for BRCA1, BRCA2, ERBB2 (HER2), MYC, PALB2