## Create a directory in Windows
C:\Users\chaadams\Dropbox\COH\Hispanic tumours\Overexpression select genes\Linux version
## Create a Linux directory
cd /home/chaadams/cadams.grp/Seq/180618/cadams_analysis/
module add R/3.5.1
R
## Dependencies
source("http://bioconductor.org/biocLite.R")
biocLite("annotate")
biocLite("GO.db")
biocLite("org.Hs.egENSEMBL")
biocLite("limma")
biocLite("Glimma")
biocLite("gplots")
biocLite("org.Hs.eg.db")
biocLite("RColorBrewer")
biocLite("biomaRt")
biocLite("edgeR")
library(GO.db)
library(edgeR)
library(gplots)
library(RColorBrewer)
library(edgeR)
library(limma)
library(Glimma)
library(gplots)
library(org.Hs.eg.db)
library(RColorBrewer)
library(biomaRt)
library(annotate)
Code
setwd("/home/chaadams/cadams.grp/Seq/180618/cadams_analysis")
#Read the sample information into R
sampleinfo <- read.csv("./data/final_pheno.csv")
# Read the data into R
seqdata=read.csv("data/sneuhausen_180612_180615AB_180618_hg19_125_RNAseq_HTSeq_RAW_fixed_sampleIDs.csv",stringsAsFactors = FALSE)
# Use the gene symbols to fetch the ENTREZIDs (we have Ensembl IDs)
hs <- org.Hs.eg.db
my.symbols <- seqdata[,2]
entrezid=select(hs,
keys = my.symbols,
columns = c("ENTREZID", "SYMBOL"),
keytype = "SYMBOL")
entrezid_keep <- na.omit(entrezid)
dim(entrezid_keep)
entrezid_keep$symbol=entrezid_keep$SYMBOL
seqdata2 <- merge(seqdata,entrezid_keep,by="symbol")
dim(seqdata2)
length(unique(seqdata2$symbol)) == nrow(seqdata2)
length(unique(seqdata2$ENTREZID)) == nrow(seqdata2)
seqdata4=seqdata2[!duplicated(seqdata2[,136]),]
dim(seqdata4)
seqdata4 <- seqdata4[order(seqdata4$ENTREZID),]
write.csv(seqdata4, "./data/seqdata_with_entrezID.csv")
# Remove first two columns from seqdata
countdata <- seqdata4[,-(1:9)]
# Store EntrezGeneID as rownames
rownames(countdata) <- countdata[,127]
# Remove the 2 extra columns at the end
countdata <- countdata[,-(126:127)]
substr("ThisIsAString", start=1, stop=5)
# using substr, you extract the characters starting at position 1 and stopping at position 7 of the colnames
colnames(countdata) <- substr(colnames(countdata), 2, 7)
# Obtain CPMs
myCPM <- cpm(countdata)
# Have a look at the output
head(myCPM)
# Which values in myCPM are greater than 0.5?
thresh <- myCPM > 0.5
# This produces a logical matrix with TRUEs and FALSEs
head(thresh)
# Summary of how many TRUEs there are in each row
table(rowSums(thresh))
# Keep genes that have at least 2 TRUES in each row of thresh
keep <- rowSums(thresh) >= 2 #all rows fit this criteria
# Subset the rows of countdata to keep the more highly expressed genes
counts.keep <- countdata[keep,]
summary(keep)
dim(counts.keep)
dgeObj <- DGEList(counts.keep)
# have a look at dgeObj
dgeObj
# See what slots are stored in dgeObj
names(dgeObj)
# Library size information is stored in the samples slot
dgeObj$samples
# Get log2 counts per million
logcounts <- cpm(dgeObj,log=TRUE)
# Bring in the gene annotation
ann <- select(org.Hs.eg.db,keys=rownames(logcounts),columns=c("ENTREZID","SYMBOL","GENENAME"))
# Have a look at the annotation
head(ann, n=2)
table(ann$ENTREZID==rownames(logcounts))
logcounts.annotated <- cbind(logcounts, ann)
head(logcounts.annotated$GENENAME, n=2)
setwd("/home/chaadams/cadams.grp/Seq/180618/cadams_analysis/Overexpression")
write.csv(logcounts.annotated,file="./data/Annotated_logcounts_for_overexpression_select_genes.csv",row.names=FALSE)
# Set-up the data:
#shorten the name for ease of typing
#transform it such that the rownames are the gene symbol
#and then transpose it such that the columns are genes
mydata=logcounts.annotated
mylist=c("2064", "672", "675", "4609", "79728")
data=mydata[mylist, ]
rownames(data)=data$SYMBOL
#transpose such that the genes identified by ENTREZID are the colnames; patients the rows
data_t=t(data)
#remove the extra rows
data_t_clean=data_t[-c(126:128),]
#give it a final nice name
d=data_t_clean
head(d)
str(d)
#change to a data frame from a list
#change factors to numeric
asNumeric <- function(x) as.numeric(as.character(x))
factorsNumeric <- function(d) modifyList(d, lapply(d[, sapply(d, is.factor)],
asNumeric))
f=factorsNumeric(d)
str(f)
save(dgeObj,logcounts, logcounts.annotated, sampleinfo,counts.keep, seqdata4, f, file="./data/overexpression_select_genes.Rdata")
#get histograms
#get them all on one page vs one-by-one
hist(f$ERBB2)
library(tidyr)
library(ggplot2)
png(file="./figures/Histograms_5genes.png")
ggplot(gather(f), aes(value)) +
geom_histogram(bins = 10) +
facet_wrap(~key, scales = 'free_x')
dev.off()
## Copy over the main Linux directory to my Windows directory to access it and run Markdown
Histrograms for BRCA1, BRCA2, ERBB2 (HER2), MYC, PALB2