Distribution of five genes

Code

setwd("/home/chaadams/cadams.grp/Seq/180618/cadams_analysis")
#Read the sample information into R
sampleinfo <- read.csv("./data/final_pheno.csv")

# Read the data into R
seqdata=read.csv("data/sneuhausen_180612_180615AB_180618_hg19_125_RNAseq_HTSeq_RAW_fixed_sampleIDs.csv",stringsAsFactors = FALSE)

# Use the gene symbols to fetch the ENTREZIDs (we have Ensembl IDs)
hs <- org.Hs.eg.db
my.symbols <- seqdata[,2]
entrezid=select(hs, 
        keys = my.symbols,
        columns = c("ENTREZID", "SYMBOL"),
        keytype = "SYMBOL")
entrezid_keep <- na.omit(entrezid)
dim(entrezid_keep)
entrezid_keep$symbol=entrezid_keep$SYMBOL
seqdata2 <- merge(seqdata,entrezid_keep,by="symbol")
dim(seqdata2)
length(unique(seqdata2$symbol)) == nrow(seqdata2)
length(unique(seqdata2$ENTREZID)) == nrow(seqdata2)
seqdata4=seqdata2[!duplicated(seqdata2[,136]),]
dim(seqdata4)
seqdata4 <- seqdata4[order(seqdata4$ENTREZID),]
write.csv(seqdata4, "./data/seqdata_with_entrezID.csv")

# Remove first two columns from seqdata
countdata <- seqdata4[,-(1:9)]

# Store EntrezGeneID as rownames
rownames(countdata) <- countdata[,127]

# Remove the 2 extra columns at the end
countdata <- countdata[,-(126:127)]

substr("ThisIsAString", start=1, stop=5)
# using substr, you extract the characters starting at position 1 and stopping at position 7 of the colnames
colnames(countdata) <- substr(colnames(countdata), 2, 7)

# Obtain CPMs
myCPM <- cpm(countdata)
# Have a look at the output
head(myCPM)

# Which values in myCPM are greater than 0.5?
thresh <- myCPM > 0.5
# This produces a logical matrix with TRUEs and FALSEs
head(thresh)

# Summary of how many TRUEs there are in each row
table(rowSums(thresh))

# Keep genes that have at least 2 TRUES in each row of thresh
keep <- rowSums(thresh) >= 2 #all rows fit this criteria

# Subset the rows of countdata to keep the more highly expressed genes
counts.keep <- countdata[keep,]
summary(keep)
dim(counts.keep)

dgeObj <- DGEList(counts.keep)
# have a look at dgeObj
dgeObj
# See what slots are stored in dgeObj
names(dgeObj)
# Library size information is stored in the samples slot
dgeObj$samples

# Get log2 counts per million
logcounts <- cpm(dgeObj,log=TRUE)

# Bring in the gene annotation 
ann <- select(org.Hs.eg.db,keys=rownames(logcounts),columns=c("ENTREZID","SYMBOL","GENENAME"))
# Have a look at the annotation
head(ann, n=2)

table(ann$ENTREZID==rownames(logcounts)) 

logcounts.annotated <- cbind(logcounts, ann)
head(logcounts.annotated$GENENAME, n=2)
setwd("/home/chaadams/cadams.grp/Seq/180618/cadams_analysis/Overexpression")
write.csv(logcounts.annotated,file="./data/Annotated_logcounts_for_overexpression_select_genes.csv",row.names=FALSE)

# Set-up the data:
#shorten the name for ease of typing
#transform it such that the rownames are the gene symbol
#and then transpose it such that the columns are genes 

mydata=logcounts.annotated
mylist=c("2064", "672", "675", "4609", "79728")

data=mydata[mylist, ]
rownames(data)=data$SYMBOL

#transpose such that the genes identified by ENTREZID are the colnames; patients the rows
data_t=t(data)

#remove the extra rows
data_t_clean=data_t[-c(126:128),]

#give it a final nice name
d=data_t_clean
head(d)
str(d)

#change to a data frame from a list
#change factors to numeric

asNumeric <- function(x) as.numeric(as.character(x))
factorsNumeric <- function(d) modifyList(d, lapply(d[, sapply(d, is.factor)],   
                                                   asNumeric))
f=factorsNumeric(d)
str(f)

save(dgeObj,logcounts, logcounts.annotated, sampleinfo,counts.keep, seqdata4, f, file="./data/overexpression_select_genes.Rdata")


#get histograms
#get them all on one page vs one-by-one
hist(f$ERBB2)
library(tidyr)
library(ggplot2)

png(file="./figures/Histograms_5genes.png")
ggplot(gather(f), aes(value)) + 
  geom_histogram(bins = 10) + 
  facet_wrap(~key, scales = 'free_x')
dev.off()

## Copy over the main Linux directory to my Windows directory to access it and run Markdown
Distribution of five genes

Charleen D. Adams

September 26, 2018

Code

Histrograms for BRCA1, BRCA2, ERBB2 (HER2), MYC, PALB2