#/////////////////////////////////////////////////////////////////////////////
#   LIBRARIES AND CUSTOM FUNCTIONS
#\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\


library(edgeR)
## Loading required package: limma
library(car)
library(gplots)
## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009
## 
## Attaching package: 'gplots'
## 
## The following object is masked from 'package:stats':
## 
##     lowess
library(reshape)
library(stringr)
#library(affycoretools)
library(limma)
library(ggplot2)
require(biomaRt)
## Loading required package: biomaRt
library(stringr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following object is masked from 'package:biomaRt':
## 
##     select
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Run once #################################################################################################
library(biomaRt)
ensMart<-useMart("ensembl")
ensembl_hs_mart <- useMart(biomart="ensembl", dataset="hsapiens_gene_ensembl")


#head(listAttributes(ensembl_hs_mart),n=157)
#grep("length",listAttributes(ensembl_hs_mart)$description,ignore.case = T,value=T)

ensembl_df <- getBM(attributes=c("ensembl_gene_id", "refseq_mrna","description",
                                 "hgnc_symbol","chromosome_name"),mart=ensembl_hs_mart)

head(ensembl_df)
##   ensembl_gene_id refseq_mrna
## 1 ENSG00000261657            
## 2 ENSG00000223116            
## 3 ENSG00000233440            
## 4 ENSG00000207157            
## 5 ENSG00000229483            
## 6 ENSG00000252952   NR_046932
##                                                                                         description
## 1 solute carrier family 25 (S-adenosylmethionine carrier), member 26 [Source:HGNC Symbol;Acc:20661]
## 2                                                                                                  
## 3                         high mobility group AT-hook 1 pseudogene 6 [Source:HGNC Symbol;Acc:19121]
## 4                                 RNA, Ro-associated Y3 pseudogene 4 [Source:HGNC Symbol;Acc:42488]
## 5                         long intergenic non-protein coding RNA 362 [Source:HGNC Symbol;Acc:42682]
## 6                               RNA, U6 small nuclear 58, pseudogene [Source:HGNC Symbol;Acc:42548]
##   hgnc_symbol chromosome_name
## 1    SLC25A26     HG991_PATCH
## 2                          13
## 3     HMGA1P6              13
## 4      RNY3P4              13
## 5   LINC00362              13
## 6    RNU6-58P              13
# /Run once #################################################################################################



# Set wordking folder for results files
setwd("~/Projects/2014-08-19_GlucoseIslets_2014//Analysis")

#/////////////////////////////////////////////////////////////////////////////
# Read in data ------------------------------------------------------------
#\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\


## Read in the data making the row names the first column
counttable_all <- read.table("~/Projects/2014-08-19_GlucoseIslets_2014/RawData/RNAseq_human_islets.featureCounts.nolastline.tsv", header=T, row.names=1)
#head(counttable_all)

colnames(counttable_all)<-sapply(strsplit(colnames(counttable_all), "\\."), `[`, 4)

# fix stupid naming
colnames(counttable_all)[55] <- "HTL0006"




#/////////////////////////////////////////////////////////////////////////////
# Bring in metadata
#\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\


meta <- read.csv("~/Projects/2013-10_HTL_RNASeq/Metadata/20131020_phenotypes_from_intranet.csv",row.names=1,sep="\t")

meta$Glycemic<-factor(recode(meta$Hba1c,"0:6='Normo';6:6.5='IGT';6.5:1000='T2D'"))
meta$Glycemic <- relevel(meta$Glycemic,ref="Normo")

meta$Donor <- rownames(meta)

#/////////////////////////////////////////////////////////////////////////////
# Add  information for high glucose 
#\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\

highcl <-  colnames(counttable_all)[grep("High",colnames(counttable_all))]

HighSamples <- gsub("HighGlucose","",highcl)

meta$Treatment <- "Low"


meta$InGlucosePair <- 0
meta[HighSamples,]$InGlucosePair <- 1



MetaHigh <- meta[HighSamples,]
rownames(MetaHigh) <- highcl
MetaHigh$Treatment <- "High"

MetaComplete <- rbind(meta,MetaHigh)



#/////////////////////////////////////////////////////////////////////////////
# Final clean up
#\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\


MetaSeqSamples <- MetaComplete[colnames(counttable_all),]

MetaSeqSamples$Batch <- ifelse(as.integer(substr(MetaSeqSamples$Donor,5,8)) > 110,"B2","B1")


#/////////////////////////////////////////////////////////////////////////////
# Remove samples with < 10M counts
#\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\

# Samples to remove
remove_to_few_reads<-colnames(counttable_all[,colSums(counttable_all) / 1000000 < 3])

kept_samples<-colnames(counttable_all[,colSums(counttable_all) / 1000000 > 3])

CounttableQC <- counttable_all[,kept_samples]
MetaSeqSamplesQC <- MetaSeqSamples[colnames(CounttableQC),]



rownames(CounttableQC) <- substr(rownames(CounttableQC),1,15)


#/////////////////////////////////////////////////////////////////////////////
# All low samples
#\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\


par(mfrow=c(1,1))

design<-model.matrix(~Batch+Treatment,MetaSeqSamplesQC)
y <- DGEList(counts=CounttableQC, genes=rownames(CounttableQC))
#isexpr <- rowSums(cpm(y)>1) >= 10
#y <- y[isexpr,]
v <- voom(y,design,plot=T)

plot of chunk unnamed-chunk-1

plotMDS(v,col=rainbow(2)[as.factor(MetaSeqSamplesQC$Treatment)],top=50,gene.selection="common")

plot of chunk unnamed-chunk-1

plotMDS(v,col=topo.colors(2)[as.factor(MetaSeqSamplesQC$Gender)],top=50,gene.selection="common")

plot of chunk unnamed-chunk-1