Experiment design

  • Organism : Mus musculus
  • Experiment type : RNA Sequences
  • Background : Hyperhomocysteinemia (HHcy) is regarded as an independent predictor of cardiovascular morbidity and mortality in end-stage renal disease. Hhcy exerts its pathogenic action on the main processes involved in the progression of vascular damage. Research has shown Hhcy suggests enhanced risks for inflammation and endothelial injury which lead to cardiovascular disease (CVD), stroke, and CKD. We used RNASeq to detail the global programme of gene expression underlying hyperhomocysteinemia and identified distinct classes of differentially regulated genes in kidney.
  • Samples information :
  1. Control: Control group
  2. H-Hcy: Hyperhomocysteinemia group
  3. Ischemia control: Ischemia control group
  4. Ischemia-H-Hcy: Ischemia + Hyperhomocysteinemia group
Group Barcode Sample_ID index Qubit
control c667 17-R-100 96-25 28.0
control c669 17-R-101 96-26 33.4
control c670 17-R-102 96-27 40.2
control c678 17-R-103 96-28 40.9
control c683 17-R-104 96-29 35.8
H-Hcy h653 17-R-105 96-30 39.1
H-Hcy h654 17-R-106 96-31 42.4
H-Hcy h655 17-R-107 96-32 34.6
H-Hcy h656 17-R-108 96-33 33.9
H-Hcy h658 17-R-109 96-34 34.6
Ischemia control Ic668 17-R-110 96-35 33.5
Ischemia control Ic673 17-R-111 96-36 41.4
Ischemia control Ic677 17-R-112 96-37 33.8
Ischemia control Ic679 17-R-113 96-38 31.9
Ischemia control Ic684 17-R-114 96-39 29.0
Ischemia-H-Hcy Ih649 17-R-115 96-40 34.3
Ischemia-H-Hcy Ih651 17-R-116 96-41 35.7
Ischemia-H-Hcy Ih652 17-R-117 96-42 40.3
Ischemia-H-Hcy Ih659 17-R-118 96-43 43.5
Ischemia-H-Hcy Ih661 17-R-119 96-44 42.7

RNASeq Analysis Environment Set Up

Reference Genomes and annotations

Reference Genomes and annotations from UCSC mm10: http://hgdownload.cse.ucsc.edu/goldenPath/mm10/

Adapter Trim And Fastq File QC

#!/bin/bash
#annotation and data
project_dir=/zzh_gpfs02/jixing/Project/leishjie_Hcy
fastq_raw=/zzh_gpfs02/jixing/DataBase/Hcy_before
#adapter file
adapter=/zzh_gpfs02/jixing/Project/leishjie_Hcy/adapter.fasta

#tools
trim_adapter=/zzh_gpfs/home/zzhgroup/jixing/workspace/biosoft/Trimmomatic-0.36/trimmomatic-0.36.jar
Fastqc=/zzh_gpfs/apps/FastQC/fastqc

P=8 #use 16 threads/cores
mkdir ${project_dir}/processed_data/fastq_processed
mkdir ${project_dir}/processed_data/fastqc

cd ${fastq_raw}
find . -name '*_R1_001.fastq.gz'| sed 's/_R1_001.fastq.gz//g'| tr -d './' > ${project_dir}/sample_id.txt

cd ${project_dir}/processed_data/fastq_processed

for SAMPLE_ID in `cat ${project_dir}/sample_id.txt`
do
cat > ${SAMPLE_ID}.sh <<EOF
java -jar ${trim_adapter} PE -phred33 ${fastq_raw}/${SAMPLE_ID}_R1_001.fastq.gz ${fastq_raw}/${SAMPLE_ID}_R2_001.fastq.gz\
      ${SAMPLE_ID}_R1.fastq.gz unpaired_${SAMPLE_ID}_R1.fastq.gz ${SAMPLE_ID}_R2.fastq.gz unpaired_${SAMPLE_ID}_R2.fastq.gz \
      ILLUMINACLIP:${adapter}:2:30:10:1:true \
      LEADING:3 TRAILING:3 \
      AVGQUAL:20 \
      SLIDINGWINDOW:7:15 MINLEN:50

${Fastqc} ${SAMPLE_ID}_R1.fastq.gz ${SAMPLE_ID}_R2.fastq.gz -o ${project_dir}/processed_data/fastqc

EOF
bsub < ${SAMPLE_ID}.sh
rm ${SAMPLE_ID}.sh
done

STAR Indexing

project_dir=/zzh_gpfs02/jixing/Project/leishjie_Hcy
ref_gtf=/zzh_gpfs02/jixing/Annotation/mm10/idx_hisat2/genes.gtf
ref_fasta=/zzh_gpfs02/jixing/Annotation/mm10/fa/genome.fa

#tools
RUN_STAR=/zzh_gpfs/home/zzhgroup/jixing/miniconda2/bin/STAR

#parameter
P=8 #use 8 threads/cores
sjdbOverhang=149

cd  ${project_dir}/processed_data
mkdir star_index
$RUN_STAR --runMode genomeGenerate --runThreadN $P \
--genomeDir ${project_dir}/processed_data/star_index \
--limitGenomeGenerateRAM 136410676608 \
--genomeFastaFiles ${ref_fasta} \
--sjdbGTFfile ${ref_gtf} \
--sjdbOverhang ${sjdbOverhang}

STAR Alignment

#!/bin/r
project_dir=/zzh_gpfs02/jixing/Project/leishjie_Hcy
fastq_clean=${project_dir}/processed_data/fastq_processed


ref_gtf=/zzh_gpfs02/jixing/Annotation/mm10/idx_hisat2/genes.gtf
ref_fasta=/zzh_gpfs02/jixing/Annotation/mm10/fa/genome.fa
star_index=${project_dir}/processed_data/star_index

#tools=================================================
RUN_STAR=/zzh_gpfs/home/zzhgroup/jixing/miniconda2/bin/STAR
SAMTOOLS=/zzh_gpfs/home/zzhgroup/jixing/workspace/rnaseq/tools/samtools-1.1/samtools

P=8 #use 8 threads/cores
sjdbOverhang=149

mkdir ${project_dir}/processed_data/algn_star
cd ${project_dir}/processed_data/algn_star
for SAMPLE_ID in `cat ${project_dir}/sample_id.txt`
do
cat > ${SAMPLE_ID}.sh <<EOF
#!/bin/bash
mkdir ${project_dir}/processed_data/algn_star/${SAMPLE_ID}
cd ${project_dir}/processed_data/algn_star/${SAMPLE_ID}

${RUN_STAR}   --runMode alignReads --runThreadN $P --sjdbOverhang $sjdbOverhang \
              --sjdbGTFfile ${ref_gtf} \
              --genomeDir  ${star_index} \
              --outSAMstrandField intronMotif \
              --outFileNamePrefix ${SAMPLE_ID}. \
              --outSAMtype BAM SortedByCoordinate \
              --outFilterScoreMinOverLread 0.33 --outFilterMatchNminOverLread 0.33 \
              --readFilesIn ${fastq_clean}/${SAMPLE_ID}_R1.fastq.gz  ${fastq_clean}/${SAMPLE_ID}_R2.fastq.gz \
              --readFilesCommand zcat

${SAMTOOLS} sort ${SAMPLE_ID}.Aligned.sortedByCoord.out.bam ${SAMPLE_ID}.star.sorted
${SAMTOOLS} index ${SAMPLE_ID}.star.sorted.bam
EOF
bsub < ${SAMPLE_ID}.sh
rm ${SAMPLE_ID}.sh
done

Expression Count wit htseq-count

#!/bin/bash
project_dir=/zzh_gpfs02/jixing/Project/leishjie_Hcy
ref_gtf=/zzh_gpfs02/jixing/Annotation/mm10/idx_hisat2/genes.gtf
ref_fasta=/zzh_gpfs02/jixing/Annotation/mm10/fa/genome.fa

fastq_clean=${project_dir}/processed_data/fastq_processed
bam_dir=${project_dir}/processed_data/algn_star

#tools========
#HTSeq version 0.6.0.
run_htseqCount=/zzh_gpfs/home/zzhgroup/jixing/anaconda3/bin/htseq-count

P=8 #use 8 threads/cores

mkdir ${project_dir}/processed_data/count_htseq_star
cd ${project_dir}/processed_data/count_htseq_star

for SAMPLE_ID in `cat ${project_dir}/sample_id.txt`
do
cat > ${SAMPLE_ID}.sh <<EOF
#!/bin/bash
$run_htseqCount \
    --format bam --order pos \
    --mode intersection-strict \
    --stranded reverse \
    --minaqual 1 --type exon --idattr gene_id \
    ${bam_dir}/${SAMPLE_ID}/${SAMPLE_ID}.star.sorted.bam ${ref_gtf} > ${SAMPLE_ID}_gene.tsv
EOF
bsub < ${SAMPLE_ID}.sh
rm ${SAMPLE_ID}.sh
done

# -r pos sort.bam base position
# -r name sorted.bam base read name
# a new advice : use a name sorted bamfile for htseq

Differential Expression Analysis Use DESeq2

Count matrix input

library(tidyverse)
#第一列是公司测序ID 第二列是实验室ID
library(readxl)
metadata <- read_excel("./NFYKD_dataOutput.20171113.20171117.xlsx")
colnames(metadata)[1:2] <- c("company_ID","Lab_ID")
library(readr)
countmatrix <- read_csv("star_count_merged.csv")
colnames(countmatrix)[2:dim(countmatrix)[2]] <- 
  colnames(countmatrix)[2:dim(countmatrix)[2]] %>% str_sub(1,nchar(metadata$company_ID[1])) 

colnames(countmatrix)[2:dim(countmatrix)[2]] <- 
  metadata %>% arrange(company_ID) %>% dplyr::select(Lab_ID) %>% .[[1]]

rownames(countmatrix) <- countmatrix$gene_name
countmatrix <- dplyr::select(countmatrix,starts_with("I"), -Ic673, -Ic684)
colnames(countmatrix)

Experiment design matrix

ExpDesign <- data.frame(row.names = colnames(countmatrix), 
                        name =  colnames(countmatrix),
                        hcy = c(rep("untrt",3), rep("trt",5)))
dds <- DESeq2::DESeqDataSetFromMatrix(countData = countmatrix, colData = ExpDesign, design = ~hcy)

Exploratory analysis and visualization

Pre-filtering the dataset

nrow(dds)
dds <- dds[rowSums(DESeq2::counts(dds)) > 1, ]# remove no-expression gene
nrow(dds)

The rlog and variance stabilizing transformations

library("dplyr")
library("ggplot2")
library(DESeq2)
rld <- rlog(dds, blind = FALSE)
vsd <- vst(dds, blind = FALSE)
dds <- estimateSizeFactors(dds)

df <- bind_rows(
  as_data_frame(log2(counts(dds, normalized=TRUE)[, 1:2]+1)) %>%
         mutate(transformation = "log2(x + 1)"),
  as_data_frame(assay(rld)[, 1:2]) %>% mutate(transformation = "rlog"),
  as_data_frame(assay(vsd)[, 1:2]) %>% mutate(transformation = "vst"))
  
colnames(df)[1:2] <- c("x", "y")  

ggplot(df, aes(x = x, y = y)) + geom_hex(bins = 80) +
  coord_fixed() + facet_grid( . ~ transformation)  

Sample distances

sampleDists <- dist(t(assay(rld)))
library("pheatmap")
library("RColorBrewer")
sampleDistMatrix <- as.matrix( sampleDists )
rownames(sampleDistMatrix) <- colnames(rld)
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
         clustering_distance_rows = sampleDists,
         clustering_distance_cols = sampleDists,
         col = colors)

PCA(Principal Components Analysis) plot

library(ggrepel)

plotPCA(rld, intgroup = c( "hcy"))
(pcaData <- plotPCA(rld, intgroup = c( "hcy"), returnData = TRUE))

percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(x = PC1, y = PC2, color = hcy)) +
  geom_point(size =3) +
  xlab(paste0("PC1: ", percentVar[1], "% variance")) +
  ylab(paste0("PC2: ", percentVar[2], "% variance")) +
  coord_fixed()+
  theme_bw() +
  ggrepel::geom_label_repel(aes(label = name), data = pcaData)

DEG Analysis: Ischemia-H-Hcy VS Ischemia control

dds <- DESeq2::DESeq(dds)
library(dplyr)
#res <- results(dds, contrast=c("dex","trt","untrt"))
res.05 <- results(dds, contrast=c("hcy","trt","untrt"), alpha = 0.05)
summary(res.05)

Annotating and exporting results

library("AnnotationDbi")
library("org.Mm.eg.db")
columns(org.Mm.eg.db)
#tolower() toupper()
res.05$symbol <- mapIds(org.Mm.eg.db,
                        keys=row.names(res.05),
                        column="SYMBOL",
                        keytype="SYMBOL",
                        multiVals="first")
res.05$ensembl <- mapIds(org.Mm.eg.db,
                         keys=row.names(res.05),
                         column="ENSEMBL",
                         keytype="SYMBOL",
                         multiVals="first")
res.05$entrezid <- mapIds(org.Mm.eg.db,
                          keys=row.names(res.05),
                          column="ENTREZID",
                          keytype="SYMBOL",
                          multiVals="first")

make res.05 dataframe for html report

  • padj <= 0.05
  • log2FoldChange >= 1
library(dplyr)
dir.create("./table/DEG_report/Ih_vs_Ic",recursive = TRUE)
DEG <- as.data.frame(res.05)

DEG %<>% 
  dplyr::mutate(abs_log2FC = abs(log2FoldChange)) %>% 
  dplyr::mutate(FoldChange = 2^(log2FoldChange))  %>% 
  dplyr::mutate(trend = if_else(log2FoldChange <= 0,"down","up"))
write.csv(DEG, file="./table/DEG_report/Ih_vs_Ic/results_DEG.csv")

report res.05 with ReportingTools

library("ReportingTools")
report <- dplyr::select(DEG, symbol,abs_log2FC,trend,FoldChange,log2FoldChange,pvalue, padj, entrezid) %>% 
          dplyr::arrange(abs_log2FC)
htmlRep <- HTMLReport(shortName="DEG_list", title="ZERO report",
                      reportDirectory="./table/DEG_report/Ih_vs_Ic")
publish(report, htmlRep)
url <- finish(htmlRep)

Enrichment Analysis: GO and KEGG

GO Biological Process

library(enrichR)
listEnrichrDbs()
DEG %>% #combine
  dplyr::filter(padj <= 0.05) %>% 
  dplyr::filter(abs_log2FC >= 1) %>% 
  dplyr::arrange(abs_log2FC) %>% 
  dplyr::select(symbol) %>% 
  .[[1]] %>% as.character() %>% 
  enrichr(., databases = "GO_Biological_Process_2017")

GO Cellular Component

DEG %>% #combine
  dplyr::filter(padj <= 0.05) %>% 
  dplyr::filter(abs_log2FC >= 1) %>% 
  dplyr::arrange(abs_log2FC) %>% 
  dplyr::select(symbol) %>% 
  .[[1]] %>% as.character() %>% 
  enrichr(., databases = "GO_Cellular_Component_2017")

GO Molecular Function

DEG %>% #combine
  dplyr::filter(padj <= 0.05) %>% 
  dplyr::filter(abs_log2FC >= 1) %>% 
  dplyr::arrange(abs_log2FC) %>% 
  dplyr::select(symbol) %>% 
  .[[1]] %>% as.character() %>% 
  enrichr(., databases = "GO_Molecular_Function_2017")

KEGG

DEG %>% #combine
  dplyr::filter(padj <= 0.05) %>% 
  dplyr::filter(abs_log2FC >= 1) %>% 
  dplyr::arrange(abs_log2FC) %>% 
  dplyr::select(symbol) %>% 
  .[[1]] %>% as.character() %>% 
  enrichr(., databases = "KEGG_2016")

Plotting some results

Counts plot

topGene <- rownames(res.05)[which.min(res.05$padj)]
plotCounts(dds, gene = topGene, intgroup=c("hcy"))
plotCounts(dds, gene = "Numb", intgroup=c("hcy"))
plotCounts(dds, gene = "Trp53", intgroup=c("hcy"))

MA-plot

res <- lfcShrink(dds, contrast=c("hcy","trt","untrt"), res=res.05)
plotMA(res, ylim = c(-5, 5))

histogram of the p values

hist(res$pvalue[res$baseMean > 1], breaks = 0:20/20,
     col = "grey50", border = "white")

heatmap

pheatmap

library("genefilter")
topVarGenes <- head(order(rowVars(assay(rld)), decreasing = TRUE), 20)
mat  <- assay(rld)[ topVarGenes, ]
mat  <- mat - rowMeans(mat)
#anno <- as.data.frame(colData(rld)[, c("hcy")])
pheatmap(mat)

Session information

devtools::session_info()
## ─ Session info ──────────────────────────────────────────────────────────
##  setting  value                                      
##  version  R version 3.4.4 Patched (2018-03-19 r74624)
##  os       macOS High Sierra 10.13.6                  
##  system   x86_64, darwin15.6.0                       
##  ui       X11                                        
##  language (EN)                                       
##  collate  en_US.UTF-8                                
##  ctype    en_US.UTF-8                                
##  tz       Asia/Shanghai                              
##  date     2018-11-21                                 
## 
## ─ Packages ──────────────────────────────────────────────────────────────
##  package     * version     date       lib
##  assertthat    0.2.0       2017-04-11 [1]
##  backports     1.1.2       2017-12-13 [1]
##  base64enc     0.1-3       2015-07-28 [1]
##  callr         3.0.0       2018-08-24 [1]
##  cli           1.0.1       2018-09-25 [1]
##  crayon        1.3.4       2017-09-16 [1]
##  debugme       1.1.0       2017-10-22 [1]
##  desc          1.2.0       2018-05-01 [1]
##  devtools      2.0.1       2018-10-26 [1]
##  digest        0.6.18      2018-10-10 [1]
##  dplyr         0.7.99.9000 2018-10-13 [1]
##  evaluate      0.12        2018-10-09 [1]
##  fs            1.2.6       2018-08-23 [1]
##  glue          1.3.0       2018-08-23 [1]
##  htmldeps      0.1.1       2018-07-29 [1]
##  htmltools     0.3.6       2017-04-28 [1]
##  knitr         1.20.15     2018-10-08 [1]
##  magrittr      1.5         2014-11-22 [1]
##  memoise       1.1.0       2018-07-28 [1]
##  pillar        1.3.0       2018-07-14 [1]
##  pkgbuild      1.0.2       2018-10-16 [1]
##  pkgconfig     2.0.2       2018-08-16 [1]
##  pkgload       1.0.2       2018-10-29 [1]
##  prettyunits   1.0.2       2015-07-13 [1]
##  processx      3.2.0       2018-08-16 [1]
##  ps            1.2.1       2018-11-06 [1]
##  purrr         0.2.5       2018-05-29 [1]
##  R6            2.3.0       2018-10-04 [1]
##  Rcpp          1.0.0       2018-11-07 [1]
##  remotes       2.0.2       2018-10-30 [1]
##  rlang         0.3.0.1     2018-10-25 [1]
##  rmarkdown     1.10.13     2018-10-08 [1]
##  rprojroot     1.3-2       2018-01-03 [1]
##  rstudioapi    0.8         2018-10-02 [1]
##  sessioninfo   1.1.1       2018-11-05 [1]
##  stringi       1.2.4       2018-07-20 [1]
##  stringr       1.3.1       2018-05-10 [1]
##  testthat      2.0.1       2018-10-13 [1]
##  tibble        1.4.2       2018-01-22 [1]
##  tidyselect    0.2.5       2018-10-11 [1]
##  usethis       1.4.0       2018-08-14 [1]
##  withr         2.1.2       2018-07-28 [1]
##  xfun          0.4         2018-10-23 [1]
##  yaml          2.2.0       2018-07-25 [1]
##  source                            
##  CRAN (R 3.4.0)                    
##  CRAN (R 3.4.3)                    
##  CRAN (R 3.4.0)                    
##  CRAN (R 3.4.4)                    
##  CRAN (R 3.4.4)                    
##  CRAN (R 3.4.1)                    
##  CRAN (R 3.4.2)                    
##  CRAN (R 3.4.4)                    
##  CRAN (R 3.4.4)                    
##  CRAN (R 3.4.4)                    
##  Github (tidyverse/dplyr@1573f60)  
##  CRAN (R 3.4.4)                    
##  CRAN (R 3.4.4)                    
##  Github (tidyverse/glue@4e74901)   
##  Github (rstudio/htmldeps@c1023e0) 
##  CRAN (R 3.4.0)                    
##  Github (yihui/knitr@3d6a6c3)      
##  CRAN (R 3.4.0)                    
##  Github (hadley/memoise@06d16ec)   
##  CRAN (R 3.4.4)                    
##  CRAN (R 3.4.4)                    
##  CRAN (R 3.4.4)                    
##  CRAN (R 3.4.4)                    
##  CRAN (R 3.4.0)                    
##  CRAN (R 3.4.4)                    
##  CRAN (R 3.4.4)                    
##  CRAN (R 3.4.4)                    
##  CRAN (R 3.4.4)                    
##  CRAN (R 3.4.4)                    
##  CRAN (R 3.4.4)                    
##  CRAN (R 3.4.4)                    
##  Github (rstudio/rmarkdown@6931cdb)
##  CRAN (R 3.4.3)                    
##  CRAN (R 3.4.4)                    
##  CRAN (R 3.4.4)                    
##  CRAN (R 3.4.4)                    
##  CRAN (R 3.4.4)                    
##  CRAN (R 3.4.4)                    
##  CRAN (R 3.4.3)                    
##  CRAN (R 3.4.4)                    
##  CRAN (R 3.4.4)                    
##  Github (jimhester/withr@fe56f20)  
##  CRAN (R 3.4.4)                    
##  CRAN (R 3.4.4)                    
## 
## [1] /Library/Frameworks/R.framework/Versions/3.4/Resources/library