$ ftp ftp.ensembl.org $ ftp> cd pub/release-88/fasta/homo_sapiens/cdna/ $mget Homo_sapiens.GRCh38.cdna.all.fa.gz
$ kallisto index -i hg38.cdna.kallisto.idx Homo_sapiens.GRCh38.cdna.all.fa.gz
Processing messages: [build] loading fasta file Homo_sapiens.GRCh38.cdna.all.fa.gz [build] k-mer length: 31 [build] warning: clipped off poly-A tail (longer than 10) from 1421 target sequences [build] warning: replaced 3 non-ACGUT characters in the input sequence with pseudorandom nucleotides [build] counting k-mers … done. [build] building target de Bruijn graph … done [build] creating equivalence classes … done [build] target de Bruijn graph has 1077577 contigs and contains 106161285 k-mers
$ kallisto quant -i ../ensemble_hg38/hg38.cdna.kallisto.idx -o output –single -l 100 -s 20 V1_R1.fastq
[quant] fragment length distribution is truncated gaussian with mean = 100, sd = 20 [index] k-mer length: 31 [index] number of targets: 179,973 [index] number of k-mers: 106,161,285 [index] number of equivalence classes: 718,281 [quant] running in single-end mode [quant] will process file 1: V1_R1.fastq [quant] finding pseudoalignments for the reads … done [quant] processed 29,894,821 reads, 26,489,621 reads pseudoaligned [ em] quantifying the abundances … done [ em] the Expectation-Maximization algorithm ran for 1,241 rounds
results: |– kallisto | |– D1_R1 | | |– abundance.h5 | | |– abundance.tsv | | -- run_info.json | |-- D2_R1 | | |-- abundance.h5 | | |-- abundance.tsv | |
– run_info.json | |– D3_R1 | | |– abundance.h5 | | |– abundance.tsv | | -- run_info.json | |-- V1_R1 | | |-- abundance.h5 | | |-- abundance.tsv | |
– run_info.json | |– V2_R1 | | |– abundance.h5 | | |– abundance.tsv | | -- run_info.json | |-- V3_R1 | | |-- abundance.h5 | | |-- abundance.tsv | |
– run_info.json
First load up sleuth
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
# source("http://bioconductor.org/biocLite.R")
# biocLite("rhdf5")
# install.packages("devtools")
# devtools::install_github("pachterlab/sleuth")
library("sleuth")
Next we will construct a dataframe that contains metadata describing the experiment.
base_dir <- "~/Documents/MOLB7621/final_project/"
# get sample ids
sample_id <- dir(file.path(base_dir, "kallisto"))
# get full paths to kallisto directories
paths <- dir(file.path(base_dir, "kallisto"), full.names = T)
# specify the sample type
conditions <- c(rep("kd", 3), rep("control", 3))
# put it all in a dataframe
meta_data <- data_frame(sample = sample_id,
condition = conditions,
path = paths)
meta_data
## # A tibble: 6 × 3
## sample condition
## <chr> <chr>
## 1 D1_R1 kd
## 2 D2_R1 kd
## 3 D3_R1 kd
## 4 V1_R1 control
## 5 V2_R1 control
## 6 V3_R1 control
## # ... with 1 more variables: path <chr>
source("http://bioconductor.org/biocLite.R")
## Bioconductor version 3.4 (BiocInstaller 1.24.0), ?biocLite for help
## A new version of Bioconductor is available after installing the most
## recent version of R; see http://bioconductor.org/install
biocLite("biomaRt")
## BioC_mirror: https://bioconductor.org
## Using Bioconductor 3.4 (BiocInstaller 1.24.0), R 3.3.2 (2016-10-31).
## Installing package(s) 'biomaRt'
##
## The downloaded binary packages are in
## /var/folders/_8/6rwft5_j0b33b0zv_wh7g18m0000gn/T//RtmpqUE2xr/downloaded_packages
mart <- biomaRt::useMart(biomart = "ENSEMBL_MART_ENSEMBL",
dataset = "hsapiens_gene_ensembl",
host = 'ensembl.org')
t2g <- biomaRt::getBM(attributes = c("ensembl_transcript_id", "ensembl_gene_id",
"external_gene_name"), mart = mart)
t2g <- dplyr::rename(t2g, target_id = ensembl_transcript_id,
ens_gene = ensembl_gene_id, ext_gene = external_gene_name)
# generate sleuth object, supply formula for model, and convert ens to gene names
so <- sleuth_prep(meta_data, ~ condition, target_mapping = t2g)
## reading in kallisto results
## ......
##
## Warning in check_target_mapping(tmp_names, target_mapping): intersection
## between target_id from kallisto runs and the target_mapping is empty.
## attempted to fix problem by removing .N from target_id, then merging back
## into target_mapping. please check obj$target_mapping to ensure this new
## mapping is correct.
## normalizing est_counts
## 45999 targets passed the filter
## normalizing tpm
## merging in metadata
## normalizing bootstrap samples
## summarizing bootstraps
# sleuth objs are essentially lists that contain other R objects
# to get a decription
#?sleuth_prep
# To access each object
# so$full_model
so <- sleuth_fit(so)
## fitting measurement error models
## shrinkage estimation
## Adding missing grouping variables: `x_group`
## computing variance of betas
so <- sleuth_fit(so, ~1, 'reduced')
## fitting measurement error models
## shrinkage estimation
## Adding missing grouping variables: `x_group`
## computing variance of betas
so <- sleuth_lrt(so, 'reduced', 'full')
res <- sleuth_results(so, "reduced:full", test_type = "lrt")
res <- as_data_frame(res)
res %>% write.csv("sleuth_analysis_results.csv")
# sleuth_live(so)
```