FTP download the hg38 cdna fasta file

ftp ftp.ensembl.org 
ftp> cd pub/release-88/fasta/homo_sapiens/cdna/
mget Homo_sapiens.GRCh38.cdna.all.fa.gz

Kallisto index the hg38.cdna.fasta file

kallisto index -i hg38.cdna.kallisto.idx Homo_sapiens.GRCh38.cdna.all.fa.gz 

Processing messages: [build] loading fasta file Homo_sapiens.GRCh38.cdna.all.fa.gz [build] k-mer length: 31 [build] warning: clipped off poly-A tail (longer than 10) from 1421 target sequences [build] warning: replaced 3 non-ACGUT characters in the input sequence with pseudorandom nucleotides [build] counting k-mers … done. [build] building target de Bruijn graph … done [build] creating equivalence classes … done [build] target de Bruijn graph has 1077577 contigs and contains 106161285 k-mers

Psuedo-align and count reads

kallisto quant -i ../ensemble_hg38/hg38.cdna.kallisto.idx -o output --single -l 100 -s 20 V1_R1.fastq 

Processing messages: [quant] fragment length distribution is truncated gaussian with mean = 100, sd = 20 [index] k-mer length: 31 [index] number of targets: 179,973 [index] number of k-mers: 106,161,285 [index] number of equivalence classes: 718,281 [quant] running in single-end mode [quant] will process file 1: V1_R1.fastq [quant] finding pseudoalignments for the reads … done [quant] processed 29,894,821 reads, 26,489,621 reads pseudoaligned [ em] quantifying the abundances … done [ em] the Expectation-Maximization algorithm ran for 1,241 rounds

Check newly generated kallisto files

Caption for the picture.

Caption for the picture.

Basic Differential Expression analysis with Sleuth

1. load up sleuth

library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag():    dplyr, stats
# source("http://bioconductor.org/biocLite.R")
# biocLite("rhdf5")
# install.packages("devtools")
# devtools::install_github("pachterlab/sleuth")
library("sleuth")

2. Construct a dataframe that contains metadata describing the experiment.

base_dir <- "~/Documents/MOLB7621/final_project/"
# get sample ids
sample_id <- dir(file.path(base_dir, "kallisto"))
# get full paths to kallisto directories
paths <- dir(file.path(base_dir, "kallisto"), full.names = T)
# specify the sample type
conditions <- c(rep("shRNA", 3), rep("control", 3))
# put it all in a dataframe
meta_data <- data_frame(sample = sample_id, 
               condition = conditions,
               path = paths)
meta_data
## # A tibble: 6 × 3
##   sample condition
##    <chr>     <chr>
## 1  D1_R1     shRNA
## 2  D2_R1     shRNA
## 3  D3_R1     shRNA
## 4  V1_R1   control
## 5  V2_R1   control
## 6  V3_R1   control
## # ... with 1 more variables: path <chr>

3. Generate a t2g file containing info to map gene symbols to ensemble transcript IDs

source("http://bioconductor.org/biocLite.R")
## Bioconductor version 3.4 (BiocInstaller 1.24.0), ?biocLite for help
## A new version of Bioconductor is available after installing the most
##   recent version of R; see http://bioconductor.org/install
# biocLite("biomaRt")
mart <- biomaRt::useMart(biomart = "ENSEMBL_MART_ENSEMBL",
  dataset = "hsapiens_gene_ensembl",
  host = 'ensembl.org')

t2g <- biomaRt::getBM(attributes = c("ensembl_transcript_id", "ensembl_gene_id","external_gene_name"), mart = mart)

t2g <- dplyr::rename(t2g, target_id = ensembl_transcript_id,
  ens_gene = ensembl_gene_id, ext_gene = external_gene_name)

t2g %>% head()
##         target_id        ens_gene  ext_gene
## 1 ENST00000583496 ENSG00000264452     snoZ6
## 2 ENST00000620853 ENSG00000278324    PVT1_3
## 3 ENST00000636749 ENSG00000283502        U6
## 4 ENST00000476140 ENSG00000241226 RN7SL836P
## 5 ENST00000516795 ENSG00000252604  RNU2-44P
## 6 ENST00000616110 ENSG00000274494   MIR6832

4. Generate Sleuth object and convert ens to gene names

so <- sleuth_prep(meta_data, ~ condition, target_mapping = t2g)
## reading in kallisto results
## ......
## 
## Warning in check_target_mapping(tmp_names, target_mapping): intersection
## between target_id from kallisto runs and the target_mapping is empty.
## attempted to fix problem by removing .N from target_id, then merging back
## into target_mapping. please check obj$target_mapping to ensure this new
## mapping is correct.
## normalizing est_counts
## 45999 targets passed the filter
## normalizing tpm
## merging in metadata
## normalizing bootstrap samples
## summarizing bootstraps
# sleuth objs are essentially lists that contain other R objects
# to get a decription
#?sleuth_prep
# To access each object
# so$full_model

5. Fit full and reduced models in sleuth

so <- sleuth_fit(so)
## fitting measurement error models
## shrinkage estimation
## Adding missing grouping variables: `x_group`
## computing variance of betas
so <- sleuth_fit(so, ~1, 'reduced')
## fitting measurement error models
## shrinkage estimation
## Adding missing grouping variables: `x_group`
## computing variance of betas

6. Perform sleuth stats

so <- sleuth_lrt(so, 'reduced', 'full')

7. Gnerate sleuth results and writes it as a csv file

res <- sleuth_results(so, "reduced:full", test_type = "lrt")
res <- as_data_frame(res)
res %>% write.csv("sleuth_analysis_results.csv")

8. Visulize sleuth results

# sleuth_live(so)

8.1. Knock down efficiency

Caption for the picture.

Caption for the picture.

8.2. Condition_density_plot

Caption for the picture.

Caption for the picture.

8.3. representative scatter_plot (V1 vs D1)

Caption for the picture.

Caption for the picture.

8.4. pca_plot

Caption for the picture.

Caption for the picture.

Identify DEGs

1. Make a file that contains all est_counts together. Source code from: http://achri.blogspot.com/2017/03/quick-and-dirty-sample-sex-swap-sanity.html

# first, make a file head
perl -e 'print "target_id\t",join("\t",map {/(.*)\//;$1} @ARGV),"\n";' kallisto/*/abundance.tsv > all_trinity_abundance.tsv

# then, extract est_counts from eachi invidivual file and append them to the all_trinity_abundance file
paste kallisto/*/abundance.tsv | perl -ane 'print $F[0];for (1..$#F){print "\t$F[$_]" if /[49]$/}print "\n"' | tail -n +2 >> all_trinity_abundance.tsv

3. read in all_trinity_abundance.tsv file and remove low reads

all <- read_tsv("all_trinity_abundance.tsv") %>% # read in file
    mutate(sum = rowSums(.[2:7])) %>% # generate an additional column containing sum of the reads per row
    filter(sum >= 6) %>% # filter out low read genes
    select(-sum) # remove sum column
## Parsed with column specification:
## cols(
##   target_id = col_character(),
##   `kallisto/D1_R1` = col_double(),
##   `kallisto/D2_R1` = col_double(),
##   `kallisto/D3_R1` = col_double(),
##   `kallisto/V1_R1` = col_double(),
##   `kallisto/V2_R1` = col_double(),
##   `kallisto/V3_R1` = col_double()
## )
all
## # A tibble: 28,870 × 7
##            target_id `kallisto/D1_R1` `kallisto/D2_R1` `kallisto/D3_R1`
##                <chr>            <dbl>            <dbl>            <dbl>
## 1  ENST00000390343.2         1.561680          1.56950         2.238590
## 2  ENST00000390344.2         2.891130          3.37049         4.655110
## 3  ENST00000390477.2         1.602830          1.35651         2.143060
## 4  ENST00000611116.1        23.475700         20.17130        23.390400
## 5  ENST00000453673.3         1.237500          0.00000         0.180674
## 6  ENST00000390289.2         0.935487          2.06837         0.491691
## 7  ENST00000390514.1         6.243490          0.00000         0.000000
## 8  ENST00000390488.1         0.000000          6.93646         0.000000
## 9  ENST00000390473.1         0.000000          0.00000         0.000000
## 10 ENST00000390476.1         0.000000          0.00000         0.000000
## # ... with 28,860 more rows, and 3 more variables: `kallisto/V1_R1` <dbl>,
## #   `kallisto/V2_R1` <dbl>, `kallisto/V3_R1` <dbl>

4. select DEGs with qval < 0.001

res %>% 
  filter(qval < 0.001) -> degs
degs
## # A tibble: 468 × 15
##            target_id test_stat         pval         qval       rss
##                <chr>     <dbl>        <dbl>        <dbl>     <dbl>
## 1  ENST00000361050.3  36.77675 1.324607e-09 6.092000e-05 12.829649
## 2  ENST00000369409.8  35.04435 3.222801e-09 7.307899e-05 10.617438
## 3  ENST00000290551.4  34.28231 4.766954e-09 7.307899e-05  7.658977
## 4  ENST00000262878.4  32.61531 1.123264e-08 1.033200e-04  5.178877
## 5  ENST00000423485.5  32.73581 1.055742e-08 1.033200e-04  7.311694
## 6  ENST00000296370.3  31.15736 2.379357e-08 1.229476e-04 11.856147
## 7  ENST00000523944.5  30.90997 2.702795e-08 1.229476e-04  5.090015
## 8  ENST00000303115.7  30.27845 3.742617e-08 1.229476e-04 20.623211
## 9  ENST00000264156.2  31.34731 2.157573e-08 1.229476e-04  4.194092
## 10 ENST00000376588.3  31.05478 2.508476e-08 1.229476e-04  8.324726
## # ... with 458 more rows, and 10 more variables: sigma_sq <dbl>,
## #   tech_var <dbl>, mean_obs <dbl>, var_obs <dbl>, sigma_sq_pmax <dbl>,
## #   smooth_sigma_sq <dbl>, final_sigma_sq <dbl>, degrees_free <int>,
## #   ens_gene <chr>, ext_gene <chr>

5. left_join the est_counts to the degs list

degs %>%
  left_join(all) -> degs_counts
## Joining, by = "target_id"
degs_counts %>%
  write.csv("degs_and_est_counts.csv")

degs_counts
## # A tibble: 468 × 21
##            target_id test_stat         pval         qval       rss
##                <chr>     <dbl>        <dbl>        <dbl>     <dbl>
## 1  ENST00000361050.3  36.77675 1.324607e-09 6.092000e-05 12.829649
## 2  ENST00000369409.8  35.04435 3.222801e-09 7.307899e-05 10.617438
## 3  ENST00000290551.4  34.28231 4.766954e-09 7.307899e-05  7.658977
## 4  ENST00000262878.4  32.61531 1.123264e-08 1.033200e-04  5.178877
## 5  ENST00000423485.5  32.73581 1.055742e-08 1.033200e-04  7.311694
## 6  ENST00000296370.3  31.15736 2.379357e-08 1.229476e-04 11.856147
## 7  ENST00000523944.5  30.90997 2.702795e-08 1.229476e-04  5.090015
## 8  ENST00000303115.7  30.27845 3.742617e-08 1.229476e-04 20.623211
## 9  ENST00000264156.2  31.34731 2.157573e-08 1.229476e-04  4.194092
## 10 ENST00000376588.3  31.05478 2.508476e-08 1.229476e-04  8.324726
## # ... with 458 more rows, and 16 more variables: sigma_sq <dbl>,
## #   tech_var <dbl>, mean_obs <dbl>, var_obs <dbl>, sigma_sq_pmax <dbl>,
## #   smooth_sigma_sq <dbl>, final_sigma_sq <dbl>, degrees_free <int>,
## #   ens_gene <chr>, ext_gene <chr>, `kallisto/D1_R1` <dbl>,
## #   `kallisto/D2_R1` <dbl>, `kallisto/D3_R1` <dbl>,
## #   `kallisto/V1_R1` <dbl>, `kallisto/V2_R1` <dbl>, `kallisto/V3_R1` <dbl>