ftp ftp.ensembl.org
ftp> cd pub/release-88/fasta/homo_sapiens/cdna/
mget Homo_sapiens.GRCh38.cdna.all.fa.gz
kallisto index -i hg38.cdna.kallisto.idx Homo_sapiens.GRCh38.cdna.all.fa.gz
Processing messages: [build] loading fasta file Homo_sapiens.GRCh38.cdna.all.fa.gz [build] k-mer length: 31 [build] warning: clipped off poly-A tail (longer than 10) from 1421 target sequences [build] warning: replaced 3 non-ACGUT characters in the input sequence with pseudorandom nucleotides [build] counting k-mers … done. [build] building target de Bruijn graph … done [build] creating equivalence classes … done [build] target de Bruijn graph has 1077577 contigs and contains 106161285 k-mers
kallisto quant -i ../ensemble_hg38/hg38.cdna.kallisto.idx -o output --single -l 100 -s 20 V1_R1.fastq
Processing messages: [quant] fragment length distribution is truncated gaussian with mean = 100, sd = 20 [index] k-mer length: 31 [index] number of targets: 179,973 [index] number of k-mers: 106,161,285 [index] number of equivalence classes: 718,281 [quant] running in single-end mode [quant] will process file 1: V1_R1.fastq [quant] finding pseudoalignments for the reads … done [quant] processed 29,894,821 reads, 26,489,621 reads pseudoaligned [ em] quantifying the abundances … done [ em] the Expectation-Maximization algorithm ran for 1,241 rounds
Caption for the picture.
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
# source("http://bioconductor.org/biocLite.R")
# biocLite("rhdf5")
# install.packages("devtools")
# devtools::install_github("pachterlab/sleuth")
library("sleuth")
base_dir <- "~/Documents/MOLB7621/final_project/"
# get sample ids
sample_id <- dir(file.path(base_dir, "kallisto"))
# get full paths to kallisto directories
paths <- dir(file.path(base_dir, "kallisto"), full.names = T)
# specify the sample type
conditions <- c(rep("shRNA", 3), rep("control", 3))
# put it all in a dataframe
meta_data <- data_frame(sample = sample_id,
condition = conditions,
path = paths)
meta_data
## # A tibble: 6 × 3
## sample condition
## <chr> <chr>
## 1 D1_R1 shRNA
## 2 D2_R1 shRNA
## 3 D3_R1 shRNA
## 4 V1_R1 control
## 5 V2_R1 control
## 6 V3_R1 control
## # ... with 1 more variables: path <chr>
source("http://bioconductor.org/biocLite.R")
## Bioconductor version 3.4 (BiocInstaller 1.24.0), ?biocLite for help
## A new version of Bioconductor is available after installing the most
## recent version of R; see http://bioconductor.org/install
# biocLite("biomaRt")
mart <- biomaRt::useMart(biomart = "ENSEMBL_MART_ENSEMBL",
dataset = "hsapiens_gene_ensembl",
host = 'ensembl.org')
t2g <- biomaRt::getBM(attributes = c("ensembl_transcript_id", "ensembl_gene_id","external_gene_name"), mart = mart)
t2g <- dplyr::rename(t2g, target_id = ensembl_transcript_id,
ens_gene = ensembl_gene_id, ext_gene = external_gene_name)
t2g %>% head()
## target_id ens_gene ext_gene
## 1 ENST00000583496 ENSG00000264452 snoZ6
## 2 ENST00000620853 ENSG00000278324 PVT1_3
## 3 ENST00000636749 ENSG00000283502 U6
## 4 ENST00000476140 ENSG00000241226 RN7SL836P
## 5 ENST00000516795 ENSG00000252604 RNU2-44P
## 6 ENST00000616110 ENSG00000274494 MIR6832
so <- sleuth_prep(meta_data, ~ condition, target_mapping = t2g)
## reading in kallisto results
## ......
##
## Warning in check_target_mapping(tmp_names, target_mapping): intersection
## between target_id from kallisto runs and the target_mapping is empty.
## attempted to fix problem by removing .N from target_id, then merging back
## into target_mapping. please check obj$target_mapping to ensure this new
## mapping is correct.
## normalizing est_counts
## 45999 targets passed the filter
## normalizing tpm
## merging in metadata
## normalizing bootstrap samples
## summarizing bootstraps
# sleuth objs are essentially lists that contain other R objects
# to get a decription
#?sleuth_prep
# To access each object
# so$full_model
so <- sleuth_fit(so)
## fitting measurement error models
## shrinkage estimation
## Adding missing grouping variables: `x_group`
## computing variance of betas
so <- sleuth_fit(so, ~1, 'reduced')
## fitting measurement error models
## shrinkage estimation
## Adding missing grouping variables: `x_group`
## computing variance of betas
so <- sleuth_lrt(so, 'reduced', 'full')
res <- sleuth_results(so, "reduced:full", test_type = "lrt")
res <- as_data_frame(res)
res %>% write.csv("sleuth_analysis_results.csv")
# sleuth_live(so)
Caption for the picture.
Caption for the picture.
Caption for the picture.
Caption for the picture.
# first, make a file head
perl -e 'print "target_id\t",join("\t",map {/(.*)\//;$1} @ARGV),"\n";' kallisto/*/abundance.tsv > all_trinity_abundance.tsv
# then, extract est_counts from eachi invidivual file and append them to the all_trinity_abundance file
paste kallisto/*/abundance.tsv | perl -ane 'print $F[0];for (1..$#F){print "\t$F[$_]" if /[49]$/}print "\n"' | tail -n +2 >> all_trinity_abundance.tsv
all <- read_tsv("all_trinity_abundance.tsv") %>% # read in file
mutate(sum = rowSums(.[2:7])) %>% # generate an additional column containing sum of the reads per row
filter(sum >= 6) %>% # filter out low read genes
select(-sum) # remove sum column
## Parsed with column specification:
## cols(
## target_id = col_character(),
## `kallisto/D1_R1` = col_double(),
## `kallisto/D2_R1` = col_double(),
## `kallisto/D3_R1` = col_double(),
## `kallisto/V1_R1` = col_double(),
## `kallisto/V2_R1` = col_double(),
## `kallisto/V3_R1` = col_double()
## )
all
## # A tibble: 28,870 × 7
## target_id `kallisto/D1_R1` `kallisto/D2_R1` `kallisto/D3_R1`
## <chr> <dbl> <dbl> <dbl>
## 1 ENST00000390343.2 1.561680 1.56950 2.238590
## 2 ENST00000390344.2 2.891130 3.37049 4.655110
## 3 ENST00000390477.2 1.602830 1.35651 2.143060
## 4 ENST00000611116.1 23.475700 20.17130 23.390400
## 5 ENST00000453673.3 1.237500 0.00000 0.180674
## 6 ENST00000390289.2 0.935487 2.06837 0.491691
## 7 ENST00000390514.1 6.243490 0.00000 0.000000
## 8 ENST00000390488.1 0.000000 6.93646 0.000000
## 9 ENST00000390473.1 0.000000 0.00000 0.000000
## 10 ENST00000390476.1 0.000000 0.00000 0.000000
## # ... with 28,860 more rows, and 3 more variables: `kallisto/V1_R1` <dbl>,
## # `kallisto/V2_R1` <dbl>, `kallisto/V3_R1` <dbl>
res %>%
filter(qval < 0.001) -> degs
degs
## # A tibble: 468 × 15
## target_id test_stat pval qval rss
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 ENST00000361050.3 36.77675 1.324607e-09 6.092000e-05 12.829649
## 2 ENST00000369409.8 35.04435 3.222801e-09 7.307899e-05 10.617438
## 3 ENST00000290551.4 34.28231 4.766954e-09 7.307899e-05 7.658977
## 4 ENST00000262878.4 32.61531 1.123264e-08 1.033200e-04 5.178877
## 5 ENST00000423485.5 32.73581 1.055742e-08 1.033200e-04 7.311694
## 6 ENST00000296370.3 31.15736 2.379357e-08 1.229476e-04 11.856147
## 7 ENST00000523944.5 30.90997 2.702795e-08 1.229476e-04 5.090015
## 8 ENST00000303115.7 30.27845 3.742617e-08 1.229476e-04 20.623211
## 9 ENST00000264156.2 31.34731 2.157573e-08 1.229476e-04 4.194092
## 10 ENST00000376588.3 31.05478 2.508476e-08 1.229476e-04 8.324726
## # ... with 458 more rows, and 10 more variables: sigma_sq <dbl>,
## # tech_var <dbl>, mean_obs <dbl>, var_obs <dbl>, sigma_sq_pmax <dbl>,
## # smooth_sigma_sq <dbl>, final_sigma_sq <dbl>, degrees_free <int>,
## # ens_gene <chr>, ext_gene <chr>
degs %>%
left_join(all) -> degs_counts
## Joining, by = "target_id"
degs_counts %>%
write.csv("degs_and_est_counts.csv")
degs_counts
## # A tibble: 468 × 21
## target_id test_stat pval qval rss
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 ENST00000361050.3 36.77675 1.324607e-09 6.092000e-05 12.829649
## 2 ENST00000369409.8 35.04435 3.222801e-09 7.307899e-05 10.617438
## 3 ENST00000290551.4 34.28231 4.766954e-09 7.307899e-05 7.658977
## 4 ENST00000262878.4 32.61531 1.123264e-08 1.033200e-04 5.178877
## 5 ENST00000423485.5 32.73581 1.055742e-08 1.033200e-04 7.311694
## 6 ENST00000296370.3 31.15736 2.379357e-08 1.229476e-04 11.856147
## 7 ENST00000523944.5 30.90997 2.702795e-08 1.229476e-04 5.090015
## 8 ENST00000303115.7 30.27845 3.742617e-08 1.229476e-04 20.623211
## 9 ENST00000264156.2 31.34731 2.157573e-08 1.229476e-04 4.194092
## 10 ENST00000376588.3 31.05478 2.508476e-08 1.229476e-04 8.324726
## # ... with 458 more rows, and 16 more variables: sigma_sq <dbl>,
## # tech_var <dbl>, mean_obs <dbl>, var_obs <dbl>, sigma_sq_pmax <dbl>,
## # smooth_sigma_sq <dbl>, final_sigma_sq <dbl>, degrees_free <int>,
## # ens_gene <chr>, ext_gene <chr>, `kallisto/D1_R1` <dbl>,
## # `kallisto/D2_R1` <dbl>, `kallisto/D3_R1` <dbl>,
## # `kallisto/V1_R1` <dbl>, `kallisto/V2_R1` <dbl>, `kallisto/V3_R1` <dbl>