library(sleuth)
library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.0     ✓ purrr   0.3.4
## ✓ tibble  3.0.1     ✓ dplyr   0.8.5
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
setwd("/Users/nuriteliash/OneDrive - OIST/Repos/varroa-virus-networks")

# The first step in a sleuth analysis is to specify where the kallisto results are stored. A variable is created for this purpose with
sample_id <- list.dirs(path = "sleuth/kallistoAll", full.names = FALSE) 
#sample_id <- sub("(.*).tsv.gz","\\1", sample_id)


#original code. אני לא בטוחה מה ההבדל בין הפקודות. אבל הפקודה המקורית מחזירה וקטור ריק.  
#sample_id <- dir(file.path("..", "results"))


# A list of paths to the kallisto results indexed by the sample IDs is collated with
kal_dirs <- file.path("sleuth/kallistoAll", sample_id)
#remove the first element (i dont know why, but the first element is an emtpy folder)
kal_dirs <- kal_dirs[-1]

#original code:
#kal_dirs <- file.path("..", "results", sample_id, "kallisto")

# The next step is to load an auxillary table that describes the experimental design and the relationship between the kallisto directories and the samples:
# read the table with the libraries description:
ModTraitFac_71 <- read.csv("/Users/nuriteliash/OneDrive - OIST/Repos/varroa-virus-networks/module_trait_factors_71.csv", stringsAsFactors=TRUE) 

# load the viruses load by library, for all 23 viruses
lnames <- load(file = "viruses_load.RData"); 

# make a data frame of the viral loads of DWVa and VDV2 to each library (the explanatory variable)
exp_var <- viruses_load %>%
    filter(description %in% c("DWVa", "VDV2")) %>%
    column_to_rownames("description") %>%
    t() %>%
    as.data.frame() %>%
    rownames_to_column("Library")
  

# join the two tables (rename the library as "sample" to match the next steps)
# **for DWVa**:
s2c <- left_join(exp_var, ModTraitFac_71 ,by = "Library") %>%
    dplyr::select(sample = Library, condition = DWVa)
## Warning: Column `Library` joining character vector and factor, coercing into
## character vector
# *there is a problem joining 4 libs, their factors doesnt join. i dont know why*


# Now the directories must be appended in a new column to the table describing the experiment. This column must be labeled path, otherwise sleuth will report an error. This is to ensure that samples can be associated with kallisto quantifications.
s2c <- dplyr::mutate(s2c, path = kal_dirs)

#It is important to check that the pairings are correct:
print(s2c)
##        sample   condition                          path
## 1  SRR3632582 2.03929e-01 sleuth/kallistoAll/SRR3632582
## 2  SRR3633003 1.31147e+01 sleuth/kallistoAll/SRR3633003
## 3  SRR3634700 2.14957e-01 sleuth/kallistoAll/SRR3634700
## 4  SRR3634772 7.15612e+01 sleuth/kallistoAll/SRR3634772
## 5  SRR3634929 2.11970e+00 sleuth/kallistoAll/SRR3634929
## 6  SRR3634942 9.10705e-03 sleuth/kallistoAll/SRR3634942
## 7  SRR3635001 2.36305e-01 sleuth/kallistoAll/SRR3635001
## 8  SRR3635050 0.00000e+00 sleuth/kallistoAll/SRR3635050
## 9  SRR3635105 0.00000e+00 sleuth/kallistoAll/SRR3635105
## 10 SRR3927486 3.27937e+05 sleuth/kallistoAll/SRR3927486
## 11 SRR3927496 1.00938e+05 sleuth/kallistoAll/SRR3927496
## 12 SRR5109825 0.00000e+00 sleuth/kallistoAll/SRR5109825
## 13 SRR5109827 1.90782e+00 sleuth/kallistoAll/SRR5109827
## 14  SRR533974 8.62546e+04  sleuth/kallistoAll/SRR533974
## 15 SRR5377263 3.01273e+05 sleuth/kallistoAll/SRR5377263
## 16 SRR5377264 5.77487e+04 sleuth/kallistoAll/SRR5377264
## 17 SRR5377265 6.90621e+04 sleuth/kallistoAll/SRR5377265
## 18 SRR5377266 3.76915e+05 sleuth/kallistoAll/SRR5377266
## 19 SRR5377267 3.47962e+05 sleuth/kallistoAll/SRR5377267
## 20 SRR5377268 4.15723e+05 sleuth/kallistoAll/SRR5377268
## 21 SRR5377269 5.24063e+05 sleuth/kallistoAll/SRR5377269
## 22 SRR5377270 1.53606e+03 sleuth/kallistoAll/SRR5377270
## 23 SRR5760812 8.55280e+01 sleuth/kallistoAll/SRR5760812
## 24 SRR5760813 1.27648e+01 sleuth/kallistoAll/SRR5760813
## 25 SRR5760814 7.96905e+03 sleuth/kallistoAll/SRR5760814
## 26 SRR5760815 6.76260e+00 sleuth/kallistoAll/SRR5760815
## 27 SRR5760816 2.44386e+00 sleuth/kallistoAll/SRR5760816
## 28 SRR5760817 7.36708e+00 sleuth/kallistoAll/SRR5760817
## 29 SRR5760818 7.72868e+00 sleuth/kallistoAll/SRR5760818
## 30 SRR5760819 1.54966e+00 sleuth/kallistoAll/SRR5760819
## 31 SRR5760820 2.92096e+01 sleuth/kallistoAll/SRR5760820
## 32 SRR5760821 1.04083e+01 sleuth/kallistoAll/SRR5760821
## 33 SRR5760822 8.08372e-01 sleuth/kallistoAll/SRR5760822
## 34 SRR5760823 3.67167e+01 sleuth/kallistoAll/SRR5760823
## 35 SRR5760824 1.44482e-01 sleuth/kallistoAll/SRR5760824
## 36 SRR5760825 1.02654e+00 sleuth/kallistoAll/SRR5760825
## 37 SRR5760826 1.05556e+01 sleuth/kallistoAll/SRR5760826
## 38 SRR5760827 1.34207e-01 sleuth/kallistoAll/SRR5760827
## 39 SRR5760828 6.65275e-01 sleuth/kallistoAll/SRR5760828
## 40 SRR5760829 9.72406e-01 sleuth/kallistoAll/SRR5760829
## 41 SRR5760830 2.59728e+00 sleuth/kallistoAll/SRR5760830
## 42 SRR5760831 1.69175e-01 sleuth/kallistoAll/SRR5760831
## 43 SRR5760832 1.08245e+00 sleuth/kallistoAll/SRR5760832
## 44 SRR5760833 1.19564e-01 sleuth/kallistoAll/SRR5760833
## 45 SRR5760834 2.22716e+02 sleuth/kallistoAll/SRR5760834
## 46 SRR5760835 1.20884e+01 sleuth/kallistoAll/SRR5760835
## 47 SRR5760836 2.02158e-01 sleuth/kallistoAll/SRR5760836
## 48 SRR5760837 2.66654e-01 sleuth/kallistoAll/SRR5760837
## 49 SRR5760838 2.32044e+00 sleuth/kallistoAll/SRR5760838
## 50 SRR5760839 3.74878e+00 sleuth/kallistoAll/SRR5760839
## 51 SRR5760840 4.30509e+01 sleuth/kallistoAll/SRR5760840
## 52 SRR5760841 2.80647e+01 sleuth/kallistoAll/SRR5760841
## 53 SRR5760842 1.51384e+02 sleuth/kallistoAll/SRR5760842
## 54 SRR5760843 2.97921e+02 sleuth/kallistoAll/SRR5760843
## 55 SRR5760844 4.87463e+01 sleuth/kallistoAll/SRR5760844
## 56 SRR5760845 1.97376e+01 sleuth/kallistoAll/SRR5760845
## 57 SRR5760846 1.19626e+02 sleuth/kallistoAll/SRR5760846
## 58 SRR5760847 2.79756e+01 sleuth/kallistoAll/SRR5760847
## 59 SRR5760848 1.93277e+01 sleuth/kallistoAll/SRR5760848
## 60 SRR5760849 1.01137e+05 sleuth/kallistoAll/SRR5760849
## 61 SRR5760850 1.90209e+01 sleuth/kallistoAll/SRR5760850
## 62 SRR5760851 1.09937e+01 sleuth/kallistoAll/SRR5760851
## 63 SRR6823684 2.11700e+05 sleuth/kallistoAll/SRR6823684
## 64 SRR6823686 3.97361e+03 sleuth/kallistoAll/SRR6823686
## 65 SRR6824277 2.22181e+05 sleuth/kallistoAll/SRR6824277
## 66 SRR7339931 8.58536e+00 sleuth/kallistoAll/SRR7339931
## 67 SRR8100122 1.94596e+05 sleuth/kallistoAll/SRR8100122
## 68 SRR8100123 2.87988e+05 sleuth/kallistoAll/SRR8100123
## 69 SRR8100124 8.73502e+04 sleuth/kallistoAll/SRR8100124
## 70 SRR8864012 2.77030e+01 sleuth/kallistoAll/SRR8864012
## 71 SRR8867385 5.78789e+05 sleuth/kallistoAll/SRR8867385

Next, the “sleuth object” can be constructed. This object will store not only the information about the experiment, but also details of the model to be used for differential testing, and the results. It is prepared and used with four commands that (1) load the kallisto processed data into the object (2) estimate parameters for the sleuth response error measurement (full) model (3) estimate parameters for the sleuth reduced model, and (4) perform differential analysis (testing) using the likelihood ratio test. On a laptop the four steps should take about a few minutes altogether.

# The sleuth object must first be initialized with (this can take a few minutes)
so <- sleuth_prep(s2c, extra_bootstrap_summary = TRUE)
## Warning in check_num_cores(num_cores): It appears that you are running Sleuth from within Rstudio.
## Because of concerns with forking processes from a GUI, 'num_cores' is being set to 1.
## If you wish to take advantage of multiple cores, please consider running sleuth from the command line.
## reading in kallisto results
## dropping unused factor levels
## .......................................................................
## Warning in check_kal_pack(kal_list): More than one version of kallisto was used: 0.46.0
## 0.44.0
## normalizing est_counts
## 23725 targets passed the filter
## normalizing tpm
## merging in metadata
## summarizing bootstraps
## .................................................
## ......................
# somewhere in here i need to take out the viruses reads from the analyais. 
# load the "virusId" file with viruses accessions: 
#lnames <- load(file = "viruses.RData");

fit models

# Then the full model is fit with
so <- sleuth_fit(so, ~condition, 'full')
## fitting measurement error models
## shrinkage estimation
## 2 NA values were found during variance shrinkage estimation due to mean observation values outside of the range used for the LOESS fit.
## The LOESS fit will be repeated using exact computation of the fitted surface to extrapolate the missing values.
## These are the target ids with NA values: XM_022807559.1, XM_022789639.1
## computing variance of betas
# What this has accomplished is to “smooth” the raw kallisto abundance estimates for each sample using a linear model with a parameter that represents the experimental condition (in this case DWVa as a continouse explenatory variable). To test for transcripts that are differential expressed between the conditions, sleuth performs a second fit to a “reduced” model that presumes abundances are equal in the *two conditions*. To identify differential expressed transcripts sleuth will then identify transcripts with a significantly better fit with the “full” model.

# The “reduced” model is fit with
so <- sleuth_fit(so, ~1, 'reduced')
## fitting measurement error models
## shrinkage estimation
## 2 NA values were found during variance shrinkage estimation due to mean observation values outside of the range used for the LOESS fit.
## The LOESS fit will be repeated using exact computation of the fitted surface to extrapolate the missing values.
## These are the target ids with NA values: XM_022807559.1, XM_022789639.1
## computing variance of betas
# and the test is performed with
so <- sleuth_lrt(so, 'reduced', 'full')

# In general, sleuth can utilize the likelihood ratio test with any pair of models that are nested, and other walkthroughs illustrate the power of such a framework for accounting for batch effects and more complex experimental designs.
# The models that have been fit can always be examined with the models() function.
models(so)
## [  full  ]
## formula:  ~condition 
## data modeled:  obs_counts 
## transform sync'ed:  TRUE 
## coefficients:
##  (Intercept)
##      condition
## [  reduced  ]
## formula:  ~1 
## data modeled:  obs_counts 
## transform sync'ed:  TRUE 
## coefficients:
##  (Intercept)
# The results of the test can be examined with
sleuth_table <- sleuth_results(so, 'reduced:full', 'lrt', show_all = FALSE)
sleuth_significant <- dplyr::filter(sleuth_table, qval <= 0.05)
head(sleuth_significant, 20)
##                          target_id         pval         qval test_stat
## 1                      NC_004830.2 2.245856e-14 5.328293e-10  58.30401
## 2                   XR_002671571.1 8.268930e-14 9.809018e-10  55.74068
## 3  ENA|CEND01000001|CEND01000001.1 3.575113e-11 2.827319e-07  43.83362
## 4                   XM_022803779.1 3.119319e-10 1.850146e-06  39.59844
## 5                   XM_022805293.1 6.894261e-10 2.960675e-06  38.05034
## 6                   XR_002673434.1 7.487481e-10 2.960675e-06  37.88930
## 7                   XM_022795096.1 1.529143e-09 4.075927e-06  36.49684
## 8                   XM_022797387.1 1.546189e-09 4.075927e-06  36.47523
## 9                   XM_022810810.1 1.253857e-09 4.075927e-06  36.88376
## 10                  XM_022799285.1 2.696539e-09 6.397538e-06  35.39155
## 11                  XM_022800907.1 3.740455e-09 7.898836e-06  34.75433
## 12                  XM_022809515.1 3.995196e-09 7.898836e-06  34.62607
## 13                  XM_022816676.1 7.441463e-09 1.358067e-05  33.41586
## 14                  XM_022805100.1 9.304543e-09 1.471669e-05  32.98139
## 15                  XM_022813058.1 9.097150e-09 1.471669e-05  33.02521
## 16                  XM_022805674.1 1.084855e-08 1.608636e-05  32.68294
## 17                  XM_022789825.1 1.410679e-08 1.767856e-05  32.17258
## 18                  XM_022816735.1 1.415775e-08 1.767856e-05  32.16557
## 19                  XR_002671213.1 1.413868e-08 1.767856e-05  32.16819
## 20                  XM_022797444.1 1.660506e-08 1.950611e-05  31.85583
##          rss degrees_free mean_obs   var_obs    tech_var  sigma_sq
## 1  2224.9177            1 8.625955 31.784538 0.043148712 31.741389
## 2   502.5164            1 4.289255  7.178806 0.650368504  6.528438
## 3   748.8718            1 2.387008 10.698169 0.061981929 10.636187
## 4   484.6737            1 4.034053  6.923910 0.162505822  6.761404
## 5   107.7911            1 7.013951  1.539872 0.005112173  1.534760
## 6   329.1371            1 3.397208  4.701959 0.644515682  4.057443
## 7   416.7514            1 3.842505  5.953592 0.355351639  5.598240
## 8   261.6728            1 4.964515  3.738183 0.336493058  3.401690
## 9   298.8711            1 4.455266  4.269587 0.631902194  3.637685
## 10  350.0282            1 4.802161  5.000403 1.160330609  3.840073
## 11  140.8891            1 6.152407  2.012701 0.622560998  1.390140
## 12  440.8807            1 4.176413  6.298295 1.346505993  4.951789
## 13  400.1443            1 4.468576  5.716346 0.198639283  5.517707
## 14  513.8851            1 3.939343  7.341216 0.625115616  6.716100
## 15  404.8795            1 5.696151  5.783993 0.504974235  5.279019
## 16  451.4481            1 3.983778  6.449259 1.288164812  5.161094
## 17  334.5537            1 5.055394  4.779338 1.226286530  3.553052
## 18  696.4168            1 4.477313  9.948811 0.710442452  9.238369
## 19  136.9754            1 7.056152  1.956792 0.520292867  1.436499
## 20  259.6211            1 3.479995  3.708873 0.975604326  2.733268
##    smooth_sigma_sq final_sigma_sq
## 1        0.2162747      31.741389
## 2        2.1784882       6.528438
## 3        3.0967006      10.636187
## 4        2.3277214       6.761404
## 5        0.5777492       1.534760
## 6        2.7074251       4.057443
## 7        2.4376242       5.598240
## 8        1.7489570       3.401690
## 9        2.0757333       3.637685
## 10       1.8541661       3.840073
## 11       0.9690730       1.390140
## 12       2.2453993       4.951789
## 13       2.0672250       5.517707
## 14       2.3819557       6.716100
## 15       1.2566663       5.279019
## 16       2.3565197       5.161094
## 17       1.6891633       3.553052
## 18       2.0616378       9.238369
## 19       0.5631793       1.436499
## 20       2.6573992       2.733268
# The table shown above displays the top 20 significant genes with a (Benjamini-Hochberg multiple testing corrected) q-value <= 0.05.
plot_bootstrap(so, "NC_004830.2", units = "est_counts", color_by = "condition")