options(install.packages.check.source = "yes")
install.packages("openssl", dependencies = TRUE, quiet=TRUE )
install.packages("fs", dependencies = TRUE, quiet=TRUE )
install.packages("broom", dependencies = TRUE, quiet=TRUE )
install.packages("dbplyr", dependencies = TRUE, quiet=TRUE )
install.packages("dplyr", dependencies = TRUE, quiet=TRUE )
install.packages("dplyr", dependencies = TRUE, quiet=TRUE )
install.packages("haven", dependencies = TRUE, quiet=TRUE )
install.packages("httr", dependencies = TRUE, quiet=TRUE )
install.packages("modelr", dependencies = TRUE, quiet=TRUE )
install.packages("readr", dependencies = TRUE, quiet=TRUE )
install.packages("tidyverse", dependencies = TRUE, quiet=TRUE )
devtools::install_github("traversc/trqwe", dependencies = T,quiet=TRUE)

Background data (sampled from null distribution): sim_null

library(tidyverse)

#bg: Background.
replicates   = 3
bg_proteins  = 3000           #Background proteins. 
log2_mean_bg = 27             #Background mean.
log2_sd_bg   = 2              #Background standard deviation.

bg_reps_by_prot             <- rep((2*replicates), bg_proteins)
bg_all_3000_prots_by_6_reps <- rep(1:bg_proteins,bg_reps_by_prot)                   
bg_distrib_all_samples      <- rnorm(2*replicates*bg_proteins, mean = log2_mean_bg, sd = log2_sd_bg) 


sim_null <- data_frame(
  name  = paste0("bg_", bg_all_3000_prots_by_6_reps),
  ID    = bg_all_3000_prots_by_6_reps,
  var   = rep(c("control_1", "control_2", "control_3", "treatment_1","treatment_2","treatment_3"), bg_proteins), 
  val   = 2^bg_distrib_all_samples)

# Histogram overlaid with kernel density curve
ggplot(as.data.frame(bg_distrib_all_samples), aes(x=bg_distrib_all_samples)) +  geom_histogram(aes(y=..density..),      # Histogram with density instead of count on y-axis
                   binwidth=.5, colour="black", fill="white") +
    geom_density(alpha=.2, fill="#FF6666")  # Overlay with transparent density plot

ggplot(sim_null, aes(x=val)) +     geom_histogram(aes(y=..density..),      # Histogram with density instead of count on y-axis
                   binwidth=.5, colour="black", fill="white") +
    geom_density(alpha=.2, fill="#FF6666")  # Overlay with transparent density plot

Data for DE proteins (sampled from alternative distribution): sim_diff

DE_proteins            = 300
log2_mean_DE_control   = 25
log2_mean_DE_treatment = 30
log2_sd_DE             = 2

DE_reps_by_prot              <- rep(replicates,    DE_proteins)
DE_all_3000_prots_by_6_reps  <- rep(1:DE_proteins, DE_reps_by_prot)
DE_distrib_control_samples   <- rnorm(replicates*DE_proteins, mean = log2_mean_DE_control, sd = log2_sd_DE)
DE_distrib_treatment_samples <- rnorm(replicates*DE_proteins, mean = log2_mean_DE_treatment, sd = log2_sd_DE)

sim_diff <- rbind(
  data_frame(
    name = paste0("DE_", DE_all_3000_prots_by_6_reps),
    ID   = rep(  (bg_proteins+1):(bg_proteins+DE_proteins),  DE_reps_by_prot),
    var  = rep(c("control_1", "control_2", "control_3"), DE_proteins), 
    val  = 2^DE_distrib_control_samples),
  data_frame(
    name = paste0("DE_", DE_all_3000_prots_by_6_reps),
    ID = rep((bg_proteins+1):(bg_proteins+DE_proteins), DE_reps_by_prot),
    var = rep(c("treatment_1", "treatment_2", "treatment_3"), DE_proteins),
    val = 2^DE_distrib_treatment_samples))


rbind(sim_null, sim_diff)
# Combine null and DE data
#Funciones tradicionales
sim <- rbind(sim_null, sim_diff) %>% 
             spread(key = var, value = val) %>% 
             arrange(ID)
# Operación inversa
sim %>% gather(key = "var", value = "val", -name, -ID)
#################################################
# Con funciones nuevas:
sim <-  rbind(sim_null, sim_diff) %>% 
        pivot_wider(names_from = var, values_from = val)%>% 
        arrange(ID)
sim %>% pivot_longer(cols = !c(name,ID),  names_to = "var", values_to = "val")

Introduce missing values

missing at random (MAR) or missing not at random (MNAR). To mimick these two types of missing values, we introduce missing values randomly over all data points (MAR) and we introduce missing values in the control samples of 100 differentially expressed proteins (MNAR).

# Generate a MAR matrix
MAR_fraction = 0.05
MAR_matrix <- matrix(
                     data = sample(c(TRUE, FALSE), 
                            size = 2*replicates*(bg_proteins+DE_proteins), 
                            replace = TRUE, 
                            prob = c(MAR_fraction, 1-MAR_fraction)), 
                     nrow = bg_proteins+DE_proteins, 
                     ncol = 2*replicates)
# Introduce missing values at random (MAR)
controls   <- grep("control", colnames(sim))
treatments <- grep("treatment", colnames(sim))
sim[, c(controls, treatments)][MAR_matrix] <- NA
#sim$MAR <- apply(MAR_matrix, 1, any)
# Introduce missing values not at random (MNAR)
MNAR_proteins = 100
DE_protein_IDs <- grep("DE", sim$name)
DE_first_100   <- DE_protein_IDs[1:MNAR_proteins]
sim[DE_first_100, controls] <- NA
#sim$MNAR <- FALSE
#sim$MNAR[DE_first_100] <- TRUE
library(dplyr)
sim %>% slice_sample(n=100) 

Missing values and outliers

options(install.packages.check.source = "yes")
install.packages('outForest', dependencies = TRUE, quiet=TRUE)
install.packages('OutlierDetection', dependencies = TRUE, quiet=TRUE)
install.packages('missRanger', dependencies = TRUE, quiet=TRUE)
library(outForest)
library(dplyr)
library(OutlierDetection)
select_if(sim, is.numeric)
sim[,-c(1,2)]
sim %>%select(starts_with(c("tr",'co'))) -> only.my.numeric.data

only.my.numeric.data.with.outliers <- generateOutliers(only.my.numeric.data) %>% abs()

Lidiando con datos ausentes

is.na(sim)           %>% colSums
##        name          ID   control_1   control_2   control_3 treatment_1 
##           0           0         258         267         242         148 
## treatment_2 treatment_3 
##         170         175
is.na(drop_na(sim))  %>% colSums
##        name          ID   control_1   control_2   control_3 treatment_1 
##           0           0           0           0           0           0 
## treatment_2 treatment_3 
##           0           0
summary(sim)
##      name                 ID           control_1           control_2        
##  Length:3300        Min.   :   1.0   Min.   :6.531e+05   Min.   :6.009e+05  
##  Class :character   1st Qu.: 825.8   1st Qu.:4.466e+07   1st Qu.:4.927e+07  
##  Mode  :character   Median :1650.5   Median :1.190e+08   Median :1.321e+08  
##                     Mean   :1650.5   Mean   :3.325e+08   Mean   :3.281e+08  
##                     3rd Qu.:2475.2   3rd Qu.:3.133e+08   3rd Qu.:3.226e+08  
##                     Max.   :3300.0   Max.   :2.007e+10   Max.   :1.933e+10  
##                                      NA's   :258         NA's   :267        
##    control_3          treatment_1         treatment_2       
##  Min.   :1.082e+06   Min.   :1.968e+06   Min.   :7.877e+05  
##  1st Qu.:4.870e+07   1st Qu.:5.735e+07   1st Qu.:5.713e+07  
##  Median :1.240e+08   Median :1.574e+08   Median :1.523e+08  
##  Mean   :3.659e+08   Mean   :5.604e+08   Mean   :5.861e+08  
##  3rd Qu.:3.137e+08   3rd Qu.:4.196e+08   3rd Qu.:4.457e+08  
##  Max.   :6.473e+10   Max.   :5.461e+10   Max.   :4.589e+10  
##  NA's   :242         NA's   :148         NA's   :170        
##   treatment_3       
##  Min.   :5.790e+05  
##  1st Qu.:6.476e+07  
##  Median :1.609e+08  
##  Mean   :5.651e+08  
##  3rd Qu.:4.505e+08  
##  Max.   :5.275e+10  
##  NA's   :175

Corrección de outliers

my.raw.data %>%select(starts_with(c("tr",'co'))) -> only.my.numeric.data.with.outliers

out <- outForest(only.my.numeric.data.with.outliers, splitrule = "extratrees", 
                 num.trees = 50, verbose = 0) 

outliers(out)
summary(out)
## The following outlier counts have been detected:
## 
##             Number of outliers
## treatment_1                 31
## treatment_2                 34
## treatment_3                 27
## control_1                   48
## control_2                   50
## control_3                   23
## 
## These are the worst outliers:
## 
##      row         col    observed  predicted       rmse    score threshold
## 213 2911   control_3 64733320725  316861047 1401569048 45.96025         3
## 166 1711   control_2 19327358479  331167147  673894228 28.18868         3
## 24  3227 treatment_1 54606142695  927556716 1962817906 27.34772         3
## 78  3107 treatment_3 52748078459 1726705549 1933400095 26.38945         3
## 112 1578   control_1 20070994704  225278869  787985744 25.18537         3
## 62  3273 treatment_2 45893374100  643190069 1880148722 24.06734         3
##     replacement
## 213   166389437
## 166    76636877
## 24     85012063
## 78   1166139979
## 112  1809462423
## 62     31189623
# The fixed data
glimpse(Data(out))
## Rows: 3,300
## Columns: 6
## $ treatment_1 <dbl> 129530434, 2241465106, 117540770, 34219140, 321815870, 15…
## $ treatment_2 <dbl> 176360962, 22561072, 94265734, 157698187, 107352786, 4761…
## $ treatment_3 <dbl> 67537123, 15664673, 143891028, 584216472, 355705294, 4660…
## $ control_1   <dbl> 25657685, 1103536004, 292948883, 171528784, 859023174, 20…
## $ control_2   <dbl> 123657556, 136353202, 815357684, 157892186, 137106663, 19…
## $ control_3   <dbl> 70225079, 85215388, 66420621, 215923179, 178707559, 77373…
my.raw.data %>%select(!starts_with(c("tr",'co'))) %>% cbind((Data(out))) -> sim.without.outliers
sim.without.outliers

Generación de log2Ratio = FC

sim.without.outliers %>% mutate(treatment = rowMeans(select(., starts_with("treat")))) %>% mutate(control = rowMeans(select(., starts_with("control")))) %>%
    mutate(log2Ratio = log2(.[['treatment']] / .[['control']])) -> A

library(trqwe)

B.names <- c('treatment.B_1', 'treatment.B_2', 'treatment.B_3',  'treatment.B_mean')
C.names <- c('treatment.C_1', 'treatment.C_2', 'treatment.C_3',  'treatment.C_mean')
D.names <- c('treatment.D_1', 'treatment.D_2', 'treatment.D_3',  'treatment.D_mean')
E.names <- c('treatment.E_1', 'treatment.E_2', 'treatment.E_3',  'treatment.E_mean')

my.raw.data %>% select(starts_with("treat")) %>% "*"(2 ) %>%mutate(treatment = rowMeans(select(., starts_with("treat")))) %>% 
  trqwe::set_colnames(B.names) -> B

my.raw.data %>% select(starts_with("treat")) %>% "*"(5 ) %>%mutate(treatment = rowMeans(select(., starts_with("treat"))))%>% 
  trqwe::set_colnames(C.names) -> C

my.raw.data %>% select(starts_with("treat")) %>% "*"(.5 ) %>%mutate(treatment = rowMeans(select(., starts_with("treat"))))%>% 
  trqwe::set_colnames(D.names) -> D

my.raw.data %>% select(starts_with("treat")) %>% "*"(.2 ) %>%mutate(treatment = rowMeans(select(., starts_with("treat"))))%>% 
  trqwe::set_colnames(E.names) -> E

cbind(A,B,C,D,E) %>%
    mutate(log2Ratio.B = log2(.[['treatment.B_mean']] / .[['control']])) %>%
    mutate(log2Ratio.C = log2(.[['treatment.C_mean']] / .[['control']])) %>%
    mutate(log2Ratio.D = log2(.[['treatment.D_mean']] / .[['control']])) %>%
    mutate(log2Ratio.E = log2(.[['treatment.E_mean']] / .[['control']])) %>% select(starts_with(c('na','log'))) -> log2Ratio_matrix

log2Ratio_matrix