options(install.packages.check.source = "yes")
install.packages("openssl", dependencies = TRUE, quiet=TRUE )
install.packages("fs", dependencies = TRUE, quiet=TRUE )
install.packages("broom", dependencies = TRUE, quiet=TRUE )
install.packages("dbplyr", dependencies = TRUE, quiet=TRUE )
install.packages("dplyr", dependencies = TRUE, quiet=TRUE )
install.packages("dplyr", dependencies = TRUE, quiet=TRUE )
install.packages("haven", dependencies = TRUE, quiet=TRUE )
install.packages("httr", dependencies = TRUE, quiet=TRUE )
install.packages("modelr", dependencies = TRUE, quiet=TRUE )
install.packages("readr", dependencies = TRUE, quiet=TRUE )
install.packages("tidyverse", dependencies = TRUE, quiet=TRUE )
devtools::install_github("traversc/trqwe", dependencies = T,quiet=TRUE)
library(tidyverse)
#bg: Background.
replicates = 3
bg_proteins = 3000 #Background proteins.
log2_mean_bg = 27 #Background mean.
log2_sd_bg = 2 #Background standard deviation.
bg_reps_by_prot <- rep((2*replicates), bg_proteins)
bg_all_3000_prots_by_6_reps <- rep(1:bg_proteins,bg_reps_by_prot)
bg_distrib_all_samples <- rnorm(2*replicates*bg_proteins, mean = log2_mean_bg, sd = log2_sd_bg)
sim_null <- data_frame(
name = paste0("bg_", bg_all_3000_prots_by_6_reps),
ID = bg_all_3000_prots_by_6_reps,
var = rep(c("control_1", "control_2", "control_3", "treatment_1","treatment_2","treatment_3"), bg_proteins),
val = 2^bg_distrib_all_samples)
# Histogram overlaid with kernel density curve
ggplot(as.data.frame(bg_distrib_all_samples), aes(x=bg_distrib_all_samples)) + geom_histogram(aes(y=..density..), # Histogram with density instead of count on y-axis
binwidth=.5, colour="black", fill="white") +
geom_density(alpha=.2, fill="#FF6666") # Overlay with transparent density plot
ggplot(sim_null, aes(x=val)) + geom_histogram(aes(y=..density..), # Histogram with density instead of count on y-axis
binwidth=.5, colour="black", fill="white") +
geom_density(alpha=.2, fill="#FF6666") # Overlay with transparent density plot
DE_proteins = 300
log2_mean_DE_control = 25
log2_mean_DE_treatment = 30
log2_sd_DE = 2
DE_reps_by_prot <- rep(replicates, DE_proteins)
DE_all_3000_prots_by_6_reps <- rep(1:DE_proteins, DE_reps_by_prot)
DE_distrib_control_samples <- rnorm(replicates*DE_proteins, mean = log2_mean_DE_control, sd = log2_sd_DE)
DE_distrib_treatment_samples <- rnorm(replicates*DE_proteins, mean = log2_mean_DE_treatment, sd = log2_sd_DE)
sim_diff <- rbind(
data_frame(
name = paste0("DE_", DE_all_3000_prots_by_6_reps),
ID = rep( (bg_proteins+1):(bg_proteins+DE_proteins), DE_reps_by_prot),
var = rep(c("control_1", "control_2", "control_3"), DE_proteins),
val = 2^DE_distrib_control_samples),
data_frame(
name = paste0("DE_", DE_all_3000_prots_by_6_reps),
ID = rep((bg_proteins+1):(bg_proteins+DE_proteins), DE_reps_by_prot),
var = rep(c("treatment_1", "treatment_2", "treatment_3"), DE_proteins),
val = 2^DE_distrib_treatment_samples))
rbind(sim_null, sim_diff)
# Combine null and DE data
#Funciones tradicionales
sim <- rbind(sim_null, sim_diff) %>%
spread(key = var, value = val) %>%
arrange(ID)
# Operación inversa
sim %>% gather(key = "var", value = "val", -name, -ID)
#################################################
# Con funciones nuevas:
sim <- rbind(sim_null, sim_diff) %>%
pivot_wider(names_from = var, values_from = val)%>%
arrange(ID)
sim %>% pivot_longer(cols = !c(name,ID), names_to = "var", values_to = "val")
missing at random (MAR) or missing not at random (MNAR). To mimick these two types of missing values, we introduce missing values randomly over all data points (MAR) and we introduce missing values in the control samples of 100 differentially expressed proteins (MNAR).
# Generate a MAR matrix
MAR_fraction = 0.05
MAR_matrix <- matrix(
data = sample(c(TRUE, FALSE),
size = 2*replicates*(bg_proteins+DE_proteins),
replace = TRUE,
prob = c(MAR_fraction, 1-MAR_fraction)),
nrow = bg_proteins+DE_proteins,
ncol = 2*replicates)
# Introduce missing values at random (MAR)
controls <- grep("control", colnames(sim))
treatments <- grep("treatment", colnames(sim))
sim[, c(controls, treatments)][MAR_matrix] <- NA
#sim$MAR <- apply(MAR_matrix, 1, any)
# Introduce missing values not at random (MNAR)
MNAR_proteins = 100
DE_protein_IDs <- grep("DE", sim$name)
DE_first_100 <- DE_protein_IDs[1:MNAR_proteins]
sim[DE_first_100, controls] <- NA
#sim$MNAR <- FALSE
#sim$MNAR[DE_first_100] <- TRUE
library(dplyr)
sim %>% slice_sample(n=100)
options(install.packages.check.source = "yes")
install.packages('outForest', dependencies = TRUE, quiet=TRUE)
install.packages('OutlierDetection', dependencies = TRUE, quiet=TRUE)
install.packages('missRanger', dependencies = TRUE, quiet=TRUE)
library(outForest)
library(dplyr)
library(OutlierDetection)
select_if(sim, is.numeric)
sim[,-c(1,2)]
sim %>%select(starts_with(c("tr",'co'))) -> only.my.numeric.data
only.my.numeric.data.with.outliers <- generateOutliers(only.my.numeric.data) %>% abs()
is.na(sim) %>% colSums
## name ID control_1 control_2 control_3 treatment_1
## 0 0 258 267 242 148
## treatment_2 treatment_3
## 170 175
is.na(drop_na(sim)) %>% colSums
## name ID control_1 control_2 control_3 treatment_1
## 0 0 0 0 0 0
## treatment_2 treatment_3
## 0 0
summary(sim)
## name ID control_1 control_2
## Length:3300 Min. : 1.0 Min. :6.531e+05 Min. :6.009e+05
## Class :character 1st Qu.: 825.8 1st Qu.:4.466e+07 1st Qu.:4.927e+07
## Mode :character Median :1650.5 Median :1.190e+08 Median :1.321e+08
## Mean :1650.5 Mean :3.325e+08 Mean :3.281e+08
## 3rd Qu.:2475.2 3rd Qu.:3.133e+08 3rd Qu.:3.226e+08
## Max. :3300.0 Max. :2.007e+10 Max. :1.933e+10
## NA's :258 NA's :267
## control_3 treatment_1 treatment_2
## Min. :1.082e+06 Min. :1.968e+06 Min. :7.877e+05
## 1st Qu.:4.870e+07 1st Qu.:5.735e+07 1st Qu.:5.713e+07
## Median :1.240e+08 Median :1.574e+08 Median :1.523e+08
## Mean :3.659e+08 Mean :5.604e+08 Mean :5.861e+08
## 3rd Qu.:3.137e+08 3rd Qu.:4.196e+08 3rd Qu.:4.457e+08
## Max. :6.473e+10 Max. :5.461e+10 Max. :4.589e+10
## NA's :242 NA's :148 NA's :170
## treatment_3
## Min. :5.790e+05
## 1st Qu.:6.476e+07
## Median :1.609e+08
## Mean :5.651e+08
## 3rd Qu.:4.505e+08
## Max. :5.275e+10
## NA's :175
my.raw.data %>%select(starts_with(c("tr",'co'))) -> only.my.numeric.data.with.outliers
out <- outForest(only.my.numeric.data.with.outliers, splitrule = "extratrees",
num.trees = 50, verbose = 0)
outliers(out)
summary(out)
## The following outlier counts have been detected:
##
## Number of outliers
## treatment_1 31
## treatment_2 34
## treatment_3 27
## control_1 48
## control_2 50
## control_3 23
##
## These are the worst outliers:
##
## row col observed predicted rmse score threshold
## 213 2911 control_3 64733320725 316861047 1401569048 45.96025 3
## 166 1711 control_2 19327358479 331167147 673894228 28.18868 3
## 24 3227 treatment_1 54606142695 927556716 1962817906 27.34772 3
## 78 3107 treatment_3 52748078459 1726705549 1933400095 26.38945 3
## 112 1578 control_1 20070994704 225278869 787985744 25.18537 3
## 62 3273 treatment_2 45893374100 643190069 1880148722 24.06734 3
## replacement
## 213 166389437
## 166 76636877
## 24 85012063
## 78 1166139979
## 112 1809462423
## 62 31189623
# The fixed data
glimpse(Data(out))
## Rows: 3,300
## Columns: 6
## $ treatment_1 <dbl> 129530434, 2241465106, 117540770, 34219140, 321815870, 15…
## $ treatment_2 <dbl> 176360962, 22561072, 94265734, 157698187, 107352786, 4761…
## $ treatment_3 <dbl> 67537123, 15664673, 143891028, 584216472, 355705294, 4660…
## $ control_1 <dbl> 25657685, 1103536004, 292948883, 171528784, 859023174, 20…
## $ control_2 <dbl> 123657556, 136353202, 815357684, 157892186, 137106663, 19…
## $ control_3 <dbl> 70225079, 85215388, 66420621, 215923179, 178707559, 77373…
my.raw.data %>%select(!starts_with(c("tr",'co'))) %>% cbind((Data(out))) -> sim.without.outliers
sim.without.outliers
sim.without.outliers %>% mutate(treatment = rowMeans(select(., starts_with("treat")))) %>% mutate(control = rowMeans(select(., starts_with("control")))) %>%
mutate(log2Ratio = log2(.[['treatment']] / .[['control']])) -> A
library(trqwe)
B.names <- c('treatment.B_1', 'treatment.B_2', 'treatment.B_3', 'treatment.B_mean')
C.names <- c('treatment.C_1', 'treatment.C_2', 'treatment.C_3', 'treatment.C_mean')
D.names <- c('treatment.D_1', 'treatment.D_2', 'treatment.D_3', 'treatment.D_mean')
E.names <- c('treatment.E_1', 'treatment.E_2', 'treatment.E_3', 'treatment.E_mean')
my.raw.data %>% select(starts_with("treat")) %>% "*"(2 ) %>%mutate(treatment = rowMeans(select(., starts_with("treat")))) %>%
trqwe::set_colnames(B.names) -> B
my.raw.data %>% select(starts_with("treat")) %>% "*"(5 ) %>%mutate(treatment = rowMeans(select(., starts_with("treat"))))%>%
trqwe::set_colnames(C.names) -> C
my.raw.data %>% select(starts_with("treat")) %>% "*"(.5 ) %>%mutate(treatment = rowMeans(select(., starts_with("treat"))))%>%
trqwe::set_colnames(D.names) -> D
my.raw.data %>% select(starts_with("treat")) %>% "*"(.2 ) %>%mutate(treatment = rowMeans(select(., starts_with("treat"))))%>%
trqwe::set_colnames(E.names) -> E
cbind(A,B,C,D,E) %>%
mutate(log2Ratio.B = log2(.[['treatment.B_mean']] / .[['control']])) %>%
mutate(log2Ratio.C = log2(.[['treatment.C_mean']] / .[['control']])) %>%
mutate(log2Ratio.D = log2(.[['treatment.D_mean']] / .[['control']])) %>%
mutate(log2Ratio.E = log2(.[['treatment.E_mean']] / .[['control']])) %>% select(starts_with(c('na','log'))) -> log2Ratio_matrix
log2Ratio_matrix