Vignette Title

Vignette Author

2017-07-18

Read data

library(readr)
data_obs <- lapply(1:10, function(i){
    fname = system.file(paste0("extdata/Sub1/data_obs_", i, ".txt"), package="ProteogenomicsChallenge")
    x = data.frame(readr::read_tsv(fname))
    rownames(x) <- x[, 1]
    as.matrix(x[, -1])
})
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Gene_ID = col_character()
## )
## See spec(...) for full column specifications.
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Gene_ID = col_character()
## )
## See spec(...) for full column specifications.
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Gene_ID = col_character()
## )
## See spec(...) for full column specifications.
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Gene_ID = col_character()
## )
## See spec(...) for full column specifications.
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Gene_ID = col_character()
## )
## See spec(...) for full column specifications.
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Gene_ID = col_character()
## )
## See spec(...) for full column specifications.
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Gene_ID = col_character()
## )
## See spec(...) for full column specifications.
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Gene_ID = col_character()
## )
## See spec(...) for full column specifications.
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Gene_ID = col_character()
## )
## See spec(...) for full column specifications.
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Gene_ID = col_character()
## )
## See spec(...) for full column specifications.
names(data_obs) <- 1:10
data_true <- data.frame(readr::read_tsv(system.file("extdata/Sub1/data_true.txt", 
                                             package="ProteogenomicsChallenge")))
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Gene_ID = col_character()
## )
## See spec(...) for full column specifications.
rownames(data_true) <- data_true[, 1]
data_true <- as.matrix(data_true[, -1])
class(data_true)

Create MultiAssayExperiment

library(MultiAssayExperiment)
data_all <- MultiAssayExperiment(data_obs)
data_all <- c(data_all, list(data_true=data_true))

One example of an algorithm

Mean value imputation with the DMwR library:

cvalFUN <- function(x) t(DMwR::centralImputation(t(x)))

KNN imputation with the impute library:

knnFUN <- function(x, ...){ #see ?impute::impute.knn for optional args
    res <- impute::impute.knn(x, ...)
    set.seed(res$rng.state)
    return( res$data )
}

Function to calculate RMSE

for two matrices

Don’t count the identical entries; these weren’t missing.

rmse <- function(mat1, mat2){
    mat <- (mat1 - mat2)^2
    mse <- sum(mat) / sum(mat1 != mat2)
    return( sqrt(mse) )
}

for the MultiAssayExperiment

allrmse <- function(mae, FUN, ...){
    res <- sapply(1:10, function(i){
        imputedmat <- FUN(mae[[i]], ...)
        rmse(imputedmat, mae[[11]])
    })
    return( res )
}

Example of using these to calculate RMSE for a method

Let’s calculate RMSE for the 10 training datasets using the methods we’ve defined so far. For the central imputation approach, it’s very consistent:

library(DMwR)
library(impute)
cval <- allrmse(data_all, cvalFUN)
knn10 <- allrmse(data_all, knnFUN)
## Warning in knnimp(x, k, maxmiss = rowmax, maxp = maxp): 27 rows with more than 50 % entries missing;
##  mean imputation used for these rows
## Warning in knnimp(x, k, maxmiss = rowmax, maxp = maxp): 30 rows with more than 50 % entries missing;
##  mean imputation used for these rows
## Warning in knnimp(x, k, maxmiss = rowmax, maxp = maxp): 16 rows with more than 50 % entries missing;
##  mean imputation used for these rows
## Warning in knnimp(x, k, maxmiss = rowmax, maxp = maxp): 17 rows with more than 50 % entries missing;
##  mean imputation used for these rows
## Warning in knnimp(x, k, maxmiss = rowmax, maxp = maxp): 21 rows with more than 50 % entries missing;
##  mean imputation used for these rows
## Warning in knnimp(x, k, maxmiss = rowmax, maxp = maxp): 15 rows with more than 50 % entries missing;
##  mean imputation used for these rows
## Warning in knnimp(x, k, maxmiss = rowmax, maxp = maxp): 18 rows with more than 50 % entries missing;
##  mean imputation used for these rows
## Warning in knnimp(x, k, maxmiss = rowmax, maxp = maxp): 27 rows with more than 50 % entries missing;
##  mean imputation used for these rows
## Warning in knnimp(x, k, maxmiss = rowmax, maxp = maxp): 17 rows with more than 50 % entries missing;
##  mean imputation used for these rows
## Warning in knnimp(x, k, maxmiss = rowmax, maxp = maxp): 25 rows with more than 50 % entries missing;
##  mean imputation used for these rows
knn5 <- allrmse(data_all, knnFUN, k=5)
## Warning in knnimp(x, k, maxmiss = rowmax, maxp = maxp): 27 rows with more than 50 % entries missing;
##  mean imputation used for these rows
## Warning in knnimp(x, k, maxmiss = rowmax, maxp = maxp): 30 rows with more than 50 % entries missing;
##  mean imputation used for these rows
## Warning in knnimp(x, k, maxmiss = rowmax, maxp = maxp): 16 rows with more than 50 % entries missing;
##  mean imputation used for these rows
## Warning in knnimp(x, k, maxmiss = rowmax, maxp = maxp): 17 rows with more than 50 % entries missing;
##  mean imputation used for these rows
## Warning in knnimp(x, k, maxmiss = rowmax, maxp = maxp): 21 rows with more than 50 % entries missing;
##  mean imputation used for these rows
## Warning in knnimp(x, k, maxmiss = rowmax, maxp = maxp): 15 rows with more than 50 % entries missing;
##  mean imputation used for these rows
## Warning in knnimp(x, k, maxmiss = rowmax, maxp = maxp): 18 rows with more than 50 % entries missing;
##  mean imputation used for these rows
## Warning in knnimp(x, k, maxmiss = rowmax, maxp = maxp): 27 rows with more than 50 % entries missing;
##  mean imputation used for these rows
## Warning in knnimp(x, k, maxmiss = rowmax, maxp = maxp): 17 rows with more than 50 % entries missing;
##  mean imputation used for these rows
## Warning in knnimp(x, k, maxmiss = rowmax, maxp = maxp): 25 rows with more than 50 % entries missing;
##  mean imputation used for these rows
plot(cval, type="both", ylim=c(12.75, 12.9))
## Warning in plot.xy(xy, type, ...): plot type 'both' will be truncated to
## first character
lines(knn10, type="both", pch=2)
## Warning in plot.xy(xy.coords(x, y), type = type, ...): plot type 'both'
## will be truncated to first character
lines(knn5, type="both", pch=3)
## Warning in plot.xy(xy.coords(x, y), type = type, ...): plot type 'both'
## will be truncated to first character
legend("topright", legend=c("central value", "k=10", "k=5"), pch=1:3)

Exploratory analysis

Main questions: * are the values missing completely at random? + does missingness depend on the true value? + does missingness of one value in one dataset increase the probability of missingness in another dataset? + is the probability of a protein being missing correlated across datasets? * how to assess accuracy of imputation? + Root Mean Square error - means that it’s important to predict outliers + if there is a tie, will use Pearson correlation * how is the distribution of each row? + how many missing NAs in each row + spagatti plot of each row

identical(dimnames(data_true), dimnames(data_obs[[1]]))
## [1] TRUE

Are all rows complete?

dim(data_true)
## [1] 7927   80
sum(complete.cases(data_true))
## [1] 7927
sapply(data_obs, function(x) sum(complete.cases(x)))
##  1  2  3  4  5  6  7  8  9 10 
##  0  0  0  0  0  0  0  0  0  0
par(mfrow=c(4, 3))
for (i in 1:10)
    hist(apply(data_obs[[i]], 1, function(x) sum(is.na(x))), main=i)

They look like the same normal distribution, let’s look at the means:

hist(rowMeans(data_true))

rowsd <- apply(data_true, 1, sd)
plot(rowsd ~ rowMeans(data_true))  #heteroskedastic

abundance of missing vs. non-missing values

par(mfrow=c(3, 4))
for (i in 1:10){
    mylist <- list(missing=as.numeric(data_true)[is.na(as.numeric(data_obs[[i]]))],
               notmissing=as.numeric(data_true)[!is.na(as.numeric(data_obs[[i]]))])
    boxplot(mylist, main=i)
}