library(readr)
data_obs <- lapply(1:10, function(i){
fname = system.file(paste0("extdata/Sub1/data_obs_", i, ".txt"), package="ProteogenomicsChallenge")
x = data.frame(readr::read_tsv(fname))
rownames(x) <- x[, 1]
as.matrix(x[, -1])
})
## Parsed with column specification:
## cols(
## .default = col_double(),
## Gene_ID = col_character()
## )
## See spec(...) for full column specifications.
## Parsed with column specification:
## cols(
## .default = col_double(),
## Gene_ID = col_character()
## )
## See spec(...) for full column specifications.
## Parsed with column specification:
## cols(
## .default = col_double(),
## Gene_ID = col_character()
## )
## See spec(...) for full column specifications.
## Parsed with column specification:
## cols(
## .default = col_double(),
## Gene_ID = col_character()
## )
## See spec(...) for full column specifications.
## Parsed with column specification:
## cols(
## .default = col_double(),
## Gene_ID = col_character()
## )
## See spec(...) for full column specifications.
## Parsed with column specification:
## cols(
## .default = col_double(),
## Gene_ID = col_character()
## )
## See spec(...) for full column specifications.
## Parsed with column specification:
## cols(
## .default = col_double(),
## Gene_ID = col_character()
## )
## See spec(...) for full column specifications.
## Parsed with column specification:
## cols(
## .default = col_double(),
## Gene_ID = col_character()
## )
## See spec(...) for full column specifications.
## Parsed with column specification:
## cols(
## .default = col_double(),
## Gene_ID = col_character()
## )
## See spec(...) for full column specifications.
## Parsed with column specification:
## cols(
## .default = col_double(),
## Gene_ID = col_character()
## )
## See spec(...) for full column specifications.
names(data_obs) <- 1:10
data_true <- data.frame(readr::read_tsv(system.file("extdata/Sub1/data_true.txt",
package="ProteogenomicsChallenge")))
## Parsed with column specification:
## cols(
## .default = col_double(),
## Gene_ID = col_character()
## )
## See spec(...) for full column specifications.
rownames(data_true) <- data_true[, 1]
data_true <- as.matrix(data_true[, -1])
class(data_true)
library(MultiAssayExperiment)
data_all <- MultiAssayExperiment(data_obs)
data_all <- c(data_all, list(data_true=data_true))
Mean value imputation with the DMwR library:
cvalFUN <- function(x) t(DMwR::centralImputation(t(x)))
KNN imputation with the impute library:
knnFUN <- function(x, ...){ #see ?impute::impute.knn for optional args
res <- impute::impute.knn(x, ...)
set.seed(res$rng.state)
return( res$data )
}
Don’t count the identical entries; these weren’t missing.
rmse <- function(mat1, mat2){
mat <- (mat1 - mat2)^2
mse <- sum(mat) / sum(mat1 != mat2)
return( sqrt(mse) )
}
allrmse <- function(mae, FUN, ...){
res <- sapply(1:10, function(i){
imputedmat <- FUN(mae[[i]], ...)
rmse(imputedmat, mae[[11]])
})
return( res )
}
Let’s calculate RMSE for the 10 training datasets using the methods we’ve defined so far. For the central imputation approach, it’s very consistent:
library(DMwR)
library(impute)
cval <- allrmse(data_all, cvalFUN)
knn10 <- allrmse(data_all, knnFUN)
## Warning in knnimp(x, k, maxmiss = rowmax, maxp = maxp): 27 rows with more than 50 % entries missing;
## mean imputation used for these rows
## Warning in knnimp(x, k, maxmiss = rowmax, maxp = maxp): 30 rows with more than 50 % entries missing;
## mean imputation used for these rows
## Warning in knnimp(x, k, maxmiss = rowmax, maxp = maxp): 16 rows with more than 50 % entries missing;
## mean imputation used for these rows
## Warning in knnimp(x, k, maxmiss = rowmax, maxp = maxp): 17 rows with more than 50 % entries missing;
## mean imputation used for these rows
## Warning in knnimp(x, k, maxmiss = rowmax, maxp = maxp): 21 rows with more than 50 % entries missing;
## mean imputation used for these rows
## Warning in knnimp(x, k, maxmiss = rowmax, maxp = maxp): 15 rows with more than 50 % entries missing;
## mean imputation used for these rows
## Warning in knnimp(x, k, maxmiss = rowmax, maxp = maxp): 18 rows with more than 50 % entries missing;
## mean imputation used for these rows
## Warning in knnimp(x, k, maxmiss = rowmax, maxp = maxp): 27 rows with more than 50 % entries missing;
## mean imputation used for these rows
## Warning in knnimp(x, k, maxmiss = rowmax, maxp = maxp): 17 rows with more than 50 % entries missing;
## mean imputation used for these rows
## Warning in knnimp(x, k, maxmiss = rowmax, maxp = maxp): 25 rows with more than 50 % entries missing;
## mean imputation used for these rows
knn5 <- allrmse(data_all, knnFUN, k=5)
## Warning in knnimp(x, k, maxmiss = rowmax, maxp = maxp): 27 rows with more than 50 % entries missing;
## mean imputation used for these rows
## Warning in knnimp(x, k, maxmiss = rowmax, maxp = maxp): 30 rows with more than 50 % entries missing;
## mean imputation used for these rows
## Warning in knnimp(x, k, maxmiss = rowmax, maxp = maxp): 16 rows with more than 50 % entries missing;
## mean imputation used for these rows
## Warning in knnimp(x, k, maxmiss = rowmax, maxp = maxp): 17 rows with more than 50 % entries missing;
## mean imputation used for these rows
## Warning in knnimp(x, k, maxmiss = rowmax, maxp = maxp): 21 rows with more than 50 % entries missing;
## mean imputation used for these rows
## Warning in knnimp(x, k, maxmiss = rowmax, maxp = maxp): 15 rows with more than 50 % entries missing;
## mean imputation used for these rows
## Warning in knnimp(x, k, maxmiss = rowmax, maxp = maxp): 18 rows with more than 50 % entries missing;
## mean imputation used for these rows
## Warning in knnimp(x, k, maxmiss = rowmax, maxp = maxp): 27 rows with more than 50 % entries missing;
## mean imputation used for these rows
## Warning in knnimp(x, k, maxmiss = rowmax, maxp = maxp): 17 rows with more than 50 % entries missing;
## mean imputation used for these rows
## Warning in knnimp(x, k, maxmiss = rowmax, maxp = maxp): 25 rows with more than 50 % entries missing;
## mean imputation used for these rows
plot(cval, type="both", ylim=c(12.75, 12.9))
## Warning in plot.xy(xy, type, ...): plot type 'both' will be truncated to
## first character
lines(knn10, type="both", pch=2)
## Warning in plot.xy(xy.coords(x, y), type = type, ...): plot type 'both'
## will be truncated to first character
lines(knn5, type="both", pch=3)
## Warning in plot.xy(xy.coords(x, y), type = type, ...): plot type 'both'
## will be truncated to first character
legend("topright", legend=c("central value", "k=10", "k=5"), pch=1:3)
Main questions: * are the values missing completely at random? + does missingness depend on the true value? + does missingness of one value in one dataset increase the probability of missingness in another dataset? + is the probability of a protein being missing correlated across datasets? * how to assess accuracy of imputation? + Root Mean Square error - means that it’s important to predict outliers + if there is a tie, will use Pearson correlation * how is the distribution of each row? + how many missing NAs in each row + spagatti plot of each row
identical(dimnames(data_true), dimnames(data_obs[[1]]))
## [1] TRUE
dim(data_true)
## [1] 7927 80
sum(complete.cases(data_true))
## [1] 7927
sapply(data_obs, function(x) sum(complete.cases(x)))
## 1 2 3 4 5 6 7 8 9 10
## 0 0 0 0 0 0 0 0 0 0
par(mfrow=c(4, 3))
for (i in 1:10)
hist(apply(data_obs[[i]], 1, function(x) sum(is.na(x))), main=i)
They look like the same normal distribution, let’s look at the means:
hist(rowMeans(data_true))
rowsd <- apply(data_true, 1, sd)
plot(rowsd ~ rowMeans(data_true)) #heteroskedastic
par(mfrow=c(3, 4))
for (i in 1:10){
mylist <- list(missing=as.numeric(data_true)[is.na(as.numeric(data_obs[[i]]))],
notmissing=as.numeric(data_true)[!is.na(as.numeric(data_obs[[i]]))])
boxplot(mylist, main=i)
}