Load in data
stability <- read.delim(file="/Users/wede/Documents/University/Major_research_project/rnadyn/rembrandts_results/stability.filtered.mx.txt", row.names=1)
metadata <- read.delim(file="/Users/wede/Documents/University/Major_research_project/rnadyn/samples_metadata.txt", sep="|")
#df_stability <- as.matrix(stability[,-1])
#rownames(df_stability) <- stability[,1]
metadata[,7] <- factor(metadata[,7],
levels = c('Male', 'Female'),
labels = c(0, 1)) # Sex
metadata[,9] <- factor(metadata[,9],
levels = c('ALS Spectrum MND', 'ALS Spectrum MND, Other Neurological Disorders', 'Non-Neurological Control', 'Other MND', 'Other Neurological Disorders', 'Other Neurological DIsorders', 'Pre-fALS, Other Neurological Disorders'),
labels = c(0, 0, 1, 0, 0 ,0, 0)) # Group
metadata[,30] <- factor(metadata[,30],
levels = c('HiSeq 2500', 'NovaSeq'),
labels = c(0, 1)) # Platform
Change Gene_IDs to Gene_Names
myGtf <- rtracklayer::import(gzfile('/Users/wede/Documents/University/Major_research_project/data/genenames/Homo_sapiens.GRCh38.99.chr.gtf.gz'))
test=as.data.frame(myGtf[myGtf$gene_id=='ENSG00000001629'])
gene_names <- c(rownames(stability))
gene_ids <- c(rownames(stability))
for (i in 1:nrow(stability)) {
gene_names[i]=as.data.frame(myGtf[myGtf$gene_id==gene_ids[i]])$gene_name[1]
print(gene_names[i])
}
rownames(stability) <- gene_names
Check for missing values
which(is.na(metadata[,19]))
# Imputation of RIN based on PCAs
df_stability_pca <- prcomp(stability)
df_stability_pca <- as.data.frame(df_stability_pca$rotation)
RIN <- metadata[,19]
lmRIN <- lm(RIN ~ PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10, data = df_stability_pca)
summary(lmRIN)
metadata[168,19] <- predict(lmRIN, newdata = df_stability_pca[168,])
mean(metadata[,19])
# Sex
which(is.na(metadata[,7]))
# Use sex_call to fill missing value
# metadata[227,31], metadata[228,31]
# Both males
metadata[227,7] <- 0
metadata[228,7] <- 0
sex <- metadata[,7]
which(is.na(metadata[,30]))
which(is.na(metadata[,33:37]))
Cell proportion correction
library(BRETIGEA)
celltype_corrected_stability <- adjustBrainCells(stability, nMarker = 50, species = "combined", celltypes = c("ast", "end", "mic", "neu", "oli", "opc"), addMeans = FALSE, verbose = FALSE)
SVPs <- celltype_corrected_stability$SVPs
metadata[,42:47] <- SVPs[,1:6]
Which confounders are significant
confounder_vector = c(7,9,16:17,19,21:22,30,33:37,42:47)
confounders <- metadata[,confounder_vector]
rvals_sign = matrix(nrow=nrow(stability),ncol=length(confounder_vector))
pvals_sign = matrix(nrow=nrow(stability),ncol=length(confounder_vector))
for (j in 1:length(confounder_vector)) {
for (i in 1:nrow(stability)) {
conf_sign <- lm(unlist(stability[i,]) ~ confounders[,j])
rvals_sign[i,j] <- summary(conf_sign)$r.squared
pvals_sign[i,j] <- summary(conf_sign)$coefficients[,4][2]
}
}
colnames(rvals_sign) <- colnames(confounders)
colnames(pvals_sign) <- colnames(confounders)
#RIN
hist(rvals_sign[,2]) # RIN less important than expected
hist(pvals_sign[,2])
library(matrixStats)
high_r_means <- as.integer(colMeans(rvals_sign) > 0.1)
low_p_means <- as.integer(colMeans(pvals_sign) < 0.2)
cols_keep <- which(high_r_means*low_p_means == 1)
names(cols_keep) <- colnames(rvals_sign[,cols_keep])
cols_keep
Correcting for confounders
rvals <- c()
pvals <- c()
confounders = metadata[,c(19,42:47)]
corrected_stability <- stability
for (i in 1:nrow(stability)) {
lm_stability <- lm(unlist(stability[i,]) ~ ., data=confounders)
corrected_stability[i,] <- lm_stability$residuals
rvals[i] <- summary(lm_stability)$r.squared
pvals[i] <- summary(lm_stability)$coefficients[,4][2]
}
hist(rvals)
hist(pvals)
mean(rvals)
mean(pvals)