Load in data

stability <- read.delim(file="/Users/wede/Documents/University/Major_research_project/rnadyn/rembrandts_results/stability.filtered.mx.txt", row.names=1)
metadata <- read.delim(file="/Users/wede/Documents/University/Major_research_project/rnadyn/samples_metadata.txt", sep="|")

#df_stability <- as.matrix(stability[,-1])
#rownames(df_stability) <- stability[,1]

metadata[,7] <- factor(metadata[,7],
                         levels = c('Male', 'Female'),
                         labels = c(0, 1)) # Sex

metadata[,9] <- factor(metadata[,9],
                         levels = c('ALS Spectrum MND', 'ALS Spectrum MND, Other Neurological Disorders', 'Non-Neurological Control', 'Other MND', 'Other Neurological Disorders', 'Other Neurological DIsorders', 'Pre-fALS, Other Neurological Disorders'),
                         labels = c(0, 0, 1, 0, 0 ,0, 0)) # Group

metadata[,30] <- factor(metadata[,30],
                   levels = c('HiSeq 2500', 'NovaSeq'),
                   labels = c(0, 1)) # Platform

Change Gene_IDs to Gene_Names

myGtf <- rtracklayer::import(gzfile('/Users/wede/Documents/University/Major_research_project/data/genenames/Homo_sapiens.GRCh38.99.chr.gtf.gz'))
test=as.data.frame(myGtf[myGtf$gene_id=='ENSG00000001629'])

gene_names <- c(rownames(stability))
gene_ids <- c(rownames(stability))

for (i in 1:nrow(stability)) {
    gene_names[i]=as.data.frame(myGtf[myGtf$gene_id==gene_ids[i]])$gene_name[1]
    print(gene_names[i])
}
rownames(stability) <- gene_names

Check for missing values

which(is.na(metadata[,19]))
# Imputation of RIN based on PCAs
df_stability_pca <- prcomp(stability)
df_stability_pca <- as.data.frame(df_stability_pca$rotation)
RIN <- metadata[,19]
lmRIN <- lm(RIN ~ PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10, data = df_stability_pca)
summary(lmRIN)
metadata[168,19] <- predict(lmRIN, newdata = df_stability_pca[168,])
mean(metadata[,19])
# Sex
which(is.na(metadata[,7]))
# Use sex_call to fill missing value
# metadata[227,31], metadata[228,31]
# Both males
metadata[227,7] <- 0
metadata[228,7] <- 0
sex <- metadata[,7]
 
which(is.na(metadata[,30]))
which(is.na(metadata[,33:37]))

Cell proportion correction

library(BRETIGEA)
celltype_corrected_stability <- adjustBrainCells(stability, nMarker = 50, species = "combined", celltypes = c("ast", "end", "mic", "neu", "oli", "opc"), addMeans = FALSE, verbose = FALSE)

SVPs <- celltype_corrected_stability$SVPs
metadata[,42:47] <- SVPs[,1:6]

Which confounders are significant

confounder_vector = c(7,9,16:17,19,21:22,30,33:37,42:47)
confounders <- metadata[,confounder_vector]
rvals_sign = matrix(nrow=nrow(stability),ncol=length(confounder_vector))
pvals_sign = matrix(nrow=nrow(stability),ncol=length(confounder_vector))


for (j in 1:length(confounder_vector)) {
  for (i in 1:nrow(stability)) {
    conf_sign <- lm(unlist(stability[i,]) ~ confounders[,j])
    rvals_sign[i,j] <- summary(conf_sign)$r.squared
    pvals_sign[i,j] <- summary(conf_sign)$coefficients[,4][2]
  }
}
colnames(rvals_sign) <- colnames(confounders)
colnames(pvals_sign) <- colnames(confounders)

#RIN
hist(rvals_sign[,2]) # RIN less important than expected
hist(pvals_sign[,2])
library(matrixStats)
high_r_means <- as.integer(colMeans(rvals_sign) > 0.1)
low_p_means <- as.integer(colMeans(pvals_sign) < 0.2)
cols_keep <- which(high_r_means*low_p_means == 1)
names(cols_keep) <- colnames(rvals_sign[,cols_keep])
cols_keep

Correcting for confounders

rvals <- c()
pvals <- c()

confounders = metadata[,c(19,42:47)]
corrected_stability <- stability
for (i in 1:nrow(stability)) {
  lm_stability <- lm(unlist(stability[i,]) ~ ., data=confounders)
  corrected_stability[i,] <- lm_stability$residuals
  rvals[i] <- summary(lm_stability)$r.squared
  pvals[i] <- summary(lm_stability)$coefficients[,4][2]
}

hist(rvals)
hist(pvals)
mean(rvals)
mean(pvals)