PCA for covariate selection
#remove low count genes, each gene must have 20 individuals with at least 10 counts of the gene
filt_counts <- counts[rowSums(counts >= 10) >= 20,]
dim(filt_counts)
## [1] 8500 86
#variance stabilisation transformation - normalises for same variance amongst low count genes
vsd_counts <- DESeq2::varianceStabilizingTransformation(as.matrix(filt_counts))
## converting counts to integer mode
#View histogram, clear peak in low counts (which couldn't be found in data for analysis)
hist(vsd_counts, main = 'vst of filtered Fibrous Astrocytes Counts, with outliers (n=86)')

#PCA
PCA_vsdbulk <- pca(vsd_counts, metadata = metadata_reduced, removeVar = 0.05)
## -- removing the lower 5% of variables based on variance
#new data frame with PC1 column
meta_wPC1 <- cbind(metadata_reduced, PCA_vsdbulk$rotated[,c("PC1")])
#rename column to PC1
meta_wPC1$PC1 <- meta_wPC1$'PCA_vsdbulk$rotated[, c("PC1")]'
# Calculate the mean and standard deviation of PC1
mean_PC1 <- mean(meta_wPC1$PC1, na.rm = TRUE)
sd_PC1 <- sd(meta_wPC1$PC1, na.rm = TRUE)
# Print the mean and standard deviation
cat("Mean of PP PC1:", mean_PC1, "\n")
## Mean of PP PC1: 2.478637e-15
cat("Standard PP Deviation of PC1:", sd_PC1, "\n")
## Standard PP Deviation of PC1: 18.60339
# Create a new column 'PC1_Z_Score' with z-scores for each row
meta_wPC1$PC1_Z_Score <- (meta_wPC1$PC1 - mean_PC1) / sd_PC1
# Create a new column 'outliers' to mark z-scores above an absolute value of 3
meta_wPC1$outliers <- abs(meta_wPC1$PC1_Z_Score) > 3
# Create a new filtered data frame 'meta_filt' that removes outliers
meta_remOutliers <- meta_wPC1[!meta_wPC1$outliers, ]
#Create new dataframe homologus to metadata_reduced in previous chunk
meta_filt <- meta_remOutliers[c("SU.Number","Age", "AgeBin", "Sex", "Status", 'Trauma.Code', 'Adversity.Code', "RIN", "pH", "PMI", "lib_batch", 'Suicide', "Days_in_Freezer", 'agonal_score', 'Classification.detail')]
#adjust the counts to remove outliers in the cell type your using. Endo = no outliers, FA = 'individual.404', 'individual.854', PP = 'individual.383'. CHeck dim to ensure the columns are removed
outlier_cols <- c('individual.404', 'individual.854')
counts <- counts[,!colnames(counts) %in% outlier_cols]
dim(counts)
## [1] 26195 84
#do the same for filtered counts
filt_counts <- filt_counts[, !colnames(filt_counts) %in% outlier_cols]
dim(filt_counts)
## [1] 8500 84
#variance stabilisation transformation for counts without outliers
vst <- DESeq2::varianceStabilizingTransformation(as.matrix(filt_counts))
## converting counts to integer mode
dim(vst)
## [1] 8500 84
#Vst histogram to visualise improvement in low count outliers
hist(vst, main = "Fibrous astrocyte counts w/z-score over |3| removed (n=84)")

#recalculate PC
PCA_vst <- pca(vst, metadata = meta_filt, removeVar = 0.05)
## -- removing the lower 5% of variables based on variance
#the 'PCA_vst' in #158 and #166 and is for the outliers removed, if you want to see the scree without use PCA_vsdbulk from #107
scree <- screeplot(PCA_vst, components = getComponents(PCA_vst)[1:20], axisLabSize = 18, titleLabSize = 22, colBar = "#2e134b" ,
drawCumulativeSumLine = TRUE,
colCumulativeSumLine = "#e38830", colCumulativeSumPoints = "#9a4a1e",
sizeCumulativeSumPoints = 2.5, title = 'SCREE: n = 84, without outliers')
scree + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1, colour="black", size=15), axis.text.y = element_text(colour="black", size=15),
axis.title=element_text(face='bold'))

options(repr.plot.width = 100, repr.plot.height = 7, repr.plot.res = 150)
eigencorplot(PCA_vst, components = getComponents(PCA_vst)[1:10],
metavars = c('Age', 'RIN', 'pH', 'PMI', 'Days_in_Freezer'), rotLabX = 45, cexCorval = 1,
col = c("#fde725", '#21918c', '#440154'), corFUN = 'kendall', corMultipleTestCorrection = 'BH',
main = bquote(Kendall ~ Correlation), colCorval = "black", fontLabX = 4, fontLabY = 4)

Accounting for hidden confounders by calculating PC1
dfull <- DGEList(as.data.frame(counts), samples = meta_filt)
dim(dfull)
## [1] 26195 84
keep <- rowSums(dfull$counts >= 10) >= 20
dfilt <- dfull[keep,]
dim(dfilt)
## [1] 8500 84
#normalise filtered dgelist
dfilt <- calcNormFactors(dfilt)
#create model matrix for voom function
mm <- model.matrix(~ Sex + Age + pH + PMI + RIN + lib_batch + Status, data = dfilt$samples)
#voom - transforming counts for linear modelling
voom <- voom(dfilt, mm, plot =T)

#model matric for removing batch effects - note that lib_batch isn't included
design_rembatch <- model.matrix(~ Sex + Age + pH + PMI + RIN + Status, data = dfilt$samples)
#remove Batch Effect Function
voom_rembatch <- removeBatchEffect(
voom,
batch = dfilt$samples$lib_batch,
design = design_rembatch)
#calculating PCs on batch corrected matrix to account for hidden variance
pc <- pca(voom_rembatch,removeVar = 0.1)
## -- removing the lower 10% of variables based on variance
#head(pc$rotated)
head(pc$variance)
## PC1 PC2 PC3 PC4 PC5 PC6
## 10.391856 6.858424 4.737621 3.827702 3.234374 3.036731
#creating new metadata to include PCs for hidden variance
meta_redwPCA <- data.frame(cbind(dfilt$samples, pc$rotated[,c(1:3)] ))
#variance parition to visualise variance explained by PCs, covariates and residuals
varPartDes <- ~ Age + pH + RIN + PMI + PC1 + PC2 + (1 | Sex) +
(1 | lib_batch) + (1 | Status)
#adjusting optimsation algorithm to have less tolerance, this is to prevent roundoff errors
options(lmerControl = lmerControl(optimizer = "bobyqa", optCtrl = list(tol = 1e-4)))
# Run fitExtractVarPartModel with the new optimiser
varPart <- fitExtractVarPartModel(voom, varPartDes, meta_redwPCA)
#return optimisation algotrithm back to normal
options(lmerControl = "nloptwrap")
# Plotting variance partition results
vp <- sortCols(varPart)
plotPercentBars(vp[1:10, ])

plotVarPart(vp, main = 'Fib_A Variance Parition with Outliers excluded (n=84)')

#head(varPart[order(varPart$Status, decreasing = F), ])
Data prepared, now running DESeq
#ensure all categorical variables are recognised by R
meta_redwPCA$Sex <- as.factor(meta_redwPCA$Sex)
meta_redwPCA$lib_batch <- as.factor(meta_redwPCA$lib_batch)
meta_redwPCA$Status <- factor(meta_redwPCA$Status, c('0', '1'))
meta_redwPCA$Status <- ifelse(meta_redwPCA$Status == '0', 'Case', 'Control')
meta_redwPCA$Adversity.Code <- factor(meta_redwPCA$Adversity.Code, c('0', '1', '2'))
meta_redwPCA$Adversity.Code <- ifelse(meta_redwPCA$Adversity.Code == '0', 'noAdversity',
ifelse(meta_redwPCA$Adversity.Code == '1', 'control', 'Adversity'))
#ensure all numeric variables are recognised as such by R, also scaled and centred to prevent multicollinearity
meta_redwPCA$Age <- as.numeric(meta_redwPCA$Age)
meta_redwPCA$Brain.pH <- as.numeric(meta_redwPCA$pH)
meta_redwPCA$RIN <- as.numeric(meta_redwPCA$RIN)
meta_redwPCA$PMI <- as.numeric(meta_redwPCA$PMI)
#meta_redwPCA$metadata_PC1 <- as.numeric(meta_redwPCA$metadata_PC1)
meta_redwPCA <- meta_redwPCA %>%
mutate(across(contains("PC"), as.numeric))
meta_redwPCA$Age <- scale(meta_redwPCA$Age, center = TRUE, scale = TRUE)
meta_redwPCA$pH <- scale(meta_redwPCA$pH, center = TRUE, scale = TRUE)
meta_redwPCA$RIN <- scale(meta_redwPCA$RIN, center = TRUE, scale = TRUE)
meta_redwPCA$PMI <- scale(meta_redwPCA$PMI, center = TRUE, scale = TRUE)
meta_redwPCA <- meta_redwPCA %>% mutate(across(contains("PC"), ~ scale(.)))
#make DEseq object (used to analyse)
dds <- DESeqDataSetFromMatrix(countData = counts, colData=meta_redwPCA, design= ~ RIN + lib_batch + Age + Sex + PMI + pH + PC1 + PC2 + Status)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
#relevel to make the control group reference level, the results will indicate the difference of case from control
dds$Status <- relevel(dds$Status, ref = "Control")
#Pre-filtering data to remove low counts - in this workflow remove genes that don't have 20 individuals with 10 counts
keep <- rowSums(counts(dds) >= 10) >= 20
dds <- dds[keep,]
#Run DEseq Analysis
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
#check dispersion as a validation
plotDispEsts(dds)

#form results table
res <- results(dds, name = "Status_Case_vs_Control")
#lfcshrink uses info from all genes to generate more accurate estimates, useful when you have low counts/high dispersion. If the gene has high residuals the log2FoldChange is reduced.
resShrink <- lfcShrink(dds, coef = "Status_Case_vs_Control", res = res, type = "apeglm")
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
## Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
## sequence count data: removing the noise and preserving large differences.
## Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
head(resShrink[order(resShrink$pvalue),][1:10,])
## log2 fold change (MAP): Status Case vs Control
## Wald test p-value: Status Case vs Control
## DataFrame with 6 rows and 5 columns
## baseMean log2FoldChange lfcSE pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric>
## CAB39L 19.1423 0.453943 0.1221805 8.60428e-06 0.0731364
## TAFA1 95.8143 -0.357630 0.1087379 3.56518e-05 0.1515202
## LRIG1 244.1235 -0.197094 0.0630041 6.63110e-05 0.1878812
## NEK7 29.6787 0.272431 0.0946733 1.28830e-04 0.2737632
## AC087564.1 75.9921 -0.240778 0.0932219 2.62893e-04 0.4469183
## HIF3A 96.3103 0.385500 0.1558982 3.50807e-04 0.4969762
plotMA(res, ylim = c(-2,2), main = "without Shrink")

plotMA(resShrink, ylim = c(-2,2), main = 'with shrink')

#export full genelist as a csv so you can look at it
#write.csv(resShrink, file="E_DEG_Case_v_Control_PC2_wShrink.csv")
#write.csv(res, file = "E_DEG_Case_v_Control_PC2_woutShrink.csv")