Loading and presenting data in Correct format for DESeq2 Package

#load in raw counts and metadata
counts <- read.csv("~/Desktop/Honours/Data Files/Fibrous_Pseudobulk_Counts.csv")
metadata <- read.csv("~/Desktop/Honours/Data Files/Metadata.csv") 

#set row names of metadata as subject_id and remove 'X' column which corresponds with row names
rownames(metadata) <- metadata$subject_id
metadata = metadata[,-1]

#create reduced mwtadata for easier interpretation
metadata_reduced <- metadata[c("subject_id","SU.Number", "Age", "AgeBin", "Sex", "Status", 'Adversity.Code', 'Trauma.Code', "RIN", "Brain.pH", "PMI", "SixBatch", 'Suicide', "Freezer.storage.time..days.", 'Agonal.score', 'Classification.detail')]

rownames(metadata_reduced) <- metadata_reduced$subject_id

#change column names for easier interpretation
colnames(metadata_reduced)[which(names(metadata_reduced) == 'Brain.pH')] <- 'pH'
colnames(metadata_reduced)[which(names(metadata_reduced) == 'SixBatch')] <- 'lib_batch'
metadata_reduced$lib_batch <- as.factor(metadata_reduced$lib_batch)
metadata_reduced$Status <- as.factor(metadata_reduced$Status)
metadata_reduced$Adversity.Code <- as.factor(metadata_reduced$Adversity.Code)
colnames(metadata_reduced)[which(names(metadata_reduced) == 'Freezer.storage.time..days.')] <- 'Days_in_Freezer'
colnames(metadata_reduced)[which(names(metadata_reduced) == 'Agonal.score')] <- 'agonal_score'
metadata_reduced$Age <- as.numeric(metadata_reduced$Age)

# order individuals by increasing individual number
metadata_reduced <- metadata_reduced[order(metadata_reduced$SU.Number),]
head(metadata_reduced)
##                    subject_id SU.Number Age AgeBin    Sex Status Adversity.Code
## individual.255 individual.255       255  51  50-59   Male      0              0
## individual.294 individual.294       294  64  60-69   Male      0              0
## individual.306 individual.306       306  66  60-69 Female      0              0
## individual.308 individual.308       308  40  40-49   Male      0              0
## individual.319 individual.319       319  55  50-59   Male      0              0
## individual.323 individual.323       323  33  30-39 Female      1              1
##                Trauma.Code RIN   pH  PMI lib_batch Suicide Days_in_Freezer
## individual.255           0 7.5 6.62 18.0        15       0            6094
## individual.294           0 7.6 6.60 24.0        15       1            5583
## individual.306           0 8.1 6.52 12.5        10       0            5457
## individual.308           0 7.6 6.20 21.5        13       0            5445
## individual.319           0 6.4 6.34 72.0         5       0            5321
## individual.323           1 7.7 6.77 24.0        13       0            5292
##                agonal_score Classification.detail
## individual.255            0         Schizophrenia
## individual.294            0      Major Depression
## individual.306            0         Schizophrenia
## individual.308            1         Schizophrenia
## individual.319            0         Schizophrenia
## individual.323            0               Control
#removing individual in trauma group 5 - they were diagnosed after death by friends as having depression - not medically
metadata_reduced <- metadata_reduced[rownames(metadata_reduced) != "individual.758", ]
metadata_reduced <- metadata_reduced[,-1]

#this sets the row names as the values in the first column (initally row names in uploaded csv) then removes it 
rownames(counts)<-counts$X

#remove T.C 5 individual in 5 as well for counts
counts <- counts[, colnames(counts) != "individual.758"]
counts = counts[,-1]

#convert dataframe to matrix - didn't do this for metadata
counts <- as.matrix(counts)
dim(counts)
## [1] 26195    86
#check that ID order is the same in both, the column names of counts = row names of metadata
all.equal(colnames(counts),rownames(metadata_reduced))
## [1] TRUE
#all columns from counts are in the same order as rows in metadata
all(colnames(counts) == rownames(metadata_reduced))
## [1] TRUE

PCA for covariate selection

#remove low count genes, each gene must have 20 individuals with at least 10 counts of the gene
filt_counts <- counts[rowSums(counts >= 10) >= 20,]
dim(filt_counts)
## [1] 8500   86
#variance stabilisation transformation - normalises for same variance amongst low count genes
vsd_counts  <- DESeq2::varianceStabilizingTransformation(as.matrix(filt_counts))
## converting counts to integer mode
#View histogram, clear peak in low counts (which couldn't be found in data for analysis)
hist(vsd_counts, main = 'vst of filtered Fibrous Astrocytes Counts, with outliers (n=86)')

#PCA
PCA_vsdbulk <- pca(vsd_counts, metadata = metadata_reduced, removeVar = 0.05)
## -- removing the lower 5% of variables based on variance
#new data frame with PC1 column
meta_wPC1 <- cbind(metadata_reduced, PCA_vsdbulk$rotated[,c("PC1")])

#rename column to PC1
meta_wPC1$PC1 <- meta_wPC1$'PCA_vsdbulk$rotated[, c("PC1")]'

# Calculate the mean and standard deviation of PC1
mean_PC1 <- mean(meta_wPC1$PC1, na.rm = TRUE)
sd_PC1 <- sd(meta_wPC1$PC1, na.rm = TRUE)

# Print the mean and standard deviation
cat("Mean of PP PC1:", mean_PC1, "\n")
## Mean of PP PC1: 2.478637e-15
cat("Standard PP Deviation of PC1:", sd_PC1, "\n")
## Standard PP Deviation of PC1: 18.60339
# Create a new column 'PC1_Z_Score' with z-scores for each row
meta_wPC1$PC1_Z_Score <- (meta_wPC1$PC1 - mean_PC1) / sd_PC1

# Create a new column 'outliers' to mark z-scores above an absolute value of 3
meta_wPC1$outliers <- abs(meta_wPC1$PC1_Z_Score) > 3

# Create a new filtered data frame 'meta_filt' that removes outliers
meta_remOutliers <- meta_wPC1[!meta_wPC1$outliers, ]

#Create new dataframe homologus to metadata_reduced in previous chunk 
meta_filt <- meta_remOutliers[c("SU.Number","Age", "AgeBin", "Sex", "Status", 'Trauma.Code', 'Adversity.Code', "RIN", "pH", "PMI", "lib_batch", 'Suicide', "Days_in_Freezer", 'agonal_score', 'Classification.detail')]

#adjust the counts to remove outliers in the cell type your using. Endo = no outliers, FA = 'individual.404', 'individual.854', PP = 'individual.383'. CHeck dim to ensure the columns are removed
outlier_cols <- c('individual.404', 'individual.854')
counts <- counts[,!colnames(counts) %in% outlier_cols]
dim(counts)
## [1] 26195    84
#do the same for filtered counts
filt_counts <- filt_counts[, !colnames(filt_counts) %in% outlier_cols]
dim(filt_counts)
## [1] 8500   84
#variance stabilisation transformation for counts without outliers
vst  <- DESeq2::varianceStabilizingTransformation(as.matrix(filt_counts))
## converting counts to integer mode
dim(vst)
## [1] 8500   84
#Vst histogram to visualise improvement in low count outliers
hist(vst, main = "Fibrous astrocyte counts w/z-score over |3| removed (n=84)")

#recalculate PC
PCA_vst <- pca(vst, metadata = meta_filt, removeVar = 0.05)
## -- removing the lower 5% of variables based on variance
#the 'PCA_vst' in #158 and #166 and is for the outliers removed, if you want to see the scree without use PCA_vsdbulk from #107
scree <- screeplot(PCA_vst, components = getComponents(PCA_vst)[1:20], axisLabSize = 18, titleLabSize = 22, colBar = "#2e134b" ,
  drawCumulativeSumLine = TRUE,
  colCumulativeSumLine = "#e38830", colCumulativeSumPoints = "#9a4a1e",
  sizeCumulativeSumPoints = 2.5, title = 'SCREE: n = 84, without outliers')
scree + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1, colour="black", size=15), axis.text.y = element_text(colour="black", size=15),
             axis.title=element_text(face='bold'))

options(repr.plot.width = 100, repr.plot.height = 7, repr.plot.res = 150)
eigencorplot(PCA_vst, components = getComponents(PCA_vst)[1:10],
             metavars = c('Age', 'RIN', 'pH', 'PMI', 'Days_in_Freezer'), rotLabX = 45, cexCorval = 1, 
             col = c("#fde725", '#21918c', '#440154'), corFUN = 'kendall', corMultipleTestCorrection = 'BH', 
             main = bquote(Kendall ~ Correlation), colCorval = "black", fontLabX = 4, fontLabY = 4)

Accounting for hidden confounders by calculating PC1

dfull <- DGEList(as.data.frame(counts), samples = meta_filt)
dim(dfull)
## [1] 26195    84
keep <- rowSums(dfull$counts >= 10) >= 20

dfilt <- dfull[keep,]
dim(dfilt)
## [1] 8500   84
#normalise filtered dgelist
dfilt <- calcNormFactors(dfilt)

#create model matrix for voom function
mm <- model.matrix(~ Sex + Age + pH + PMI + RIN + lib_batch + Status, data = dfilt$samples)

#voom - transforming counts for linear modelling
voom <- voom(dfilt, mm, plot =T)

#model matric for removing batch effects - note that lib_batch isn't included
design_rembatch <- model.matrix(~ Sex + Age + pH + PMI + RIN + Status, data = dfilt$samples)

#remove Batch Effect Function
voom_rembatch <- removeBatchEffect(
    voom, 
    batch = dfilt$samples$lib_batch, 
    design = design_rembatch)

#calculating PCs on batch corrected matrix to account for hidden variance
pc <- pca(voom_rembatch,removeVar = 0.1)
## -- removing the lower 10% of variables based on variance
#head(pc$rotated)
head(pc$variance)
##       PC1       PC2       PC3       PC4       PC5       PC6 
## 10.391856  6.858424  4.737621  3.827702  3.234374  3.036731
#creating new metadata to include PCs for hidden variance
meta_redwPCA <- data.frame(cbind(dfilt$samples, pc$rotated[,c(1:3)] ))

#variance parition to visualise variance explained by PCs, covariates and residuals
varPartDes <- ~ Age + pH + RIN + PMI + PC1 + PC2 + (1 | Sex) + 
  (1 | lib_batch) + (1 | Status)

#adjusting optimsation algorithm to have less tolerance, this is to prevent roundoff errors
options(lmerControl = lmerControl(optimizer = "bobyqa", optCtrl = list(tol = 1e-4)))

# Run fitExtractVarPartModel with the new optimiser
varPart <- fitExtractVarPartModel(voom, varPartDes, meta_redwPCA)

#return optimisation algotrithm back to normal
options(lmerControl = "nloptwrap")

# Plotting variance partition results
vp <- sortCols(varPart)

plotPercentBars(vp[1:10, ])

plotVarPart(vp, main = 'Fib_A Variance Parition with Outliers excluded (n=84)')

#head(varPart[order(varPart$Status, decreasing = F), ])

Data prepared, now running DESeq

#ensure all categorical variables are recognised by R
meta_redwPCA$Sex <- as.factor(meta_redwPCA$Sex)
meta_redwPCA$lib_batch <- as.factor(meta_redwPCA$lib_batch)
meta_redwPCA$Status <- factor(meta_redwPCA$Status, c('0', '1'))
meta_redwPCA$Status <- ifelse(meta_redwPCA$Status == '0', 'Case', 'Control')
meta_redwPCA$Adversity.Code <- factor(meta_redwPCA$Adversity.Code, c('0', '1', '2'))
meta_redwPCA$Adversity.Code <- ifelse(meta_redwPCA$Adversity.Code == '0', 'noAdversity',
                                      ifelse(meta_redwPCA$Adversity.Code == '1', 'control', 'Adversity'))

#ensure all numeric variables are recognised as such by R, also scaled and centred to prevent multicollinearity
meta_redwPCA$Age <- as.numeric(meta_redwPCA$Age)
meta_redwPCA$Brain.pH <- as.numeric(meta_redwPCA$pH)
meta_redwPCA$RIN <- as.numeric(meta_redwPCA$RIN)
meta_redwPCA$PMI <- as.numeric(meta_redwPCA$PMI)
#meta_redwPCA$metadata_PC1 <- as.numeric(meta_redwPCA$metadata_PC1)
meta_redwPCA <- meta_redwPCA %>%
  mutate(across(contains("PC"), as.numeric))

meta_redwPCA$Age <- scale(meta_redwPCA$Age, center = TRUE, scale = TRUE)
meta_redwPCA$pH <- scale(meta_redwPCA$pH, center = TRUE, scale = TRUE)
meta_redwPCA$RIN <- scale(meta_redwPCA$RIN, center = TRUE, scale = TRUE)
meta_redwPCA$PMI <- scale(meta_redwPCA$PMI, center = TRUE, scale = TRUE)
meta_redwPCA <- meta_redwPCA %>% mutate(across(contains("PC"), ~ scale(.)))

#make DEseq object (used to analyse)
dds <- DESeqDataSetFromMatrix(countData = counts, colData=meta_redwPCA, design= ~ RIN + lib_batch + Age + Sex + PMI + pH + PC1 + PC2 + Status)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
#relevel to make the control group reference level, the results will indicate the difference of case from control
dds$Status <- relevel(dds$Status, ref = "Control")

#Pre-filtering data to remove low counts - in this workflow remove genes that don't have 20 individuals with 10 counts 
keep <- rowSums(counts(dds) >= 10) >= 20
dds <- dds[keep,]

#Run DEseq Analysis
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
#check dispersion as a validation
plotDispEsts(dds)

#form results table
res <- results(dds, name = "Status_Case_vs_Control")

#lfcshrink uses info from all genes to generate more accurate estimates, useful when you have low counts/high dispersion. If the gene has high residuals the log2FoldChange is reduced. 
resShrink <- lfcShrink(dds, coef = "Status_Case_vs_Control", res = res, type = "apeglm")
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
head(resShrink[order(resShrink$pvalue),][1:10,])
## log2 fold change (MAP): Status Case vs Control 
## Wald test p-value: Status Case vs Control 
## DataFrame with 6 rows and 5 columns
##             baseMean log2FoldChange     lfcSE      pvalue      padj
##            <numeric>      <numeric> <numeric>   <numeric> <numeric>
## CAB39L       19.1423       0.453943 0.1221805 8.60428e-06 0.0731364
## TAFA1        95.8143      -0.357630 0.1087379 3.56518e-05 0.1515202
## LRIG1       244.1235      -0.197094 0.0630041 6.63110e-05 0.1878812
## NEK7         29.6787       0.272431 0.0946733 1.28830e-04 0.2737632
## AC087564.1   75.9921      -0.240778 0.0932219 2.62893e-04 0.4469183
## HIF3A        96.3103       0.385500 0.1558982 3.50807e-04 0.4969762
plotMA(res, ylim = c(-2,2), main = "without Shrink")

plotMA(resShrink, ylim = c(-2,2), main = 'with shrink')

#export full genelist as a csv so you can look at it
#write.csv(resShrink, file="E_DEG_Case_v_Control_PC2_wShrink.csv")
#write.csv(res, file = "E_DEG_Case_v_Control_PC2_woutShrink.csv")