#BiocManager::install("DESeq2")
#library(DESeq2)
H <- read.table("C:\\Users\\beste\\Downloads\\hammer_count_table.txt", header = TRUE, row.names = 1)
H_phen <- read.table("C:\\Users\\beste\\Downloads\\hammer_phenodata.txt", header = TRUE, row.names = 1)
colnames(H) <- c("1_C_2M","2_C_2M","1_SNL_2M","2_SNL_2M",
"1_C_2W","2_C_2W","1_SNL_2W","2_SNL_2W")
rownames(H_phen)<-c("1_C_2M","2_C_2M","1_SNL_2M","2_SNL_2M",
"1_C_2W","2_C_2W","1_SNL_2W","2_SNL_2W")
colnames(H_phen)[2]<-"TX"
#drop 0 count data
n_count <- drop(as.matrix(H) %*% rep(1, 8))
H <- H[n_count > 0, ]
head(H_phen)
## num.tech.reps TX strain Time
## 1_C_2M 1 control Sprague_Dawley 2_months
## 2_C_2M 2 control Sprague_Dawley 2_months
## 1_SNL_2M 1 L5_SNL Sprague_Dawley 2_months
## 2_SNL_2M 2 L5_SNL Sprague_Dawley 2_months
## 1_C_2W 1 control Sprague_Dawley 2_weeks
## 2_C_2W 2 control Sprague_Dawley 2_weeks
#create DESeq data set
HdsF <- DESeqDataSetFromMatrix(countData = H, colData = H_phen,
design = ~ Time+TX+Time:TX)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
#1a) Apply rlog transformation.
logH <- rlog(HdsF)
#1ai) Hierarchical clustering
par(mfrow=c(1,2))
HC <- hclust(dist(t(assay(logH))),"complete")
plot(HC)
plot(HC$height, main="Complete")
Based on the height plot, there is an “elbow” between the 6th and 7th index between the heights of about 30 and 60. Looking at the dendrogram, this corresponds to two different clusters which are clustered based on whether the rat was in the SNL group or the control group.
#1aii) PCA
plotPCA(logH, ntop = nrow(assay(logH)), intgroup = c("TX","Time"))
logH_pca <- prcomp(t(assay(logH)))
summary(logH_pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 25.2269 10.0710 5.72482 4.82183 4.71630 4.3441 3.88274
## Proportion of Variance 0.7487 0.1193 0.03856 0.02735 0.02617 0.0222 0.01774
## Cumulative Proportion 0.7487 0.8680 0.90654 0.93390 0.96006 0.9823 1.00000
## PC8
## Standard deviation 1.172e-13
## Proportion of Variance 0.000e+00
## Cumulative Proportion 1.000e+00
logH_pca$x
## PC1 PC2 PC3 PC4 PC5 PC6
## 1_C_2M -22.05548 4.611323 1.3213478 -3.3147559 -9.8667005 2.164869
## 2_C_2M -21.75540 10.477693 8.9572233 -1.0108456 5.3787093 -2.011387
## 1_SNL_2M 23.59116 10.441329 -0.9655458 9.5316671 -1.5703159 1.462706
## 2_SNL_2M 26.20543 10.326241 -7.2173251 -6.9773028 2.2659411 -1.167478
## 1_C_2W -21.61026 -8.083427 -5.0783490 0.6127469 4.0143011 7.041393
## 2_C_2W -28.24297 -4.632872 -4.8098309 2.8693295 0.8340482 -5.710653
## 1_SNL_2W 25.46336 -9.481074 6.6466362 -1.6082772 0.9643171 3.358151
## 2_SNL_2W 18.40416 -13.659213 1.1458435 -0.1025620 -2.0203004 -5.137601
## PC7 PC8
## 1_C_2M 0.04321949 1.338374e-13
## 2_C_2M 2.09758569 2.641605e-13
## 1_SNL_2M 0.47968020 2.172053e-13
## 2_SNL_2M -0.35282895 3.052329e-13
## 1_C_2W 2.98767269 -2.323691e-13
## 2_C_2W -5.26766239 -4.031857e-13
## 1_SNL_2W -5.65504537 -1.708533e-13
## 2_SNL_2W 5.66737864 4.135833e-14
logH_svd <- svd(t(scale(t(assay(logH)), center = TRUE, scale = FALSE)))
dd = logH_svd$d ^ 2 / sum(logH_svd$d ^ 2)
round(dd, 4)
## [1] 0.7487 0.1193 0.0386 0.0274 0.0262 0.0222 0.0177 0.0000
#eigenvalues:
e<-logH_svd$d^2
round(e,4)
## [1] 4454.7732 709.9691 229.4147 162.7504 155.7044 132.0997 105.5295
## [8] 0.0000
e/sum(e)
## [1] 7.486710e-01 1.193177e-01 3.855553e-02 2.735191e-02 2.616775e-02
## [6] 2.220074e-02 1.773533e-02 1.634593e-29
Looking the principal component graph, we can see that the first principal component accounts for about 75% of the variance. Looking at the data points we can see that PC1 corresponds with the clusters from the hierarchical clustering, since all the rats in the control group have a PC1 value between -20 and -30 while all the rats in the SNL group have a PC1 value between 18 to 28. The second principal component accounts for about 12% of the variance. We can see that PC2 accounts for the differences between the 2 week samples and the 2 month samples; all the 2 week samples have a negative PC2 value between -8 and -14 while all the 2 month samples have a positive PC2 value bewteen 4 and 11. We can see that this also corresponds with the next branching in the dendrogram above, which clusters the samples based on the timing of the sample.
When we find the singular values and find the percentage of variation for each one, we see that the percentages correctly match the variation percentages for the PCA testing. The first eigenfeature accounts for about 75% of the variation.
# 1b) Differential expression analysis.
HdsF_de <- DESeq(HdsF,test="LRT",reduced=~Time)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
HdsF_de_out <- results(HdsF_de)
# 1bi) Histogram of p-values
par(mfrow = c(1, 1))
hist(HdsF_de_out$pvalue)
If there were no differentially expressed genes then we would expect to see a uniform distribution. However the very large number of p values close to 0 suggests that there are features for which the treatment is a significant factor. This matches with the findings we saw in (1) where the samples were clustered by treatment and PCA found that the difference in results due to treatment was the main source of variability in the data.
# 1bii) Significant features
keep<-which(abs(HdsF_de_out$log2FoldChange)>2)
H_sig<-HdsF_de_out[keep,]
H_sig
## log2 fold change (MLE): Time2 weeks.TXL5 SNL
## LRT p-value: '~ Time + TX + Time:TX' vs '~ Time'
## DataFrame with 1450 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSRNOG00000000008 2.514400 -2.28363 2.59728 1.53565 0.46402051
## ENSRNOG00000000014 0.722944 -3.42840 4.82604 1.04194 0.59394548
## ENSRNOG00000000021 14.479824 2.51065 1.21962 4.86766 0.08770019
## ENSRNOG00000000097 0.486246 4.54965 5.68345 1.07179 0.58514446
## ENSRNOG00000000138 8.628185 -3.96909 1.80379 14.20079 0.00082478
## ... ... ... ... ... ...
## ENSRNOG00000043417 1.380893 -2.01163 3.04948 0.487520 0.783676
## ENSRNOG00000043437 0.578284 2.02080 5.00159 0.523114 0.769852
## ENSRNOG00000043440 0.262075 3.37796 6.94827 0.612760 0.736107
## ENSRNOG00000043493 0.418214 2.02221 5.20439 0.333854 0.846261
## ENSRNOG00000043507 2.494835 -3.04767 2.44950 4.516139 0.104552
## padj
## <numeric>
## ENSRNOG00000000008 0.61733441
## ENSRNOG00000000014 NA
## ENSRNOG00000000021 0.16512754
## ENSRNOG00000000097 NA
## ENSRNOG00000000138 0.00252238
## ... ...
## ENSRNOG00000043417 NA
## ENSRNOG00000043437 NA
## ENSRNOG00000043440 NA
## ENSRNOG00000043493 NA
## ENSRNOG00000043507 0.190967
keep2<-which(H_sig$padj<0.001)
H_sig<-H_sig[keep2,]
H_sig
## log2 fold change (MLE): Time2 weeks.TXL5 SNL
## LRT p-value: '~ Time + TX + Time:TX' vs '~ Time'
## DataFrame with 52 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSRNOG00000001338 61.79126 -2.85504 1.49778 156.9423 8.32551e-35
## ENSRNOG00000001414 277.25044 2.10644 0.37205 253.0918 1.10108e-55
## ENSRNOG00000002316 9.80392 4.16251 1.77200 21.9822 1.68509e-05
## ENSRNOG00000002336 15.31254 3.02411 1.29517 23.0700 9.78161e-06
## ENSRNOG00000002460 10.75831 2.71550 1.64722 33.8535 4.45464e-08
## ... ... ... ... ... ...
## ENSRNOG00000032517 30.88575 2.57160 1.01554 55.7974 7.65135e-13
## ENSRNOG00000032868 10.83404 2.54778 2.25907 54.1506 1.74322e-12
## ENSRNOG00000034164 7.00829 -2.97095 1.99737 16.3023 2.88408e-04
## ENSRNOG00000036924 7.51317 2.82554 1.81395 19.0525 7.29119e-05
## ENSRNOG00000042516 8.67611 2.12656 1.79798 16.4163 2.72419e-04
## padj
## <numeric>
## ENSRNOG00000001338 2.85874e-33
## ENSRNOG00000001414 9.25572e-54
## ENSRNOG00000002316 6.75927e-05
## ENSRNOG00000002336 4.06677e-05
## ENSRNOG00000002460 2.42999e-07
## ... ...
## ENSRNOG00000032517 6.57546e-12
## ENSRNOG00000032868 1.45818e-11
## ENSRNOG00000034164 9.56365e-04
## ENSRNOG00000036924 2.65154e-04
## ENSRNOG00000042516 9.05688e-04
There are 52 genes for which the log2 fold change is <2 and the FDR is <0.001.
#1biii) Most significant features
H_sig_ord<-H_sig[order(-abs(H_sig$log2FoldChange)),]
H_sig_ord
## log2 fold change (MLE): Time2 weeks.TXL5 SNL
## LRT p-value: '~ Time + TX + Time:TX' vs '~ Time'
## DataFrame with 52 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSRNOG00000019445 99.90499 4.50002 0.798833 274.0155 3.14979e-60
## ENSRNOG00000030823 6.35257 4.29183 2.196986 22.3100 1.43035e-05
## ENSRNOG00000028707 11.18901 4.24308 1.578595 27.0064 1.36656e-06
## ENSRNOG00000010079 26.12105 -4.19453 1.022288 24.7783 4.16344e-06
## ENSRNOG00000002316 9.80392 4.16251 1.772003 21.9822 1.68509e-05
## ... ... ... ... ... ...
## ENSRNOG00000004747 11.21190 2.17796 1.73182 22.5868 1.24548e-05
## ENSRNOG00000006755 11.88412 -2.17086 1.70447 18.6959 8.71454e-05
## ENSRNOG00000042516 8.67611 2.12656 1.79798 16.4163 2.72419e-04
## ENSRNOG00000013496 124.69388 -2.12129 1.08740 394.0141 2.76006e-86
## ENSRNOG00000001414 277.25044 2.10644 0.37205 253.0918 1.10108e-55
## padj
## <numeric>
## ENSRNOG00000019445 3.23021e-58
## ENSRNOG00000030823 5.80405e-05
## ENSRNOG00000028707 6.34334e-06
## ENSRNOG00000010079 1.82318e-05
## ENSRNOG00000002316 6.75927e-05
## ... ...
## ENSRNOG00000004747 5.10638e-05
## ENSRNOG00000006755 3.13507e-04
## ENSRNOG00000042516 9.05688e-04
## ENSRNOG00000013496 8.16499e-84
## ENSRNOG00000001414 9.25572e-54
## Plots of (normalized) read counts for the top 4 genes.
par(mfrow = c(2, 2))
for(i in 1:4){
plotCounts(HdsF, gene = rownames(H_sig_ord)[i], intgroup = c("TX","Time"),normal=FALSE,transform=FALSE)
}
Looking at the four plots of the genes with the greatest absolute log2 fold change, it appears that there is a significant difference between the treatment groups and the control groups for all 4 genes. For ENSRNOG00000019445, ENSRNOG00000030823, and ENSRNOG00000028707 the gene count for the SNL group at 2 weeks is significantly larger than the gene count the other three groups. Lastly for ENSRNOG00000010079 the gene count for the control group at 2 weeks is significantly larger than the gene count for the other three groups. Altogether this supports the conclusion that these genes are differentially expressed for the control group vs. the treatment group and the SNL treatment has a significant effect.