Load packages
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
Load gene expression matrix
df <- read_tsv("gene_expr_matrix.tsv")
Calculate mean and SD of each gene, and normalize SD according to mean (coefficient of variation)
df.summarised <- gather(df, sample, expr, -gene_id, -gene_symbol) %>%
group_by(gene_id, gene_symbol) %>%
summarise(mean = mean(expr), sd_norm = sd(expr) / mean) %>%
filter(mean != 0) %>%
gather(statistic, value, mean, sd_norm)
Show mean and SD of all genes, highlighting TBP and PRKG1
genes <- c("TBP", "PRKG1", "CDKN1A")
p <- ggplot(data = df.summarised,
aes(x=statistic, y=value)) +
geom_boxplot() +
scale_y_log10() +
geom_point(
data=filter(df.summarised, gene_symbol %in% genes),
color="red", size=3) +
geom_text(
data = filter(df.summarised, gene_symbol %in% genes),
aes(label=gene_symbol),
hjust = -0.2,
vjust = 0.5)
p

Find other genes with high mean (above Q3) and low SD (below Q1/4)
# mean_quants <- quantile(df$mean)
# sd_quants <- quantile(df$sd_norm)
# df.candidates <- filter(df, mean > mean_quants["75%"], sd_norm < sd_quants["25%"] / 4)
# print(df.candidates)