#This tutorial focuses on the use of dplyr for data manipulation
#Step 1: Read in files from yeast expression experiment.
myconditions <- read.csv("conditions_annotation.csv")
myexpression <- read.csv("SC_expression.csv")
#Step 2: Limit your search to one condition (pick a new condition for your script)
myfilter <- myconditions[grepl("37 deg",myconditions$primary),]
#Step 3: Select expression data from only the annotations/column 1 of filtered list
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
myexpression2 <-
myexpression %>%
select(myfilter$ID)
#Step 4: Make data tidy (one observation per row) to use ggplot/dplyr
library(tidyr)
tidyExpression <- myexpression2 %>% pivot_longer(cols = everything())
#Step 5: Create summary of expression value stats in an easy to read tibble (formatted dataframe)
by_treatment <- tidyExpression %>%
group_by(name)
by_treatment %>%
summarise_all(list(mean = mean, median = median, n=length))
## # A tibble: 8 × 4
## name mean median n
## <chr> <dbl> <dbl> <int>
## 1 SAABFI 165. 20.3 6071
## 2 SAABFN 165. 17.7 6071
## 3 SAABFS 165. 17.3 6071
## 4 SAABIB 165. 16.6 6071
## 5 SAABIC 165. 15.9 6071
## 6 SAABIF 165. 20.0 6071
## 7 SAABIQ 165. 19.3 6071
## 8 SAABQR 165. 17.6 6071
## # A tibble: 4 × 4
## name mean median n
## <chr> <dbl> <dbl> <int>
## 1 AFIQBR 165. 2.13 6071
## 2 AFIQCI 165. 3.31 6071
## 3 QCAQFI 165. 5.36 6071
## 4 QCAQFQ 165. 7.37 6071
#Step 5: Plot violin plot of expression data
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.1
ggplot(tidyExpression, aes(x = name, y = log(value))) +
geom_violin(fill = "pink", color = "black", alpha = 0.7) +
theme_minimal(base_size = 14) +
labs(title = "Log Expression by Sample", x = "Sample", y = "Log(Value)")
## Warning: Removed 1407 rows containing non-finite outside the scale range
## (`stat_ydensity()`).
