Data Transformation

#This tutorial focuses on the use of dplyr for data manipulation

#Step 1: Read in files from yeast expression experiment.

myconditions <- read.csv("conditions_annotation.csv")
myexpression <- read.csv("SC_expression.csv")
#Step 2: Limit your search to one condition (pick a new condition for your script)

myfilter <- myconditions[grepl("37 deg",myconditions$primary),] 
#Step 3: Select expression data from only the annotations/column 1 of filtered list
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

myexpression2 <-
  myexpression %>%
  select(myfilter$ID)

#Step 4: Make data tidy (one observation per row) to use ggplot/dplyr
library(tidyr)
tidyExpression <- myexpression2 %>% pivot_longer(cols = everything())

#Step 5: Create summary of expression value stats in an easy to read tibble (formatted dataframe)

by_treatment <- tidyExpression %>%
  group_by(name)

by_treatment  %>%
  summarise_all(list(mean = mean, median  = median, n=length))

## # A tibble: 8 × 4
##   name    mean median     n
##   <chr>  <dbl>  <dbl> <int>
## 1 SAABFI  165.   20.3  6071
## 2 SAABFN  165.   17.7  6071
## 3 SAABFS  165.   17.3  6071
## 4 SAABIB  165.   16.6  6071
## 5 SAABIC  165.   15.9  6071
## 6 SAABIF  165.   20.0  6071
## 7 SAABIQ  165.   19.3  6071
## 8 SAABQR  165.   17.6  6071

## # A tibble: 4 × 4
##   name    mean median     n
##   <chr>  <dbl>  <dbl> <int>
## 1 AFIQBR  165.   2.13  6071
## 2 AFIQCI  165.   3.31  6071
## 3 QCAQFI  165.   5.36  6071
## 4 QCAQFQ  165.   7.37  6071
#Step 5: Plot violin plot of expression data
library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.4.1

ggplot(tidyExpression, aes(x = name, y = log(value))) +
  geom_violin(fill = "pink", color = "black", alpha = 0.7) +         
  theme_minimal(base_size = 14) +                                          
  labs(title = "Log Expression by Sample", x = "Sample", y = "Log(Value)")

## Warning: Removed 1407 rows containing non-finite outside the scale range
## (`stat_ydensity()`).

Data Transformation

Alyson Barsalou

2025-09-19