loading in libraries

library("dplyr")
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library("tidyr")
library("tibble")
library("ggplot2")

reading in condition_annotations, SC_expression, and merged labels csv files

expression <-read.csv("SC_expression.csv", header=TRUE)
conditions <-read.csv("conditions_annotation.csv", header = TRUE)
mergedLabels <-read.csv("mergedLabels.csv")

Filtering condition data for “galactose”

galacdata <-conditions[grepl("galactose",conditions$secondary),]

Merge expression data with labels

expressionlabels<-merge(expression, mergedLabels, by.x="X", by.y = "gene")

selecting sequences by gene IDs for galactose

galacexpression <-expression%>% select(X,SAASAQ,SAASCC,SAASCF)

use pivot longer to make all treatment column names into a new column just named “treatment”

pivotgalac<-galacexpression %>%
  pivot_longer(
    cols=c(2:4),
    names_to = "treatment",
    values_to = "count")

creating a tibble for mean and median counts for all treatments

tibble<-pivotgalac %>%
  group_by(treatment)%>%
  summarise(
    mean= mean(count),
    median=median(count),
    n=nrow(galacexpression)) %>%
  as_tibble()
tibble
## # A tibble: 3 × 4
##   treatment  mean median     n
##   <chr>     <dbl>  <dbl> <int>
## 1 SAASAQ     165.   32.5  6071
## 2 SAASCC     165.   31.1  6071
## 3 SAASCF     165.   36.7  6071

creating violin plot to show counts for each condition tested with galactose. Mean is marked in pink, median is marked in blue. Outliers above 328 have been filtered out

filterpivotgalac<-pivotgalac %>% filter(count <328)
ggplot(filterpivotgalac, aes(x= treatment, y= count)) +
  geom_violin()+
  stat_summary(fun=mean, geom = "point", color= "pink", size = 2.5)+
  stat_summary(fun=median, geom = "point", color = "skyblue", size = 2.5)