#Reading in the dataset three data sets from expression, conditions, and labels.
SCexpression<-read.csv("SC_expression.csv", header = T)
conditionData<-read.csv("conditions_annotation.csv", header = T)
labels<-read.csv("mergedLabels.csv", header = T)
#Filtering data for a single Condition, Citrinin.
Citrinin_condition <-conditionData[grepl("Citrinin",conditionData$secondary),] 
#Mergeing the data sets for expression and labels.
exlabels<-merge.data.frame(SCexpression, labels, by.x = "X", by.y = "gene")
#Selection of gene expression for sequences filtered by the choisen condition.
Citrinin_expression<- SCexpression %>% select(X,IAAICF, IAAICQ)
#Pivot_longer to creat new columns for treatment(sequence) and count for each gene in the filtered selection.
CitEX<-Citrinin_expression%>%
  pivot_longer(
  cols = c(2,3),
  names_to = "treatment",
  values_to = "count")
#Creating a tibble for mean and median counts for each treatment(sequence).
tibble <- CitEX %>%
  group_by(treatment) %>%
  summarise(
    mean = mean(count),
    median = median(count),
    n = nrow(Citrinin_expression)) %>%
  as_tibble()
tibble
## # A tibble: 2 × 4
##   treatment  mean median     n
##   <chr>     <dbl>  <dbl> <int>
## 1 IAAICF     165.   33.9  6071
## 2 IAAICQ     165.   33.9  6071
#Creating a violin plot for the counts of each treatment with the mean marked in red and the median marked in blue. Counts filtered to remove outliers greater than 328.
filteredCitEX<- CitEX %>% filter(count < 328)
ggplot(filteredCitEX, aes(x = treatment, y = count))+
  geom_violin()+
  stat_summary(fun = mean, geom = "point", color = "red", size = 3)+
  stat_summary(fun = median, geom = "point", color = "blue", size = 3)