annotations <- read_csv(
  "conditions_annotation.csv",
  col_types = cols(
    ID = col_character(),
    primary = col_character(),
    secondary = col_character(),
    additional_information = col_character()
  )
)
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
problems(annotations)
## # A tibble: 86 × 5
##      row   col expected  actual    file                                         
##    <int> <int> <chr>     <chr>     <chr>                                        
##  1     2     3 4 columns 3 columns /Users/lydiapicariello/Desktop/DataTransform…
##  2     3     3 4 columns 3 columns /Users/lydiapicariello/Desktop/DataTransform…
##  3     4     3 4 columns 3 columns /Users/lydiapicariello/Desktop/DataTransform…
##  4     5     3 4 columns 3 columns /Users/lydiapicariello/Desktop/DataTransform…
##  5     6     3 4 columns 3 columns /Users/lydiapicariello/Desktop/DataTransform…
##  6     7     3 4 columns 3 columns /Users/lydiapicariello/Desktop/DataTransform…
##  7     8     3 4 columns 3 columns /Users/lydiapicariello/Desktop/DataTransform…
##  8     9     3 4 columns 3 columns /Users/lydiapicariello/Desktop/DataTransform…
##  9    10     3 4 columns 3 columns /Users/lydiapicariello/Desktop/DataTransform…
## 10    11     3 4 columns 3 columns /Users/lydiapicariello/Desktop/DataTransform…
## # ℹ 76 more rows
expr_df <- read_csv("SC_expression.csv", show_col_types = FALSE)
## New names:
## • `` -> `...1`
## • `IFFAFA` -> `IFFAFA...5`
## • `INICIA` -> `INICIA...6`
## • `FFNAAA` -> `FFNAAA...9`
## • `SAABFA` -> `SAABFA...21`
## • `INICIA` -> `INICIA...25`
## • `IFAAAA` -> `IFAAAA...35`
## • `SAABFA` -> `SAABFA...42`
## • `IFFAFA` -> `IFFAFA...59`
## • `FFNAAA` -> `FFNAAA...83`
## • `IFAAAA` -> `IFAAAA...91`
colnames(expr_df) <- colnames(expr_df) %>%
  str_replace("\\.\\.\\.[0-9]+", "")

colnames(expr_df)[1] <- "gene"

colnames(expr_df)[1:10]
##  [1] "gene"   "IFFABF" "SAASCC" "IFFAFF" "IFFAFA" "INICIA" "INICIF" "SAABQI"
##  [9] "FFNAAA" "IFAAAR"
process_condition <- function(condition_name, expr_df, annotations_df){
  
  cond <- annotations_df %>%
    filter(primary == condition_name | secondary == condition_name)
  
  cond_cols <- cond$ID
  
  expr_cond <- expr_df %>%
    select(gene, all_of(cond_cols))
  
  long_cond <- expr_cond %>%
    pivot_longer(-gene, names_to = "ID", values_to = "count") %>%
    left_join(annotations_df, by = "ID") %>%
    mutate(count_log = count + 1) %>%  
    filter(!is.na(count_log))          
  
  return(long_cond)
}
wildtype_long <- process_condition("wildtype", expr_df, annotations)
swr1_long <- process_condition("swr1", expr_df, annotations)
plot_violin <- function(data, title_text){
  ggplot(data, aes(x = primary, y = count_log, fill = primary)) +
    geom_violin(trim = FALSE, alpha = 0.7) +
    geom_boxplot(width = 0.1, color = "lightpink", alpha = 0.3) +
    scale_y_log10() +
    theme_minimal() +
    labs(title = title_text,
         x = "Condition",
         y = "Expression Count (log10)") +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
}

plot_violin(wildtype_long, "Wildtype Gene Expression Distribution")

plot_violin(swr1_long, "swr1 Gene Expression Distribution")

summarize_condition <- function(long_data){
  long_data %>%
    group_by(primary) %>%
    summarise(mean_count = mean(count),
              median_count = median(count))
}

wildtype_summary <- summarize_condition(wildtype_long)
swr1_summary <- summarize_condition(swr1_long)

bind_rows(
  wildtype_summary %>% mutate(condition_set = "wildtype"),
  swr1_summary %>% mutate(condition_set = "swr1")
)
## # A tibble: 2 × 4
##   primary  mean_count median_count condition_set
##   <chr>         <dbl>        <dbl> <chr>        
## 1 wildtype       165.         3.85 wildtype     
## 2 swr1           165.         6.73 swr1