annotations <- read_csv(
"conditions_annotation.csv",
col_types = cols(
ID = col_character(),
primary = col_character(),
secondary = col_character(),
additional_information = col_character()
)
)
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
problems(annotations)
## # A tibble: 86 × 5
## row col expected actual file
## <int> <int> <chr> <chr> <chr>
## 1 2 3 4 columns 3 columns /Users/lydiapicariello/Desktop/DataTransform…
## 2 3 3 4 columns 3 columns /Users/lydiapicariello/Desktop/DataTransform…
## 3 4 3 4 columns 3 columns /Users/lydiapicariello/Desktop/DataTransform…
## 4 5 3 4 columns 3 columns /Users/lydiapicariello/Desktop/DataTransform…
## 5 6 3 4 columns 3 columns /Users/lydiapicariello/Desktop/DataTransform…
## 6 7 3 4 columns 3 columns /Users/lydiapicariello/Desktop/DataTransform…
## 7 8 3 4 columns 3 columns /Users/lydiapicariello/Desktop/DataTransform…
## 8 9 3 4 columns 3 columns /Users/lydiapicariello/Desktop/DataTransform…
## 9 10 3 4 columns 3 columns /Users/lydiapicariello/Desktop/DataTransform…
## 10 11 3 4 columns 3 columns /Users/lydiapicariello/Desktop/DataTransform…
## # ℹ 76 more rows
expr_df <- read_csv("SC_expression.csv", show_col_types = FALSE)
## New names:
## • `` -> `...1`
## • `IFFAFA` -> `IFFAFA...5`
## • `INICIA` -> `INICIA...6`
## • `FFNAAA` -> `FFNAAA...9`
## • `SAABFA` -> `SAABFA...21`
## • `INICIA` -> `INICIA...25`
## • `IFAAAA` -> `IFAAAA...35`
## • `SAABFA` -> `SAABFA...42`
## • `IFFAFA` -> `IFFAFA...59`
## • `FFNAAA` -> `FFNAAA...83`
## • `IFAAAA` -> `IFAAAA...91`
colnames(expr_df) <- colnames(expr_df) %>%
str_replace("\\.\\.\\.[0-9]+", "")
colnames(expr_df)[1] <- "gene"
colnames(expr_df)[1:10]
## [1] "gene" "IFFABF" "SAASCC" "IFFAFF" "IFFAFA" "INICIA" "INICIF" "SAABQI"
## [9] "FFNAAA" "IFAAAR"
process_condition <- function(condition_name, expr_df, annotations_df){
cond <- annotations_df %>%
filter(primary == condition_name | secondary == condition_name)
cond_cols <- cond$ID
expr_cond <- expr_df %>%
select(gene, all_of(cond_cols))
long_cond <- expr_cond %>%
pivot_longer(-gene, names_to = "ID", values_to = "count") %>%
left_join(annotations_df, by = "ID") %>%
mutate(count_log = count + 1) %>%
filter(!is.na(count_log))
return(long_cond)
}
wildtype_long <- process_condition("wildtype", expr_df, annotations)
swr1_long <- process_condition("swr1", expr_df, annotations)
plot_violin <- function(data, title_text){
ggplot(data, aes(x = primary, y = count_log, fill = primary)) +
geom_violin(trim = FALSE, alpha = 0.7) +
geom_boxplot(width = 0.1, color = "lightpink", alpha = 0.3) +
scale_y_log10() +
theme_minimal() +
labs(title = title_text,
x = "Condition",
y = "Expression Count (log10)") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
}
plot_violin(wildtype_long, "Wildtype Gene Expression Distribution")

plot_violin(swr1_long, "swr1 Gene Expression Distribution")

summarize_condition <- function(long_data){
long_data %>%
group_by(primary) %>%
summarise(mean_count = mean(count),
median_count = median(count))
}
wildtype_summary <- summarize_condition(wildtype_long)
swr1_summary <- summarize_condition(swr1_long)
bind_rows(
wildtype_summary %>% mutate(condition_set = "wildtype"),
swr1_summary %>% mutate(condition_set = "swr1")
)
## # A tibble: 2 × 4
## primary mean_count median_count condition_set
## <chr> <dbl> <dbl> <chr>
## 1 wildtype 165. 3.85 wildtype
## 2 swr1 165. 6.73 swr1