#Read and clean the label files Reading each file as a single column and the splitting.
read_labels <- function(path) {
raw <- read.table(path, sep = "\n", stringsAsFactors = FALSE, header = FALSE)
df <- as_tibble(stringr::str_split_fixed(raw$V1,",",3))
names(df) <- c("gene_id","term_id","description")
return(df)
}
labels_BP <- read_labels("labels_BP.csv")
## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
## `.name_repair` is omitted as of tibble 2.0.0.
## ℹ Using compatibility `.name_repair`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
labels_CC <- read_labels("labels_CC.csv")
labels_MF <- read_labels("labels_MF.csv")
labels_all <- bind_rows(
labels_BP %>% mutate(source = "BP"),
labels_CC %>% mutate(source = "CC"),
labels_MF %>% mutate(source = "MF")
)
conditionData <- read_csv("conditions_annotation.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 92 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): ID, primary, secondary, additional_information
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
expression <- read_csv("SC_expression.csv")
## New names:
## Rows: 6071 Columns: 93
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (1): ...1 dbl (92): IFFABF, SAASCC, IFFAFF, IFFAFA...5, INICIA...6, INICIF,
## SAABQI, FF...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
## • `IFFAFA` -> `IFFAFA...5`
## • `INICIA` -> `INICIA...6`
## • `FFNAAA` -> `FFNAAA...9`
## • `SAABFA` -> `SAABFA...21`
## • `INICIA` -> `INICIA...25`
## • `IFAAAA` -> `IFAAAA...35`
## • `SAABFA` -> `SAABFA...42`
## • `IFFAFA` -> `IFFAFA...59`
## • `FFNAAA` -> `FFNAAA...83`
## • `IFAAAA` -> `IFAAAA...91`
#Diagnostics
glimpse(expression)
## Rows: 6,071
## Columns: 93
## $ ...1 <chr> "YAL008W", "YBR255W", "YGR164W", "YGR131W", "YNL003C", "YB…
## $ IFFABF <dbl> 70.80493083, 36.36878581, 0.35138923, 7.37917393, 82.04938…
## $ SAASCC <dbl> 30.970186, 16.860290, 0.904239, 5.858715, 78.480411, 21.00…
## $ IFFAFF <dbl> 118.38465326, 27.72645192, 0.45602717, 12.13032271, 67.218…
## $ IFFAFA...5 <dbl> 110.7039503, 34.5279281, 0.2926096, 12.6797476, 64.1790302…
## $ INICIA...6 <dbl> 74.102235, 35.440199, 0.000000, 11.276427, 75.713153, 17.7…
## $ INICIF <dbl> 89.8022199, 56.9713008, 1.1035603, 18.4846351, 75.3179909,…
## $ SAABQI <dbl> 7.3415361, 15.4521855, 0.7691133, 2.3772593, 20.1367847, 1…
## $ FFNAAA...9 <dbl> 112.091167, 26.858749, 18.085545, 22.251591, 43.326887, 21…
## $ IFAAAR <dbl> 5.9736681, 5.3840882, 0.7119455, 4.4051630, 30.3244305, 6.…
## $ AFIINC <dbl> 1.59994401, 3.20552163, 0.61406302, 0.46758927, 3.65057648…
## $ IFFABN <dbl> 80.906634, 35.588256, 0.399868, 10.263280, 84.638736, 35.0…
## $ SAARRS <dbl> 25.248508, 18.951094, 3.747059, 12.300555, 120.082451, 17.…
## $ FFNARN <dbl> 92.878348, 20.780555, 16.487858, 14.731755, 38.536709, 12.…
## $ SAABQR <dbl> 14.3453666, 19.8577066, 0.4648961, 4.2172721, 20.5550508, …
## $ FFNAAR <dbl> 97.811741, 26.511415, 21.625352, 19.091838, 48.860629, 22.…
## $ FAFASB <dbl> 9.70600282, 10.00263452, 1.10101133, 1.09566662, 10.529088…
## $ SAABBN <dbl> 11.5621808, 14.0147646, 0.9009491, 1.5015819, 15.2660828, …
## $ SAASCF <dbl> 53.7957089, 23.5684783, 0.8542478, 9.5719563, 126.0782168,…
## $ INICIQ <dbl> 87.2875111, 47.3030480, 1.2495145, 22.9375157, 95.4986063,…
## $ SAABFA...21 <dbl> 7.1145370, 12.5800091, 0.7067421, 0.9423228, 12.5800091, 9…
## $ IFFAFQ <dbl> 123.6629036, 32.2781138, 0.5589284, 13.1348169, 67.9097979…
## $ SAABRQ <dbl> 10.232139, 14.944556, 3.961434, 4.505896, 105.325323, 13.2…
## $ IFFAFB <dbl> 100.5829827, 32.7070538, 0.3516888, 9.6128259, 67.4070106,…
## $ INICIA...25 <dbl> 1.480642e+02, 3.722284e+01, 3.308697e-01, 5.469276e+02, 6.…
## $ QCFQIA <dbl> 7.2104514, 13.2555772, 2.8404808, 2.3488592, 17.1339261, 1…
## $ FFNAAC <dbl> 103.3929255, 27.3903134, 3.5247588, 18.4315513, 47.1436493…
## $ QNCSCN <dbl> 25.230781, 24.827382, 4.327372, 4.877462, 43.933831, 39.53…
## $ SAABQQ <dbl> 11.3035156, 14.4192283, 0.7245843, 2.8983373, 15.5061048, …
## $ INICIC <dbl> 1.197575e+02, 2.106200e+01, 3.481323e-01, 4.381245e+02, 8.…
## $ INICIB <dbl> 64.4814982, 44.9510444, 0.9300216, 22.9405330, 93.6221753,…
## $ IAAICI <dbl> 13.0985982, 19.5708468, 2.1574162, 2.4656185, 27.1218034, …
## $ SAABIC <dbl> 15.303059, 8.763290, 1.613143, 2.833900, 20.622071, 11.684…
## $ FAFAFF <dbl> 3.4209174, 3.8906289, 2.4881414, 1.6506370, 5.8957651, 4.3…
## $ IFAAAA...35 <dbl> 6.1470742, 5.8918708, 1.0541012, 4.6602368, 28.4385402, 7.…
## $ SAABIB <dbl> 14.107651, 9.637325, 1.393348, 4.412269, 19.913269, 12.946…
## $ SAARAA <dbl> 17.5620152, 23.2701674, 4.7137006, 11.1975250, 125.5395695…
## $ QCFFAB <dbl> 7.3864306, 14.1582085, 2.1512847, 2.2784542, 17.9415022, 1…
## $ INICSR <dbl> 84.979729, 63.216628, 7.254367, 14.508734, 42.489865, 7.25…
## $ AFNCCR <dbl> 1.6045494, 3.4161373, 0.6314678, 0.4934421, 4.6790730, 3.9…
## $ INICIR <dbl> 80.5725155, 38.1860263, 0.7637205, 26.0619629, 113.1261029…
## $ SAABFA...42 <dbl> 4.6628026, 7.5517129, 0.6081916, 0.7602396, 9.6297010, 6.4…
## $ SAARRR <dbl> 32.3772163, 30.5309261, 0.8056539, 17.1369299, 123.0132806…
## $ IFFABI <dbl> 86.34607969, 36.26903561, 0.46026695, 9.11328560, 76.49636…
## $ SARICB <dbl> 1.4843492, 6.2972391, 0.1349408, 0.6297239, 4.5430082, 2.4…
## $ INICSA <dbl> 40.7524726, 31.1578371, 9.8913769, 28.1904240, 126.7085375…
## $ QCAQFI <dbl> 21.09426000, 1.36283140, 1.83685972, 2.13312742, 16.502110…
## $ FAFAFQ <dbl> 3.7996701, 4.1538396, 2.6628298, 1.9129525, 6.3837956, 4.5…
## $ SAASAQ <dbl> 35.9634167, 17.6560149, 0.8228046, 5.7767738, 91.7255686, …
## $ IAAICF <dbl> 10.0163057, 29.7359076, 1.9823938, 0.8346921, 34.1180413, …
## $ INICIS <dbl> 7.132085e+01, 1.846377e+01, 1.206783e-01, 3.598626e+02, 8.…
## $ AFIQBR <dbl> 0.91311765, 1.72457240, 0.46395249, 0.16635747, 2.08686200…
## $ FABINF <dbl> 2.90347053, 2.68323256, 1.58020741, 0.43129935, 5.23248702…
## $ FABINB <dbl> 2.97372741, 3.36344121, 2.52450068, 0.51449900, 5.96089323…
## $ IFFABQ <dbl> 75.5213649, 33.7479669, 0.4115606, 8.4369917, 74.9040241, …
## $ IFFAFR <dbl> 78.8718471, 37.4465743, 1.1702054, 8.1914381, 75.7122924, …
## $ FFNAAQ <dbl> 126.891141, 30.376063, 4.090045, 19.253137, 45.589034, 31.…
## $ QCAQFQ <dbl> 34.9454436, 6.6040966, 1.4807391, 1.3326652, 17.8577140, 1…
## $ IFFAFA...59 <dbl> 117.398963, 28.296160, 0.301023, 10.686316, 59.301527, 40.…
## $ AFNCCA <dbl> 3.2448931, 7.3469277, 1.3906685, 1.1282782, 9.9533378, 7.3…
## $ QNCSCS <dbl> 21.068795, 23.305008, 2.353030, 4.514146, 41.645290, 54.47…
## $ AFIQCI <dbl> 1.57782490, 2.71069226, 0.54323037, 0.40128073, 3.78350400…
## $ QNCSCQ <dbl> 25.953675, 23.344930, 2.842864, 4.715810, 48.897260, 40.03…
## $ SAABFI <dbl> 15.434716, 13.780115, 1.728688, 4.667458, 22.695207, 18.62…
## $ IFFABC <dbl> 77.3586657, 37.1147756, 0.1738397, 11.5603399, 79.6185818,…
## $ IFFAFC <dbl> 106.1655599, 39.8865001, 0.3968806, 7.3422911, 71.4385076,…
## $ QCAQFS <dbl> 44.9967308, 51.1227918, 1.1926844, 2.1685171, 31.1182211, …
## $ INICII <dbl> 1.282320e+02, 4.711696e+01, 1.847724e-01, 4.430842e+02, 4.…
## $ IAAICQ <dbl> 15.5286189, 34.5035371, 1.4190644, 2.1083242, 39.4905348, …
## $ SAABFR <dbl> 9.8278590, 7.9692954, 0.5632011, 1.3235226, 12.4185840, 12…
## $ SAABFN <dbl> 14.5538531, 20.0536382, 0.7482701, 4.1528990, 19.2305412, …
## $ QNCSCI <dbl> 24.428979, 21.706792, 2.969659, 4.065604, 45.287296, 45.39…
## $ IAAICS <dbl> 11.4453489, 21.2120467, 1.6786512, 2.6197132, 25.2815041, …
## $ SAABFS <dbl> 10.3996370, 13.1875467, 1.8158096, 3.7416683, 21.5145929, …
## $ QCAQFN <dbl> 43.54055060, 59.07692485, 2.46038667, 0.49760629, 35.10888…
## $ SICIBA <dbl> 46.138127, 124.162123, 13.462221, 35.456834, 63.613732, 22…
## $ SAABQS <dbl> 11.8666284, 17.2840022, 0.5159404, 2.8376720, 22.5723909, …
## $ FAFAQQ <dbl> 10.106824, 9.793654, 1.610132, 1.357066, 12.805140, 14.326…
## $ IFFABS <dbl> 88.4735738, 36.0501088, 0.0000000, 8.9048077, 83.7339180, …
## $ SAABIF <dbl> 12.3602202, 21.6454588, 0.7838188, 3.0749816, 22.0072214, …
## $ FFNARI <dbl> 153.1139964, 22.2038424, 5.0883805, 20.8161022, 32.3806035…
## $ INICIN <dbl> 75.6719123, 33.9282329, 0.7355714, 22.3429826, 119.5303597…
## $ FFNAAA...83 <dbl> 95.292888, 20.870771, 15.831244, 22.346997, 39.756274, 19.…
## $ SAABIQ <dbl> 12.953973, 21.543401, 1.222073, 2.199731, 21.124404, 10.68…
## $ AFNAQI <dbl> 1.6381427, 3.5476134, 0.7122360, 0.4917820, 4.2327165, 2.4…
## $ SAANNN <dbl> 34.0660510, 28.9971045, 0.9045595, 11.0253852, 131.2123246…
## $ IFFABB <dbl> 81.3464947, 35.1378350, 0.0000000, 7.0596563, 84.7158761, …
## $ FFNARS <dbl> 127.981129, 31.181031, 2.586445, 21.936885, 41.766297, 25.…
## $ SAABQF <dbl> 14.649997, 13.574768, 1.209633, 2.016055, 17.539675, 10.95…
## $ IFFAFS <dbl> 101.58666703, 30.90338738, 0.08218986, 9.36964405, 66.1628…
## $ IFAAAA...91 <dbl> 6.5740958, 6.4596733, 0.9777927, 4.6913247, 26.7124654, 7.…
## $ IFFAFI <dbl> 110.2845047, 29.0860232, 0.1009931, 12.0181832, 75.2398865…
## $ SARIAI <dbl> 1.48926863, 5.11936093, 0.04653964, 0.93079290, 3.90933016…
glimpse(conditionData)
## Rows: 92
## Columns: 4
## $ ID <chr> "AFIQCI", "AFIQBR", "AFIINC", "AFNAQI", "AFNCCR…
## $ primary <chr> "wildtype", "wildtype", "itc1", "itc1", "swr1",…
## $ secondary <chr> "wildtype 1", "wildtype 2", "itc1-1_dUTP", "itc…
## $ additional_information <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
glimpse(labels_all)
## Rows: 18,218
## Columns: 4
## $ gene_id <chr> "gene", "YAL008W", "YBR255W", "YGR164W", "YGR131W", "YNL00…
## $ term_id <chr> "validation", "Verified", "Verified", "Dubious", "Verified…
## $ description <chr> "localization", "ion", "biological_process", "biological_p…
## $ source <chr> "BP", "BP", "BP", "BP", "BP", "BP", "BP", "BP", "BP", "BP"…
#Rename first column in expression
names(expression)[1] <- "gene_id"
#Filter for one condition
WTconditions <- conditionData %>%
filter(str_detect(primary, regex("widltype", ignore_case = TRUE)))
#Select expression data
WT_expr <- expression %>%
select(gene_id, any_of(WTconditions$ID))
#Pivot Ensuring numeric counts
WT_long <- WT_expr %>%
pivot_longer(
cols = "gene_id",
names_to = "treatment",
values_to = "count") %>%
mutate(count = as.numeric(count))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `count = as.numeric(count)`.
## Caused by warning:
## ! NAs introduced by coercion
#Summary Table
summary.tbl <- WT_long %>%
group_by(treatment) %>%
summarise(
mean_count = mean(count, na.rm = TRUE),
median_count = median(count, na.rm = TRUE)
) %>%
arrange(desc(mean_count))
print(summary.tbl)
## # A tibble: 1 × 3
## treatment mean_count median_count
## <chr> <dbl> <dbl>
## 1 gene_id NaN NA
#Plot Filter out outliers
WT_long %>%
filter(count <= 328) %>%
mutate(treatment= fct_reorder(treatment, count, .fun = mean)) %>%
ggplot(aes(x = treatment, y = count)) +
geom_violin(trim = FALSE) +
geom_boxplot(width = 0.12, outlier.shape = NA) +
coord_flip() +
theme_bw() +
labs(title = "Distribution of Counts per Treatment", x = "Treatment", y = "Count")