Main Features (I/O)
- Loading data (Import)
- parse and import a
*.adattext file into anRsession as asoma_adatobject.
- parse and import a
- Wrangling data (manipulation)
- subset, reorder, and list various fields of a
soma_adatobject. ?SeqIdanalyte (feature) matching.dplyrandtidyrverb S3 methods for thesoma_adatclass.?rownameshelpers that do not breaksoma_adatattributes.
- subset, reorder, and list various fields of a
- Exporting data (Output)
- write out a
soma_adatobject as a*.adattext file.
- write out a
Loading an ADAT
# Sample file name
f <- system.file("example", "example_data.adat", package = "SomaDataIO", mustWork = TRUE)
my_adat <- read_adat(f)
is.soma_adat(my_adat)
#> [1] TRUE
# S3 print method forwards -> tibble
my_adat
#> ══ SomaScan Data ════════════════════════════════════════════════════════════════════════════════════
#> Attributes intact ✓
#> Rows 192
#> Columns 5318
#> Clinical Data 34
#> Features 5284
#> ── Column Meta ──────────────────────────────────────────────────────────────────────────────────────
#> ℹ SeqId, SeqIdVersion, SomaId, TargetFullName, Target, UniProt, EntrezGeneID, EntrezGeneSymbol,
#> ℹ Organism, Units, Type, Dilution, PlateScale_Reference, CalReference, Cal_Example_Adat_Set001,
#> ℹ ColCheck, CalQcRatio_Example_Adat_Set001_170255, QcReference_170255, Cal_Example_Adat_Set002,
#> ℹ CalQcRatio_Example_Adat_Set002_170255, Dilution2
#> ── Tibble ───────────────────────────────────────────────────────────────────────────────────────────
#> # A tibble: 192 × 5,319
#> row_names PlateId PlateRunDate ScannerID PlatePosition SlideId Subarray SampleId SampleType
#> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
#> 1 258495800012_3 Example … 2020-06-18 SG152144… H9 2.58e11 3 1 Sample
#> 2 258495800004_7 Example … 2020-06-18 SG152144… H8 2.58e11 7 2 Sample
#> 3 258495800010_8 Example … 2020-06-18 SG152144… H7 2.58e11 8 3 Sample
#> 4 258495800003_4 Example … 2020-06-18 SG152144… H6 2.58e11 4 4 Sample
#> 5 258495800009_4 Example … 2020-06-18 SG152144… H5 2.58e11 4 5 Sample
#> 6 258495800012_8 Example … 2020-06-18 SG152144… H4 2.58e11 8 6 Sample
#> 7 258495800001_3 Example … 2020-06-18 SG152144… H3 2.58e11 3 7 Sample
#> 8 258495800004_8 Example … 2020-06-18 SG152144… H2 2.58e11 8 8 Sample
#> 9 258495800001_8 Example … 2020-06-18 SG152144… H12 2.58e11 8 9 Sample
#> 10 258495800004_3 Example … 2020-06-18 SG152144… H11 2.58e11 3 170261 Calibrator
#> # … with 182 more rows, and 5,310 more variables: PercentDilution <int>, SampleMatrix <chr>,
#> # Barcode <lgl>, Barcode2d <lgl>, SampleName <lgl>, SampleNotes <lgl>, AliquotingNotes <lgl>,
#> # SampleDescription <chr>, AssayNotes <lgl>, TimePoint <lgl>, ExtIdentifier <lgl>, SsfExtId <lgl>,
#> # SampleGroup <lgl>, SiteId <lgl>, TubeUniqueID <lgl>, …
#> ═════════════════════════════════════════════════════════════════════════════════════════════════════
print(my_adat, show_header = TRUE) # if simply wish to see Header info
#> ══ SomaScan Data ════════════════════════════════════════════════════════════════════════════════════
#> Attributes intact ✓
#> Rows 192
#> Columns 5318
#> Clinical Data 34
#> Features 5284
#> ── Column Meta ──────────────────────────────────────────────────────────────────────────────────────
#> ℹ SeqId, SeqIdVersion, SomaId, TargetFullName, Target, UniProt, EntrezGeneID, EntrezGeneSymbol,
#> ℹ Organism, Units, Type, Dilution, PlateScale_Reference, CalReference, Cal_Example_Adat_Set001,
#> ℹ ColCheck, CalQcRatio_Example_Adat_Set001_170255, QcReference_170255, Cal_Example_Adat_Set002,
#> ℹ CalQcRatio_Example_Adat_Set002_170255, Dilution2
#> ── Header Data ──────────────────────────────────────────────────────────────────────────────────────
#> # A tibble: 35 × 2
#> Key Value
#> <chr> <chr>
#> 1 AdatId GID-1234-56-789-abcdef
#> 2 Version 1.2
#> 3 AssayType PharmaServices
#> 4 AssayVersion V4
#> 5 AssayRobot Fluent 1 L-307
#> 6 Legal Experiment details and data have been processed to protect Personally Identi…
#> 7 CreatedBy PharmaServices
#> 8 CreatedDate 2020-07-24
#> 9 EnteredBy Technician1
#> 10 ExpDate 2020-06-18, 2020-07-20
#> 11 GeneratedBy Px (Build: : ), Canopy_0.1.1
#> 12 RunNotes 2 columns ('Age' and 'Sex') have been added to this ADAT. Age has been rando…
#> 13 ProcessSteps Raw RFU, Hyb Normalization, medNormInt (SampleId), plateScale, Calibration, …
#> 14 ProteinEffectiveDate 2019-08-06
#> 15 StudyMatrix EDTA Plasma
#> # … with 20 more rows
#> ═════════════════════════════════════════════════════════════════════════════════════════════════════
# S3 summary method
# View Target and summary statistics
seqs <- tail(names(my_adat), 3)
summary(my_adat[, seqs])
#> seq.9995.6 seq.9997.12 seq.9999.1
#> Target : DUT Target : UBXN4 Target : IRF6
#> Min : 81.9 Min : 28.1 Min : 36.7
#> 1Q : 1637.0 1Q : 10172.4 1Q : 1395.2
#> Median : 4425.3 Median : 23352.8 Median : 2576.6
#> Mean : 5512.7 Mean : 25230.0 Mean : 2966.0
#> 3Q : 8452.8 3Q : 39643.7 3Q : 4280.5
#> Max : 26905.6 Max : 63583.3 Max : 8480.1
#> sd : 4484.2 sd : 16463.8 sd : 1869.7
#> MAD : 4537.9 MAD : 20865.2 MAD : 2041.0
#> IQR : 6815.8 IQR : 29471.2 IQR : 2885.2
# Summarize by Sex
my_adat[, seqs] %>%
split(my_adat$Sex) %>%
lapply(summary)
#> $F
#> seq.9995.6 seq.9997.12 seq.9999.1
#> Target : DUT Target : UBXN4 Target : IRF6
#> Min : 1130 Min : 5353 Min : 889.8
#> 1Q : 2114 1Q : 12830 1Q : 1652.1
#> Median : 6466 Median : 32204 Median : 3264.7
#> Mean : 6306 Mean : 29141 Mean : 3333.2
#> 3Q : 8763 3Q : 42488 3Q : 4366.0
#> Max : 26906 Max : 63583 Max : 7801.8
#> sd : 4537 sd : 15693 sd : 1780.5
#> MAD : 4834 MAD : 20822 MAD : 2183.0
#> IQR : 6649 IQR : 29658 IQR : 2713.9
#>
#> $M
#> seq.9995.6 seq.9997.12 seq.9999.1
#> Target : DUT Target : UBXN4 Target : IRF6
#> Min : 1121 Min : 5206 Min : 853.9
#> 1Q : 2282 1Q : 12492 1Q : 1703.1
#> Median : 4902 Median : 24027 Median : 2872.5
#> Mean : 5922 Mean : 26936 Mean : 3189.8
#> 3Q : 8325 3Q : 38187 3Q : 4423.1
#> Max : 21190 Max : 60322 Max : 8480.1
#> sd : 4316 sd : 15065 sd : 1784.2
#> MAD : 4538 MAD : 19345 MAD : 1996.9
#> IQR : 6043 IQR : 25695 IQR : 2720.0
Wrangling
Attributes Contain File and Feature Information
names(attributes(my_adat))
#> [1] "names" "class" "row.names" "Header.Meta" "Col.Meta" "file_specs" "row_meta"
# The `Col.Meta` attribute contains
# target annotation information
attr(my_adat, "Col.Meta")
#> # A tibble: 5,284 × 21
#> SeqId SeqIdVersion SomaId TargetFullName Target UniProt EntrezGeneID EntrezGeneSymbol Organism
#> <chr> <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 10000-28 3 SL019… Beta-crystall… CRBB2 P43320 "1415" "CRYBB2" Human
#> 2 10001-7 3 SL002… RAF proto-onc… c-Raf P04049 "5894" "RAF1" Human
#> 3 10003-15 3 SL019… Zinc finger p… ZNF41 P51814 "7592" "ZNF41" Human
#> 4 10006-25 3 SL019… ETS domain-co… ELK1 P19419 "2002" "ELK1" Human
#> 5 10008-43 3 SL019… Guanylyl cycl… GUC1A P43080 "2978" "GUCA1A" Human
#> 6 10011-65 3 SL019… Inositol poly… OCRL Q01968 "4952" "OCRL" Human
#> 7 10012-5 3 SL014… SAM pointed d… SPDEF O95238 "25803" "SPDEF" Human
#> 8 10013-34 3 SL025… Fc_MOUSE Fc_MO… Q99LC4 "" "" Mouse
#> 9 10014-31 3 SL007… Zinc finger p… SLUG O43623 "6591" "SNAI2" Human
#> 10 10015-119 3 SL014… Voltage-gated… KCAB2 Q13303 "8514" "KCNAB2" Human
#> # … with 5,274 more rows, and 12 more variables: Units <chr>, Type <chr>, Dilution <chr>,
#> # PlateScale_Reference <dbl>, CalReference <dbl>, Cal_Example_Adat_Set001 <dbl>, ColCheck <chr>,
#> # CalQcRatio_Example_Adat_Set001_170255 <dbl>, QcReference_170255 <dbl>,
#> # Cal_Example_Adat_Set002 <dbl>, CalQcRatio_Example_Adat_Set002_170255 <dbl>, Dilution2 <dbl>
Analyte Features (seq.xxxx.xx)
getAnalytes(my_adat) %>% head(20) # first 20 analytes; see AptName above
#> [1] "seq.10000.28" "seq.10001.7" "seq.10003.15" "seq.10006.25" "seq.10008.43" "seq.10011.65"
#> [7] "seq.10012.5" "seq.10013.34" "seq.10014.31" "seq.10015.119" "seq.10021.1" "seq.10022.207"
#> [13] "seq.10023.32" "seq.10024.44" "seq.10030.8" "seq.10034.16" "seq.10035.6" "seq.10036.201"
#> [19] "seq.10037.98" "seq.10040.63"
getAnalytes(my_adat) %>% length() # how many analytes
#> [1] 5284
getAnalytes(my_adat, n = TRUE) # the `n` argument; no. analytes
#> [1] 5284
Feature Data
The getAnalyteInfo() function creates a lookup table
that links analyte feature names in the soma_adat object to
the annotation data in ?Col.Meta via the common index-key,
AptName, in column 1:
getAnalyteInfo(my_adat)
#> # A tibble: 5,284 × 22
#> AptName SeqId SeqIdVersion SomaId TargetFullName Target UniProt EntrezGeneID EntrezGeneSymbol
#> <chr> <chr> <dbl> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 seq.10000.28 1000… 3 SL019… Beta-crystall… CRBB2 P43320 "1415" "CRYBB2"
#> 2 seq.10001.7 1000… 3 SL002… RAF proto-onc… c-Raf P04049 "5894" "RAF1"
#> 3 seq.10003.15 1000… 3 SL019… Zinc finger p… ZNF41 P51814 "7592" "ZNF41"
#> 4 seq.10006.25 1000… 3 SL019… ETS domain-co… ELK1 P19419 "2002" "ELK1"
#> 5 seq.10008.43 1000… 3 SL019… Guanylyl cycl… GUC1A P43080 "2978" "GUCA1A"
#> 6 seq.10011.65 1001… 3 SL019… Inositol poly… OCRL Q01968 "4952" "OCRL"
#> 7 seq.10012.5 1001… 3 SL014… SAM pointed d… SPDEF O95238 "25803" "SPDEF"
#> 8 seq.10013.34 1001… 3 SL025… Fc_MOUSE Fc_MO… Q99LC4 "" ""
#> 9 seq.10014.31 1001… 3 SL007… Zinc finger p… SLUG O43623 "6591" "SNAI2"
#> 10 seq.10015.1… 1001… 3 SL014… Voltage-gated… KCAB2 Q13303 "8514" "KCNAB2"
#> # … with 5,274 more rows, and 13 more variables: Organism <chr>, Units <chr>, Type <chr>,
#> # Dilution <chr>, PlateScale_Reference <dbl>, CalReference <dbl>, Cal_Example_Adat_Set001 <dbl>,
#> # ColCheck <chr>, CalQcRatio_Example_Adat_Set001_170255 <dbl>, QcReference_170255 <dbl>,
#> # Cal_Example_Adat_Set002 <dbl>, CalQcRatio_Example_Adat_Set002_170255 <dbl>, Dilution2 <dbl>
See ?colmeta or ?annotations for further
details about these fields.
Clinical Data
getMeta(my_adat) # clinical meta data for each sample
#> [1] "PlateId" "PlateRunDate" "ScannerID"
#> [4] "PlatePosition" "SlideId" "Subarray"
#> [7] "SampleId" "SampleType" "PercentDilution"
#> [10] "SampleMatrix" "Barcode" "Barcode2d"
#> [13] "SampleName" "SampleNotes" "AliquotingNotes"
#> [16] "SampleDescription" "AssayNotes" "TimePoint"
#> [19] "ExtIdentifier" "SsfExtId" "SampleGroup"
#> [22] "SiteId" "TubeUniqueID" "CLI"
#> [25] "HybControlNormScale" "RowCheck" "NormScale_20"
#> [28] "NormScale_0_005" "NormScale_0_5" "ANMLFractionUsed_20"
#> [31] "ANMLFractionUsed_0_005" "ANMLFractionUsed_0_5" "Age"
#> [34] "Sex"
getMeta(my_adat, n = TRUE) # also an `n` argument
#> [1] 34
Group Generics
You may perform basic mathematical transformations on the feature
data only with special soma_adat S3 methods (see
?groupGenerics):
head(my_adat$seq.2429.27)
#> [1] 8642.3 12472.1 14627.7 13579.8 8938.8 6738.8
logData <- log10(my_adat) # a typical log10() transform
head(logData$seq.2429.27)
#> [1] 3.936629 4.095940 4.165176 4.132893 3.951279 3.828583
roundData <- round(my_adat)
head(roundData$seq.2429.27)
#> [1] 8642 12472 14628 13580 8939 6739
sqData <- sqrt(my_adat)
head(sqData$seq.2429.27)
#> [1] 92.96397 111.67856 120.94503 116.53240 94.54523 82.09019
antilog(1:4)
#> [1] 10 100 1000 10000
sum(my_adat < 100) # low signalling values
#> [1] 41721
all.equal(my_adat, sqrt(my_adat^2))
#> [1] TRUE
all.equal(my_adat, antilog(log10(my_adat)))
#> [1] TRUE
Full Complement of dplyr S3 Methods
The soma_adat also comes with numerous class specific
methods to the most popular dplyr generics that make working
with soma_adat objects simpler for those familiar with this
standard toolkit:
dim(my_adat)
#> [1] 192 5318
males <- dplyr::filter(my_adat, Sex == "M")
dim(males)
#> [1] 85 5318
males %>%
dplyr::select(SampleType, SampleMatrix, starts_with("NormScale"))
#> ══ SomaScan Data ════════════════════════════════════════════════════════════════════════════════════
#> Attributes intact ✓
#> Rows 85
#> Columns 5
#> Clinical Data 5
#> Features 0
#> ── Column Meta ──────────────────────────────────────────────────────────────────────────────────────
#> ℹ SeqId, SeqIdVersion, SomaId, TargetFullName, Target, UniProt, EntrezGeneID, EntrezGeneSymbol,
#> ℹ Organism, Units, Type, Dilution, PlateScale_Reference, CalReference, Cal_Example_Adat_Set001,
#> ℹ ColCheck, CalQcRatio_Example_Adat_Set001_170255, QcReference_170255, Cal_Example_Adat_Set002,
#> ℹ CalQcRatio_Example_Adat_Set002_170255, Dilution2
#> ── Tibble ───────────────────────────────────────────────────────────────────────────────────────────
#> # A tibble: 85 × 6
#> row_names SampleType SampleMatrix NormScale_20 NormScale_0_005 NormScale_0_5
#> <chr> <chr> <chr> <dbl> <dbl> <dbl>
#> 1 258495800010_8 Sample Plasma-PPT 0.984 1.03 0.915
#> 2 258495800003_4 Sample Plasma-PPT 1.08 0.946 0.912
#> 3 258495800001_3 Sample Plasma-PPT 0.921 1.13 0.953
#> 4 258495800012_5 Sample Plasma-PPT 0.861 1.08 0.829
#> 5 258495800006_2 Sample Plasma-PPT 0.874 1.01 0.822
#> 6 258495800011_3 Sample Plasma-PPT 0.928 1.13 0.930
#> 7 258495800003_2 Sample Plasma-PPT 1.12 1.15 0.943
#> 8 258495800005_2 Sample Plasma-PPT 0.884 0.921 0.762
#> 9 258495800008_4 Sample Plasma-PPT 0.991 0.979 0.920
#> 10 258495800006_6 Sample Plasma-PPT 0.862 0.964 0.999
#> # … with 75 more rows
#> ═════════════════════════════════════════════════════════════════════════════════════════════════════
Available S3 Methods soma_adat
# see full complement of `soma_adat` methods
methods(class = "soma_adat")
#> [1] [ [[ [[<- [<- == $ $<-
#> [8] anti_join arrange count filter full_join getAnalytes getMeta
#> [15] group_by inner_join is_seqFormat left_join Math median mutate
#> [22] Ops print rename right_join sample_frac sample_n select
#> [29] semi_join separate slice_sample slice summary Summary transform
#> [36] ungroup unite
#> see '?methods' for accessing help and source code
Writing a soma_adat
is.intact.attributes(my_adat) # attributes MUST be intact to write to file
#> [1] TRUE
write_adat(my_adat, file = tempfile("my-adat-", fileext = ".adat"))
#> ✔ ADAT passed all checks and traps.
#> ✔ ADAT written to: '/var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T//Rtmpxwy6fk/my-adat-7ee19f25cd2.adat'