birthweight <- read.csv("birthweight.csv")
Tidiverse adalah kumpulan paket dari pembuat RStudio yang berbagi pendekatan terhadap ilmu data .
Penulis memodelkan ilmu data seperti ini:
Siklus hidup ilmu data
Paket rapi menggantikan beberapa fungsi dasar R dengan alternatif yang dimaksudkan agar lebih ramah pengguna bagi data scientist yang mengikuti siklus hidup ini.
Kami hanya akan membahas beberapa paket dari rapiverse.
download.file("https://raw.githubusercontent.com/ucdavis-bioinformatics-training/2022_February_Introduction_to_R_for_Bioinformatics/main/miRNA.csv", "miRNA.csv")
mir <- read.csv("miRNA.csv", row.names = 1)
mir
## sample.27 sample.1522 sample.569 sample.365 sample.1369 sample.1023
## miR-16 46 56 47 54 56 59
## miR-21 52 43 40 35 59 47
## miR-146a 98 97 87 96 84 96
## miR-182 53 45 63 41 46 50
## sample.1272 sample.1262 sample.575 sample.792 sample.752 sample.619
## miR-16 49 55 62 63 46 52
## miR-21 42 45 55 45 42 43
## miR-146a 88 97 96 104 103 92
## miR-182 49 50 62 51 64 58
## sample.1764 sample.516 sample.272 sample.1388 sample.1363 sample.300
## miR-16 46 61 49 46 61 60
## miR-21 40 51 43 44 47 48
## miR-146a 98 97 91 105 77 89
## miR-182 57 59 55 60 60 65
## sample.57 sample.431 sample.532 sample.223 sample.1107 sample.697
## miR-16 46 70 60 60 57 68
## miR-21 39 51 44 46 49 47
## miR-146a 105 84 94 87 116 98
## miR-182 40 48 49 52 48 57
## sample.1683 sample.808 sample.822 sample.1016 sample.820 sample.1081
## miR-16 49 59 54 69 58 55
## miR-21 48 56 52 41 55 52
## miR-146a 98 101 86 98 102 93
## miR-182 55 74 49 51 53 52
## sample.321 sample.1636 sample.1360 sample.1058 sample.755 sample.462
## miR-16 68 63 70 77 56 65
## miR-21 46 39 57 55 46 58
## miR-146a 125 104 111 124 101 101
## miR-182 60 43 46 56 50 60
## sample.1088 sample.553 sample.1191 sample.1313 sample.1600 sample.1187
## miR-16 42 63 66 64 50 57
## miR-21 54 54 48 47 44 46
## miR-146a 107 106 102 104 111 86
## miR-182 63 60 50 42 67 43
mir <- as.data.frame(t(mir))
mir$ID <- gsub("sample.", "", rownames(mir))
experiment <- merge(birthweight, mir)
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:tidyr':
##
## extract
experiment
## ID birth.date location length birthweight head.circumference
## 1 27 3/9/1967 Silver Hill 53 3.55 37
## 2 57 8/12/1968 Memorial 51 3.32 38
## 3 223 12/11/1968 General 50 3.87 33
## 4 272 1/10/1968 Memorial 52 3.86 36
## 5 300 7/18/1968 Silver Hill 46 2.05 32
## 6 321 1/21/1968 Silver Hill 48 3.11 33
## 7 365 4/23/1967 Memorial 52 3.53 37
## 8 431 9/16/1968 Silver Hill 48 1.92 30
## 9 462 6/19/1968 Silver Hill 58 4.10 39
## 10 516 1/8/1968 Silver Hill 47 2.66 33
## 11 532 10/25/1968 General 53 3.59 34
## 12 553 8/17/1968 Silver Hill 54 3.94 37
## 13 569 3/23/1967 Memorial 50 2.51 35
## 14 575 7/12/1967 Memorial 50 2.78 30
## 15 619 11/1/1967 Memorial 52 3.41 33
## 16 697 2/6/1967 Silver Hill 48 3.03 35
## 17 752 10/19/1967 General 49 3.32 36
## 18 755 4/25/1968 Memorial 53 3.20 33
## 19 792 9/7/1967 Memorial 53 3.64 38
## 20 808 5/5/1967 Silver Hill 48 2.92 33
## 21 820 10/7/1967 General 52 3.77 34
## 22 822 6/14/1967 Memorial 50 3.42 35
## 23 1016 7/13/1967 Silver Hill 53 4.32 36
## 24 1023 6/7/1967 Memorial 52 3.00 35
## 25 1058 4/24/1968 Silver Hill 53 3.15 34
## 26 1081 12/14/1967 Silver Hill 54 3.63 38
## 27 1088 7/24/1968 General 51 3.27 36
## 28 1107 1/25/1967 General 52 3.23 36
## 29 1187 12/19/1968 Silver Hill 53 4.07 38
## 30 1191 9/7/1968 General 53 3.65 33
## 31 1262 6/25/1967 Silver Hill 53 3.19 34
## 32 1272 6/20/1967 Memorial 53 2.75 32
## 33 1313 9/27/1968 Silver Hill 43 2.65 32
## 34 1360 2/16/1968 General 56 4.55 34
## 35 1363 4/2/1968 General 48 2.37 30
## 36 1369 6/4/1967 Silver Hill 49 3.18 34
## 37 1388 2/22/1968 Memorial 51 3.14 33
## 38 1522 3/13/1967 Memorial 50 2.74 33
## 39 1600 10/9/1968 General 53 2.90 34
## 40 1636 2/2/1968 Silver Hill 51 3.93 38
## 41 1683 2/14/1967 Silver Hill 53 3.35 33
## 42 1764 12/7/1967 Silver Hill 58 4.57 39
## weeks.gestation smoker maternal.age maternal.cigarettes maternal.height
## 1 41 yes 37 25 161
## 2 39 yes 23 17 157
## 3 45 yes 28 25 163
## 4 39 yes 30 25 170
## 5 35 yes 41 7 166
## 6 37 no 28 0 158
## 7 40 yes 26 25 170
## 8 33 yes 20 7 161
## 9 41 no 35 0 172
## 10 35 yes 20 35 170
## 11 40 yes 31 12 163
## 12 42 no 24 0 175
## 13 39 yes 22 7 159
## 14 37 yes 19 7 165
## 15 39 yes 23 25 181
## 16 39 no 27 0 162
## 17 40 yes 27 12 152
## 18 41 no 21 0 155
## 19 40 yes 20 2 170
## 20 34 no 26 0 167
## 21 40 no 24 0 157
## 22 38 no 20 0 157
## 23 40 no 19 0 171
## 24 38 yes 30 12 165
## 25 40 no 29 0 167
## 26 38 no 18 0 172
## 27 40 no 24 0 168
## 28 38 no 31 0 164
## 29 44 no 20 0 174
## 30 42 no 21 0 165
## 31 41 yes 27 35 163
## 32 40 yes 37 50 168
## 33 33 no 24 0 149
## 34 44 no 20 0 162
## 35 37 yes 20 7 163
## 36 38 yes 31 25 162
## 37 41 yes 22 7 160
## 38 39 yes 21 17 156
## 39 39 no 19 0 165
## 40 38 no 29 0 165
## 41 41 no 27 0 164
## 42 41 yes 32 12 173
## maternal.prepregnant.weight paternal.age paternal.education
## 1 66 46 NA
## 2 48 NA NA
## 3 54 30 16
## 4 78 40 16
## 5 57 37 14
## 6 54 39 10
## 7 62 30 10
## 8 50 20 10
## 9 58 31 16
## 10 57 23 12
## 11 49 41 12
## 12 66 30 12
## 13 52 23 14
## 14 60 20 14
## 15 69 23 16
## 16 62 27 14
## 17 48 37 12
## 18 55 25 14
## 19 59 24 12
## 20 64 25 12
## 21 50 31 16
## 22 48 22 14
## 23 62 19 12
## 24 64 38 14
## 25 60 30 16
## 26 50 20 12
## 27 53 29 16
## 28 57 NA NA
## 29 68 26 14
## 30 61 21 10
## 31 51 31 16
## 32 61 31 16
## 33 45 26 16
## 34 57 23 10
## 35 47 20 10
## 36 57 32 16
## 37 53 24 16
## 38 53 24 12
## 39 57 NA NA
## 40 61 NA NA
## 41 62 37 14
## 42 70 38 14
## paternal.cigarettes paternal.height low.birthweight geriatric.pregnancy
## 1 0 175 0 TRUE
## 2 NA NA 0 FALSE
## 3 0 183 0 FALSE
## 4 50 178 0 FALSE
## 5 25 173 1 TRUE
## 6 0 171 0 FALSE
## 7 25 181 0 FALSE
## 8 35 180 1 FALSE
## 9 25 185 0 TRUE
## 10 50 186 1 FALSE
## 11 50 191 0 FALSE
## 12 0 184 0 FALSE
## 13 25 NA 1 FALSE
## 14 0 183 0 FALSE
## 15 2 181 0 FALSE
## 16 0 178 0 FALSE
## 17 25 170 0 FALSE
## 18 25 183 0 FALSE
## 19 12 185 0 FALSE
## 20 25 175 0 FALSE
## 21 0 173 0 FALSE
## 22 0 179 0 FALSE
## 23 0 183 0 FALSE
## 24 50 180 0 FALSE
## 25 NA 182 0 FALSE
## 26 7 172 0 FALSE
## 27 0 181 0 FALSE
## 28 NA NA 0 FALSE
## 29 25 189 0 FALSE
## 30 25 185 0 FALSE
## 31 25 185 0 FALSE
## 32 0 173 0 TRUE
## 33 0 169 1 FALSE
## 34 35 179 0 FALSE
## 35 35 185 1 FALSE
## 36 50 194 0 FALSE
## 37 12 176 0 FALSE
## 38 7 179 0 FALSE
## 39 NA NA 0 FALSE
## 40 NA NA 0 FALSE
## 41 0 170 0 FALSE
## 42 25 180 0 FALSE
## miR-16 miR-21 miR-146a miR-182
## 1 46 52 98 53
## 2 46 39 105 40
## 3 60 46 87 52
## 4 49 43 91 55
## 5 60 48 89 65
## 6 68 46 125 60
## 7 54 35 96 41
## 8 70 51 84 48
## 9 65 58 101 60
## 10 61 51 97 59
## 11 60 44 94 49
## 12 63 54 106 60
## 13 47 40 87 63
## 14 62 55 96 62
## 15 52 43 92 58
## 16 68 47 98 57
## 17 46 42 103 64
## 18 56 46 101 50
## 19 63 45 104 51
## 20 59 56 101 74
## 21 58 55 102 53
## 22 54 52 86 49
## 23 69 41 98 51
## 24 59 47 96 50
## 25 77 55 124 56
## 26 55 52 93 52
## 27 42 54 107 63
## 28 57 49 116 48
## 29 57 46 86 43
## 30 66 48 102 50
## 31 55 45 97 50
## 32 49 42 88 49
## 33 64 47 104 42
## 34 70 57 111 46
## 35 61 47 77 60
## 36 56 59 84 46
## 37 46 44 105 60
## 38 56 43 97 45
## 39 50 44 111 67
## 40 63 39 104 43
## 41 49 48 98 55
## 42 46 40 98 57
Dalam data “rapi”, setiap kolom adalah variabel, setiap baris adalah observasi, dan setiap sel berisi satu observasi. Apakah kerangka data berat lahir sudah rapi? Mengapa atau mengapa tidak?
Tidyverse menggunakan perpipaan untuk mengirim output dari satu fungsi ke fungsi lain, bukan nesting yang digunakan di basis r. “Pipa” ditulis dengan simbol lebih besar dari yang diapit di antara tanda dua persen, seperti ini: %>%.
experiment %>%
filter(low.birthweight == TRUE) %>%
select(birth.date, length, birthweight, smoker)
## birth.date length birthweight smoker
## 1 7/18/1968 46 2.05 yes
## 2 9/16/1968 48 1.92 yes
## 3 1/8/1968 47 2.66 yes
## 4 3/23/1967 50 2.51 yes
## 5 9/27/1968 43 2.65 no
## 6 4/2/1968 48 2.37 yes
# equivalent to:
experiment[experiment$low.birthweight == TRUE, c("birth.date", "length", "birthweight", "smoker")]
## birth.date length birthweight smoker
## 5 7/18/1968 46 2.05 yes
## 8 9/16/1968 48 1.92 yes
## 10 1/8/1968 47 2.66 yes
## 13 3/23/1967 50 2.51 yes
## 33 9/27/1968 43 2.65 no
## 35 4/2/1968 48 2.37 yes
Fungsi ini separate()membuat konversi kolom “tanggal lahir” menjadi “bulan”, “hari”, dan “tahun” menjadi sepele.
experiment %>%
filter(low.birthweight == TRUE) %>%
select(birth.date, length, birthweight, smoker) %>%
separate(col = birth.date, sep = "[/]", into = c("month", "day", "year"))
## month day year length birthweight smoker
## 1 7 18 1968 46 2.05 yes
## 2 9 16 1968 48 1.92 yes
## 3 1 8 1968 47 2.66 yes
## 4 3 23 1967 50 2.51 yes
## 5 9 27 1968 43 2.65 no
## 6 4 2 1968 48 2.37 yes
Fungsinya mutate()menambahkan kolom baru berdasarkan data yang terdapat di kolom yang ada.
experiment %>%
filter(low.birthweight == TRUE) %>%
select(birth.date, length, birthweight, smoker) %>%
mutate(d = birthweight / length)
## birth.date length birthweight smoker d
## 1 7/18/1968 46 2.05 yes 0.04456522
## 2 9/16/1968 48 1.92 yes 0.04000000
## 3 1/8/1968 47 2.66 yes 0.05659574
## 4 3/23/1967 50 2.51 yes 0.05020000
## 5 9/27/1968 43 2.65 no 0.06162791
## 6 4/2/1968 48 2.37 yes 0.04937500
Fungsi group_by()and summarize()menerapkan fungsi ke grup yang ditentukan oleh satu atau lebih variabel kategori.
experiment %>%
group_by(smoker) %>%
summarize(mean.birthweight = mean(birthweight))
## # A tibble: 2 × 2
## smoker mean.birthweight
## <chr> <dbl>
## 1 no 3.51
## 2 yes 3.13
experiment %>%
group_by(smoker, low.birthweight) %>%
summarize(mean.birthweight = mean(birthweight))
## `summarise()` has grouped output by 'smoker'. You can override using the
## `.groups` argument.
## # A tibble: 4 × 3
## # Groups: smoker [2]
## smoker low.birthweight mean.birthweight
## <chr> <int> <dbl>
## 1 no 0 3.55
## 2 no 1 2.65
## 3 yes 0 3.38
## 4 yes 1 2.30
Untuk mengubah urutan baris, gunakan arrange(). Untuk mengembalikan satu atau lebih baris tertentu, gunakan slice().
experiment %>%
group_by(smoker) %>%
select(smoker, birthweight, length, head.circumference, weeks.gestation) %>%
slice_max(order_by = birthweight, n = 5)
## # A tibble: 10 × 5
## # Groups: smoker [2]
## smoker birthweight length head.circumference weeks.gestation
## <chr> <dbl> <int> <int> <int>
## 1 no 4.55 56 34 44
## 2 no 4.32 53 36 40
## 3 no 4.1 58 39 41
## 4 no 4.07 53 38 44
## 5 no 3.94 54 37 42
## 6 yes 4.57 58 39 41
## 7 yes 3.87 50 33 45
## 8 yes 3.86 52 36 39
## 9 yes 3.64 53 38 40
## 10 yes 3.59 53 34 40
Fungsi pivot_longer()dan pivot_wider()mengatur ulang data, mengurangi atau menambah jumlah kolom. Penggunaan ini akan menjadi lebih jelas selama visualisasi.
experiment %>%
filter(low.birthweight == TRUE) %>%
select(smoker, `miR-16`, `miR-21`, `miR-146a`, `miR-182`) %>%
pivot_longer(cols = c(`miR-16`, `miR-21`, `miR-146a`, `miR-182`),
names_to = "gene",
values_to = "expression")
## # A tibble: 24 × 3
## smoker gene expression
## <chr> <chr> <int>
## 1 yes miR-16 60
## 2 yes miR-21 48
## 3 yes miR-146a 89
## 4 yes miR-182 65
## 5 yes miR-16 70
## 6 yes miR-21 51
## 7 yes miR-146a 84
## 8 yes miR-182 48
## 9 yes miR-16 61
## 10 yes miR-21 51
## # ℹ 14 more rows