download.file("https://raw.githubusercontent.com/ucdavis-bioinformatics-training/2022_February_Introduction_to_R_for_Bioinformatics/main/birthweight.csv", "birthweight.csv")
birthweight <- read.csv("birthweight.csv")
download.file("https://raw.githubusercontent.com/ucdavis-bioinformatics-training/2022_February_Introduction_to_R_for_Bioinformatics/main/miRNA.csv", "miRNA.csv")
mir <- read.csv("miRNA.csv", row.names = 1)
mir
## sample.27 sample.1522 sample.569 sample.365 sample.1369 sample.1023
## miR-16 46 56 47 54 56 59
## miR-21 52 43 40 35 59 47
## miR-146a 98 97 87 96 84 96
## miR-182 53 45 63 41 46 50
## sample.1272 sample.1262 sample.575 sample.792 sample.752 sample.619
## miR-16 49 55 62 63 46 52
## miR-21 42 45 55 45 42 43
## miR-146a 88 97 96 104 103 92
## miR-182 49 50 62 51 64 58
## sample.1764 sample.516 sample.272 sample.1388 sample.1363 sample.300
## miR-16 46 61 49 46 61 60
## miR-21 40 51 43 44 47 48
## miR-146a 98 97 91 105 77 89
## miR-182 57 59 55 60 60 65
## sample.57 sample.431 sample.532 sample.223 sample.1107 sample.697
## miR-16 46 70 60 60 57 68
## miR-21 39 51 44 46 49 47
## miR-146a 105 84 94 87 116 98
## miR-182 40 48 49 52 48 57
## sample.1683 sample.808 sample.822 sample.1016 sample.820 sample.1081
## miR-16 49 59 54 69 58 55
## miR-21 48 56 52 41 55 52
## miR-146a 98 101 86 98 102 93
## miR-182 55 74 49 51 53 52
## sample.321 sample.1636 sample.1360 sample.1058 sample.755 sample.462
## miR-16 68 63 70 77 56 65
## miR-21 46 39 57 55 46 58
## miR-146a 125 104 111 124 101 101
## miR-182 60 43 46 56 50 60
## sample.1088 sample.553 sample.1191 sample.1313 sample.1600 sample.1187
## miR-16 42 63 66 64 50 57
## miR-21 54 54 48 47 44 46
## miR-146a 107 106 102 104 111 86
## miR-182 63 60 50 42 67 43
mir <- as.data.frame(t(mir))
mir$ID <- gsub("sample.", "", rownames(mir))
experiment <- merge(birthweight, mir)
experiment
## ID birth.date location length birthweight head.circumference
## 1 27 3/9/1967 Silver Hill 53 3.55 37
## 2 57 8/12/1968 Memorial 51 3.32 38
## 3 223 12/11/1968 General 50 3.87 33
## 4 272 1/10/1968 Memorial 52 3.86 36
## 5 300 7/18/1968 Silver Hill 46 2.05 32
## 6 321 1/21/1968 Silver Hill 48 3.11 33
## 7 365 4/23/1967 Memorial 52 3.53 37
## 8 431 9/16/1968 Silver Hill 48 1.92 30
## 9 462 6/19/1968 Silver Hill 58 4.10 39
## 10 516 1/8/1968 Silver Hill 47 2.66 33
## 11 532 10/25/1968 General 53 3.59 34
## 12 553 8/17/1968 Silver Hill 54 3.94 37
## 13 569 3/23/1967 Memorial 50 2.51 35
## 14 575 7/12/1967 Memorial 50 2.78 30
## 15 619 11/1/1967 Memorial 52 3.41 33
## 16 697 2/6/1967 Silver Hill 48 3.03 35
## 17 752 10/19/1967 General 49 3.32 36
## 18 755 4/25/1968 Memorial 53 3.20 33
## 19 792 9/7/1967 Memorial 53 3.64 38
## 20 808 5/5/1967 Silver Hill 48 2.92 33
## 21 820 10/7/1967 General 52 3.77 34
## 22 822 6/14/1967 Memorial 50 3.42 35
## 23 1016 7/13/1967 Silver Hill 53 4.32 36
## 24 1023 6/7/1967 Memorial 52 3.00 35
## 25 1058 4/24/1968 Silver Hill 53 3.15 34
## 26 1081 12/14/1967 Silver Hill 54 3.63 38
## 27 1088 7/24/1968 General 51 3.27 36
## 28 1107 1/25/1967 General 52 3.23 36
## 29 1187 12/19/1968 Silver Hill 53 4.07 38
## 30 1191 9/7/1968 General 53 3.65 33
## 31 1262 6/25/1967 Silver Hill 53 3.19 34
## 32 1272 6/20/1967 Memorial 53 2.75 32
## 33 1313 9/27/1968 Silver Hill 43 2.65 32
## 34 1360 2/16/1968 General 56 4.55 34
## 35 1363 4/2/1968 General 48 2.37 30
## 36 1369 6/4/1967 Silver Hill 49 3.18 34
## 37 1388 2/22/1968 Memorial 51 3.14 33
## 38 1522 3/13/1967 Memorial 50 2.74 33
## 39 1600 10/9/1968 General 53 2.90 34
## 40 1636 2/2/1968 Silver Hill 51 3.93 38
## 41 1683 2/14/1967 Silver Hill 53 3.35 33
## 42 1764 12/7/1967 Silver Hill 58 4.57 39
## weeks.gestation smoker maternal.age maternal.cigarettes maternal.height
## 1 41 yes 37 25 161
## 2 39 yes 23 17 157
## 3 45 yes 28 25 163
## 4 39 yes 30 25 170
## 5 35 yes 41 7 166
## 6 37 no 28 0 158
## 7 40 yes 26 25 170
## 8 33 yes 20 7 161
## 9 41 no 35 0 172
## 10 35 yes 20 35 170
## 11 40 yes 31 12 163
## 12 42 no 24 0 175
## 13 39 yes 22 7 159
## 14 37 yes 19 7 165
## 15 39 yes 23 25 181
## 16 39 no 27 0 162
## 17 40 yes 27 12 152
## 18 41 no 21 0 155
## 19 40 yes 20 2 170
## 20 34 no 26 0 167
## 21 40 no 24 0 157
## 22 38 no 20 0 157
## 23 40 no 19 0 171
## 24 38 yes 30 12 165
## 25 40 no 29 0 167
## 26 38 no 18 0 172
## 27 40 no 24 0 168
## 28 38 no 31 0 164
## 29 44 no 20 0 174
## 30 42 no 21 0 165
## 31 41 yes 27 35 163
## 32 40 yes 37 50 168
## 33 33 no 24 0 149
## 34 44 no 20 0 162
## 35 37 yes 20 7 163
## 36 38 yes 31 25 162
## 37 41 yes 22 7 160
## 38 39 yes 21 17 156
## 39 39 no 19 0 165
## 40 38 no 29 0 165
## 41 41 no 27 0 164
## 42 41 yes 32 12 173
## maternal.prepregnant.weight paternal.age paternal.education
## 1 66 46 NA
## 2 48 NA NA
## 3 54 30 16
## 4 78 40 16
## 5 57 37 14
## 6 54 39 10
## 7 62 30 10
## 8 50 20 10
## 9 58 31 16
## 10 57 23 12
## 11 49 41 12
## 12 66 30 12
## 13 52 23 14
## 14 60 20 14
## 15 69 23 16
## 16 62 27 14
## 17 48 37 12
## 18 55 25 14
## 19 59 24 12
## 20 64 25 12
## 21 50 31 16
## 22 48 22 14
## 23 62 19 12
## 24 64 38 14
## 25 60 30 16
## 26 50 20 12
## 27 53 29 16
## 28 57 NA NA
## 29 68 26 14
## 30 61 21 10
## 31 51 31 16
## 32 61 31 16
## 33 45 26 16
## 34 57 23 10
## 35 47 20 10
## 36 57 32 16
## 37 53 24 16
## 38 53 24 12
## 39 57 NA NA
## 40 61 NA NA
## 41 62 37 14
## 42 70 38 14
## paternal.cigarettes paternal.height low.birthweight geriatric.pregnancy
## 1 0 175 0 TRUE
## 2 NA NA 0 FALSE
## 3 0 183 0 FALSE
## 4 50 178 0 FALSE
## 5 25 173 1 TRUE
## 6 0 171 0 FALSE
## 7 25 181 0 FALSE
## 8 35 180 1 FALSE
## 9 25 185 0 TRUE
## 10 50 186 1 FALSE
## 11 50 191 0 FALSE
## 12 0 184 0 FALSE
## 13 25 NA 1 FALSE
## 14 0 183 0 FALSE
## 15 2 181 0 FALSE
## 16 0 178 0 FALSE
## 17 25 170 0 FALSE
## 18 25 183 0 FALSE
## 19 12 185 0 FALSE
## 20 25 175 0 FALSE
## 21 0 173 0 FALSE
## 22 0 179 0 FALSE
## 23 0 183 0 FALSE
## 24 50 180 0 FALSE
## 25 NA 182 0 FALSE
## 26 7 172 0 FALSE
## 27 0 181 0 FALSE
## 28 NA NA 0 FALSE
## 29 25 189 0 FALSE
## 30 25 185 0 FALSE
## 31 25 185 0 FALSE
## 32 0 173 0 TRUE
## 33 0 169 1 FALSE
## 34 35 179 0 FALSE
## 35 35 185 1 FALSE
## 36 50 194 0 FALSE
## 37 12 176 0 FALSE
## 38 7 179 0 FALSE
## 39 NA NA 0 FALSE
## 40 NA NA 0 FALSE
## 41 0 170 0 FALSE
## 42 25 180 0 FALSE
## miR-16 miR-21 miR-146a miR-182
## 1 46 52 98 53
## 2 46 39 105 40
## 3 60 46 87 52
## 4 49 43 91 55
## 5 60 48 89 65
## 6 68 46 125 60
## 7 54 35 96 41
## 8 70 51 84 48
## 9 65 58 101 60
## 10 61 51 97 59
## 11 60 44 94 49
## 12 63 54 106 60
## 13 47 40 87 63
## 14 62 55 96 62
## 15 52 43 92 58
## 16 68 47 98 57
## 17 46 42 103 64
## 18 56 46 101 50
## 19 63 45 104 51
## 20 59 56 101 74
## 21 58 55 102 53
## 22 54 52 86 49
## 23 69 41 98 51
## 24 59 47 96 50
## 25 77 55 124 56
## 26 55 52 93 52
## 27 42 54 107 63
## 28 57 49 116 48
## 29 57 46 86 43
## 30 66 48 102 50
## 31 55 45 97 50
## 32 49 42 88 49
## 33 64 47 104 42
## 34 70 57 111 46
## 35 61 47 77 60
## 36 56 59 84 46
## 37 46 44 105 60
## 38 56 43 97 45
## 39 50 44 111 67
## 40 63 39 104 43
## 41 49 48 98 55
## 42 46 40 98 57
if (!("BiocManager" %in% rownames(installed.packages()))){
install.packages("BiocManager")
}
if (!("ggplot2" %in% rownames(installed.packages()))){
BiocManager::install("ggplot2")
}
if (!("tidyr" %in% rownames(installed.packages()))){
BiocManager::install("tidyr")
}
if (!("dplyr" %in% rownames(installed.packages()))){
BiocManager::install("dplyr")
}
if (!("magrittr" %in% rownames(installed.packages()))){
BiocManager::install("magrittr")
}
if (!("viridis" %in% rownames(installed.packages()))){
BiocManager::install("viridis")
}
The tidyverse is a collection of packages by the creators of RStudio that share an approach to data science.
The authors model data science like this:
Data science life cycle
The tidyverse packages replace some of the base R functions with alternatives that are intended to be more user friendly for data scientists who are following this life cycle.
We will only be covering a few of the packages from the tidyverse.
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:tidyr':
##
## extract
##7.1 Defining tidy data In “tidy” data, every column is a variable, every row is an observation, and every cell contains a single observation. Is the birthweight data frame tidy? Why or why not?
##7.2 Pipes: combining tidyverse functions The tidyverse employs piping to send the output of one function to another function, rather than the nesting used in base r. The “pipe” is written with a greater than symbol sandwiched between two percent signs, like this: %>%.
experiment %>%
filter(low.birthweight == TRUE) %>%
select(birth.date, length, birthweight, smoker)
## birth.date length birthweight smoker
## 1 7/18/1968 46 2.05 yes
## 2 9/16/1968 48 1.92 yes
## 3 1/8/1968 47 2.66 yes
## 4 3/23/1967 50 2.51 yes
## 5 9/27/1968 43 2.65 no
## 6 4/2/1968 48 2.37 yes
##7.3 Transforming data The separate() function makes the conversion of the “birth.date” column into “month,” “day,” and “year” trivial.
experiment %>%
filter(low.birthweight == TRUE) %>%
select(birth.date, length, birthweight, smoker) %>%
separate(col = birth.date, sep = "[/]", into = c("month", "day", "year"))
## month day year length birthweight smoker
## 1 7 18 1968 46 2.05 yes
## 2 9 16 1968 48 1.92 yes
## 3 1 8 1968 47 2.66 yes
## 4 3 23 1967 50 2.51 yes
## 5 9 27 1968 43 2.65 no
## 6 4 2 1968 48 2.37 yes
The mutate() function adds a new column based on data contained in the existing columns.
experiment %>%
filter(low.birthweight == TRUE) %>%
select(birth.date, length, birthweight, smoker) %>%
mutate(d = birthweight / length)
## birth.date length birthweight smoker d
## 1 7/18/1968 46 2.05 yes 0.04456522
## 2 9/16/1968 48 1.92 yes 0.04000000
## 3 1/8/1968 47 2.66 yes 0.05659574
## 4 3/23/1967 50 2.51 yes 0.05020000
## 5 9/27/1968 43 2.65 no 0.06162791
## 6 4/2/1968 48 2.37 yes 0.04937500
##7.4 Summarizing data The group_by() and summarize() functions apply a function to a group defined by one or more categorical variables.
experiment %>%
group_by(smoker) %>%
summarize(mean.birthweight = mean(birthweight))
## # A tibble: 2 × 2
## smoker mean.birthweight
## <chr> <dbl>
## 1 no 3.51
## 2 yes 3.13
experiment %>%
group_by(smoker, low.birthweight) %>%
summarize(mean.birthweight = mean(birthweight))
## `summarise()` has grouped output by 'smoker'. You can override using the
## `.groups` argument.
## # A tibble: 4 × 3
## # Groups: smoker [2]
## smoker low.birthweight mean.birthweight
## <chr> <int> <dbl>
## 1 no 0 3.55
## 2 no 1 2.65
## 3 yes 0 3.38
## 4 yes 1 2.30
To change the order of rows, use arrange(). To return one or more specified rows, use slice().
experiment %>%
group_by(smoker) %>%
select(smoker, birthweight, length, head.circumference, weeks.gestation) %>%
slice_max(order_by = birthweight, n = 5)
## # A tibble: 10 × 5
## # Groups: smoker [2]
## smoker birthweight length head.circumference weeks.gestation
## <chr> <dbl> <int> <int> <int>
## 1 no 4.55 56 34 44
## 2 no 4.32 53 36 40
## 3 no 4.1 58 39 41
## 4 no 4.07 53 38 44
## 5 no 3.94 54 37 42
## 6 yes 4.57 58 39 41
## 7 yes 3.87 50 33 45
## 8 yes 3.86 52 36 39
## 9 yes 3.64 53 38 40
## 10 yes 3.59 53 34 40
The pivot_longer() and pivot_wider() functions rearrange data, decreasing or increasing the number of columns. The use of this will become more evident during visualization.
experiment %>%
filter(low.birthweight == TRUE) %>%
select(smoker, `miR-16`, `miR-21`, `miR-146a`, `miR-182`) %>%
pivot_longer(cols = c(`miR-16`, `miR-21`, `miR-146a`, `miR-182`),
names_to = "gene",
values_to = "expression")
## # A tibble: 24 × 3
## smoker gene expression
## <chr> <chr> <int>
## 1 yes miR-16 60
## 2 yes miR-21 48
## 3 yes miR-146a 89
## 4 yes miR-182 65
## 5 yes miR-16 70
## 6 yes miR-21 51
## 7 yes miR-146a 84
## 8 yes miR-182 48
## 9 yes miR-16 61
## 10 yes miR-21 51
## # ℹ 14 more rows
#Refrensi :