Bioinformatics uses computational tools to analyze biological data. This project demonstrates data wrangling, expression analysis, phylogenetics, and genome-wide analysis using simple example datasets in R.
sample_tbl <- tibble(
sample_id = paste0("S", 1:5),
group = c("A", "A", "B", "B", "B"),
value = c(10.5, 12.3, 9.8, 11.2, 10.9)
)
sample_tbl
## # A tibble: 5 × 3
## sample_id group value
## <chr> <chr> <dbl>
## 1 S1 A 10.5
## 2 S2 A 12.3
## 3 S3 B 9.8
## 4 S4 B 11.2
## 5 S5 B 10.9
wide_tbl <- tibble(
gene = c("Gene1", "Gene2"),
condA = c(10, 20),
condB = c(15, 25)
)
long_tbl <- wide_tbl |>
pivot_longer(cols = c(condA, condB),
names_to = "condition",
values_to = "expression")
long_tbl
## # A tibble: 4 × 3
## gene condition expression
## <chr> <chr> <dbl>
## 1 Gene1 condA 10
## 2 Gene1 condB 15
## 3 Gene2 condA 20
## 4 Gene2 condB 25
long_tbl2 <- tibble(
gene = c("Gene1", "Gene2", "Gene1", "Gene2"),
condition_rep = c("A_rep1", "A_rep2", "B_rep1", "B_rep2"),
expression = c(10, 12, 15, 17)
)
long_sep <- long_tbl2 |>
separate(condition_rep, into = c("condition", "replicate"), sep = "_")
long_sep
## # A tibble: 4 × 4
## gene condition replicate expression
## <chr> <chr> <chr> <dbl>
## 1 Gene1 A rep1 10
## 2 Gene2 A rep2 12
## 3 Gene1 B rep1 15
## 4 Gene2 B rep2 17
summary_tbl <- long_sep |>
group_by(gene, condition) |>
summarize(mean_expression = mean(expression), .groups = "drop")
summary_tbl
## # A tibble: 4 × 3
## gene condition mean_expression
## <chr> <chr> <dbl>
## 1 Gene1 A 10
## 2 Gene1 B 15
## 3 Gene2 A 12
## 4 Gene2 B 17
missing_tbl <- tibble(
sample_id = paste0("S", 1:5),
measurement = c(5.1, NA, 4.8, NA, 5.3)
)
mean_value <- mean(missing_tbl$measurement, na.rm = TRUE)
missing_imputed <- missing_tbl |>
mutate(measurement_imputed = if_else(is.na(measurement), mean_value, measurement))
missing_imputed
## # A tibble: 5 × 3
## sample_id measurement measurement_imputed
## <chr> <dbl> <dbl>
## 1 S1 5.1 5.1
## 2 S2 NA 5.07
## 3 S3 4.8 4.8
## 4 S4 NA 5.07
## 5 S5 5.3 5.3
if (!requireNamespace("stringr", quietly = TRUE)) install.packages("stringr")
library(stringr)
gene_ids <- c("ENSG000001234", "ENSG000009876", "LOC123456")
str_detect(gene_ids, "^ENSG")
## [1] TRUE TRUE FALSE
str_extract(gene_ids, "[0-9]+")
## [1] "000001234" "000009876" "123456"
if (requireNamespace("limma", quietly = TRUE)) {
library(limma)
set.seed(1)
exprs_mat <- matrix(rnorm(1000), nrow = 100, ncol = 10)
rownames(exprs_mat) <- paste0("Gene", 1:100)
group <- factor(rep(c("Control", "Treatment"), each = 5))
design <- model.matrix(~ group)
fit <- lmFit(exprs_mat, design)
fit <- eBayes(fit)
head(topTable(fit))
} else {
cat("limma not installed.\n")
}
## limma not installed.
if (requireNamespace("edgeR", quietly = TRUE)) {
library(edgeR)
set.seed(123)
counts <- matrix(rnbinom(100, mu = 20, size = 1), nrow = 20)
rownames(counts) <- paste0("Gene", 1:20)
group <- factor(rep(c("Control", "Treatment"), each = 5))
dge <- DGEList(counts = counts, group = group)
dge <- calcNormFactors(dge)
design <- model.matrix(~ group)
dge <- estimateDisp(dge, design)
fit <- glmFit(dge, design)
lrt <- glmLRT(fit, coef = 2)
head(topTags(lrt))
} else {
cat("edgeR not installed.\n")
}
## edgeR not installed.
if (requireNamespace("DESeq2", quietly = TRUE)) {
library(DESeq2)
coldata <- data.frame(
row.names = colnames(counts),
condition = group
)
dds <- DESeqDataSetFromMatrix(
countData = counts,
colData = coldata,
design = ~ condition
)
dds <- DESeq(dds)
head(results(dds))
} else {
cat("DESeq2 not installed.\n")
}
## DESeq2 not installed.
if (requireNamespace("Biostrings", quietly = TRUE)) {
library(Biostrings)
seq1 <- AAString("MSTNPKPQRIT")
seq2 <- AAString("MSSSSKPQRTT")
pairwiseAlignment(seq1, seq2)
} else {
cat("Biostrings not installed.\n")
}
## Biostrings not installed.
synteny_tbl <- tibble(
genome = rep(c("Species1", "Species2"), each = 5),
gene = paste0("Gene", 1:10),
start = c(1, 100, 200, 300, 400, 5, 105, 205, 305, 405)
)
synteny_tbl
## # A tibble: 10 × 3
## genome gene start
## <chr> <chr> <dbl>
## 1 Species1 Gene1 1
## 2 Species1 Gene2 100
## 3 Species1 Gene3 200
## 4 Species1 Gene4 300
## 5 Species1 Gene5 400
## 6 Species2 Gene6 5
## 7 Species2 Gene7 105
## 8 Species2 Gene8 205
## 9 Species2 Gene9 305
## 10 Species2 Gene10 405
(fixed version)
genome_regions <- tibble(
chr = rep("chr1", 9),
start = seq(1, 9000, by = 1000),
end = seq(1000, 9000, by = 1000),
annotated = c(TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE)
)
genome_regions
## # A tibble: 9 × 4
## chr start end annotated
## <chr> <dbl> <dbl> <lgl>
## 1 chr1 1 1000 TRUE
## 2 chr1 1001 2000 FALSE
## 3 chr1 2001 3000 TRUE
## 4 chr1 3001 4000 FALSE
## 5 chr1 4001 5000 TRUE
## 6 chr1 5001 6000 FALSE
## 7 chr1 6001 7000 TRUE
## 8 chr1 7001 8000 FALSE
## 9 chr1 8001 9000 TRUE
if (requireNamespace("ape", quietly = TRUE)) {
library(ape)
set.seed(42)
tree <- rtree(10)
tree
} else {
cat("ape not installed.\n")
}
##
## Phylogenetic tree with 10 tips and 9 internal nodes.
##
## Tip labels:
## t10, t1, t8, t7, t4, t9, ...
##
## Rooted; includes branch length(s).
cat("Treespace package not installed or not required for this example.\n")
## Treespace package not installed or not required for this example.
if (exists("tree") && requireNamespace("ape", quietly = TRUE)) {
library(ape)
drop.tip(tree, tree$tip.label[1:3])
} else {
cat("Tree not found.\n")
}
##
## Phylogenetic tree with 7 tips and 6 internal nodes.
##
## Tip labels:
## t7, t4, t9, t5, t2, t3, ...
##
## Rooted; includes branch length(s).
cat("phangorn example not run in this environment, but the package can be used for maximum likelihood phylogenetic analysis.\n")
## phangorn example not run in this environment, but the package can be used for maximum likelihood phylogenetic analysis.
if (requireNamespace("VariantAnnotation", quietly = TRUE)) {
library(VariantAnnotation)
cat("VariantAnnotation can read VCF files.\n")
} else {
cat("VariantAnnotation not installed.\n")
}
## VariantAnnotation not installed.
if (requireNamespace("Biostrings", quietly = TRUE)) {
library(Biostrings)
dna <- DNAString("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")
findORFs(dna, startCodon = "ATG", minimumLength = 30)
} else {
cat("Biostrings not installed.\n")
}
## Biostrings not installed.
if (requireNamespace("karyoploteR", quietly = TRUE)) {
library(karyoploteR)
cat("karyoploteR could plot chromosomes here.\n")
} else {
cat("karyoploteR not installed.\n")
}
## karyoploteR not installed.
This project demonstrates foundational bioinformatics workflows using R, including data wrangling, expression analysis, phylogenetics, and genome-wide exploration. These basic tools support modern biological research.
sessionInfo()
## R version 4.5.1 (2025-06-13)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 20.04.6 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/liblapack.so.3; LAPACK version 3.9.0
##
## locale:
## [1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8
## [4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8
## [7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C
## [10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C
##
## time zone: UTC
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ape_5.8-1 lubridate_1.9.4 forcats_1.0.1 stringr_1.6.0
## [5] dplyr_1.1.4 purrr_1.2.0 readr_2.1.5 tidyr_1.3.1
## [9] tibble_3.3.0 ggplot2_4.0.0 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.6 jsonlite_2.0.0 compiler_4.5.1 Rcpp_1.1.0
## [5] tidyselect_1.2.1 parallel_4.5.1 jquerylib_0.1.4 scales_1.4.0
## [9] yaml_2.3.10 fastmap_1.2.0 lattice_0.22-7 R6_2.6.1
## [13] generics_0.1.4 knitr_1.50 bslib_0.9.0 pillar_1.11.1
## [17] RColorBrewer_1.1-3 tzdb_0.5.0 rlang_1.1.6 utf8_1.2.6
## [21] stringi_1.8.7 cachem_1.1.0 xfun_0.54 sass_0.4.10
## [25] S7_0.2.0 timechange_0.3.0 cli_3.6.5 withr_3.0.2
## [29] magrittr_2.0.4 digest_0.6.38 grid_4.5.1 rstudioapi_0.17.1
## [33] hms_1.1.4 nlme_3.1-168 lifecycle_1.0.4 vctrs_0.6.5
## [37] evaluate_1.0.5 glue_1.8.0 farver_2.1.2 rmarkdown_2.30
## [41] tools_4.5.1 pkgconfig_2.0.3 htmltools_0.5.8.1