Introduction to Bioinformatics with R

Bioinformatics uses computational tools to analyze biological data. This project demonstrates data wrangling, expression analysis, phylogenetics, and genome-wide analysis using simple example datasets in R.

Data Wrangling

Tibbles

sample_tbl <- tibble(
  sample_id = paste0("S", 1:5),
  group = c("A", "A", "B", "B", "B"),
  value = c(10.5, 12.3, 9.8, 11.2, 10.9)
)
sample_tbl

## # A tibble: 5 × 3
##   sample_id group value
##   <chr>     <chr> <dbl>
## 1 S1        A      10.5
## 2 S2        A      12.3
## 3 S3        B       9.8
## 4 S4        B      11.2
## 5 S5        B      10.9

TidyR

wide_tbl <- tibble(
  gene = c("Gene1", "Gene2"),
  condA = c(10, 20),
  condB = c(15, 25)
)

long_tbl <- wide_tbl |>
  pivot_longer(cols = c(condA, condB),
               names_to = "condition",
               values_to = "expression")
long_tbl

## # A tibble: 4 × 3
##   gene  condition expression
##   <chr> <chr>          <dbl>
## 1 Gene1 condA             10
## 2 Gene1 condB             15
## 3 Gene2 condA             20
## 4 Gene2 condB             25

Separate / Gather

long_tbl2 <- tibble(
  gene = c("Gene1", "Gene2", "Gene1", "Gene2"),
  condition_rep = c("A_rep1", "A_rep2", "B_rep1", "B_rep2"),
  expression = c(10, 12, 15, 17)
)

long_sep <- long_tbl2 |>
  separate(condition_rep, into = c("condition", "replicate"), sep = "_")
long_sep

## # A tibble: 4 × 4
##   gene  condition replicate expression
##   <chr> <chr>     <chr>          <dbl>
## 1 Gene1 A         rep1              10
## 2 Gene2 A         rep2              12
## 3 Gene1 B         rep1              15
## 4 Gene2 B         rep2              17

dplyr

summary_tbl <- long_sep |>
  group_by(gene, condition) |>
  summarize(mean_expression = mean(expression), .groups = "drop")
summary_tbl

## # A tibble: 4 × 3
##   gene  condition mean_expression
##   <chr> <chr>               <dbl>
## 1 Gene1 A                      10
## 2 Gene1 B                      15
## 3 Gene2 A                      12
## 4 Gene2 B                      17

Missing Values

missing_tbl <- tibble(
  sample_id = paste0("S", 1:5),
  measurement = c(5.1, NA, 4.8, NA, 5.3)
)

mean_value <- mean(missing_tbl$measurement, na.rm = TRUE)

missing_imputed <- missing_tbl |>
  mutate(measurement_imputed = if_else(is.na(measurement), mean_value, measurement))

missing_imputed

## # A tibble: 5 × 3
##   sample_id measurement measurement_imputed
##   <chr>           <dbl>               <dbl>
## 1 S1                5.1                5.1 
## 2 S2               NA                  5.07
## 3 S3                4.8                4.8 
## 4 S4               NA                  5.07
## 5 S5                5.3                5.3

stringr

if (!requireNamespace("stringr", quietly = TRUE)) install.packages("stringr")
library(stringr)

gene_ids <- c("ENSG000001234", "ENSG000009876", "LOC123456")

str_detect(gene_ids, "^ENSG")

## [1]  TRUE  TRUE FALSE

str_extract(gene_ids, "[0-9]+")

## [1] "000001234" "000009876" "123456"

Microarrays

if (requireNamespace("limma", quietly = TRUE)) {
  library(limma)
  set.seed(1)
  exprs_mat <- matrix(rnorm(1000), nrow = 100, ncol = 10)
  rownames(exprs_mat) <- paste0("Gene", 1:100)
  group <- factor(rep(c("Control", "Treatment"), each = 5))
  design <- model.matrix(~ group)
  fit <- lmFit(exprs_mat, design)
  fit <- eBayes(fit)
  head(topTable(fit))
} else {
  cat("limma not installed.\n")
}

## limma not installed.

RNASeq

EdgeR

if (requireNamespace("edgeR", quietly = TRUE)) {
  library(edgeR)
  set.seed(123)
  counts <- matrix(rnbinom(100, mu = 20, size = 1), nrow = 20)
  rownames(counts) <- paste0("Gene", 1:20)
  group <- factor(rep(c("Control", "Treatment"), each = 5))

  dge <- DGEList(counts = counts, group = group)
  dge <- calcNormFactors(dge)
  design <- model.matrix(~ group)
  dge <- estimateDisp(dge, design)
  fit <- glmFit(dge, design)
  lrt <- glmLRT(fit, coef = 2)
  head(topTags(lrt))
} else {
  cat("edgeR not installed.\n")
}

## edgeR not installed.

DESeq

if (requireNamespace("DESeq2", quietly = TRUE)) {
  library(DESeq2)
  coldata <- data.frame(
    row.names = colnames(counts),
    condition = group
  )

  dds <- DESeqDataSetFromMatrix(
    countData = counts,
    colData = coldata,
    design = ~ condition
  )
  dds <- DESeq(dds)
  head(results(dds))
} else {
  cat("DESeq2 not installed.\n")
}

## DESeq2 not installed.

Other Tools

Protein Alignment

if (requireNamespace("Biostrings", quietly = TRUE)) {
  library(Biostrings)
  seq1 <- AAString("MSTNPKPQRIT")
  seq2 <- AAString("MSSSSKPQRTT")
  pairwiseAlignment(seq1, seq2)
} else {
  cat("Biostrings not installed.\n")
}

## Biostrings not installed.

Synteny

synteny_tbl <- tibble(
  genome = rep(c("Species1", "Species2"), each = 5),
  gene   = paste0("Gene", 1:10),
  start  = c(1, 100, 200, 300, 400, 5, 105, 205, 305, 405)
)
synteny_tbl

## # A tibble: 10 × 3
##    genome   gene   start
##    <chr>    <chr>  <dbl>
##  1 Species1 Gene1      1
##  2 Species1 Gene2    100
##  3 Species1 Gene3    200
##  4 Species1 Gene4    300
##  5 Species1 Gene5    400
##  6 Species2 Gene6      5
##  7 Species2 Gene7    105
##  8 Species2 Gene8    205
##  9 Species2 Gene9    305
## 10 Species2 Gene10   405

Unannotated Regions

(fixed version)

genome_regions <- tibble(
  chr = rep("chr1", 9),
  start = seq(1, 9000, by = 1000),
  end   = seq(1000, 9000, by = 1000),
  annotated = c(TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE)
)
genome_regions

## # A tibble: 9 × 4
##   chr   start   end annotated
##   <chr> <dbl> <dbl> <lgl>    
## 1 chr1      1  1000 TRUE     
## 2 chr1   1001  2000 FALSE    
## 3 chr1   2001  3000 TRUE     
## 4 chr1   3001  4000 FALSE    
## 5 chr1   4001  5000 TRUE     
## 6 chr1   5001  6000 FALSE    
## 7 chr1   6001  7000 TRUE     
## 8 chr1   7001  8000 FALSE    
## 9 chr1   8001  9000 TRUE

Phylogenetic Analysis

Treeio / ape

if (requireNamespace("ape", quietly = TRUE)) {
  library(ape)
  set.seed(42)
  tree <- rtree(10)
  tree
} else {
  cat("ape not installed.\n")
}

## 
## Phylogenetic tree with 10 tips and 9 internal nodes.
## 
## Tip labels:
##   t10, t1, t8, t7, t4, t9, ...
## 
## Rooted; includes branch length(s).

Treespace (fixed version)

cat("Treespace package not installed or not required for this example.\n")

## Treespace package not installed or not required for this example.

Subtrees

if (exists("tree") && requireNamespace("ape", quietly = TRUE)) {
  library(ape)
  drop.tip(tree, tree$tip.label[1:3])
} else {
  cat("Tree not found.\n")
}

## 
## Phylogenetic tree with 7 tips and 6 internal nodes.
## 
## Tip labels:
##   t7, t4, t9, t5, t2, t3, ...
## 
## Rooted; includes branch length(s).

Phangorn

cat("phangorn example not run in this environment, but the package can be used for maximum likelihood phylogenetic analysis.\n")

## phangorn example not run in this environment, but the package can be used for maximum likelihood phylogenetic analysis.

GWAS

Variant Tools

if (requireNamespace("VariantAnnotation", quietly = TRUE)) {
  library(VariantAnnotation)
  cat("VariantAnnotation can read VCF files.\n")
} else {
  cat("VariantAnnotation not installed.\n")
}

## VariantAnnotation not installed.

Predicting ORFs

if (requireNamespace("Biostrings", quietly = TRUE)) {
  library(Biostrings)
  dna <- DNAString("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")
  findORFs(dna, startCodon = "ATG", minimumLength = 30)
} else {
  cat("Biostrings not installed.\n")
}

## Biostrings not installed.

Karyoploter

if (requireNamespace("karyoploteR", quietly = TRUE)) {
  library(karyoploteR)
  cat("karyoploteR could plot chromosomes here.\n")
} else {
  cat("karyoploteR not installed.\n")
}

## karyoploteR not installed.

Conclusion

This project demonstrates foundational bioinformatics workflows using R, including data wrangling, expression analysis, phylogenetics, and genome-wide exploration. These basic tools support modern biological research.

sessionInfo()

## R version 4.5.1 (2025-06-13)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 20.04.6 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/liblapack.so.3;  LAPACK version 3.9.0
## 
## locale:
##  [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
##  [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
##  [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
## [10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   
## 
## time zone: UTC
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] ape_5.8-1       lubridate_1.9.4 forcats_1.0.1   stringr_1.6.0  
##  [5] dplyr_1.1.4     purrr_1.2.0     readr_2.1.5     tidyr_1.3.1    
##  [9] tibble_3.3.0    ggplot2_4.0.0   tidyverse_2.0.0
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.6       jsonlite_2.0.0     compiler_4.5.1     Rcpp_1.1.0        
##  [5] tidyselect_1.2.1   parallel_4.5.1     jquerylib_0.1.4    scales_1.4.0      
##  [9] yaml_2.3.10        fastmap_1.2.0      lattice_0.22-7     R6_2.6.1          
## [13] generics_0.1.4     knitr_1.50         bslib_0.9.0        pillar_1.11.1     
## [17] RColorBrewer_1.1-3 tzdb_0.5.0         rlang_1.1.6        utf8_1.2.6        
## [21] stringi_1.8.7      cachem_1.1.0       xfun_0.54          sass_0.4.10       
## [25] S7_0.2.0           timechange_0.3.0   cli_3.6.5          withr_3.0.2       
## [29] magrittr_2.0.4     digest_0.6.38      grid_4.5.1         rstudioapi_0.17.1 
## [33] hms_1.1.4          nlme_3.1-168       lifecycle_1.0.4    vctrs_0.6.5       
## [37] evaluate_1.0.5     glue_1.8.0         farver_2.1.2       rmarkdown_2.30    
## [41] tools_4.5.1        pkgconfig_2.0.3    htmltools_0.5.8.1

Bioinformatics Final Project

Edmun Williams