Purpose: A living notebook of short, reliable R snippets I actually use. Keep it short. Keep it runnable. Update as I learn.

1. Quick Setup

# Install-once (uncomment as needed)
# install.packages(c("tidyverse", "janitor", "lubridate", "readr", "readxl", "openxlsx",
#                    "skimr", "here", "fs", "glue", "knitr", "rmarkdown", "ggthemes",
#                    "patchwork", "gt", "gtExtras", "stringr", "forcats"))

# Load every session
suppressPackageStartupMessages({
  library(tidyverse)
  library(janitor)
  library(lubridate)
  library(here)
  library(glue)
  library(skimr)
  library(gt)
})

# Reproducibility
set.seed(42)

Project tip: Use an RStudio Project and here::here() for paths. Never hard‑code "C:/Users/...".

2. Reading & Writing Data (fast + safe)

# CSV (robust defaults)
df_csv <- readr::read_csv(here("data", "my_data.csv"))

# Excel (first sheet)
df_xlsx <- readxl::read_excel(here("data", "my_data.xlsx"), sheet = 1)

# Write outputs with timestamps
out_path <- here("output", glue("cleaned_{format(Sys.Date(), '%Y%m%d')}.csv"))
# readr::write_csv(df_csv, out_path)

Gotcha: If you see encoding issues, try locale = locale(encoding = "UTF-8") in read_csv().

3. Inspecting Data (what is this?)

# High‑level skim
skimr::skim(df_csv)

# Structure & types
str(df_csv)

# Column names (clean + check)
names(df_csv)
janitor::compare_df_cols(df_csv)

Rule of thumb: If a column should be a date, convert it immediately with lubridate.

4. Cleaning Columns & Rows

# Consistent names
clean <- df_csv %>%
  janitor::clean_names() %>%                 # snake_case column names
  mutate(across(where(is.character), trimws)) # trim leading/trailing spaces

# Remove complete duplicate rows
clean <- distinct(clean)

# Handle blanks as NA
clean <- mutate(clean, across(everything(), ~na_if(.x, "")))

Tip: Use distinct(.keep_all = TRUE) to de‑dupe by subset of columns.

5. dplyr Cheatsheet (minimal set)

result <- clean %>%
  filter(!is.na(id)) %>%
  mutate(
    date = lubridate::ymd(date),
    category = forcats::fct_lump_n(as.factor(category), n = 5)
  ) %>%
  group_by(category) %>%
  summarize(
    n = n(),
    mean_val = mean(value, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  arrange(desc(n))

Mnemonic: Select–Filter–Mutate–Summarize–Arrange covers 80% of wrangling.

6. Joins (I always mix these up)

# left_join: keep all rows from x, bring matches from y
joined <- df_csv %>% left_join(df_xlsx, by = "id")

# anti_join: rows in x with no match in y (great for QA)
missing_keys <- df_csv %>% anti_join(df_xlsx, by = "id")

QA trick: anti_join() first to see what won’t match before any heavy processing.

7. Dates & Times (lubridate)

# Parse and standardize
clean_dates <- clean %>%
  mutate(
    date = ymd(date),
    year = year(date),
    month = month(date, label = TRUE, abbr = TRUE),
    wk = isoweek(date)
  )

Tip: If parsing fails, inspect with parse_date_time(x, orders = c("ymd", "mdy", "dmy")).

8. Strings (stringr)

text_clean <- clean %>%
  mutate(
    email = str_to_lower(email),
    domain = str_extract(email, "@.+$")
  )

Regex sanity: Test patterns at https://regex101.com/ before committing.

9. Factors (forcats)

fac <- clean %>%
  mutate(
    status = fct_relevel(as.factor(status), c("new", "active", "inactive")),
    top_cat = fct_lump_n(as.factor(category), n = 6)
  )

Plotting tip: Relevel factors to control ggplot ordering.

10. Plotting (ggplot2: minimal patterns)

# Bar (counts)
clean %>%
  ggplot(aes(x = category)) +
  geom_bar(fill = "#2E86AB") +
  theme_minimal(base_size = 12) +
  labs(title = "Counts by Category", x = NULL, y = "Count")

# Line (time series)
clean_dates %>%
  group_by(date) %>% summarize(n = n(), .groups = "drop") %>%
  ggplot(aes(date, n)) +
  geom_line(color = "#7D3C98", linewidth = 0.9) +
  theme_minimal(base_size = 12) +
  labs(title = "Daily Counts", x = NULL, y = NULL)

Small multiples: Use + facet_wrap(~group) when categories are many.

11. Tables (gt quick pattern)

result %>%
  gt::gt() %>%
  gt::fmt_number(columns = where(is.numeric), decimals = 2) %>%
  gt::tab_header(title = md("**Summary by Category**"))

Export: gtsave("table.png") or gt::gtsave() to PNG/PDF/HTML.

12. Modeling (tidymodels tiny starter)

# install.packages("tidymodels")  # once
# library(tidymodels)
# set.seed(42)
# split <- initial_split(clean, prop = 0.8)
# train <- training(split); test <- testing(split)
# rec <- recipe(target ~ ., data = train) %>% step_dummy(all_nominal(), -all_outcomes())
# mod <- linear_reg() %>% set_engine("lm")
# wf  <- workflow() %>% add_model(mod) %>% add_recipe(rec)
# fit <- fit(wf, data = train)
# metrics <- predict(fit, test) %>% bind_cols(test) %>% metrics(truth = target, estimate = .pred)

Reality check: Always baseline with a simple model (e.g., lm) before anything fancy.

13. Debugging & Safety Nets

stopifnot(!anyDuplicated(clean$id))  # ids should be unique

14. Reproducible Paths & Projects

sessionInfo()

15. Handy Snippets I Reuse

# Percent of total
percent_of_total <- function(x) round(100 * x / sum(x, na.rm = TRUE), 1)

# Not-in operator
`%nin%` <- function(x, y) !(x %in% y)

# Quietly run an expression
quietly <- purrr::quietly

16. Checklist Before You Ship

17. Appendix: swirl (learn by doing)

18. Appendix: Keyboard Macros (RStudio)

19. To Do / Parking Lot

