1 1. Choose and Import a Dataset

This document is parameterized so you can either use a built-in dataset (default: mtcars) or point to a CSV file. To switch sources, edit the params: section at the top of this file before knitting.

# Prefer a user CSV if provided; otherwise fall back to a built-in dataset by name
if (nzchar(params$csv_path)) {
  message("Loading from CSV: ", params$csv_path)
  df <- readr::read_csv(params$csv_path, show_col_types = FALSE)
  src <- paste0("CSV file at '", params$csv_path, "'")
} else {
  # Defensive: only allow certain built-ins for simplicity
  builtin_ok <- c("mtcars", "iris")
  if (!params$dataset %in% builtin_ok) {
    stop("Unsupported built-in dataset in params$dataset. Use one of: ", paste(builtin_ok, collapse = ", "))
  }
  df <- get(params$dataset)
  src <- paste0("built-in dataset '", params$dataset, "'")
}

# Make a light copy to avoid modifying the original
data <- df

Data source used: built-in dataset ‘mtcars’

1.1 Quick Peek

# Show basic info
n_rows <- nrow(data)
n_cols <- ncol(data)
cat(sprintf("This dataset has %d rows and %d columns.\n\n", n_rows, n_cols))
## This dataset has 32 rows and 11 columns.
# Display first 6 rows
knitr::kable(head(data), caption = "First 6 rows")
First 6 rows
mpg cyl disp hp drat wt qsec vs am gear carb
Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1

2 2. Basic Summary Statistics

We’ll compute summary statistics for all numeric columns: count (non-missing), mean, median, standard deviation, min, and max.

library(dplyr)

# Keep only numeric columns for summary stats
numeric_cols <- dplyr::select(data, where(is.numeric))

if (ncol(numeric_cols) == 0) {
  stop("No numeric columns found. Provide a dataset/CSV with numeric variables.")
}

summary_tbl <- numeric_cols |>
  summarise(dplyr::across(
    .cols = everything(),
    .fns = list(
      n = ~sum(!is.na(.x)),
      mean = ~mean(.x, na.rm = TRUE),
      median = ~median(.x, na.rm = TRUE),
      sd = ~sd(.x, na.rm = TRUE),
      min = ~min(.x, na.rm = TRUE),
      max = ~max(.x, na.rm = TRUE)
    ),
    .names = "{.col}__{.fn}"
  )) |>
  tidyr::pivot_longer(cols = everything(),
                      names_to = c("variable", ".value"),
                      names_sep = "__") |>
  dplyr::arrange(variable)

knitr::kable(summary_tbl, digits = 3, caption = "Summary statistics for numeric variables")
Summary statistics for numeric variables
variable n mean median sd min max
am 32 0.406 0.000 0.499 0.000 1.000
carb 32 2.812 2.000 1.615 1.000 8.000
cyl 32 6.188 6.000 1.786 4.000 8.000
disp 32 230.722 196.300 123.939 71.100 472.000
drat 32 3.597 3.695 0.535 2.760 4.930
gear 32 3.688 4.000 0.738 3.000 5.000
hp 32 146.688 123.000 68.563 52.000 335.000
mpg 32 20.091 19.200 6.027 10.400 33.900
qsec 32 17.849 17.710 1.787 14.500 22.900
vs 32 0.438 0.000 0.504 0.000 1.000
wt 32 3.217 3.325 0.978 1.513 5.424

2.1 Friendly Sentences (Optional)

Sometimes you want human-friendly sentences instead of a table. The chunk below prints one sentence per numeric variable.

fmt_num <- function(x) formatC(x, format = "f", digits = 2, big.mark = ",")

apply(summary_tbl, 1, function(row) {
  var   <- row[['variable']]
  n     <- as.numeric(row[['n']])
  meanv <- as.numeric(row[['mean']])
  medv  <- as.numeric(row[['median']])
  sdv   <- as.numeric(row[['sd']])
  minv  <- as.numeric(row[['min']])
  maxv  <- as.numeric(row[['max']])
  
  cat(sprintf("**%s**: n = %d, mean = %s, median = %s, sd = %s, min = %s, max = %s.\n\n",
              var, n, fmt_num(meanv), fmt_num(medv), fmt_num(sdv), fmt_num(minv), fmt_num(maxv)))
})

am: n = 32, mean = 0.41, median = 0.00, sd = 0.50, min = 0.00, max = 1.00.

carb: n = 32, mean = 2.81, median = 2.00, sd = 1.62, min = 1.00, max = 8.00.

cyl: n = 32, mean = 6.19, median = 6.00, sd = 1.79, min = 4.00, max = 8.00.

disp: n = 32, mean = 230.72, median = 196.30, sd = 123.94, min = 71.10, max = 472.00.

drat: n = 32, mean = 3.60, median = 3.69, sd = 0.53, min = 2.76, max = 4.93.

gear: n = 32, mean = 3.69, median = 4.00, sd = 0.74, min = 3.00, max = 5.00.

hp: n = 32, mean = 146.69, median = 123.00, sd = 68.56, min = 52.00, max = 335.00.

mpg: n = 32, mean = 20.09, median = 19.20, sd = 6.03, min = 10.40, max = 33.90.

qsec: n = 32, mean = 17.85, median = 17.71, sd = 1.79, min = 14.50, max = 22.90.

vs: n = 32, mean = 0.44, median = 0.00, sd = 0.50, min = 0.00, max = 1.00.

wt: n = 32, mean = 3.22, median = 3.33, sd = 0.98, min = 1.51, max = 5.42.

NULL

3 3. Save Cleaned/Transformed Data (Optional)

If you want to save a cleaned or transformed version of the dataset for later use, do it here. The line is commented out by default.

# readr::write_csv(data, "cleaned_data.csv")