This document is parameterized so you can either use a built-in
dataset (default: mtcars
) or point to a CSV file. To switch
sources, edit the params:
section at the top of this file
before knitting.
# Prefer a user CSV if provided; otherwise fall back to a built-in dataset by name
if (nzchar(params$csv_path)) {
message("Loading from CSV: ", params$csv_path)
df <- readr::read_csv(params$csv_path, show_col_types = FALSE)
src <- paste0("CSV file at '", params$csv_path, "'")
} else {
# Defensive: only allow certain built-ins for simplicity
builtin_ok <- c("mtcars", "iris")
if (!params$dataset %in% builtin_ok) {
stop("Unsupported built-in dataset in params$dataset. Use one of: ", paste(builtin_ok, collapse = ", "))
}
df <- get(params$dataset)
src <- paste0("built-in dataset '", params$dataset, "'")
}
# Make a light copy to avoid modifying the original
data <- df
Data source used: built-in dataset ‘mtcars’
# Show basic info
n_rows <- nrow(data)
n_cols <- ncol(data)
cat(sprintf("This dataset has %d rows and %d columns.\n\n", n_rows, n_cols))
## This dataset has 32 rows and 11 columns.
# Display first 6 rows
knitr::kable(head(data), caption = "First 6 rows")
mpg | cyl | disp | hp | drat | wt | qsec | vs | am | gear | carb | |
---|---|---|---|---|---|---|---|---|---|---|---|
Mazda RX4 | 21.0 | 6 | 160 | 110 | 3.90 | 2.620 | 16.46 | 0 | 1 | 4 | 4 |
Mazda RX4 Wag | 21.0 | 6 | 160 | 110 | 3.90 | 2.875 | 17.02 | 0 | 1 | 4 | 4 |
Datsun 710 | 22.8 | 4 | 108 | 93 | 3.85 | 2.320 | 18.61 | 1 | 1 | 4 | 1 |
Hornet 4 Drive | 21.4 | 6 | 258 | 110 | 3.08 | 3.215 | 19.44 | 1 | 0 | 3 | 1 |
Hornet Sportabout | 18.7 | 8 | 360 | 175 | 3.15 | 3.440 | 17.02 | 0 | 0 | 3 | 2 |
Valiant | 18.1 | 6 | 225 | 105 | 2.76 | 3.460 | 20.22 | 1 | 0 | 3 | 1 |
We’ll compute summary statistics for all numeric columns: count (non-missing), mean, median, standard deviation, min, and max.
library(dplyr)
# Keep only numeric columns for summary stats
numeric_cols <- dplyr::select(data, where(is.numeric))
if (ncol(numeric_cols) == 0) {
stop("No numeric columns found. Provide a dataset/CSV with numeric variables.")
}
summary_tbl <- numeric_cols |>
summarise(dplyr::across(
.cols = everything(),
.fns = list(
n = ~sum(!is.na(.x)),
mean = ~mean(.x, na.rm = TRUE),
median = ~median(.x, na.rm = TRUE),
sd = ~sd(.x, na.rm = TRUE),
min = ~min(.x, na.rm = TRUE),
max = ~max(.x, na.rm = TRUE)
),
.names = "{.col}__{.fn}"
)) |>
tidyr::pivot_longer(cols = everything(),
names_to = c("variable", ".value"),
names_sep = "__") |>
dplyr::arrange(variable)
knitr::kable(summary_tbl, digits = 3, caption = "Summary statistics for numeric variables")
variable | n | mean | median | sd | min | max |
---|---|---|---|---|---|---|
am | 32 | 0.406 | 0.000 | 0.499 | 0.000 | 1.000 |
carb | 32 | 2.812 | 2.000 | 1.615 | 1.000 | 8.000 |
cyl | 32 | 6.188 | 6.000 | 1.786 | 4.000 | 8.000 |
disp | 32 | 230.722 | 196.300 | 123.939 | 71.100 | 472.000 |
drat | 32 | 3.597 | 3.695 | 0.535 | 2.760 | 4.930 |
gear | 32 | 3.688 | 4.000 | 0.738 | 3.000 | 5.000 |
hp | 32 | 146.688 | 123.000 | 68.563 | 52.000 | 335.000 |
mpg | 32 | 20.091 | 19.200 | 6.027 | 10.400 | 33.900 |
qsec | 32 | 17.849 | 17.710 | 1.787 | 14.500 | 22.900 |
vs | 32 | 0.438 | 0.000 | 0.504 | 0.000 | 1.000 |
wt | 32 | 3.217 | 3.325 | 0.978 | 1.513 | 5.424 |
Sometimes you want human-friendly sentences instead of a table. The chunk below prints one sentence per numeric variable.
fmt_num <- function(x) formatC(x, format = "f", digits = 2, big.mark = ",")
apply(summary_tbl, 1, function(row) {
var <- row[['variable']]
n <- as.numeric(row[['n']])
meanv <- as.numeric(row[['mean']])
medv <- as.numeric(row[['median']])
sdv <- as.numeric(row[['sd']])
minv <- as.numeric(row[['min']])
maxv <- as.numeric(row[['max']])
cat(sprintf("**%s**: n = %d, mean = %s, median = %s, sd = %s, min = %s, max = %s.\n\n",
var, n, fmt_num(meanv), fmt_num(medv), fmt_num(sdv), fmt_num(minv), fmt_num(maxv)))
})
am: n = 32, mean = 0.41, median = 0.00, sd = 0.50, min = 0.00, max = 1.00.
carb: n = 32, mean = 2.81, median = 2.00, sd = 1.62, min = 1.00, max = 8.00.
cyl: n = 32, mean = 6.19, median = 6.00, sd = 1.79, min = 4.00, max = 8.00.
disp: n = 32, mean = 230.72, median = 196.30, sd = 123.94, min = 71.10, max = 472.00.
drat: n = 32, mean = 3.60, median = 3.69, sd = 0.53, min = 2.76, max = 4.93.
gear: n = 32, mean = 3.69, median = 4.00, sd = 0.74, min = 3.00, max = 5.00.
hp: n = 32, mean = 146.69, median = 123.00, sd = 68.56, min = 52.00, max = 335.00.
mpg: n = 32, mean = 20.09, median = 19.20, sd = 6.03, min = 10.40, max = 33.90.
qsec: n = 32, mean = 17.85, median = 17.71, sd = 1.79, min = 14.50, max = 22.90.
vs: n = 32, mean = 0.44, median = 0.00, sd = 0.50, min = 0.00, max = 1.00.
wt: n = 32, mean = 3.22, median = 3.33, sd = 0.98, min = 1.51, max = 5.42.
NULL
If you want to save a cleaned or transformed version of the dataset for later use, do it here. The line is commented out by default.
# readr::write_csv(data, "cleaned_data.csv")