Render this report

rmarkdown::render("combine_farm_inputs.Rmd")
# Override directories at render time:
rmarkdown::render("combine_farm_inputs.Rmd",
                  params = list(data_dir="path/to/DATA", output_dir="path/to/OUTPUT"))

Place all farm .xlsx files in DATA/. Master Excel is written to OUTPUT/SAS_4842_Master_Farm_Inputs.xlsx.


1 Processing

All data loading, parsing, combining, and Excel export runs in the single chunk below. Output objects (master_fuel, master_yield, master_fert, master_flags, structure_audit, parse_summary, FILE_REGISTRY) are available to every display chunk that follows.

# =============================================================================
# ── 1.  SETTINGS  ─────────────────────────────────────────────────────────────
# =============================================================================
DATA_DIR   <- params$data_dir
OUTPUT_DIR <- params$output_dir

# Farm keyword matching table.
# Add a row here to onboard an additional farm — no other changes needed.
FARM_PATTERNS <- data.frame(
  farm_id  = c("Dyt Farms","Hettinga Farms","Mineral King Sawtooth",
               "Vander Dussen","DCB Farming","Eric Shannon Farming",
               "B&R TeVelde","Ron Colburn"),
  pai_id   = 1:8,
  keywords = c("dyt","hettinga","sawtooth|mk_saw|mksawtooth","vander",
               "dcb","shannon|eric_shannon","tevelde|b_r_te","colburn|ron_col"),
  stringsAsFactors = FALSE
)

# =============================================================================
# ── 2.  FILE DISCOVERY & MATCHING  ────────────────────────────────────────────
# =============================================================================
# ── Working-directory note ─────────────────────────────────────────────────
# When rendered via rmarkdown::render(), R's working directory is set to the
# folder containing the .Rmd file.  DATA_DIR = "DATA" therefore resolves to
# a DATA/ subfolder *next to* the .Rmd — not the RStudio project root or the
# shell directory you ran the render command from.
# If your files are elsewhere, pass the full absolute path:
#   rmarkdown::render("...", params = list(data_dir = "/full/path/to/DATA"))

wd_at_render <- getwd()

if (!dir.exists(DATA_DIR)) {
  msg <- paste0(
    "DATA directory not found.\n\n",
    "  Looking for : ", normalizePath(DATA_DIR, mustWork=FALSE), "\n",
    "  Working dir : ", wd_at_render, "\n\n",
    "Fix options:\n",
    "  1. Create a DATA/ folder next to the .Rmd and put your .xlsx files in it.\n",
    "  2. Override at render time:\n",
    "       rmarkdown::render(\"your.Rmd\",\n",
    "         params = list(data_dir = \"/absolute/path/to/DATA\"))\n"
  )
  stop(msg, call.=FALSE)
}

dir.create(OUTPUT_DIR, showWarnings=FALSE, recursive=TRUE)

# List ALL files in DATA/ (any extension) so the error message is informative
all_files <- list.files(DATA_DIR, full.names=TRUE, recursive=FALSE)
all_xlsx  <- all_files[grepl("\\.xlsx$", all_files, ignore.case=TRUE)]

if (length(all_xlsx) == 0) {
  all_names <- if (length(all_files) > 0)
                 paste0("  - ", basename(all_files), collapse="\n")
               else
                 "  (folder is empty)"
  msg <- paste0(
    "No .xlsx files found in DATA/.\n\n",
    "  DATA dir resolved to : ", normalizePath(DATA_DIR), "\n",
    "  Working dir          : ", wd_at_render, "\n",
    "  Files present        :\n", all_names, "\n\n",
    "Fix options:\n",
    "  1. Move your .xlsx files into: ", normalizePath(DATA_DIR), "\n",
    "  2. Pass a different directory:\n",
    "       rmarkdown::render(\"your.Rmd\",\n",
    "         params = list(data_dir = \"/absolute/path/to/DATA\"))\n"
  )
  stop(msg, call.=FALSE)
}

match_farm <- function(filepath) {
  base <- tolower(basename(filepath))
  for (i in seq_len(nrow(FARM_PATTERNS))) {
    if (grepl(FARM_PATTERNS$keywords[i], base, perl=TRUE))
      return(data.frame(farm_id  = FARM_PATTERNS$farm_id[i],
                        pai_id   = FARM_PATTERNS$pai_id[i],
                        filename = basename(filepath),
                        filepath = filepath,
                        stringsAsFactors = FALSE))
  }
  NULL
}

matched_list <- Filter(Negate(is.null), lapply(all_xlsx, match_farm))
if (length(matched_list) == 0)
  stop("No files matched any farm pattern. Check FARM_PATTERNS.", call.=FALSE)

FILE_REGISTRY <- bind_rows(matched_list) |> arrange(pai_id)
unmatched     <- setdiff(basename(all_xlsx), sapply(matched_list, `[[`, "filename"))

# =============================================================================
# ── 3.  HELPER FUNCTIONS  ─────────────────────────────────────────────────────
# =============================================================================
safe_read <- function(path, sheet, ...) {
  tryCatch(suppressMessages(readxl::read_excel(path, sheet=sheet, ...)),
           error=function(e) NULL)
}

clean_cols <- function(df) {
  if (is.null(df)) return(NULL)
  names(df) <- trimws(as.character(names(df)))
  df
}

tag <- function(df, farm_id, pai_id, source_file, source_sheet) {
  df |> mutate(farm_id=farm_id, pai_id=as.integer(pai_id),
               source_file=source_file, source_sheet=source_sheet, .before=1)
}

year_cols_of <- function(df) {
  nms <- names(df)
  nms[vapply(nms, function(x) {
    v <- suppressWarnings(as.integer(x))
    !is.na(v) && v >= 2010L && v <= 2035L
  }, logical(1))]
}

rename_first <- function(df, pattern, new_name) {
  idx <- which(grepl(pattern, names(df), ignore.case=TRUE, perl=TRUE))[1]
  if (!is.na(idx) && !(new_name %in% names(df))) names(df)[idx] <- new_name
  df
}

classify_fuel <- function(fuel_name) {
  fn <- toupper(trimws(as.character(fuel_name)))
  dplyr::case_when(
    grepl("RENEW|NESTE|HVO|R99",              fn) ~ "Renewable Diesel",
    grepl("UNLEAD|GASOLINE|\\b87\\b|\\b91\\b", fn) ~ "Gasoline",
    grepl("LPG|PROPANE",                      fn) ~ "LPG",
    TRUE                                           ~ "Conventional Diesel"
  )
}

tier_letter <- function(q) {
  q <- as.character(q)
  dplyr::case_when(startsWith(q,"A")~"A", startsWith(q,"B")~"B",
                   startsWith(q,"C")~"C", startsWith(q,"D")~"D",
                   startsWith(q,"E")~"E", startsWith(q,"F")~"F", TRUE~"C")
}
tier_u_ad <- function(t) unname(c(A=5,B=10,C=15,D=25,E=50,F=75)[t])

coalesce_first <- function(...) {
  for (x in list(...)) if (!is.null(x)) return(x)
  NULL
}

# Accumulators filled by parse_farm()
fuel_list  <- list()
yield_list <- list()
fert_list  <- list()
audit_log  <- list()

# =============================================================================
# ── 4.  PER-FARM PARSER  ──────────────────────────────────────────────────────
# =============================================================================
parse_farm <- function(fp, fn, farm, pai) {

  sheets <- tryCatch(readxl::excel_sheets(fp), error=function(e) character(0))

  rd <- function(sheet, ...) {
    if (!(sheet %in% sheets)) return(NULL)
    df <- safe_read(fp, sheet, ...) |> clean_cols()
    if (!is.null(df) && nrow(df) == 0) return(NULL)
    df
  }

  # ── YIELD ─────────────────────────────────────────────────────────────────
  parse_yield <- function() {
    df <- rd("Yield Summary")
    if (is.null(df)) return(NULL)

    df <- df |>
      rename_first("^Farm *ID$|^Farm ID *$","Farm") |>
      rename_first("^Farm *$","Farm")               |>
      rename_first("^Block ID *$","Block_ID")       |>
      rename_first("^Crop name$","Crop")            |>
      rename_first("^Farm name$","Farm")            |>
      rename_first("^Yield/acre$","Yield_per_acre")

    yr_c <- year_cols_of(df)

    # Float year-header fix (DCB: "2020.0" → Unnamed cols)
    if (length(yr_c) == 0) {
      first_row <- suppressWarnings(as.numeric(unlist(df[1, ])))
      if (any(!is.na(first_row) & first_row >= 2010 & first_row <= 2035)) {
        df_retry <- tryCatch(
          suppressMessages(
            readxl::read_excel(fp, sheet="Yield Summary", skip=1)) |> clean_cols(),
          error=function(e) NULL)
        if (!is.null(df_retry)) { df <- df_retry; yr_c <- year_cols_of(df) }
      }
    }

    # Wide format → long (DCB aggregate row)
    if (!("Yield" %in% names(df)) && length(yr_c) > 0) {
      fc <- names(df)[1]
      cc <- names(df)[grepl("crop", names(df), TRUE)][1]
      uc <- names(df)[grepl("unit", names(df), TRUE)][1]
      df <- df |>
        mutate(across(all_of(yr_c), as.character)) |>
        filter(!is.na(.data[[fc]]),
               !grepl("entity|yield|lbs|acre", tolower(as.character(.data[[fc]])), TRUE)) |>
        slice(1) |>
        pivot_longer(cols=all_of(yr_c), names_to="Year", values_to="Yield") |>
        mutate(Farm    = farm,
               Crop    = if (!is.na(cc)) as.character(.data[[cc]]) else "Almonds",
               Unit    = if (!is.na(uc)) as.character(.data[[uc]]) else "lbs",
               Variety = "All sub-farms (aggregate)")
    }

    df <- df |>
      mutate(Year  = suppressWarnings(as.integer(Year)),
             Yield = suppressWarnings(as.numeric(Yield))) |>
      filter(!is.na(Yield), !is.na(Year))

    if (nrow(df) == 0) {
      audit_log[[paste0(farm,"_yld_empty")]] <<- data.frame(
        farm_id=farm, sheet="Yield Summary",
        issue="Sheet present but no valid data rows",
        action="Obtain yield records from farm", stringsAsFactors=FALSE)
      return(NULL)
    }

    uc2 <- intersect(c("Unit","unit"), names(df))[1]
    df  <- df |> mutate(
      data_quality = dplyr::case_when(
        !is.na(uc2) & grepl("\\?",           as.character(.data[[uc2]])) ~ "A — Unit uncertain (lbs?)",
        !is.na(uc2) & grepl("(?i)^ton$|^t$", as.character(.data[[uc2]])) ~ "A — Unit=metric ton",
        !is.na(uc2) & grepl("(?i)bu|bushel", as.character(.data[[uc2]])) ~ "A — Unit=BU/acre",
        TRUE ~ "A — Source document"),
      Notes = data_quality)

    keep <- intersect(c("Farm","Year","Crop","Variety","Block_ID","Yield","Unit",
                        "Acres","Yield_per_acre","Source","data_quality","Notes"), names(df))
    tag(df[, keep, drop=FALSE], farm, pai, fn, "Yield Summary")
  }

  # ── FUEL ──────────────────────────────────────────────────────────────────
  parse_fuel <- function() {
    results <- list()
    df <- rd("Fuel Summary")

    # Two-row header fix (B&R TeVelde, Ron Colburn: "Gal/Acre" in col names)
    if (!is.null(df) && any(grepl("Gal.Acre|Gal/Acre", names(df), TRUE))) {
      df_retry <- tryCatch(
        suppressMessages(
          readxl::read_excel(fp, sheet="Fuel Summary", skip=1)) |> clean_cols(),
        error=function(e) df)
      if (!is.null(df_retry)) df <- df_retry
    }

    if (!is.null(df)) {
      yr_c    <- year_cols_of(df)
      has_qty <- any(grepl("qty|quantity", names(df), TRUE))

      if (length(yr_c) > 0 && !has_qty) {
        # ── WIDE FORMAT (Dyt, DCB) ─────────────────────────────────────────
        fc <- names(df)[grepl("^farm$",      names(df), TRUE)][1]
        tc <- names(df)[grepl("fuel.*type$", names(df), TRUE)][1]

        if (!is.na(fc) && !is.na(tc)) {
          # Cast all year cols to character — prevents character/double mismatch
          df <- df |> mutate(across(all_of(yr_c), as.character))

          long1 <- df |>
            filter(!is.na(.data[[fc]]),
                   !grepl("^total$|^na$|^nan$", tolower(as.character(.data[[fc]])), TRUE)) |>
            select(Farm=all_of(fc), fuel_name=all_of(tc), all_of(yr_c)) |>
            pivot_longer(yr_c, names_to="Year", values_to="qty_gal")

          # Mirror Diesel block (DCB has .1-suffix duplicate columns)
          fc2    <- names(df)[grepl("farm\\.",      names(df), TRUE)][1]
          tc2    <- names(df)[grepl("fuel.*type\\.", names(df), TRUE)][1]
          yr_dot <- names(df)[grepl(paste0("^(",paste(yr_c,collapse="|"),")\\.\\d"), names(df))]

          long2 <- if (!is.na(fc2) && !is.na(tc2) && length(yr_dot) > 0) {
            yr_c_clean <- gsub("\\.\\d+$","", yr_dot)
            df |>
              mutate(across(all_of(yr_dot), as.character)) |>
              filter(!is.na(.data[[fc2]]),
                     !grepl("^total$|^na$|^nan$", tolower(as.character(.data[[fc2]])), TRUE)) |>
              select(Farm=all_of(fc2), fuel_name=all_of(tc2), all_of(yr_dot)) |>
              rename_with(~gsub("\\.\\d+$","",.), all_of(yr_dot)) |>
              pivot_longer(intersect(yr_c, yr_c_clean), names_to="Year", values_to="qty_gal")
          } else NULL

          # LPG rows (DCB)
          long_lpg <- if (!is.na(tc) &&
              any(grepl("LPG", toupper(as.character(df[[tc]])), fixed=TRUE))) {
            df |>
              filter(grepl("LPG", toupper(as.character(.data[[tc]])), fixed=TRUE)) |>
              select(Farm=all_of(fc), fuel_name=all_of(tc), all_of(yr_c)) |>
              pivot_longer(yr_c, names_to="Year", values_to="qty_gal") |>
              mutate(fuel_name="LPG")
          } else NULL

          wide_long <- bind_rows(long1, long2, long_lpg) |>
            filter(!is.na(qty_gal)) |>
            mutate(qty_gal = suppressWarnings(as.numeric(qty_gal))) |>
            filter(!is.na(qty_gal), qty_gal != 0) |>
            mutate(Year         = suppressWarnings(as.integer(Year)),
                   fuel_type    = classify_fuel(fuel_name),
                   unit         = "Gal",
                   gal_per_acre = NA_real_,
                   data_quality = "B — Annual summary (no individual invoice dates)")
          results[["wide"]] <- tag(wide_long, farm, pai, fn, "Fuel Summary")

        } else {
          # ── GAL/ACRE FORMAT (B&R TeVelde, Ron Colburn) ────────────────────
          tbl <- df |>
            rename_first("^Unnamed.*0$|^Crop$","Crop") |>
            filter(!is.na(Crop),
                   !grepl("source|note|gal|please|crop|^na$", tolower(Crop), TRUE)) |>
            select(Crop, all_of(yr_c)) |>
            pivot_longer(yr_c, names_to="Year", values_to="gal_per_acre") |>
            mutate(Year         = suppressWarnings(as.integer(Year)),
                   gal_per_acre = suppressWarnings(as.numeric(gal_per_acre)),
                   Farm=farm, fuel_type="Conventional Diesel",
                   fuel_name="Diesel", unit="Gal/Acre", qty_gal=NA_real_,
                   data_quality="E — Gal/Acre only; area not provided; qty not calculable") |>
            filter(!is.na(gal_per_acre))
          results[["galacre"]] <- tag(tbl, farm, pai, fn, "Fuel Summary")
        }

      } else {
        # ── LONG FORMAT (Hettinga, Vander Dussen, MK Sawtooth, Eric Shannon)
        df <- df |>
          rename_first("^Fuel Name$",       "fuel_name")     |>
          rename_first("^Fuel Type$",       "fuel_type_raw") |>
          rename_first("^Quantity$",        "qty_gal")       |>
          rename_first("^Unit$",            "unit")          |>
          rename_first("^SOURCE$|^Source$", "Source")

        # Guard: if Year column is absent, log and skip rather than crash
        if (!"Year" %in% names(df)) {
          audit_log[[paste0(farm,"_fuel_no_year")]] <<- data.frame(
            farm_id=farm, sheet="Fuel Summary",
            issue="Year column absent — check for two-row header",
            action="Inspect source file", stringsAsFactors=FALSE)
          return(NULL)
        }

        # Resolve fuel name column before entering the pipe (names(.) unavailable with |>)
        fuel_col <- if ("fuel_name"     %in% names(df)) "fuel_name"     else
                    if ("fuel_type_raw" %in% names(df)) "fuel_type_raw" else NULL

        df <- df |>
          filter(!is.na(qty_gal), !is.na(Year)) |>
          mutate(Year         = suppressWarnings(as.integer(Year)),
                 qty_gal      = suppressWarnings(as.numeric(qty_gal)),
                 unit         = gsub("(?i)gallon","Gal", as.character(unit)),
                 fuel_type    = if (!is.null(fuel_col)) classify_fuel(.data[[fuel_col]])
                                else classify_fuel(NA_character_),
                 gal_per_acre = NA_real_,
                 data_quality = "A — Dated purchase/sales invoice")

        keep <- intersect(c("Date","Year","fuel_name","fuel_type","qty_gal",
                            "gal_per_acre","unit","Source","data_quality"), names(df))
        results[["long"]] <- tag(df[, keep, drop=FALSE], farm, pai, fn, "Fuel Summary")
      }
    }

    # ── HANDWRITTEN FUEL SUMMARY (Eric Shannon only) ───────────────────────
    df_hw <- rd("Handwritten Fuel Summary")
    if (!is.null(df_hw)) {
      yr_hw <- year_cols_of(df_hw)
      fixed <- df_hw |>
        slice(1:5) |>
        rename_first("Ranch|Unnamed.*0","Ranch") |>
        rename_first("Area.*acres","Acres")      |>
        rename_first("Gal.Acre","GalAcre")

      hw_rows <- list()
      for (i in seq_len(nrow(fixed))) {
        ranch <- as.character(fixed$Ranch[i])
        if (is.na(ranch) || ranch == "NA") next
        acres <- suppressWarnings(as.numeric(fixed$Acres[i]))
        gpa   <- suppressWarnings(as.numeric(fixed$GalAcre[i]))
        base  <- NA_real_
        for (yr in sort(yr_hw, decreasing=TRUE)) {
          v <- suppressWarnings(as.numeric(as.character(fixed[[as.character(yr)]][i])))
          if (!is.na(v)) base <- v else v <- base
          hw_rows[[length(hw_rows)+1]] <- data.frame(
            Farm=paste(farm,"—",ranch), Year=as.integer(yr),
            Acres=acres, gal_per_acre=gpa, qty_gal=v,
            fuel_type="Conventional Diesel", unit="Gal",
            data_quality="D — Estimated gal/acre × acres (handwritten)",
            stringsAsFactors=FALSE)
        }
      }
      # South Cairns block (dynamic detection)
      sc_idx <- which(grepl("SOUTH CAIRNS", toupper(as.character(df_hw[[1]]))))
      if (length(sc_idx) > 0) {
        blk <- df_hw[sc_idx[1]:(sc_idx[1]+5), ]
        yr_r <- blk[grepl("year",    tolower(as.character(blk[[1]])), TRUE), ]
        qt_r <- blk[grepl("consumpt",tolower(as.character(blk[[1]])), TRUE), ]
        if (nrow(yr_r) > 0 && nrow(qt_r) > 0) {
          yrs  <- suppressWarnings(as.integer(unlist(yr_r[1, 2:5])))
          qtys <- suppressWarnings(as.numeric(unlist(qt_r[1, 2:5])))
          for (j in seq_along(yrs))
            if (!is.na(yrs[j]) && !is.na(qtys[j]))
              hw_rows[[length(hw_rows)+1]] <- data.frame(
                Farm=paste(farm,"— South Cairns"), Year=yrs[j],
                Acres=394.6, gal_per_acre=NA_real_, qty_gal=qtys[j],
                fuel_type="Conventional Diesel", unit="Gal",
                data_quality="B — Handwritten fuel usage estimate",
                stringsAsFactors=FALSE)
        }
      }
      if (length(hw_rows) > 0) {
        hw_df <- bind_rows(hw_rows) |> filter(!is.na(qty_gal))
        results[["handwritten"]] <- tag(hw_df, farm, pai, fn, "Handwritten Fuel Summary")
      }
    }

    if (length(results) == 0) {
      audit_log[[paste0(farm,"_fuel_absent")]] <<- data.frame(
        farm_id=farm, sheet="Fuel",
        issue="No fuel sheet found in source file",
        action="Request fuel records from farm", stringsAsFactors=FALSE)
      return(NULL)
    }
    bind_rows(results)
  }

  # ── FERTILISER ────────────────────────────────────────────────────────────
  parse_fert <- function() {
    df <- coalesce_first(rd("Fertiliser Summary"), rd("Fertilizer Summary"))
    if (is.null(df)) {
      audit_log[[paste0(farm,"_fert_absent")]] <<- data.frame(
        farm_id=farm, sheet="Fertiliser",
        issue="No Fertiliser Summary sheet in source file",
        action="Request fertiliser records from farm", stringsAsFactors=FALSE)
      return(NULL)
    }

    df <- df |>
      rename_first("Farm.ID|Farm ID", "Farm")        |>
      rename_first("^N%$",            "N_pct")       |>
      rename_first("Note.*Comment",   "Notes")       |>
      rename_first("^Consumption",    "Consumption") |>
      rename_first("Rate.Acre",       "Rate_per_acre") |>
      rename_first("Qty.*\\(kg\\)",   "qty_kg")      |>
      rename_first("Qty N",           "qty_n_kg")    |>
      rename_first("(?i)^year$",      "Year")        # normalise case variants

    if (!"Year"       %in% names(df)) {
      audit_log[[paste0(farm,"_fert_no_year")]] <<- data.frame(
        farm_id=farm, sheet="Fertiliser Summary",
        issue="Year column not found — check sheet structure",
        action="Inspect source file", stringsAsFactors=FALSE)
      return(NULL)
    }
    if (!"Fertilizer" %in% names(df)) {
      audit_log[[paste0(farm,"_fert_no_fert")]] <<- data.frame(
        farm_id=farm, sheet="Fertiliser Summary",
        issue="Fertilizer column not found — check sheet structure",
        action="Inspect source file", stringsAsFactors=FALSE)
      return(NULL)
    }

    df <- df |>
      filter(!is.na(Fertilizer), !is.na(Year)) |>
      mutate(Year  = suppressWarnings(as.integer(Year)),
             N_pct = suppressWarnings(as.numeric(N_pct)))

    if (nrow(df) == 0) return(NULL)

    notes_v <- if ("Notes" %in% names(df)) df$Notes else rep(NA_character_, nrow(df))
    df <- df |> mutate(
      data_quality = dplyr::case_when(
        grepl("sawtooth|mk",   tolower(farm), TRUE) ~ "D — Sold qty (not confirmed consumption)",
        !is.na(notes_v) & grepl("confirm blend|customized",
                                 tolower(as.character(notes_v)), TRUE) ~ "C — Blend composition unconfirmed",
        !is.na(notes_v) & grepl("name\\?|unit\\?",
                                 tolower(as.character(notes_v)), TRUE) ~ "C — Name or unit unconfirmed",
        !is.na(notes_v) & grepl("n% not avail",
                                 tolower(as.character(notes_v)), TRUE) ~ "D — N% unavailable",
        grepl("tevelde|colburn", tolower(farm), TRUE) ~ "E — Rate/acre only; area not provided",
        TRUE ~ "A — Source document"
      )
    )

    keep <- intersect(
      c("Farm","Year","Date","Fertilizer","Unit","N_pct","Consumption","qty_kg",
        "qty_n_kg","Rate_per_acre","Category","Name","Block_ID","Notes",
        "Source","data_quality"),
      names(df))

    # Append Sheet5 (MK Sawtooth 2025 raw invoice)
    df5 <- rd("Sheet5")
    if (!is.null(df5) && ncol(df5) >= 5) {
      names(df5)[1:5] <- c("Date","Fertilizer","Consumption","Unit","N_pct")
      df5 <- df5 |>
        filter(!is.na(Fertilizer), !is.na(Date)) |>
        mutate(Year=2025L, Farm=farm, data_quality="D — Sold qty 2025 raw invoice")
      df <- bind_rows(df[, keep, drop=FALSE],
                      df5[, intersect(keep, names(df5)), drop=FALSE])
    } else {
      df <- df[, keep, drop=FALSE]
    }

    tag(df, farm, pai, fn, "Fertiliser Summary")
  }

  # ── Dispatch & store
  y <- parse_yield()
  f <- parse_fuel()
  r <- parse_fert()

  if (!is.null(y) && is.data.frame(y) && nrow(y) > 0) yield_list[[farm]] <<- y
  if (!is.null(f) && is.data.frame(f) && nrow(f) > 0) fuel_list[[farm]]  <<- f
  if (!is.null(r) && is.data.frame(r) && nrow(r) > 0) fert_list[[farm]]  <<- r

  data.frame(PAI=pai, Farm=farm,
             Yield = if (!is.null(y) && is.data.frame(y) && nrow(y)>0) nrow(y) else 0L,
             Fuel  = if (!is.null(f) && is.data.frame(f) && nrow(f)>0) nrow(f) else 0L,
             Fert  = if (!is.null(r) && is.data.frame(r) && nrow(r)>0) nrow(r) else 0L,
             stringsAsFactors=FALSE)
}

# =============================================================================
# ── 5.  RUN ALL FARMS  ────────────────────────────────────────────────────────
# =============================================================================
parse_summary <- bind_rows(Map(parse_farm,
                               FILE_REGISTRY$filepath,
                               FILE_REGISTRY$filename,
                               FILE_REGISTRY$farm_id,
                               FILE_REGISTRY$pai_id))

# =============================================================================
# ── 6.  BUILD MASTER TABLES  ──────────────────────────────────────────────────
# =============================================================================
master_fuel <- bind_rows(fuel_list) |>
  mutate(Year      = suppressWarnings(as.integer(Year)),
         qty_gal   = suppressWarnings(as.numeric(qty_gal)),
         data_tier = tier_letter(data_quality),
         u_ad_pct  = tier_u_ad(data_tier)) |>
  arrange(pai_id, Year, fuel_type)

master_yield <- bind_rows(yield_list) |>
  mutate(Year  = suppressWarnings(as.integer(Year)),
         Yield = suppressWarnings(as.numeric(Yield))) |>
  arrange(pai_id, Year)

master_fert <- bind_rows(fert_list) |>
  mutate(Year      = suppressWarnings(as.integer(Year)),
         N_pct     = suppressWarnings(as.numeric(N_pct)),
         data_tier = tier_letter(data_quality),
         u_ad_pct  = tier_u_ad(data_tier)) |>
  arrange(pai_id, Year, Fertilizer)

# =============================================================================
# ── 7.  DATA FLAGS  ───────────────────────────────────────────────────────────
# =============================================================================
manual_flags <- bind_rows(lapply(audit_log, as.data.frame))

auto_fuel_flags <- master_fuel |>
  filter(grepl("Gal/Acre|rate|sold", data_quality, ignore.case=TRUE)) |>
  distinct(farm_id, source_sheet, data_quality) |>
  rename(sheet=source_sheet, issue=data_quality) |>
  mutate(action="Review with IQC before use in emissions model")

auto_fert_flags <- master_fert |>
  filter(grepl("rate|sold|N%", data_quality, ignore.case=TRUE)) |>
  distinct(farm_id, source_sheet, data_quality) |>
  rename(sheet=source_sheet, issue=data_quality) |>
  mutate(action="Resolve before PAI calculation")

master_flags <- bind_rows(manual_flags, auto_fuel_flags, auto_fert_flags) |>
  distinct() |>
  arrange(farm_id, sheet)

# =============================================================================
# ── 8.  STRUCTURE AUDIT  ──────────────────────────────────────────────────────
# =============================================================================
structure_audit <- FILE_REGISTRY |>
  select(pai_id, farm_id, source_filename=filename) |>
  left_join(master_fuel  |> count(farm_id, name="fuel_rows"),  by="farm_id") |>
  left_join(master_yield |> count(farm_id, name="yield_rows"), by="farm_id") |>
  left_join(master_fert  |> count(farm_id, name="fert_rows"),  by="farm_id") |>
  replace_na(list(fuel_rows=0L, yield_rows=0L, fert_rows=0L)) |>
  mutate(
    extra_sheets = mapply(function(fp_val) {
      known <- c("Yield Summary","Fuel Summary","Fertiliser Summary","Fertilizer Summary",
                 "Handwritten Fuel Summary","Sheet5","Sheet4","Sheet1","Sheet1 (2)","Sheet2",
                 "Fertiliser data 2020-25","DATA CAPTURE TEMPLATE")
      extra <- setdiff(tryCatch(readxl::excel_sheets(fp_val), error=function(e)""), known)
      if (length(extra)==0) "—" else paste(extra, collapse="; ")
    }, FILE_REGISTRY$filepath),
    data_gaps = dplyr::case_when(
      yield_rows==0 & fert_rows==0 ~ "⚠ Yield and Fert absent",
      yield_rows==0 ~ "⚠ Yield absent",
      fert_rows ==0 ~ "⚠ Fert absent",
      TRUE ~ "✓ OK")
  )

# =============================================================================
# ── 9.  EXPORT MASTER EXCEL  ──────────────────────────────────────────────────
# =============================================================================
if (params$write_excel) {

  out_path <- file.path(OUTPUT_DIR, "SAS_4842_Master_Farm_Inputs.xlsx")

  # Ensure OUTPUT folder exists (recursive = TRUE handles nested paths)
  if (!dir.create(OUTPUT_DIR, showWarnings=FALSE, recursive=TRUE) &&
      !dir.exists(OUTPUT_DIR)) {
    stop("Cannot create OUTPUT directory: '",
         normalizePath(OUTPUT_DIR, mustWork=FALSE), "'.\n",
         "Check that you have write permission to that location.", call.=FALSE)
  }

  # If a previous version of the file is open in Excel, delete it first.
  # On Windows, an open .xlsx is locked — write_xlsx fails with a permissions
  # error even though you own the file.  Removing it before writing sidesteps
  # the lock (the open Excel window will show a "file not found" notice but
  # will not lose the data it already has loaded).
  if (file.exists(out_path)) {
    del_ok <- tryCatch({ file.remove(out_path); TRUE },
                       warning = function(w) FALSE,
                       error   = function(e) FALSE)
    if (!del_ok) {
      stop(
        "Cannot overwrite: '", normalizePath(out_path), "'\n\n",
        "The file is most likely open in Excel.\n",
        "  → Close Excel (or just the file), then re-render.\n\n",
        "Alternative: change the output filename in the params:\n",
        "    rmarkdown::render(\"your.Rmd\",\n",
        "      params = list(output_dir = \"OUTPUT_v2\"))",
        call.=FALSE)
    }
  }

  write_ok <- tryCatch({
    writexl::write_xlsx(
      list("STRUCTURE AUDIT"   = structure_audit,
           "FUEL MASTER"       = master_fuel,
           "YIELD MASTER"      = master_yield,
           "FERTILISER MASTER" = master_fert,
           "DATA FLAGS"        = master_flags),
      path = out_path
    )
    TRUE
  }, error = function(e) {
    stop(
      "write_xlsx failed: ", conditionMessage(e), "\n\n",
      "Attempted path : ", normalizePath(out_path, mustWork=FALSE), "\n",
      "OUTPUT dir     : ", normalizePath(OUTPUT_DIR), "\n",
      "Dir writable?  : ", file.access(OUTPUT_DIR, mode=2) == 0, "\n\n",
      "Common fixes:\n",
      "  • Close the file in Excel if it is open (Windows file lock)\n",
      "  • Check write permissions on: ", normalizePath(OUTPUT_DIR), "\n",
      "  • Pass a different output_dir param if the default is read-only:\n",
      "      rmarkdown::render(\"your.Rmd\",\n",
      "        params = list(output_dir = \"/writable/path/OUTPUT\"))",
      call.=FALSE)
  })

}

2 Results

2.1 File Registry

FILE_REGISTRY |>
  select(PAI=pai_id, Farm=farm_id, `Source file`=filename) |>
  fmt_tbl(caption="Files matched to farms")
Files matched to farms
PAI Farm Source file
1 Dyt Farms 4842 Dyt Farms Summary Input.xlsx
2 Hettinga Farms 4842 Hettinga Farms Summary Input.xlsx
3 Mineral King Sawtooth 4842 MK Sawtooth Summary Input.xlsx
4 Vander Dussen 4842 Vander Dussen Summary Input.xlsx
5 DCB Farming 4842 DCB Farming Summary Input.xlsx
6 Eric Shannon Farming 4842 Eric Shannon Farming Summary Input.xlsx
7 B&R TeVelde 4842 B&R Tevelde Summary Input.xlsx
8 Ron Colburn 4842 Ron Colburn Summary Input.xlsx
if (length(unmatched) > 0) {
  cat("::: {.warn-box}\n**Unmatched files** (skipped):\n\n")
  cat(paste0("- `", unmatched, "`\n")); cat(":::\n")
} else {
  cat("::: {.ok-box}\nAll files in DATA/ matched to a farm.\n:::\n")
}

All files in DATA/ matched to a farm.

2.2 Parsing Summary

parse_summary |>
  mutate(across(c(Yield,Fuel,Fert),
                ~cell_spec(.x, color=ifelse(.x==0,"#e74c3c","#2c3e50")))) |>
  kbl(escape=FALSE, caption="Rows extracted per farm and data type") |>
  kable_styling(bootstrap_options=c("striped","hover","condensed"),
                full_width=FALSE, font_size=12) |>
  row_spec(0, bold=TRUE, background="#2c3e50", color="white") |>
  column_spec(3:5, width="80px")
Rows extracted per farm and data type
PAI Farm Yield Fuel Fert
1 Dyt Farms 5 7 43
2 Hettinga Farms 1 0 243
3 Mineral King Sawtooth 0 48 419
4 Vander Dussen 1 9 0
5 DCB Farming 5 0 196
6 Eric Shannon Farming 1 61 166
7 B&R TeVelde 0 28 161
8 Ron Colburn 0 33 0

2.3 Master Table Dimensions

tibble(
  Table   = c("FUEL MASTER","YIELD MASTER","FERTILISER MASTER"),
  Rows    = c(nrow(master_fuel), nrow(master_yield), nrow(master_fert)),
  Columns = c(ncol(master_fuel), ncol(master_yield), ncol(master_fert)),
  Farms   = c(n_distinct(master_fuel$farm_id), n_distinct(master_yield$farm_id),
              n_distinct(master_fert$farm_id)),
  Years   = c(paste(range(master_fuel$Year,  na.rm=TRUE), collapse="–"),
              paste(range(master_yield$Year, na.rm=TRUE), collapse="–"),
              paste(range(master_fert$Year,  na.rm=TRUE), collapse="–"))
) |> fmt_tbl(caption="Master table dimensions")
Master table dimensions
Table Rows Columns Farms Years
FUEL MASTER 186 18 6 2020–2025
YIELD MASTER 13 12 5 2019–2024
FERTILISER MASTER 1228 19 6 2020–2025

3 Master Table Previews

3.1 Fuel

DT::datatable(
  master_fuel |> select(-source_file) |> head(300),
  caption="FUEL MASTER", filter="top", extensions="Buttons",
  options=list(pageLength=10, scrollX=TRUE, dom="Bfrtip", buttons=c("csv","excel")),
  rownames=FALSE)

3.2 Yield

DT::datatable(
  master_yield |> select(-source_file) |> head(300),
  caption="YIELD MASTER", filter="top", extensions="Buttons",
  options=list(pageLength=10, scrollX=TRUE, dom="Bfrtip", buttons=c("csv","excel")),
  rownames=FALSE)

3.3 Fertiliser

DT::datatable(
  master_fert |> select(-source_file) |> head(300),
  caption="FERTILISER MASTER", filter="top", extensions="Buttons",
  options=list(pageLength=10, scrollX=TRUE, dom="Bfrtip", buttons=c("csv","excel")),
  rownames=FALSE)

4 Data Quality

4.1 Tier Definitions

tibble(
  Tier=c("A","B","C","D","E","F"), `U_AD %`=c(5,10,15,25,50,75),
  Description=c("Invoice/receipt; confirmed composition",
                "Source document present; minor unit ambiguity",
                "Blend/composition needs confirmation",
                "Sold quantity (not confirmed consumption); estimated N%",
                "Missing N%; rate/acre only; unconfirmed composition",
                "Unknown unit; no source document")
) |>
  kbl(caption="Data quality tier definitions") |>
  kable_styling(bootstrap_options=c("striped","hover","condensed"),
                full_width=FALSE, font_size=12) |>
  row_spec(0, bold=TRUE, background="#2c3e50", color="white") |>
  column_spec(1, bold=TRUE, background=TIER_COLS[c("A","B","C","D","E","F")], color="white")
Data quality tier definitions
Tier U_AD % Description
A 5 Invoice/receipt; confirmed composition
B 10 Source document present; minor unit ambiguity
C 15 Blend/composition needs confirmation
D 25 Sold quantity (not confirmed consumption); estimated N%
E 50 Missing N%; rate/acre only; unconfirmed composition
F 75 Unknown unit; no source document

4.2 Tier Distribution — Fuel

master_fuel |>
  count(farm_id, data_tier) |>
  mutate(farm_id   = factor(farm_id,   levels=rev(sort(unique(farm_id)))),
         data_tier = factor(data_tier, levels=c("A","B","C","D","E","F"))) |>
  ggplot(aes(x=n, y=farm_id, fill=data_tier)) +
  geom_col(position="stack", width=0.65) +
  scale_fill_manual(values=TIER_COLS, name="Tier", guide=guide_legend(reverse=TRUE)) +
  scale_x_continuous(expand=expansion(mult=c(0,0.05))) +
  labs(x="Records", y=NULL, title="Fuel Records by Data Quality Tier") +
  theme_minimal(base_size=12) +
  theme(panel.grid.major.y=element_blank(), legend.position="right")
Fuel records by data quality tier per farm

Fuel records by data quality tier per farm

4.3 Tier Distribution — Fertiliser

master_fert |>
  count(farm_id, data_tier) |>
  mutate(farm_id   = factor(farm_id,   levels=rev(sort(unique(farm_id)))),
         data_tier = factor(data_tier, levels=c("A","B","C","D","E","F"))) |>
  ggplot(aes(x=n, y=farm_id, fill=data_tier)) +
  geom_col(position="stack", width=0.65) +
  scale_fill_manual(values=TIER_COLS, name="Tier", guide=guide_legend(reverse=TRUE)) +
  scale_x_continuous(expand=expansion(mult=c(0,0.05))) +
  labs(x="Records", y=NULL, title="Fertiliser Records by Data Quality Tier") +
  theme_minimal(base_size=12) +
  theme(panel.grid.major.y=element_blank(), legend.position="right")
Fertiliser records by data quality tier per farm

Fertiliser records by data quality tier per farm

4.4 Annual Coverage

fuel_cov <- master_fuel |> filter(!is.na(Year)) |> count(farm_id, Year) |>
  mutate(farm_id=factor(farm_id, levels=names(FARM_COLS)), type="Fuel")
fert_cov <- master_fert |> filter(!is.na(Year)) |> count(farm_id, Year) |>
  mutate(farm_id=factor(farm_id, levels=names(FARM_COLS)), type="Fertiliser")

bind_rows(fuel_cov, fert_cov) |>
  ggplot(aes(x=Year, y=farm_id, fill=farm_id, size=n)) +
  geom_point(shape=21, colour="white", stroke=0.4) +
  facet_wrap(~type) +
  scale_fill_manual(values=FARM_COLS, guide="none") +
  scale_size_area(max_size=12, name="Records") +
  scale_x_continuous(breaks=2019:2025) +
  labs(x=NULL, y=NULL) +
  theme_minimal(base_size=11) +
  theme(panel.grid.minor=element_blank(),
        strip.background=element_rect(fill="#2c3e50"),
        strip.text=element_text(colour="white", face="bold"))
Record coverage by farm and year (bubble = record count)

Record coverage by farm and year (bubble = record count)

4.5 Fuel Volume (Tier A & B only)

master_fuel |>
  filter(!is.na(qty_gal), !is.na(Year), data_tier %in% c("A","B")) |>
  group_by(farm_id, Year, fuel_type) |>
  summarise(total_gal=sum(qty_gal, na.rm=TRUE), .groups="drop") |>
  ggplot(aes(x=Year, y=total_gal, fill=fuel_type)) +
  geom_col(width=0.7) +
  facet_wrap(~farm_id, scales="free_y", ncol=2) +
  scale_fill_brewer(palette="Set2", name="Fuel type") +
  scale_y_continuous(labels=scales::comma) +
  scale_x_continuous(breaks=2019:2025) +
  labs(x=NULL, y="Gallons",
       title="Annual Fuel Use by Farm",
       caption="Farms with Gal/Acre-only data (B&R TeVelde, Ron Colburn) excluded") +
  theme_minimal(base_size=11) +
  theme(strip.background=element_rect(fill="#2c3e50"),
        strip.text=element_text(colour="white", face="bold"),
        panel.grid.minor=element_blank())
Annual fuel consumption — invoice-backed records only

Annual fuel consumption — invoice-backed records only


5 Data Flags

n_open <- nrow(master_flags)
if (n_open == 0) {
  cat("::: {.ok-box}\nNo data flags raised.\n:::\n")
} else {
  cat(sprintf("::: {.alert-box}\n**%d flag(s)** require attention before IQC sign-off.\n:::\n", n_open))
  DT::datatable(
    master_flags,
    caption="Open data flags — resolve before PAI calculation",
    filter="top", extensions="Buttons",
    options=list(pageLength=15, scrollX=TRUE, dom="Bfrtip", buttons=c("csv","excel")),
    rownames=FALSE) |>
    DT::formatStyle("farm_id",
                    backgroundColor=DT::styleEqual(names(FARM_COLS), unname(FARM_COLS)),
                    color="white", fontWeight="bold")
}

11 flag(s) require attention before IQC sign-off.


6 Structure Audit

structure_audit |>
  select(-source_filename) |>
  kbl(caption="Structure audit — one row per farm") |>
  kable_styling(bootstrap_options=c("striped","hover","condensed"),
                full_width=TRUE, font_size=12) |>
  row_spec(0, bold=TRUE, background="#2c3e50", color="white") |>
  column_spec(7,
              color=ifelse(grepl("✓", structure_audit$data_gaps),"#27ae60","#e74c3c"),
              bold=TRUE)
Structure audit — one row per farm
pai_id farm_id fuel_rows yield_rows fert_rows extra_sheets data_gaps
1 Dyt Farms 7 5 43 ✓ OK
2 Hettinga Farms 0 1 243 ✓ OK
3 Mineral King Sawtooth 48 0 419 ⚠ Yield absent
4 Vander Dussen 9 1 0 ⚠ Fert absent
5 DCB Farming 0 5 196 ✓ OK
6 Eric Shannon Farming 61 1 166 ✓ OK
7 B&R TeVelde 28 0 161 ⚠ Yield absent
8 Ron Colburn 33 0 0 ⚠ Yield and Fert absent

7 Export

if (params$write_excel) {
  out_path <- file.path(OUTPUT_DIR, "SAS_4842_Master_Farm_Inputs.xlsx")
  cat(sprintf(paste0("::: {.ok-box}\n**Master Excel written:** `%s`\n\n",
              "Sheets: STRUCTURE AUDIT | FUEL MASTER | YIELD MASTER | FERTILISER MASTER | DATA FLAGS\n\n",
              "Totals: **%d** fuel rows | **%d** yield rows | **%d** fert rows | **%d** flags\n:::"),
      normalizePath(out_path, mustWork=FALSE),
      nrow(master_fuel), nrow(master_yield), nrow(master_fert), nrow(master_flags)))
} else {
  cat("::: {.info-box}\n`write_excel = FALSE` — Excel export skipped (report-only mode).\n:::\n")
}

Master Excel written: C:\Users\Quan\Downloads\OUTPUT\SAS_4842_Master_Farm_Inputs.xlsx

Sheets: STRUCTURE AUDIT | FUEL MASTER | YIELD MASTER | FERTILISER MASTER | DATA FLAGS

Totals: 186 fuel rows | 13 yield rows | 1228 fert rows | 11 flags


8 Session Info

cat(sprintf("R        : %s\n", R.version$version.string))
R        : R version 4.4.1 (2024-06-14 ucrt)
cat(sprintf("Platform : %s\n", R.version$platform))
Platform : x86_64-w64-mingw32
cat(sprintf("Rendered : %s\n", format(Sys.time(), "%Y-%m-%d %H:%M:%S %Z")))
Rendered : 2026-03-30 09:36:12 AEST
cat(sprintf("DATA dir : %s\n\n", normalizePath(DATA_DIR, mustWork=FALSE)))
DATA dir : C:\Users\Quan\Downloads\DATA
pi <- installed.packages()[pkgs, c("Package","Version"), drop=FALSE]
cat(paste(sprintf("  %-16s %s", pi[,"Package"], pi[,"Version"]), collapse="\n"))
  readxl           1.4.5
  writexl          1.5.4
  dplyr            1.1.4
  tidyr            1.3.1
  stringr          1.5.1
  knitr            1.50
  kableExtra       1.4.0
  ggplot2          4.0.0
  scales           1.4.0
  DT               0.34.0