1 OVERVIEW

Goal: trial two ML procedures to classify the UPCs into Market Basket categories.

Compare two ML procedures in R (Naive Bayes + TF-IDF; Random Forest + GloVe embeddings).

Use USDA TFP 2021 top categories + add one for “Non-Food Item”.

Nest subcategories in PP-NAP1718 under those top-level categories.

Output for review: food_description, predictions, confidence, and agreement flag.

Verify ~500 sample rows before scaling to UPCs.

DATA SOURCES

USDA PP National Average Prices (PP-NAP1718 sheet, 2017–2018).

USDA Thrifty Food Plan, 2021 (Table 1 categories).

2 METHODOLOGY

2.1 Data Preparation

Quanteda Label Assignment + Cosine Similarity

library(quanteda)

# Subcategory corpus/dfm
subcategory_corpus <- corpus(tfp_lookup$MB_Cats)
subcategory_tokens <- tokens(subcategory_corpus, remove_punct=TRUE) %>% tokens_tolower()
subcategory_dfm <- dfm(subcategory_tokens)

# Food description corpus/dfm
food_corpus <- corpus(pp_nap$food_description)
food_tokens <- tokens(food_corpus, remove_punct=TRUE) %>% tokens_tolower()
food_dfm <- dfm(food_tokens)

# Align features for cosine similarity

similarity_matrix <- textstat_simil(food_dfm, subcategory_dfm, margin="documents", method="cosine")

# Assign best-matching top-level category (via subcategory similarity)
sim_matrix <- as.matrix(similarity_matrix)
best_match_idx <- as.integer(apply(sim_matrix, 1, which.max))
matched_labels <- tfp_lookup$tfp_category[best_match_idx]

pp_nap$tfp_category[which(!is.na(pp_nap$food_description))] <- matched_labels

# Join back to ensure every description has a label where possible
food_descriptions <- pp_nap %>%
  filter(!is.na(food_description)) %>% distinct(food_description)
food_descriptions$tfp_category <- matched_labels

pp_nap <- left_join(pp_nap, food_descriptions, by="food_description") %>%
  select(-tfp_category.x) %>% rename(tfp_category=tfp_category.y)

pp_nap$tfp_category[is.na(pp_nap$tfp_category)] <- "Unknown"

Category Frequencies

pp_nap %>% count(tfp_category, sort=TRUE) %>%
  knitr::kable("html", caption="TFP Category Frequencies") %>%
  kableExtra::kable_styling(bootstrap_options=c("striped","hover","condensed","responsive"))

TFP Category Frequencies
tfp_category	n
Vegetables	2029
Dairy	1073
Miscellaneous	687
Fruits	281
Grains	228
Protein foods	137

Training Data Construction for Models

# Labeled rows for modeling
nb_data <- pp_nap %>%
  filter(!is.na(food_description), !is.na(tfp_category)) %>%
  distinct(food_description, tfp_category)

# Quanteda corpus/dfm for NB
nb_corpus <- corpus(nb_data, text_field="food_description")
docvars(nb_corpus,"food_description") <- nb_data$food_description
docvars(nb_corpus,"tfp_category")     <- nb_data$tfp_category

nb_tokens <- tokens(nb_corpus, remove_punct=TRUE, remove_symbols=TRUE) %>%
  tokens_tolower() %>% tokens_remove(stopwords("en"))
nb_dfm <- dfm(nb_tokens)

MODELS Model 1: Naive Bayes + TF-IDF (baseline)

set.seed(123)
train_idx <- sample(1:ndoc(nb_dfm), size=0.8*ndoc(nb_dfm))

dfm_train <- nb_dfm[train_idx, ]
dfm_test  <- nb_dfm[-train_idx, ]

labels_train <- docvars(dfm_train,"tfp_category")
labels_test  <- docvars(dfm_test,"tfp_category")

# Train NB
nb_model <- textmodel_nb(dfm_train, labels_train)

# Predict on test
nb_preds <- predict(nb_model, newdata=dfm_test)

# Accuracy + confusion
nb_accuracy <- mean(nb_preds == labels_test)
cat("Naive Bayes Accuracy (holdout):", round(nb_accuracy, 3), "\n")

## Naive Bayes Accuracy (holdout): 0.66

nb_cm <- caret::confusionMatrix(factor(nb_preds), factor(labels_test))
print(nb_cm$table)

##                Reference
## Prediction      Dairy Fruits Grains Miscellaneous Protein foods Vegetables
##   Dairy           192      4      0            12             2         40
##   Fruits            3     45      2            10             0         28
##   Grains            3      2     25            10             2         17
##   Miscellaneous    12      1      3            74             3         46
##   Protein foods     2      1      1             1            10         32
##   Vegetables       20      2      5            33             5        239

Model 2: Random Forest + GloVe Embeddings

# Train/test split for RF using the same labeled frame (nb_data)
set.seed(123)
rf_split <- initial_split(nb_data, prop=0.8)
train_data <- training(rf_split)
test_data  <- testing(rf_split)

# Determine needed words from test set (to filter GloVe for efficiency)
target_text <- test_data %>% pull(food_description)
needed_words <- unique(unlist(stringr::str_split(tolower(target_text), "\\s+")))
needed_words <- needed_words[nchar(needed_words) > 0]

# Load pre-trained GloVe (filtered)
glove_file <- "G:/.shortcut-targets-by-id/10hwxlrEnEox7VqS6tvo44Q8rX59qZcSg/Drones_MV/GITHUB/ISER/MJones/FOOD_SECURITY/FOOD_PRICING/USDA_ML/USDA_ML_Data/dolma_300_2024_1.2M.100_combined.txt"
cache_rds  <- "glove_dolma_filtered_300d.rds"

if (file.exists(cache_rds)) {
  glove_vectors <- readRDS(cache_rds)
} else {
  con <- file(glove_file,"r"); on.exit(close(con))
  keep <- list(); keep_words <- character(0); i <- 0L
  repeat {
    line <- readLines(con, n=10000, warn=FALSE); if (length(line)==0) break
    spl <- strsplit(line, " ", fixed=TRUE)
    for (s in spl) {
      if (length(s) != 301) next
      w <- s[[1]]
      if (!(w %in% needed_words)) next
      vec <- suppressWarnings(as.numeric(s[-1])); if (anyNA(vec)) next
      i <- i + 1L; keep[[i]] <- vec; keep_words <- c(keep_words, w)
    }
  }
  glove_vectors <- do.call(rbind, keep); rownames(glove_vectors) <- keep_words
  saveRDS(glove_vectors, cache_rds)
}

# Embedding helpers
get_avg_embedding <- function(txt, embeddings) {
  words <- stringr::str_split(tolower(txt %||% ""), "\\s+")[[1]]
  valid <- intersect(words, rownames(embeddings))
  if (length(valid)==0) return(rep(0, ncol(embeddings)))
  colMeans(embeddings[valid,,drop=FALSE])
}
embed_text_vec <- function(text_vec, embeddings) {
  mat <- t(vapply(text_vec, function(z) get_avg_embedding(z, embeddings), numeric(ncol(embeddings))))
  as.data.frame(mat)
}

# Train RF on training set embeddings
rf_train_X <- embed_text_vec(train_data$food_description, glove_vectors)
rf_train_y <- factor(train_data$tfp_category)

set.seed(123)
rf_model <- randomForest::randomForest(
  x = rf_train_X, y = rf_train_y, ntree = 300, mtry = floor(sqrt(ncol(rf_train_X)))
)

# Evaluate RF on its test set
rf_test_X <- embed_text_vec(test_data$food_description, glove_vectors)
rf_probs_test <- predict(rf_model, rf_test_X, type = "prob")
rf_preds_test <- predict(rf_model, rf_test_X)

rf_out <- tibble(
  food_description        = test_data$food_description,
  tfp_category            = test_data$tfp_category,
  tfp_basket_cat_ML2      = rf_preds_test,
  tfp_basket_cat_ML2_conf = apply(rf_probs_test, 1, max)
)

rf_accuracy <- mean(rf_out$tfp_basket_cat_ML2 == rf_out$tfp_category)
cat("Random Forest Accuracy (holdout):", round(rf_accuracy, 3), "\n")

## Random Forest Accuracy (holdout): 0.729

rf_cm <- caret::confusionMatrix(
  factor(rf_out$tfp_basket_cat_ML2),
  factor(rf_out$tfp_category, levels = union(levels(factor(rf_out$tfp_category)), levels(factor(rf_out$tfp_basket_cat_ML2))))
)
print(rf_cm$table)

##                Reference
## Prediction      Dairy Fruits Grains Miscellaneous Protein foods Vegetables
##   Dairy           172      6      1            13             0          7
##   Fruits            1     23      0             0             0          1
##   Grains            0      1     14             1             1          1
##   Miscellaneous     4      2      0            49             1          8
##   Protein foods     0      1      0             0             4          0
##   Vegetables       55     22     21            77            16        385

Diagnostics Helpers

# ===== Diagnostics helpers (coverage + pretty prob table) =====
count_covered_tokens <- function(text_vec, embeddings) {
  vapply(text_vec, function(txt) {
    words <- stringr::str_split(tolower(txt %||% ""), "\\s+")[[1]]
    sum(words %in% rownames(embeddings))
  }, integer(1))
}

show_rf_probs <- function(text_vec, embeddings, model, top_n = 6) {
  # Embed and predict
  X <- embed_text_vec(text_vec, embeddings)
  prob <- predict(model, X, type = "prob")
  pred <- colnames(prob)[max.col(prob, ties.method = "first")]
  conf <- apply(prob, 1, max)
  covc <- count_covered_tokens(text_vec, embeddings)

  # Build per-row "top_n" probability strings for quick eyeballing
  topk <- apply(prob, 1, function(row) {
    s <- sort(row, decreasing = TRUE)
    s <- head(s, min(top_n, length(s)))
    paste(paste0(names(s), "=", sprintf("%.3f", s)), collapse = " | ")
  })

  tibble::tibble(
    text             = text_vec,
    rf_pred          = pred,
    rf_confidence    = round(conf, 3),
    rf_cov_tokens    = covc,
    rf_top_probs     = topk
  )
}

3 Model Diagnostics

# ===== MODEL DIAGNOSTICS: fit/accuracy health check =====

cat("\n=== RF Holdout Accuracy ===\n")

## 
## === RF Holdout Accuracy ===

rf_accuracy <- mean(rf_out$tfp_basket_cat_ML2 == rf_out$tfp_category)
print(round(rf_accuracy, 3))

## [1] 0.729

rf_cm <- caret::confusionMatrix(
  factor(rf_out$tfp_basket_cat_ML2),
  factor(rf_out$tfp_category,
         levels = union(levels(factor(rf_out$tfp_category)),
                        levels(factor(rf_out$tfp_basket_cat_ML2))))
)

cat("\n=== RF Confusion Matrix (Holdout) ===\n")

## 
## === RF Confusion Matrix (Holdout) ===

print(rf_cm$table)

##                Reference
## Prediction      Dairy Fruits Grains Miscellaneous Protein foods Vegetables
##   Dairy           172      6      1            13             0          7
##   Fruits            1     23      0             0             0          1
##   Grains            0      1     14             1             1          1
##   Miscellaneous     4      2      0            49             1          8
##   Protein foods     0      1      0             0             4          0
##   Vegetables       55     22     21            77            16        385

cat("\n=== RF Per-Class Accuracy (Holdout) ===\n")

## 
## === RF Per-Class Accuracy (Holdout) ===

rf_out %>%
  mutate(correct = tfp_basket_cat_ML2 == tfp_category) %>%
  group_by(tfp_category) %>%
  summarise(n = n(), accuracy = mean(correct), .groups = "drop") %>%
  arrange(accuracy) %>%
  print(n = Inf)

## # A tibble: 6 × 3
##   tfp_category      n accuracy
##   <chr>         <int>    <dbl>
## 1 Protein foods    22    0.182
## 2 Miscellaneous   140    0.35 
## 3 Grains           36    0.389
## 4 Fruits           55    0.418
## 5 Dairy           232    0.741
## 6 Vegetables      402    0.958

4 Sanity Check

# ===== SANITY CHECKS: clearly-typed items (with coverage + top probs) =====
sanity_items <- c("sea bass", "salmon", "cheddar cheese", "milk",
                  "apple", "broccoli", "white bread")

sanity_tbl <- show_rf_probs(sanity_items, glove_vectors, rf_model, top_n = 6)
cat("\n=== RF Sanity Spot Checks ===\n")

## 
## === RF Sanity Spot Checks ===

print(sanity_tbl)

## # A tibble: 7 × 5
##   text           rf_pred    rf_confidence rf_cov_tokens rf_top_probs            
##   <chr>          <chr>              <dbl>         <int> <chr>                   
## 1 sea bass       Vegetables         0.417             1 Vegetables=0.417 | Misc…
## 2 salmon         Vegetables         0.5               1 Vegetables=0.500 | Misc…
## 3 cheddar cheese Dairy              0.677             2 Dairy=0.677 | Vegetable…
## 4 milk           Grains             0.657             1 Grains=0.657 | Dairy=0.…
## 5 apple          Vegetables         0.77              1 Vegetables=0.770 | Misc…
## 6 broccoli       Vegetables         0.477             1 Vegetables=0.477 | Misc…
## 7 white bread    Vegetables         0.373             2 Vegetables=0.373 | Misc…

# Specific printout for 'sea bass' row (easy to find)
if ("sea bass" %in% sanity_items) {
  sb_row <- dplyr::filter(sanity_tbl, text == "sea bass")
  cat("\n--- Focus: 'sea bass' ---\n")
  print(sb_row)
  if (sb_row$rf_cov_tokens < 2) {
    cat("\n[Note] 'sea bass' has very low embedding coverage (rf_cov_tokens < 2).\n",
        "Consider adding coverage-aware prediction or augmenting GloVe vocab.\n")
  }
}

## 
## --- Focus: 'sea bass' ---
## # A tibble: 1 × 5
##   text     rf_pred    rf_confidence rf_cov_tokens rf_top_probs                  
##   <chr>    <chr>              <dbl>         <int> <chr>                         
## 1 sea bass Vegetables         0.417             1 Vegetables=0.417 | Miscellane…
## 
## [Note] 'sea bass' has very low embedding coverage (rf_cov_tokens < 2).
##  Consider adding coverage-aware prediction or augmenting GloVe vocab.

# ===== MODEL DIAGNOSTICS: fit/accuracy health check =====

cat("\n=== RF Holdout Accuracy ===\n")

## 
## === RF Holdout Accuracy ===

rf_accuracy <- mean(rf_out$tfp_basket_cat_ML2 == rf_out$tfp_category)
print(round(rf_accuracy, 3))

## [1] 0.729

rf_cm <- caret::confusionMatrix(
  factor(rf_out$tfp_basket_cat_ML2),
  factor(rf_out$tfp_category,
         levels = union(levels(factor(rf_out$tfp_category)),
                        levels(factor(rf_out$tfp_basket_cat_ML2))))
)

cat("\n=== RF Confusion Matrix (Holdout) ===\n")

## 
## === RF Confusion Matrix (Holdout) ===

print(rf_cm$table)

##                Reference
## Prediction      Dairy Fruits Grains Miscellaneous Protein foods Vegetables
##   Dairy           172      6      1            13             0          7
##   Fruits            1     23      0             0             0          1
##   Grains            0      1     14             1             1          1
##   Miscellaneous     4      2      0            49             1          8
##   Protein foods     0      1      0             0             4          0
##   Vegetables       55     22     21            77            16        385

cat("\n=== RF Per-Class Accuracy (Holdout) ===\n")

## 
## === RF Per-Class Accuracy (Holdout) ===

rf_out %>%
  mutate(correct = tfp_basket_cat_ML2 == tfp_category) %>%
  group_by(tfp_category) %>%
  summarise(n = n(), accuracy = mean(correct), .groups = "drop") %>%
  arrange(accuracy) %>%
  print(n = Inf)

## # A tibble: 6 × 3
##   tfp_category      n accuracy
##   <chr>         <int>    <dbl>
## 1 Protein foods    22    0.182
## 2 Miscellaneous   140    0.35 
## 3 Grains           36    0.389
## 4 Fruits           55    0.418
## 5 Dairy           232    0.741
## 6 Vegetables      402    0.958

MODEL COMPARISON & VISUALIZATION

comparison_df <- tibble::tibble(
  Model    = c("Naive Bayes (TF-IDF)", "Random Forest (GloVe)"),
  Accuracy = c(nb_accuracy, rf_accuracy)
)
knitr::kable(comparison_df, "html", caption = "Model Accuracy Summary") %>%
  kableExtra::kable_styling(bootstrap_options=c("striped","hover","condensed","responsive"))

Model Accuracy Summary
Model	Accuracy
Naive Bayes (TF-IDF)	0.6595265
Random Forest (GloVe)	0.7294250

  # NB plot: distribution of predictions vs actual (NB uses dfm_test / labels_test)
nb_plot_df <- tibble(actual = as.character(labels_test), pred = as.character(nb_preds))
p1 <- ggplot(nb_plot_df, aes(x = actual, fill = pred)) +
  geom_bar(position = "dodge") + theme_minimal() +
  labs(title = "Naive Bayes Predictions vs Actual", x = "Actual Category", fill = "Predicted") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# RF plot: distribution on rf_out
p2 <- ggplot(rf_out, aes(x = tfp_category, fill = tfp_basket_cat_ML2)) +
  geom_bar(position = "dodge") + theme_minimal() +
  labs(title = "Random Forest Predictions vs Actual", x = "Actual Category", fill = "Predicted") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

print(p1); print(p2)

5 EXTERNAL VALIDATION & REVIEWER BATCH

6.1 Create 500-Row Reviewer Batch (Predictions + Confidence + Agreement)

# Reuse rf_predict_with_conf helper
rf_predict_with_conf <- function(text_vec, embeddings, model, threshold = 0.0) {
  X <- embed_text_vec(text_vec, embeddings)
  prob <- predict(model, X, type = "prob")
  pred <- colnames(prob)[max.col(prob, ties.method = "first")]
  conf <- apply(prob, 1, max)
  pred_thr <- ifelse(conf >= threshold, pred, "Low-Confidence")
  tibble(
    tfp_basket_cat_ML2      = pred_thr,
    tfp_basket_cat_ML2_conf = conf
  )
}

set.seed(2025)

# Candidate pool with usable descriptions
candidates <- pp_nap %>%
  filter(!is.na(food_description) & nchar(food_description) > 0)

n_review <- min(500, nrow(candidates))
review_batch <- candidates %>% slice_sample(n = n_review)

# RF predictions with configurable threshold (default 0.60)
preds_rf <- rf_predict_with_conf(
  text_vec   = review_batch$food_description,
  embeddings = glove_vectors,
  model      = rf_model,
  threshold  = 0.60
)

review_batch_out <- review_batch %>%
  transmute(
    food_description,
    tfp_category = tfp_category  # ground truth if present for USDA rows
  ) %>%
  bind_cols(preds_rf)

# NB overlap (optional): include if NB objects exist
if (exists("nb_model") && exists("dfm_train")) {
  nb_dfm_review <- dfm(tokens(corpus(review_batch$food_description), remove_punct = TRUE) %>% tokens_tolower())
  nb_dfm_review <- dfm_match(nb_dfm_review, features = featnames(dfm_train))
  nb_preds_review <- predict(nb_model, newdata = nb_dfm_review)
  nb_conf_review  <- apply(predict(nb_model, newdata = nb_dfm_review, type = "prob"), 1, max)
  review_batch_out <- review_batch_out %>%
    mutate(
      tfp_basket_cat_ML1      = as.character(nb_preds_review),
      tfp_basket_cat_ML1_conf = nb_conf_review
    )
} else {
  review_batch_out <- review_batch_out %>%
    mutate(
      tfp_basket_cat_ML1      = NA_character_,
      tfp_basket_cat_ML1_conf = NA_real_
    )
}

# Agreement flag + blank manual verification
review_batch_out <- review_batch_out %>%
  mutate(
    ML_model_agreement = as.integer(tfp_basket_cat_ML1 == tfp_basket_cat_ML2),
    manual_verification = NA_character_
  )

# Export reviewer file (exact columns MJ requested)
reviewer_csv <- review_batch_out %>%
  select(food_description,
         tfp_basket_cat_ML1, tfp_basket_cat_ML1_conf,
         tfp_basket_cat_ML2, tfp_basket_cat_ML2_conf,
         ML_model_agreement,
         manual_verification)

readr::write_csv(reviewer_csv, "upc_review_batch_500.csv")

# Preview first 20
knitr::kable(head(reviewer_csv, 20), "html", caption = "Preview: Reviewer Batch (first 20)") %>%
  kableExtra::kable_styling(bootstrap_options = c("striped","hover","condensed","responsive"))

Preview: Reviewer Batch (first 20)
food_description	tfp_basket_cat_ML1	tfp_basket_cat_ML1_conf	tfp_basket_cat_ML2	tfp_basket_cat_ML2_conf	ML_model_agreement	manual_verification
Sea bass, baked or broiled, fat added (Includes sauteed; fried with no coating; grouper; striped bass; wreakfish; bass, NFS; NS as to fat)	Dairy	0.9981215	Dairy	0.8466667	1	NA
Breadsticks, soft, from frozen (Includes homemade; school cafeteria (K-12); garlic knot)	Miscellaneous	0.9432947	Low-Confidence	0.3633333	0	NA
Ice cream cone, chocolate, prepackaged (Includes with nuts)	Miscellaneous	0.8324897	Low-Confidence	0.5933333	0	NA
Shrimp and clams in tomato-based sauce, with noodles, frozen meal (Includes Budget Gourmet Linguini with Shrimp and Clams Marinara)	Vegetables	0.9819825	Low-Confidence	0.5833333	0	NA
Infant formula, powder, made with baby water (Enfamil Newborn) (Includes made with Nursery water; Gerber Pure water; Beech-Nut Spring water; Enfamil PREMIUM Newborn)	Vegetables	0.9999998	Vegetables	0.9100000	1	NA
Pork and vegetables excluding carrots, broccoli, and dark-green leafy; no potatoes, tomato-based sauce	Vegetables	1.0000000	Vegetables	0.9666667	1	NA
Vegetable noodle soup, reduced sodium, canned, prepared with water or ready-to-serve (Includes Campbell’s Healthy Request and Select Harvest varieties; Progresso Light and Reduced Sodium varieties)	Miscellaneous	0.9964942	Miscellaneous	0.6766667	1	NA
Bread, Cuban, toasted (Includes Spanish; Portuguese; pre-sliced or not)	Protein foods	0.4885441	Vegetables	0.6433333	0	NA
Chicken or turkey vegetable soup with noodles, stew type, chunky style, canned or ready-to-serve	Vegetables	0.9982100	Vegetables	0.7033333	1	NA
Goat, boiled	Vegetables	0.3234620	Vegetables	0.8900000	1	NA
Coffee, instant, decaffeinated, pre-lightened and pre-sweetened with sugar, reconsitituted (Includes powdered mix; Maxwell House International, flavors other than chocolate, cocoa, or mocha)	Miscellaneous	0.9999969	Miscellaneous	0.8333333	1	NA
Congee (Includes Conjee; rice porridge or gruel; Asian)	Protein foods	0.5720587	Low-Confidence	0.3666667	0	NA
Pizza, cheese, from school lunch, medium crust (Includes NS as to toppings; original or NS as to type of crust)	Dairy	0.9392463	Dairy	0.7300000	1	NA
Almonds, flavored (Includes all flavors other than honey roasted)	Miscellaneous	0.6784770	Vegetables	0.8000000	0	NA
Chicken breast, baked, coated, skin / coating not eaten (Includes broiled or roasted with coating; NS as to prepared with skin; NS as to coated or uncoated; any source)	Vegetables	0.9999987	Vegetables	0.9366667	1	NA
Dosa (Indian), with filling (Includes Indian pancake or crepe; Masala Dosa; any filling; any source)	Vegetables	0.9919578	Vegetables	0.8333333	1	NA
Chicken or turkey, potatoes, and vegetables excluding carrots, broccoli, and dark-green leafy; cream sauce, white sauce, or mushroom sauce	Vegetables	1.0000000	Vegetables	0.9866667	1	NA
Macaroni or noodles with cheese and egg	Dairy	0.9327330	Dairy	0.8066667	1	NA
Popcorn, microwave, kettle corn (Includes sugar flavored)	Vegetables	0.6408330	Vegetables	0.7933333	1	NA
Potato chips, restructured, reduced fat, lightly salted (Includes reduced fat Pringles; potato crisps; reduced fat and reduced sodium; low fat)	Dairy	1.0000000	Dairy	0.6000000	1	NA

6.2 Inside-Batch Split (~300/200) & Misclassification Deep-Dive

# Keep rows with ground truth and concrete RF predictions
batch_labeled <- review_batch_out %>%
  filter(!is.na(tfp_category), tfp_basket_cat_ML2 != "Low-Confidence")

set.seed(2025)
split_idx <- sample(seq_len(nrow(batch_labeled)), size = floor(0.6 * nrow(batch_labeled)))
batch_train <- batch_labeled[split_idx, ]
batch_test  <- batch_labeled[-split_idx, ]

# Quick confusion for RF on ~200 test subset
rf_cm_batch <- caret::confusionMatrix(
  factor(batch_test$tfp_basket_cat_ML2),
  factor(batch_test$tfp_category,
         levels = union(levels(factor(batch_labeled$tfp_category)),
                        levels(factor(batch_labeled$tfp_basket_cat_ML2))))
)

cat("RF Accuracy on ~200 test (inside-review batch):", round(rf_cm_batch$overall["Accuracy"], 3), "\n")

## RF Accuracy on ~200 test (inside-review batch): 0.988

# Misclassification deep-dive
mis <- batch_test %>%
  filter(tfp_basket_cat_ML2 != tfp_category) %>%
  mutate(
    token_count = stringr::str_count(food_description, "\\S+"),
    has_number  = stringr::str_detect(food_description, "\\d"),
    has_size    = stringr::str_detect(food_description, "\\b(oz|lb|lbs|g|kg|ml|l)\\b")
  )

top_confusions <- mis %>%
  count(tfp_category, tfp_basket_cat_ML2, sort = TRUE) %>%
  head(15)

cat("\nTop confusion pairs (truth vs predicted):\n")

## 
## Top confusion pairs (truth vs predicted):

print(top_confusions)

## # A tibble: 2 × 3
##   tfp_category  tfp_basket_cat_ML2     n
##   <chr>         <chr>              <int>
## 1 Miscellaneous Vegetables             1
## 2 Vegetables    Dairy                  1

cat("\nAmbiguity indicators among misclassifications:\n")

## 
## Ambiguity indicators among misclassifications:

summary_df <- mis %>%
  summarise(
    n = n(),
    median_tokens   = median(token_count, na.rm = TRUE),
    pct_with_number = mean(has_number, na.rm = TRUE),
    pct_with_size   = mean(has_size,   na.rm = TRUE)
  )
print(summary_df)

## # A tibble: 1 × 4
##       n median_tokens pct_with_number pct_with_size
##   <int>         <dbl>           <dbl>         <dbl>
## 1     2            10               0             0

6.3 Overfitting Guard: 5-Fold Cross Validation for RF–GloVe

set.seed(42)

# Use labeled USDA rows for CV
cv_df <- nb_data %>% filter(!is.na(food_description), !is.na(tfp_category))

folds <- rsample::vfold_cv(cv_df, v = 5, strata = tfp_category)

cv_acc <- purrr::map_dbl(folds$splits, function(s) {
  train <- rsample::analysis(s)
  test  <- rsample::assessment(s)

  Xtr <- embed_text_vec(train$food_description, glove_vectors)
  ytr <- factor(train$tfp_category)

  Xte <- embed_text_vec(test$food_description, glove_vectors)
  yte <- factor(test$tfp_category, levels = levels(ytr))

  mdl <- randomForest::randomForest(x = Xtr, y = ytr, ntree = 300, mtry = floor(sqrt(ncol(Xtr))))
  pred <- predict(mdl, Xte)
  mean(pred == yte, na.rm = TRUE)
})

cat("5-fold CV Accuracy (RF-GloVe) — mean:", round(mean(cv_acc), 3),
    " | sd:", round(sd(cv_acc), 3), "\n")

## 5-fold CV Accuracy (RF-GloVe) — mean: 0.735  | sd: 0.016

6.4 (Optional) Low-Confidence Exclusion Toggle for Reviewer Export

# Keep only RF predictions with conf >= 0.60 and not flagged as "Low-Confidence"
keep <- review_batch_out$tfp_basket_cat_ML2_conf >= 0.60 &
        review_batch_out$tfp_basket_cat_ML2 != "Low-Confidence"

reviewer_csv_strict <- review_batch_out %>%
  filter(keep) %>%
  select(food_description,
         tfp_basket_cat_ML1, tfp_basket_cat_ML1_conf,
         tfp_basket_cat_ML2, tfp_basket_cat_ML2_conf,
         ML_model_agreement,
         manual_verification)

readr::write_csv(reviewer_csv_strict, "upc_review_batch_500_high_conf.csv")

knitr::kable(head(reviewer_csv_strict, 20), "html",
             caption = "Preview: Reviewer Batch (High-Confidence, first 20)") %>%
  kableExtra::kable_styling(bootstrap_options = c("striped","hover","condensed","responsive"))

Preview: Reviewer Batch (High-Confidence, first 20)
food_description	tfp_basket_cat_ML1	tfp_basket_cat_ML1_conf	tfp_basket_cat_ML2	tfp_basket_cat_ML2_conf	ML_model_agreement	manual_verification
Sea bass, baked or broiled, fat added (Includes sauteed; fried with no coating; grouper; striped bass; wreakfish; bass, NFS; NS as to fat)	Dairy	0.9981215	Dairy	0.8466667	1	NA
Infant formula, powder, made with baby water (Enfamil Newborn) (Includes made with Nursery water; Gerber Pure water; Beech-Nut Spring water; Enfamil PREMIUM Newborn)	Vegetables	0.9999998	Vegetables	0.9100000	1	NA
Pork and vegetables excluding carrots, broccoli, and dark-green leafy; no potatoes, tomato-based sauce	Vegetables	1.0000000	Vegetables	0.9666667	1	NA
Vegetable noodle soup, reduced sodium, canned, prepared with water or ready-to-serve (Includes Campbell’s Healthy Request and Select Harvest varieties; Progresso Light and Reduced Sodium varieties)	Miscellaneous	0.9964942	Miscellaneous	0.6766667	1	NA
Bread, Cuban, toasted (Includes Spanish; Portuguese; pre-sliced or not)	Protein foods	0.4885441	Vegetables	0.6433333	0	NA
Chicken or turkey vegetable soup with noodles, stew type, chunky style, canned or ready-to-serve	Vegetables	0.9982100	Vegetables	0.7033333	1	NA
Goat, boiled	Vegetables	0.3234620	Vegetables	0.8900000	1	NA
Coffee, instant, decaffeinated, pre-lightened and pre-sweetened with sugar, reconsitituted (Includes powdered mix; Maxwell House International, flavors other than chocolate, cocoa, or mocha)	Miscellaneous	0.9999969	Miscellaneous	0.8333333	1	NA
Pizza, cheese, from school lunch, medium crust (Includes NS as to toppings; original or NS as to type of crust)	Dairy	0.9392463	Dairy	0.7300000	1	NA
Almonds, flavored (Includes all flavors other than honey roasted)	Miscellaneous	0.6784770	Vegetables	0.8000000	0	NA
Chicken breast, baked, coated, skin / coating not eaten (Includes broiled or roasted with coating; NS as to prepared with skin; NS as to coated or uncoated; any source)	Vegetables	0.9999987	Vegetables	0.9366667	1	NA
Dosa (Indian), with filling (Includes Indian pancake or crepe; Masala Dosa; any filling; any source)	Vegetables	0.9919578	Vegetables	0.8333333	1	NA
Chicken or turkey, potatoes, and vegetables excluding carrots, broccoli, and dark-green leafy; cream sauce, white sauce, or mushroom sauce	Vegetables	1.0000000	Vegetables	0.9866667	1	NA
Macaroni or noodles with cheese and egg	Dairy	0.9327330	Dairy	0.8066667	1	NA
Popcorn, microwave, kettle corn (Includes sugar flavored)	Vegetables	0.6408330	Vegetables	0.7933333	1	NA
Potato chips, restructured, reduced fat, lightly salted (Includes reduced fat Pringles; potato crisps; reduced fat and reduced sodium; low fat)	Dairy	1.0000000	Dairy	0.6000000	1	NA
Breadsticks, soft, with parmesan cheese, from frozen (Includes homemade; school cafeteria (K-12); NS as to cheese kind)	Dairy	0.9986839	Dairy	0.8366667	1	NA
Pistachio nuts, salted (Includes NS as to salt; sea salt)	Protein foods	0.9985744	Protein foods	0.7400000	1	NA
Lamb or mutton stew with potatoes and vegetables excluding carrots, broccoli, and dark-green leafy; tomato-based sauce	Vegetables	1.0000000	Vegetables	0.9533333	1	NA
Coffee, instant, pre-lightened and pre-sweetened with low calorie sweetener, reconstituted (Includes powdered mix; sugar free; Maxwell House International Sugar Free Coffee; flavors other than chocolate, cocoa, or mocha)	Miscellaneous	0.9999999	Miscellaneous	0.9166667	1	NA

USDA TFP Market Basket Classification

V.L.Collier, M.S.Jones

2025-09-02