Goal: trial two ML procedures to classify the UPCs into Market Basket categories.
Compare two ML procedures in R (Naive Bayes + TF-IDF; Random Forest + GloVe embeddings).
Use USDA TFP 2021 top categories + add one for “Non-Food Item”.
Nest subcategories in PP-NAP1718 under those top-level categories.
Output for review: food_description, predictions, confidence, and agreement flag.
Verify ~500 sample rows before scaling to UPCs.
DATA SOURCES
USDA PP National Average Prices (PP-NAP1718 sheet, 2017–2018).
USDA Thrifty Food Plan, 2021 (Table 1 categories).
Quanteda Label Assignment + Cosine Similarity
library(quanteda)
# Subcategory corpus/dfm
subcategory_corpus <- corpus(tfp_lookup$MB_Cats)
subcategory_tokens <- tokens(subcategory_corpus, remove_punct=TRUE) %>% tokens_tolower()
subcategory_dfm <- dfm(subcategory_tokens)
# Food description corpus/dfm
food_corpus <- corpus(pp_nap$food_description)
food_tokens <- tokens(food_corpus, remove_punct=TRUE) %>% tokens_tolower()
food_dfm <- dfm(food_tokens)
# Align features for cosine similarity
similarity_matrix <- textstat_simil(food_dfm, subcategory_dfm, margin="documents", method="cosine")
# Assign best-matching top-level category (via subcategory similarity)
sim_matrix <- as.matrix(similarity_matrix)
best_match_idx <- as.integer(apply(sim_matrix, 1, which.max))
matched_labels <- tfp_lookup$tfp_category[best_match_idx]
pp_nap$tfp_category[which(!is.na(pp_nap$food_description))] <- matched_labels
# Join back to ensure every description has a label where possible
food_descriptions <- pp_nap %>%
filter(!is.na(food_description)) %>% distinct(food_description)
food_descriptions$tfp_category <- matched_labels
pp_nap <- left_join(pp_nap, food_descriptions, by="food_description") %>%
select(-tfp_category.x) %>% rename(tfp_category=tfp_category.y)
pp_nap$tfp_category[is.na(pp_nap$tfp_category)] <- "Unknown"
Category Frequencies
pp_nap %>% count(tfp_category, sort=TRUE) %>%
knitr::kable("html", caption="TFP Category Frequencies") %>%
kableExtra::kable_styling(bootstrap_options=c("striped","hover","condensed","responsive"))
| tfp_category | n |
|---|---|
| Vegetables | 2029 |
| Dairy | 1073 |
| Miscellaneous | 687 |
| Fruits | 281 |
| Grains | 228 |
| Protein foods | 137 |
Training Data Construction for Models
# Labeled rows for modeling
nb_data <- pp_nap %>%
filter(!is.na(food_description), !is.na(tfp_category)) %>%
distinct(food_description, tfp_category)
# Quanteda corpus/dfm for NB
nb_corpus <- corpus(nb_data, text_field="food_description")
docvars(nb_corpus,"food_description") <- nb_data$food_description
docvars(nb_corpus,"tfp_category") <- nb_data$tfp_category
nb_tokens <- tokens(nb_corpus, remove_punct=TRUE, remove_symbols=TRUE) %>%
tokens_tolower() %>% tokens_remove(stopwords("en"))
nb_dfm <- dfm(nb_tokens)
MODELS Model 1: Naive Bayes + TF-IDF (baseline)
set.seed(123)
train_idx <- sample(1:ndoc(nb_dfm), size=0.8*ndoc(nb_dfm))
dfm_train <- nb_dfm[train_idx, ]
dfm_test <- nb_dfm[-train_idx, ]
labels_train <- docvars(dfm_train,"tfp_category")
labels_test <- docvars(dfm_test,"tfp_category")
# Train NB
nb_model <- textmodel_nb(dfm_train, labels_train)
# Predict on test
nb_preds <- predict(nb_model, newdata=dfm_test)
# Accuracy + confusion
nb_accuracy <- mean(nb_preds == labels_test)
cat("Naive Bayes Accuracy (holdout):", round(nb_accuracy, 3), "\n")
## Naive Bayes Accuracy (holdout): 0.66
nb_cm <- caret::confusionMatrix(factor(nb_preds), factor(labels_test))
print(nb_cm$table)
## Reference
## Prediction Dairy Fruits Grains Miscellaneous Protein foods Vegetables
## Dairy 192 4 0 12 2 40
## Fruits 3 45 2 10 0 28
## Grains 3 2 25 10 2 17
## Miscellaneous 12 1 3 74 3 46
## Protein foods 2 1 1 1 10 32
## Vegetables 20 2 5 33 5 239
Model 2: Random Forest + GloVe Embeddings
# Train/test split for RF using the same labeled frame (nb_data)
set.seed(123)
rf_split <- initial_split(nb_data, prop=0.8)
train_data <- training(rf_split)
test_data <- testing(rf_split)
# Determine needed words from test set (to filter GloVe for efficiency)
target_text <- test_data %>% pull(food_description)
needed_words <- unique(unlist(stringr::str_split(tolower(target_text), "\\s+")))
needed_words <- needed_words[nchar(needed_words) > 0]
# Load pre-trained GloVe (filtered)
glove_file <- "G:/.shortcut-targets-by-id/10hwxlrEnEox7VqS6tvo44Q8rX59qZcSg/Drones_MV/GITHUB/ISER/MJones/FOOD_SECURITY/FOOD_PRICING/USDA_ML/USDA_ML_Data/dolma_300_2024_1.2M.100_combined.txt"
cache_rds <- "glove_dolma_filtered_300d.rds"
if (file.exists(cache_rds)) {
glove_vectors <- readRDS(cache_rds)
} else {
con <- file(glove_file,"r"); on.exit(close(con))
keep <- list(); keep_words <- character(0); i <- 0L
repeat {
line <- readLines(con, n=10000, warn=FALSE); if (length(line)==0) break
spl <- strsplit(line, " ", fixed=TRUE)
for (s in spl) {
if (length(s) != 301) next
w <- s[[1]]
if (!(w %in% needed_words)) next
vec <- suppressWarnings(as.numeric(s[-1])); if (anyNA(vec)) next
i <- i + 1L; keep[[i]] <- vec; keep_words <- c(keep_words, w)
}
}
glove_vectors <- do.call(rbind, keep); rownames(glove_vectors) <- keep_words
saveRDS(glove_vectors, cache_rds)
}
# Embedding helpers
get_avg_embedding <- function(txt, embeddings) {
words <- stringr::str_split(tolower(txt %||% ""), "\\s+")[[1]]
valid <- intersect(words, rownames(embeddings))
if (length(valid)==0) return(rep(0, ncol(embeddings)))
colMeans(embeddings[valid,,drop=FALSE])
}
embed_text_vec <- function(text_vec, embeddings) {
mat <- t(vapply(text_vec, function(z) get_avg_embedding(z, embeddings), numeric(ncol(embeddings))))
as.data.frame(mat)
}
# Train RF on training set embeddings
rf_train_X <- embed_text_vec(train_data$food_description, glove_vectors)
rf_train_y <- factor(train_data$tfp_category)
set.seed(123)
rf_model <- randomForest::randomForest(
x = rf_train_X, y = rf_train_y, ntree = 300, mtry = floor(sqrt(ncol(rf_train_X)))
)
# Evaluate RF on its test set
rf_test_X <- embed_text_vec(test_data$food_description, glove_vectors)
rf_probs_test <- predict(rf_model, rf_test_X, type = "prob")
rf_preds_test <- predict(rf_model, rf_test_X)
rf_out <- tibble(
food_description = test_data$food_description,
tfp_category = test_data$tfp_category,
tfp_basket_cat_ML2 = rf_preds_test,
tfp_basket_cat_ML2_conf = apply(rf_probs_test, 1, max)
)
rf_accuracy <- mean(rf_out$tfp_basket_cat_ML2 == rf_out$tfp_category)
cat("Random Forest Accuracy (holdout):", round(rf_accuracy, 3), "\n")
## Random Forest Accuracy (holdout): 0.729
rf_cm <- caret::confusionMatrix(
factor(rf_out$tfp_basket_cat_ML2),
factor(rf_out$tfp_category, levels = union(levels(factor(rf_out$tfp_category)), levels(factor(rf_out$tfp_basket_cat_ML2))))
)
print(rf_cm$table)
## Reference
## Prediction Dairy Fruits Grains Miscellaneous Protein foods Vegetables
## Dairy 172 6 1 13 0 7
## Fruits 1 23 0 0 0 1
## Grains 0 1 14 1 1 1
## Miscellaneous 4 2 0 49 1 8
## Protein foods 0 1 0 0 4 0
## Vegetables 55 22 21 77 16 385
Diagnostics Helpers
# ===== Diagnostics helpers (coverage + pretty prob table) =====
count_covered_tokens <- function(text_vec, embeddings) {
vapply(text_vec, function(txt) {
words <- stringr::str_split(tolower(txt %||% ""), "\\s+")[[1]]
sum(words %in% rownames(embeddings))
}, integer(1))
}
show_rf_probs <- function(text_vec, embeddings, model, top_n = 6) {
# Embed and predict
X <- embed_text_vec(text_vec, embeddings)
prob <- predict(model, X, type = "prob")
pred <- colnames(prob)[max.col(prob, ties.method = "first")]
conf <- apply(prob, 1, max)
covc <- count_covered_tokens(text_vec, embeddings)
# Build per-row "top_n" probability strings for quick eyeballing
topk <- apply(prob, 1, function(row) {
s <- sort(row, decreasing = TRUE)
s <- head(s, min(top_n, length(s)))
paste(paste0(names(s), "=", sprintf("%.3f", s)), collapse = " | ")
})
tibble::tibble(
text = text_vec,
rf_pred = pred,
rf_confidence = round(conf, 3),
rf_cov_tokens = covc,
rf_top_probs = topk
)
}
# ===== MODEL DIAGNOSTICS: fit/accuracy health check =====
cat("\n=== RF Holdout Accuracy ===\n")
##
## === RF Holdout Accuracy ===
rf_accuracy <- mean(rf_out$tfp_basket_cat_ML2 == rf_out$tfp_category)
print(round(rf_accuracy, 3))
## [1] 0.729
rf_cm <- caret::confusionMatrix(
factor(rf_out$tfp_basket_cat_ML2),
factor(rf_out$tfp_category,
levels = union(levels(factor(rf_out$tfp_category)),
levels(factor(rf_out$tfp_basket_cat_ML2))))
)
cat("\n=== RF Confusion Matrix (Holdout) ===\n")
##
## === RF Confusion Matrix (Holdout) ===
print(rf_cm$table)
## Reference
## Prediction Dairy Fruits Grains Miscellaneous Protein foods Vegetables
## Dairy 172 6 1 13 0 7
## Fruits 1 23 0 0 0 1
## Grains 0 1 14 1 1 1
## Miscellaneous 4 2 0 49 1 8
## Protein foods 0 1 0 0 4 0
## Vegetables 55 22 21 77 16 385
cat("\n=== RF Per-Class Accuracy (Holdout) ===\n")
##
## === RF Per-Class Accuracy (Holdout) ===
rf_out %>%
mutate(correct = tfp_basket_cat_ML2 == tfp_category) %>%
group_by(tfp_category) %>%
summarise(n = n(), accuracy = mean(correct), .groups = "drop") %>%
arrange(accuracy) %>%
print(n = Inf)
## # A tibble: 6 × 3
## tfp_category n accuracy
## <chr> <int> <dbl>
## 1 Protein foods 22 0.182
## 2 Miscellaneous 140 0.35
## 3 Grains 36 0.389
## 4 Fruits 55 0.418
## 5 Dairy 232 0.741
## 6 Vegetables 402 0.958
# ===== SANITY CHECKS: clearly-typed items (with coverage + top probs) =====
sanity_items <- c("sea bass", "salmon", "cheddar cheese", "milk",
"apple", "broccoli", "white bread")
sanity_tbl <- show_rf_probs(sanity_items, glove_vectors, rf_model, top_n = 6)
cat("\n=== RF Sanity Spot Checks ===\n")
##
## === RF Sanity Spot Checks ===
print(sanity_tbl)
## # A tibble: 7 × 5
## text rf_pred rf_confidence rf_cov_tokens rf_top_probs
## <chr> <chr> <dbl> <int> <chr>
## 1 sea bass Vegetables 0.417 1 Vegetables=0.417 | Misc…
## 2 salmon Vegetables 0.5 1 Vegetables=0.500 | Misc…
## 3 cheddar cheese Dairy 0.677 2 Dairy=0.677 | Vegetable…
## 4 milk Grains 0.657 1 Grains=0.657 | Dairy=0.…
## 5 apple Vegetables 0.77 1 Vegetables=0.770 | Misc…
## 6 broccoli Vegetables 0.477 1 Vegetables=0.477 | Misc…
## 7 white bread Vegetables 0.373 2 Vegetables=0.373 | Misc…
# Specific printout for 'sea bass' row (easy to find)
if ("sea bass" %in% sanity_items) {
sb_row <- dplyr::filter(sanity_tbl, text == "sea bass")
cat("\n--- Focus: 'sea bass' ---\n")
print(sb_row)
if (sb_row$rf_cov_tokens < 2) {
cat("\n[Note] 'sea bass' has very low embedding coverage (rf_cov_tokens < 2).\n",
"Consider adding coverage-aware prediction or augmenting GloVe vocab.\n")
}
}
##
## --- Focus: 'sea bass' ---
## # A tibble: 1 × 5
## text rf_pred rf_confidence rf_cov_tokens rf_top_probs
## <chr> <chr> <dbl> <int> <chr>
## 1 sea bass Vegetables 0.417 1 Vegetables=0.417 | Miscellane…
##
## [Note] 'sea bass' has very low embedding coverage (rf_cov_tokens < 2).
## Consider adding coverage-aware prediction or augmenting GloVe vocab.
# ===== MODEL DIAGNOSTICS: fit/accuracy health check =====
cat("\n=== RF Holdout Accuracy ===\n")
##
## === RF Holdout Accuracy ===
rf_accuracy <- mean(rf_out$tfp_basket_cat_ML2 == rf_out$tfp_category)
print(round(rf_accuracy, 3))
## [1] 0.729
rf_cm <- caret::confusionMatrix(
factor(rf_out$tfp_basket_cat_ML2),
factor(rf_out$tfp_category,
levels = union(levels(factor(rf_out$tfp_category)),
levels(factor(rf_out$tfp_basket_cat_ML2))))
)
cat("\n=== RF Confusion Matrix (Holdout) ===\n")
##
## === RF Confusion Matrix (Holdout) ===
print(rf_cm$table)
## Reference
## Prediction Dairy Fruits Grains Miscellaneous Protein foods Vegetables
## Dairy 172 6 1 13 0 7
## Fruits 1 23 0 0 0 1
## Grains 0 1 14 1 1 1
## Miscellaneous 4 2 0 49 1 8
## Protein foods 0 1 0 0 4 0
## Vegetables 55 22 21 77 16 385
cat("\n=== RF Per-Class Accuracy (Holdout) ===\n")
##
## === RF Per-Class Accuracy (Holdout) ===
rf_out %>%
mutate(correct = tfp_basket_cat_ML2 == tfp_category) %>%
group_by(tfp_category) %>%
summarise(n = n(), accuracy = mean(correct), .groups = "drop") %>%
arrange(accuracy) %>%
print(n = Inf)
## # A tibble: 6 × 3
## tfp_category n accuracy
## <chr> <int> <dbl>
## 1 Protein foods 22 0.182
## 2 Miscellaneous 140 0.35
## 3 Grains 36 0.389
## 4 Fruits 55 0.418
## 5 Dairy 232 0.741
## 6 Vegetables 402 0.958
MODEL COMPARISON & VISUALIZATION
comparison_df <- tibble::tibble(
Model = c("Naive Bayes (TF-IDF)", "Random Forest (GloVe)"),
Accuracy = c(nb_accuracy, rf_accuracy)
)
knitr::kable(comparison_df, "html", caption = "Model Accuracy Summary") %>%
kableExtra::kable_styling(bootstrap_options=c("striped","hover","condensed","responsive"))
| Model | Accuracy |
|---|---|
| Naive Bayes (TF-IDF) | 0.6595265 |
| Random Forest (GloVe) | 0.7294250 |
# NB plot: distribution of predictions vs actual (NB uses dfm_test / labels_test)
nb_plot_df <- tibble(actual = as.character(labels_test), pred = as.character(nb_preds))
p1 <- ggplot(nb_plot_df, aes(x = actual, fill = pred)) +
geom_bar(position = "dodge") + theme_minimal() +
labs(title = "Naive Bayes Predictions vs Actual", x = "Actual Category", fill = "Predicted") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# RF plot: distribution on rf_out
p2 <- ggplot(rf_out, aes(x = tfp_category, fill = tfp_basket_cat_ML2)) +
geom_bar(position = "dodge") + theme_minimal() +
labs(title = "Random Forest Predictions vs Actual", x = "Actual Category", fill = "Predicted") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(p1); print(p2)
6.1 Create 500-Row Reviewer Batch (Predictions + Confidence + Agreement)
# Reuse rf_predict_with_conf helper
rf_predict_with_conf <- function(text_vec, embeddings, model, threshold = 0.0) {
X <- embed_text_vec(text_vec, embeddings)
prob <- predict(model, X, type = "prob")
pred <- colnames(prob)[max.col(prob, ties.method = "first")]
conf <- apply(prob, 1, max)
pred_thr <- ifelse(conf >= threshold, pred, "Low-Confidence")
tibble(
tfp_basket_cat_ML2 = pred_thr,
tfp_basket_cat_ML2_conf = conf
)
}
set.seed(2025)
# Candidate pool with usable descriptions
candidates <- pp_nap %>%
filter(!is.na(food_description) & nchar(food_description) > 0)
n_review <- min(500, nrow(candidates))
review_batch <- candidates %>% slice_sample(n = n_review)
# RF predictions with configurable threshold (default 0.60)
preds_rf <- rf_predict_with_conf(
text_vec = review_batch$food_description,
embeddings = glove_vectors,
model = rf_model,
threshold = 0.60
)
review_batch_out <- review_batch %>%
transmute(
food_description,
tfp_category = tfp_category # ground truth if present for USDA rows
) %>%
bind_cols(preds_rf)
# NB overlap (optional): include if NB objects exist
if (exists("nb_model") && exists("dfm_train")) {
nb_dfm_review <- dfm(tokens(corpus(review_batch$food_description), remove_punct = TRUE) %>% tokens_tolower())
nb_dfm_review <- dfm_match(nb_dfm_review, features = featnames(dfm_train))
nb_preds_review <- predict(nb_model, newdata = nb_dfm_review)
nb_conf_review <- apply(predict(nb_model, newdata = nb_dfm_review, type = "prob"), 1, max)
review_batch_out <- review_batch_out %>%
mutate(
tfp_basket_cat_ML1 = as.character(nb_preds_review),
tfp_basket_cat_ML1_conf = nb_conf_review
)
} else {
review_batch_out <- review_batch_out %>%
mutate(
tfp_basket_cat_ML1 = NA_character_,
tfp_basket_cat_ML1_conf = NA_real_
)
}
# Agreement flag + blank manual verification
review_batch_out <- review_batch_out %>%
mutate(
ML_model_agreement = as.integer(tfp_basket_cat_ML1 == tfp_basket_cat_ML2),
manual_verification = NA_character_
)
# Export reviewer file (exact columns MJ requested)
reviewer_csv <- review_batch_out %>%
select(food_description,
tfp_basket_cat_ML1, tfp_basket_cat_ML1_conf,
tfp_basket_cat_ML2, tfp_basket_cat_ML2_conf,
ML_model_agreement,
manual_verification)
readr::write_csv(reviewer_csv, "upc_review_batch_500.csv")
# Preview first 20
knitr::kable(head(reviewer_csv, 20), "html", caption = "Preview: Reviewer Batch (first 20)") %>%
kableExtra::kable_styling(bootstrap_options = c("striped","hover","condensed","responsive"))
| food_description | tfp_basket_cat_ML1 | tfp_basket_cat_ML1_conf | tfp_basket_cat_ML2 | tfp_basket_cat_ML2_conf | ML_model_agreement | manual_verification |
|---|---|---|---|---|---|---|
| Sea bass, baked or broiled, fat added (Includes sauteed; fried with no coating; grouper; striped bass; wreakfish; bass, NFS; NS as to fat) | Dairy | 0.9981215 | Dairy | 0.8466667 | 1 | NA |
| Breadsticks, soft, from frozen (Includes homemade; school cafeteria (K-12); garlic knot) | Miscellaneous | 0.9432947 | Low-Confidence | 0.3633333 | 0 | NA |
| Ice cream cone, chocolate, prepackaged (Includes with nuts) | Miscellaneous | 0.8324897 | Low-Confidence | 0.5933333 | 0 | NA |
| Shrimp and clams in tomato-based sauce, with noodles, frozen meal (Includes Budget Gourmet Linguini with Shrimp and Clams Marinara) | Vegetables | 0.9819825 | Low-Confidence | 0.5833333 | 0 | NA |
| Infant formula, powder, made with baby water (Enfamil Newborn) (Includes made with Nursery water; Gerber Pure water; Beech-Nut Spring water; Enfamil PREMIUM Newborn) | Vegetables | 0.9999998 | Vegetables | 0.9100000 | 1 | NA |
| Pork and vegetables excluding carrots, broccoli, and dark-green leafy; no potatoes, tomato-based sauce | Vegetables | 1.0000000 | Vegetables | 0.9666667 | 1 | NA |
| Vegetable noodle soup, reduced sodium, canned, prepared with water or ready-to-serve (Includes Campbell’s Healthy Request and Select Harvest varieties; Progresso Light and Reduced Sodium varieties) | Miscellaneous | 0.9964942 | Miscellaneous | 0.6766667 | 1 | NA |
| Bread, Cuban, toasted (Includes Spanish; Portuguese; pre-sliced or not) | Protein foods | 0.4885441 | Vegetables | 0.6433333 | 0 | NA |
| Chicken or turkey vegetable soup with noodles, stew type, chunky style, canned or ready-to-serve | Vegetables | 0.9982100 | Vegetables | 0.7033333 | 1 | NA |
| Goat, boiled | Vegetables | 0.3234620 | Vegetables | 0.8900000 | 1 | NA |
| Coffee, instant, decaffeinated, pre-lightened and pre-sweetened with sugar, reconsitituted (Includes powdered mix; Maxwell House International, flavors other than chocolate, cocoa, or mocha) | Miscellaneous | 0.9999969 | Miscellaneous | 0.8333333 | 1 | NA |
| Congee (Includes Conjee; rice porridge or gruel; Asian) | Protein foods | 0.5720587 | Low-Confidence | 0.3666667 | 0 | NA |
| Pizza, cheese, from school lunch, medium crust (Includes NS as to toppings; original or NS as to type of crust) | Dairy | 0.9392463 | Dairy | 0.7300000 | 1 | NA |
| Almonds, flavored (Includes all flavors other than honey roasted) | Miscellaneous | 0.6784770 | Vegetables | 0.8000000 | 0 | NA |
| Chicken breast, baked, coated, skin / coating not eaten (Includes broiled or roasted with coating; NS as to prepared with skin; NS as to coated or uncoated; any source) | Vegetables | 0.9999987 | Vegetables | 0.9366667 | 1 | NA |
| Dosa (Indian), with filling (Includes Indian pancake or crepe; Masala Dosa; any filling; any source) | Vegetables | 0.9919578 | Vegetables | 0.8333333 | 1 | NA |
| Chicken or turkey, potatoes, and vegetables excluding carrots, broccoli, and dark-green leafy; cream sauce, white sauce, or mushroom sauce | Vegetables | 1.0000000 | Vegetables | 0.9866667 | 1 | NA |
| Macaroni or noodles with cheese and egg | Dairy | 0.9327330 | Dairy | 0.8066667 | 1 | NA |
| Popcorn, microwave, kettle corn (Includes sugar flavored) | Vegetables | 0.6408330 | Vegetables | 0.7933333 | 1 | NA |
| Potato chips, restructured, reduced fat, lightly salted (Includes reduced fat Pringles; potato crisps; reduced fat and reduced sodium; low fat) | Dairy | 1.0000000 | Dairy | 0.6000000 | 1 | NA |
6.2 Inside-Batch Split (~300/200) & Misclassification Deep-Dive
# Keep rows with ground truth and concrete RF predictions
batch_labeled <- review_batch_out %>%
filter(!is.na(tfp_category), tfp_basket_cat_ML2 != "Low-Confidence")
set.seed(2025)
split_idx <- sample(seq_len(nrow(batch_labeled)), size = floor(0.6 * nrow(batch_labeled)))
batch_train <- batch_labeled[split_idx, ]
batch_test <- batch_labeled[-split_idx, ]
# Quick confusion for RF on ~200 test subset
rf_cm_batch <- caret::confusionMatrix(
factor(batch_test$tfp_basket_cat_ML2),
factor(batch_test$tfp_category,
levels = union(levels(factor(batch_labeled$tfp_category)),
levels(factor(batch_labeled$tfp_basket_cat_ML2))))
)
cat("RF Accuracy on ~200 test (inside-review batch):", round(rf_cm_batch$overall["Accuracy"], 3), "\n")
## RF Accuracy on ~200 test (inside-review batch): 0.988
# Misclassification deep-dive
mis <- batch_test %>%
filter(tfp_basket_cat_ML2 != tfp_category) %>%
mutate(
token_count = stringr::str_count(food_description, "\\S+"),
has_number = stringr::str_detect(food_description, "\\d"),
has_size = stringr::str_detect(food_description, "\\b(oz|lb|lbs|g|kg|ml|l)\\b")
)
top_confusions <- mis %>%
count(tfp_category, tfp_basket_cat_ML2, sort = TRUE) %>%
head(15)
cat("\nTop confusion pairs (truth vs predicted):\n")
##
## Top confusion pairs (truth vs predicted):
print(top_confusions)
## # A tibble: 2 × 3
## tfp_category tfp_basket_cat_ML2 n
## <chr> <chr> <int>
## 1 Miscellaneous Vegetables 1
## 2 Vegetables Dairy 1
cat("\nAmbiguity indicators among misclassifications:\n")
##
## Ambiguity indicators among misclassifications:
summary_df <- mis %>%
summarise(
n = n(),
median_tokens = median(token_count, na.rm = TRUE),
pct_with_number = mean(has_number, na.rm = TRUE),
pct_with_size = mean(has_size, na.rm = TRUE)
)
print(summary_df)
## # A tibble: 1 × 4
## n median_tokens pct_with_number pct_with_size
## <int> <dbl> <dbl> <dbl>
## 1 2 10 0 0
6.3 Overfitting Guard: 5-Fold Cross Validation for RF–GloVe
set.seed(42)
# Use labeled USDA rows for CV
cv_df <- nb_data %>% filter(!is.na(food_description), !is.na(tfp_category))
folds <- rsample::vfold_cv(cv_df, v = 5, strata = tfp_category)
cv_acc <- purrr::map_dbl(folds$splits, function(s) {
train <- rsample::analysis(s)
test <- rsample::assessment(s)
Xtr <- embed_text_vec(train$food_description, glove_vectors)
ytr <- factor(train$tfp_category)
Xte <- embed_text_vec(test$food_description, glove_vectors)
yte <- factor(test$tfp_category, levels = levels(ytr))
mdl <- randomForest::randomForest(x = Xtr, y = ytr, ntree = 300, mtry = floor(sqrt(ncol(Xtr))))
pred <- predict(mdl, Xte)
mean(pred == yte, na.rm = TRUE)
})
cat("5-fold CV Accuracy (RF-GloVe) — mean:", round(mean(cv_acc), 3),
" | sd:", round(sd(cv_acc), 3), "\n")
## 5-fold CV Accuracy (RF-GloVe) — mean: 0.735 | sd: 0.016
6.4 (Optional) Low-Confidence Exclusion Toggle for Reviewer Export
# Keep only RF predictions with conf >= 0.60 and not flagged as "Low-Confidence"
keep <- review_batch_out$tfp_basket_cat_ML2_conf >= 0.60 &
review_batch_out$tfp_basket_cat_ML2 != "Low-Confidence"
reviewer_csv_strict <- review_batch_out %>%
filter(keep) %>%
select(food_description,
tfp_basket_cat_ML1, tfp_basket_cat_ML1_conf,
tfp_basket_cat_ML2, tfp_basket_cat_ML2_conf,
ML_model_agreement,
manual_verification)
readr::write_csv(reviewer_csv_strict, "upc_review_batch_500_high_conf.csv")
knitr::kable(head(reviewer_csv_strict, 20), "html",
caption = "Preview: Reviewer Batch (High-Confidence, first 20)") %>%
kableExtra::kable_styling(bootstrap_options = c("striped","hover","condensed","responsive"))
| food_description | tfp_basket_cat_ML1 | tfp_basket_cat_ML1_conf | tfp_basket_cat_ML2 | tfp_basket_cat_ML2_conf | ML_model_agreement | manual_verification |
|---|---|---|---|---|---|---|
| Sea bass, baked or broiled, fat added (Includes sauteed; fried with no coating; grouper; striped bass; wreakfish; bass, NFS; NS as to fat) | Dairy | 0.9981215 | Dairy | 0.8466667 | 1 | NA |
| Infant formula, powder, made with baby water (Enfamil Newborn) (Includes made with Nursery water; Gerber Pure water; Beech-Nut Spring water; Enfamil PREMIUM Newborn) | Vegetables | 0.9999998 | Vegetables | 0.9100000 | 1 | NA |
| Pork and vegetables excluding carrots, broccoli, and dark-green leafy; no potatoes, tomato-based sauce | Vegetables | 1.0000000 | Vegetables | 0.9666667 | 1 | NA |
| Vegetable noodle soup, reduced sodium, canned, prepared with water or ready-to-serve (Includes Campbell’s Healthy Request and Select Harvest varieties; Progresso Light and Reduced Sodium varieties) | Miscellaneous | 0.9964942 | Miscellaneous | 0.6766667 | 1 | NA |
| Bread, Cuban, toasted (Includes Spanish; Portuguese; pre-sliced or not) | Protein foods | 0.4885441 | Vegetables | 0.6433333 | 0 | NA |
| Chicken or turkey vegetable soup with noodles, stew type, chunky style, canned or ready-to-serve | Vegetables | 0.9982100 | Vegetables | 0.7033333 | 1 | NA |
| Goat, boiled | Vegetables | 0.3234620 | Vegetables | 0.8900000 | 1 | NA |
| Coffee, instant, decaffeinated, pre-lightened and pre-sweetened with sugar, reconsitituted (Includes powdered mix; Maxwell House International, flavors other than chocolate, cocoa, or mocha) | Miscellaneous | 0.9999969 | Miscellaneous | 0.8333333 | 1 | NA |
| Pizza, cheese, from school lunch, medium crust (Includes NS as to toppings; original or NS as to type of crust) | Dairy | 0.9392463 | Dairy | 0.7300000 | 1 | NA |
| Almonds, flavored (Includes all flavors other than honey roasted) | Miscellaneous | 0.6784770 | Vegetables | 0.8000000 | 0 | NA |
| Chicken breast, baked, coated, skin / coating not eaten (Includes broiled or roasted with coating; NS as to prepared with skin; NS as to coated or uncoated; any source) | Vegetables | 0.9999987 | Vegetables | 0.9366667 | 1 | NA |
| Dosa (Indian), with filling (Includes Indian pancake or crepe; Masala Dosa; any filling; any source) | Vegetables | 0.9919578 | Vegetables | 0.8333333 | 1 | NA |
| Chicken or turkey, potatoes, and vegetables excluding carrots, broccoli, and dark-green leafy; cream sauce, white sauce, or mushroom sauce | Vegetables | 1.0000000 | Vegetables | 0.9866667 | 1 | NA |
| Macaroni or noodles with cheese and egg | Dairy | 0.9327330 | Dairy | 0.8066667 | 1 | NA |
| Popcorn, microwave, kettle corn (Includes sugar flavored) | Vegetables | 0.6408330 | Vegetables | 0.7933333 | 1 | NA |
| Potato chips, restructured, reduced fat, lightly salted (Includes reduced fat Pringles; potato crisps; reduced fat and reduced sodium; low fat) | Dairy | 1.0000000 | Dairy | 0.6000000 | 1 | NA |
| Breadsticks, soft, with parmesan cheese, from frozen (Includes homemade; school cafeteria (K-12); NS as to cheese kind) | Dairy | 0.9986839 | Dairy | 0.8366667 | 1 | NA |
| Pistachio nuts, salted (Includes NS as to salt; sea salt) | Protein foods | 0.9985744 | Protein foods | 0.7400000 | 1 | NA |
| Lamb or mutton stew with potatoes and vegetables excluding carrots, broccoli, and dark-green leafy; tomato-based sauce | Vegetables | 1.0000000 | Vegetables | 0.9533333 | 1 | NA |
| Coffee, instant, pre-lightened and pre-sweetened with low calorie sweetener, reconstituted (Includes powdered mix; sugar free; Maxwell House International Sugar Free Coffee; flavors other than chocolate, cocoa, or mocha) | Miscellaneous | 0.9999999 | Miscellaneous | 0.9166667 | 1 | NA |